imw 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +15 -0
- data/CHANGELOG +0 -0
- data/LICENSE +674 -0
- data/README.rdoc +101 -0
- data/Rakefile +20 -0
- data/VERSION +1 -0
- data/etc/imwrc.rb +76 -0
- data/lib/imw.rb +42 -0
- data/lib/imw/boot.rb +58 -0
- data/lib/imw/dataset.rb +233 -0
- data/lib/imw/dataset/datamapper.rb +66 -0
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
- data/lib/imw/dataset/loaddump.rb +50 -0
- data/lib/imw/dataset/old/file_collection.rb +88 -0
- data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
- data/lib/imw/dataset/scaffold.rb +132 -0
- data/lib/imw/dataset/scraped_uri.rb +305 -0
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
- data/lib/imw/dataset/scrub/scrub.rb +147 -0
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
- data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
- data/lib/imw/dataset/scrub/slug.rb +101 -0
- data/lib/imw/dataset/stats.rb +73 -0
- data/lib/imw/dataset/stats/counter.rb +23 -0
- data/lib/imw/dataset/task.rb +38 -0
- data/lib/imw/dataset/workflow.rb +81 -0
- data/lib/imw/files.rb +110 -0
- data/lib/imw/files/archive.rb +113 -0
- data/lib/imw/files/basicfile.rb +122 -0
- data/lib/imw/files/binary.rb +28 -0
- data/lib/imw/files/compressed_file.rb +93 -0
- data/lib/imw/files/compressed_files_and_archives.rb +348 -0
- data/lib/imw/files/compressible.rb +103 -0
- data/lib/imw/files/csv.rb +112 -0
- data/lib/imw/files/json.rb +41 -0
- data/lib/imw/files/sgml.rb +65 -0
- data/lib/imw/files/text.rb +68 -0
- data/lib/imw/files/yaml.rb +46 -0
- data/lib/imw/packagers.rb +8 -0
- data/lib/imw/packagers/archiver.rb +108 -0
- data/lib/imw/packagers/s3_mover.rb +28 -0
- data/lib/imw/parsers.rb +7 -0
- data/lib/imw/parsers/html_parser.rb +382 -0
- data/lib/imw/parsers/html_parser/matchers.rb +306 -0
- data/lib/imw/parsers/line_parser.rb +87 -0
- data/lib/imw/parsers/regexp_parser.rb +72 -0
- data/lib/imw/utils.rb +24 -0
- data/lib/imw/utils/components.rb +61 -0
- data/lib/imw/utils/config.rb +46 -0
- data/lib/imw/utils/error.rb +54 -0
- data/lib/imw/utils/extensions/array.rb +125 -0
- data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
- data/lib/imw/utils/extensions/core.rb +43 -0
- data/lib/imw/utils/extensions/dir.rb +24 -0
- data/lib/imw/utils/extensions/file_core.rb +64 -0
- data/lib/imw/utils/extensions/hash.rb +218 -0
- data/lib/imw/utils/extensions/hpricot.rb +48 -0
- data/lib/imw/utils/extensions/string.rb +49 -0
- data/lib/imw/utils/extensions/struct.rb +42 -0
- data/lib/imw/utils/extensions/symbol.rb +28 -0
- data/lib/imw/utils/extensions/typed_struct.rb +22 -0
- data/lib/imw/utils/extensions/uri.rb +59 -0
- data/lib/imw/utils/log.rb +67 -0
- data/lib/imw/utils/misc.rb +63 -0
- data/lib/imw/utils/paths.rb +115 -0
- data/lib/imw/utils/uri.rb +59 -0
- data/lib/imw/utils/uuid.rb +33 -0
- data/lib/imw/utils/validate.rb +38 -0
- data/lib/imw/utils/version.rb +12 -0
- data/lib/imw/utils/view.rb +113 -0
- data/lib/imw/utils/view/dump_csv.rb +112 -0
- data/lib/imw/utils/view/dump_csv_older.rb +117 -0
- data/spec/data/sample.csv +131 -0
- data/spec/data/sample.tsv +131 -0
- data/spec/data/sample.txt +131 -0
- data/spec/data/sample.xml +653 -0
- data/spec/data/sample.yaml +652 -0
- data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
- data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
- data/spec/imw/files/archive_spec.rb +118 -0
- data/spec/imw/files/basicfile_spec.rb +121 -0
- data/spec/imw/files/bz2_spec.rb +32 -0
- data/spec/imw/files/compressed_file_spec.rb +96 -0
- data/spec/imw/files/compressible_spec.rb +100 -0
- data/spec/imw/files/file_spec.rb +144 -0
- data/spec/imw/files/gz_spec.rb +32 -0
- data/spec/imw/files/rar_spec.rb +33 -0
- data/spec/imw/files/tar_spec.rb +31 -0
- data/spec/imw/files/text_spec.rb +23 -0
- data/spec/imw/files/zip_spec.rb +31 -0
- data/spec/imw/files_spec.rb +38 -0
- data/spec/imw/packagers/archiver_spec.rb +125 -0
- data/spec/imw/packagers/s3_mover_spec.rb +7 -0
- data/spec/imw/parsers/line_parser_spec.rb +96 -0
- data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
- data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
- data/spec/imw/utils/extensions/find_spec.rb +113 -0
- data/spec/imw/utils/paths_spec.rb +38 -0
- data/spec/imw/workflow/rip/local_spec.rb +89 -0
- data/spec/imw/workflow/rip_spec.rb +27 -0
- data/spec/rcov.opts +1 -0
- data/spec/spec.opts +4 -0
- data/spec/spec_helper.rb +32 -0
- data/spec/support/archive_contents_matcher.rb +94 -0
- data/spec/support/custom_matchers.rb +21 -0
- data/spec/support/directory_contents_matcher.rb +61 -0
- data/spec/support/extensions.rb +18 -0
- data/spec/support/file_contents_matcher.rb +50 -0
- data/spec/support/random.rb +210 -0
- data/spec/support/without_regard_to_order_matcher.rb +58 -0
- metadata +196 -0
@@ -0,0 +1,66 @@
|
|
1
|
+
#
|
2
|
+
# h2. lib/imw/dataset/datamapper.rb -- extensions to datamapper for datasets
|
3
|
+
#
|
4
|
+
# == About
|
5
|
+
#
|
6
|
+
# The DataMapper[http://datamapper.org/] library is an ORM for Ruby
|
7
|
+
# which is lighter than ActiveRecord[http://ar.rubyonrails.com/] and
|
8
|
+
# the like. It is the ORM that IMW is designed to work natively with.
|
9
|
+
#
|
10
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
11
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
12
|
+
# License:: GPL 3.0
|
13
|
+
# Website:: http://infinitemonkeywrench.org/
|
14
|
+
#
|
15
|
+
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
16
|
+
|
17
|
+
require 'imw/utils'
|
18
|
+
require 'dm-core'
|
19
|
+
require 'dm-ar-finders'
|
20
|
+
require 'dm-aggregates'
|
21
|
+
require 'dm-serializer'
|
22
|
+
|
23
|
+
module DataMapper
|
24
|
+
# Connect to a remote database
|
25
|
+
def self.setup_remote_connection options
|
26
|
+
options = { :handle => :default }.merge options
|
27
|
+
params = options.values_at(:protocol, :username, :password, :hostname, :dbname)
|
28
|
+
DataMapper.setup(options[:handle], "%s://%s:%s@%s/%s" % params)
|
29
|
+
end
|
30
|
+
# Connect to a local database
|
31
|
+
def self.setup_local_connection options
|
32
|
+
options = { :handle => :default }.merge options
|
33
|
+
params = options.values_at(:protocol, :dbpath, :dbname)
|
34
|
+
DataMapper.setup(options[:handle], "%s://%s/%s" % params)
|
35
|
+
end
|
36
|
+
|
37
|
+
# KLUDGE
|
38
|
+
def self.open_repositories repository_dbnames, params
|
39
|
+
repository_dbnames.each do |handle, dbname|
|
40
|
+
repo_params = params.merge({ :handle => handle, :dbname => dbname })
|
41
|
+
DataMapper.setup_remote_connection repo_params
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
module Model
|
47
|
+
|
48
|
+
# Find or create the resource matching search attributes and in
|
49
|
+
# either case set the update-able attributes.
|
50
|
+
def update_or_create(search_attributes, updateable_attributes = {})
|
51
|
+
if (resource = first(search_attributes))
|
52
|
+
resource.update_attributes updateable_attributes
|
53
|
+
else
|
54
|
+
resource = create(search_attributes.merge(updateable_attributes))
|
55
|
+
end
|
56
|
+
resource
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
# watch SQL log -- must be BEFORE call to db setup
|
62
|
+
def self.logging=(verbosity)
|
63
|
+
verbosity = :debug if (verbosity == true)
|
64
|
+
DataMapper::Logger.new(STDERR, verbosity) if verbosity
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
|
3
|
+
# gem 'dm-core', '=0.9.6'
|
4
|
+
require 'dm-core'
|
5
|
+
|
6
|
+
#
|
7
|
+
# Stolen from http://github.com/sam/dm-more/tree/master/dm-timestamps/lib/dm-timestamps.rb
|
8
|
+
#
|
9
|
+
|
10
|
+
module DataMapper
|
11
|
+
module Timestamp
|
12
|
+
TIMESTAMP_PROPERTIES = {
|
13
|
+
:updated_at => lambda { |r| r.updated_at = DateTime.now },
|
14
|
+
:updated_on => lambda { |r| r.updated_on = Date.today },
|
15
|
+
:updated_by => lambda { |r| r.updated_by = IMW::USER_INFO[:id] },
|
16
|
+
:created_at => lambda { |r| r.created_at = DateTime.now if r.new_record? && r.created_at.nil? },
|
17
|
+
:created_on => lambda { |r| r.created_on = Date.today if r.new_record? && r.created_on.nil?},
|
18
|
+
:created_by => lambda { |r| r.created_by = IMW::USER_INFO[:id] if r.new_record? && r.created_by.blank?},
|
19
|
+
}
|
20
|
+
|
21
|
+
def self.included(model)
|
22
|
+
model.before :save, :set_timestamp_properties
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def set_timestamp_properties
|
28
|
+
if dirty?
|
29
|
+
self.class.properties.slice(*TIMESTAMP_PROPERTIES.keys).compact.each do |property|
|
30
|
+
TIMESTAMP_PROPERTIES[property.name][self] unless attribute_dirty?(property.name)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end # module Timestamp
|
35
|
+
|
36
|
+
Resource::append_inclusions Timestamp
|
37
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
#
|
2
|
+
# h2. lib/imw/dataset/loaddump.rb -- read and write datasets to resources
|
3
|
+
#
|
4
|
+
# == About
|
5
|
+
#
|
6
|
+
# Implements methods to load a dataset from a resource and to write a
|
7
|
+
# dataset back to a resource.
|
8
|
+
#
|
9
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
10
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
11
|
+
# License:: GPL 3.0
|
12
|
+
# Website:: http://infinitemonkeywrench.org/
|
13
|
+
#
|
14
|
+
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
15
|
+
|
16
|
+
require 'imw/utils'
|
17
|
+
|
18
|
+
module IMW
|
19
|
+
class Dataset
|
20
|
+
|
21
|
+
# Return the data in +filename+ in an appropriate form.
|
22
|
+
#
|
23
|
+
# FIXME How do I get pass a block from one method to another?
|
24
|
+
def self.load filename, &block
|
25
|
+
filename = path_to(filename)
|
26
|
+
announce "Loading #{filename}"
|
27
|
+
file = IMW.open(filename)
|
28
|
+
data = file.load(filename)
|
29
|
+
if block
|
30
|
+
data.each{|record| yield record}
|
31
|
+
file
|
32
|
+
else
|
33
|
+
data
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# Dump +data+ to +filename+.
|
38
|
+
def self.dump data, filename
|
39
|
+
filename = path_to(filename)
|
40
|
+
announce "Dumping to #{filename}"
|
41
|
+
IMW.open(filename,'w').dump(data)
|
42
|
+
end
|
43
|
+
|
44
|
+
# Dispatch to <tt>Dataset.dump</tt>.
|
45
|
+
def dump filename
|
46
|
+
self.class.dump self.data, *args
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'imw/dataset'
|
2
|
+
require 'imw/dataset/uri'
|
3
|
+
|
4
|
+
#
|
5
|
+
# All the files associated with a given URL
|
6
|
+
#
|
7
|
+
class DatasetFileCollection
|
8
|
+
include DataMapper::Resource
|
9
|
+
property :id, Integer, :serial => true
|
10
|
+
property :category, String, :nullable => false, :unique_index => :category
|
11
|
+
has n, :ripped_file_collections
|
12
|
+
|
13
|
+
end
|
14
|
+
|
15
|
+
#
|
16
|
+
# Collection of raw files retrieved from a spider based at a given URL
|
17
|
+
#
|
18
|
+
class RippedFileCollection
|
19
|
+
include DataMapper::Resource
|
20
|
+
property :id, Integer, :serial => true
|
21
|
+
belongs_to :url, :class_name => DM_URI, :child_key => [:url_id]
|
22
|
+
has n, :ripped_files
|
23
|
+
belongs_to :dataset_file_collection
|
24
|
+
|
25
|
+
def self.find_or_create_from_url url, dataset_file_collection
|
26
|
+
url = DM_URI.find_or_create_from_url(url)
|
27
|
+
ripdfiles = self.find_or_create(
|
28
|
+
{ :url_id => url.id },
|
29
|
+
{ :dataset_file_collection => dataset_file_collection})
|
30
|
+
end
|
31
|
+
|
32
|
+
def listing_filename()
|
33
|
+
path_to(:rawd, "listing-#{url.as_flat_filename}.txt")
|
34
|
+
end
|
35
|
+
|
36
|
+
def make_listing_file
|
37
|
+
return if File.exist?(listing_filename)
|
38
|
+
FileUtils.cd path_to(:ripd_root) do
|
39
|
+
`find #{url.as_path} > #{listing_filename}`
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
# Mon Aug 11 08:59:00 -0500 2008 files: 0
|
44
|
+
# Mon Aug 11 09:05:34 -0500 2008 files: 100000 => so, 1M files/hr. not good.
|
45
|
+
def index_from_listing
|
46
|
+
make_listing_file
|
47
|
+
self.ripped_files
|
48
|
+
FileUtils.cd path_to(:ripd_root) do
|
49
|
+
File.foreach(listing_filename) do |full_path|
|
50
|
+
track_count :files
|
51
|
+
full_path.chomp!
|
52
|
+
ripd_path = full_path[1+url.as_path.length..-1]
|
53
|
+
next if ripd_path.blank?
|
54
|
+
RippedFile.from_file(self, full_path, ripd_path)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
self.save
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
#
|
62
|
+
# Index the raw files retrieved from website
|
63
|
+
#
|
64
|
+
class RippedFile
|
65
|
+
include DataMapper::Resource
|
66
|
+
property :id, Integer, :serial => true
|
67
|
+
property :ripped_file_collection_id, Integer, :unique_index => :ripd_path
|
68
|
+
property :ripd_path, String, :length => 255, :nullable => false, :unique_index => :ripd_path
|
69
|
+
property :retrieval_date, DateTime
|
70
|
+
property :compressed_size, Integer
|
71
|
+
belongs_to :ripped_file_collection
|
72
|
+
|
73
|
+
def self.from_file clxn, full_path, ripd_path
|
74
|
+
filedate = File.mtime(full_path)
|
75
|
+
filesize = File.size( full_path)
|
76
|
+
ripped_file = self.find_or_create({ :ripd_path => ripd_path }, {
|
77
|
+
:ripped_file_collection => clxn,
|
78
|
+
:retrieval_date => filedate,
|
79
|
+
:compressed_size => filesize,
|
80
|
+
})
|
81
|
+
ripped_file
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
# SELECT r.*, u.host, u.path FROM ripped_files r
|
87
|
+
# LEFT JOIN ripped_file_collections rfs ON r.ripped_file_collection_id = rfs.id
|
88
|
+
# LEFT JOIN dm_uris u ON rfs.url_id = u.id
|
@@ -0,0 +1,71 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'imw/utils'; include IMW
|
3
|
+
require 'imw/dataset/file_collection'
|
4
|
+
require 'tempfile'
|
5
|
+
|
6
|
+
def bulk_listing_filename() '/tmp/listing_foo.txt' end
|
7
|
+
def table_name() 'ripped_files' end
|
8
|
+
|
9
|
+
def run_mysql_cmd db_params, cmd
|
10
|
+
username, password, hostname, dbname = db_params.values_at(:username, :password, :hostname, :dbname)
|
11
|
+
query_file = Tempfile.new("qlstg")
|
12
|
+
query_file.puts cmd
|
13
|
+
query_file.close
|
14
|
+
puts `time mysql -E -u#{username} -p#{password} -h#{hostname} #{dbname} < #{query_file.path}`
|
15
|
+
end
|
16
|
+
|
17
|
+
def bulk_load_mysql db_params, ripd_base
|
18
|
+
announce "Calling mysql to bulk load #{ripd_base} (expect ~2s per 100k files)"
|
19
|
+
run_mysql_cmd db_params, %Q{
|
20
|
+
LOAD DATA LOCAL INFILE '#{bulk_listing_filename}'
|
21
|
+
REPLACE INTO TABLE `#{table_name}`
|
22
|
+
FIELDS TERMINATED BY ','
|
23
|
+
LINES TERMINATED BY '\n'
|
24
|
+
(`ripped_file_collection_id`, `ripd_path`, `retrieval_date`, `compressed_size`)
|
25
|
+
;
|
26
|
+
}
|
27
|
+
end
|
28
|
+
|
29
|
+
def clear_table
|
30
|
+
run_mysql_cmd "TRUNCATE #{table_name}"
|
31
|
+
end
|
32
|
+
|
33
|
+
class RippedFileCollection
|
34
|
+
def bulk_load_listing db_params, extra_find_args=""
|
35
|
+
announce "Indexing #{url.as_path} (expect ~10s per 100k files)"
|
36
|
+
FileUtils.cd path_to(:ripd_root) do
|
37
|
+
find_fmt = "#{self.id},%P,%TY-%Tm-%Td %TH:%TM:%TS,%s\n"
|
38
|
+
find_cmd = "find #{url.as_path} #{extra_find_args} -printf '#{find_fmt}' > #{bulk_listing_filename}"
|
39
|
+
puts `time #{find_cmd}`
|
40
|
+
end unless File.exist?(bulk_listing_filename)
|
41
|
+
bulk_load_mysql db_params, url.as_path
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
# SELECT rf_yrs.*, dfc.*, url.scheme, url.host, url.path
|
47
|
+
# FROM (
|
48
|
+
# SELECT SUBSTR(ripd_path,1,4) AS yr, COUNT(*), r.*
|
49
|
+
# FROM ripped_files r
|
50
|
+
# GROUP BY ripped_file_collection_id, yr
|
51
|
+
# ORDER BY ripped_file_collection_id, yr
|
52
|
+
# ) rf_yrs
|
53
|
+
# LEFT JOIN ripped_file_collections rfc ON rfc.id = rf_yrs.ripped_file_collection_id
|
54
|
+
# LEFT JOIN dataset_file_collections dfc ON dfc.id = rfc.dataset_file_collection_id
|
55
|
+
# LEFT JOIN dm_uris url ON url.id = rfc.url_id
|
56
|
+
|
57
|
+
db_params = IMW::DEFAULT_DATABASE_CONNECTION_PARAMS.merge({ :dbname => 'imw_weather_ncdc' })
|
58
|
+
IMW::Dataset.setup_remote_connection db_params
|
59
|
+
|
60
|
+
# Daily
|
61
|
+
daily_dset_clxn = DatasetFileCollection.find_or_create({ :category => 'weather/ncdc/daily' })
|
62
|
+
rf_clxn = RippedFileCollection.find_or_create_from_url 'ftp://ftp.ncdc.noaa.gov/pub/data/gsod', daily_dset_clxn
|
63
|
+
rf_clxn.bulk_load_listing db_params
|
64
|
+
# Hourly
|
65
|
+
hourly_dset_clxn = DatasetFileCollection.find_or_create({ :category => 'weather/ncdc/hourly' })
|
66
|
+
rf_clxn = RippedFileCollection.find_or_create_from_url 'ftp://ftp.ncdc.noaa.gov/pub/data/noaa', hourly_dset_clxn
|
67
|
+
rf_clxn.bulk_load_listing db_params, '\\! \\( -iname "isd-lite" -prune \\) '
|
68
|
+
# Hourly-lite
|
69
|
+
hlite_dset_clxn = DatasetFileCollection.find_or_create({ :category => 'weather/ncdc/hourly_lite' })
|
70
|
+
rf_clxn = RippedFileCollection.find_or_create_from_url 'ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-lite', hlite_dset_clxn
|
71
|
+
rf_clxn.bulk_load_listing db_params
|
@@ -0,0 +1,132 @@
|
|
1
|
+
#
|
2
|
+
# h2. lib/imw/workflow/scaffold.rb -- scaffold the directory structure for a dataset
|
3
|
+
#
|
4
|
+
# == About
|
5
|
+
#
|
6
|
+
# Defines workflow tasks for datasets to create directories and
|
7
|
+
# symlinks to ease the processing of a dataset.
|
8
|
+
#
|
9
|
+
# Right now this file contains code written by Flip as well as code
|
10
|
+
# written by Dhruv which accomplish basically the same task. Dhruv's
|
11
|
+
# code integrates with <tt>IMW::Dataset</tt> and Rake and should be
|
12
|
+
# used preferentially.
|
13
|
+
#
|
14
|
+
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
15
|
+
# Copyright:: Copyright (c) 2008 infochimps.org
|
16
|
+
# License:: GPL 3.0
|
17
|
+
# Website:: http://infinitemonkeywrench.org/
|
18
|
+
#
|
19
|
+
# puts "#{File.basename(__FILE__)}: POST NO BILLS. Is that funny to anyone but me? No?" # at bottom
|
20
|
+
|
21
|
+
require 'rake'
|
22
|
+
require 'fileutils'
|
23
|
+
|
24
|
+
require 'imw/utils'
|
25
|
+
require 'imw/dataset/task'
|
26
|
+
|
27
|
+
include FileUtils
|
28
|
+
|
29
|
+
module IMW
|
30
|
+
include FileUtils
|
31
|
+
|
32
|
+
################################################################
|
33
|
+
## FLIP'S CODE
|
34
|
+
################################################################
|
35
|
+
|
36
|
+
def scaffold_script_dirs
|
37
|
+
mkdir_p path_to(:me)
|
38
|
+
end
|
39
|
+
|
40
|
+
#
|
41
|
+
# * creates a directory for the dataset in each of the top-level hierarchies
|
42
|
+
# (as given in ~/.imwrc)
|
43
|
+
# * links to that directory within the working directory
|
44
|
+
# in directory pool/foo/bar/baz we'd find
|
45
|
+
# rawd => /data/rawd/foo/bar/baz
|
46
|
+
#
|
47
|
+
def scaffold_dset_dirs
|
48
|
+
[:rawd, :temp, :fixd, :log].each do |seg|
|
49
|
+
unless File.exist?(path_to(seg))
|
50
|
+
seg_dir = path_to(pathseg_root(seg), :dset)
|
51
|
+
mkdir_p seg_dir
|
52
|
+
ln_s seg_dir, path_to(seg)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
#
|
59
|
+
# * creates a symlink within the working directory to the
|
60
|
+
# ripped directory, named after its url
|
61
|
+
#
|
62
|
+
def scaffold_rip_dir url
|
63
|
+
unless File.exist?(path_to(seg))
|
64
|
+
ripd_dir = path_to(:ripd_root, url)
|
65
|
+
mkdir_p ripd_dir
|
66
|
+
ln_s ripd_dir, path_to(:ripd)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def scaffold_dset
|
71
|
+
scaffold_script_dirs
|
72
|
+
scaffold_dset_dirs
|
73
|
+
end
|
74
|
+
|
75
|
+
|
76
|
+
################################################################
|
77
|
+
## DHRUV's CODE -- uses IMW::Dataset and Rake
|
78
|
+
################################################################
|
79
|
+
module Workflow
|
80
|
+
|
81
|
+
# Creates a workflow task <tt>:create_directories</tt> to create
|
82
|
+
# the directory structure for this dataset.
|
83
|
+
def create_directories_task
|
84
|
+
@last_description = "Creates directories for this dataset in the peel through package steps."
|
85
|
+
define_task(IMW::Task, {:create_directories => []}) do
|
86
|
+
[:peel, :munge, :fix, :package].each do |step|
|
87
|
+
FileUtils.mkdir_p(path_to(step)) unless File.exist?(path_to(step))
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
# Creates a workflow task <tt>:create_symlinks</tt> to create
|
93
|
+
# the directory structure for this dataset.
|
94
|
+
def create_symlinks_task
|
95
|
+
@last_description = "Creates symlinks pointing from the directory containing scripts for this dataset to the directories for the peel through package steps."
|
96
|
+
define_task(IMW::Task, {:create_symlinks => [:create_directories]}) do
|
97
|
+
[:peel, :munge, :fix, :package].each do |step|
|
98
|
+
symlink = File.join(path_to(:script),IMW::Dataset::WORKFLOW_STEP_DIRS[step].to_s)
|
99
|
+
FileUtils.ln_s(path_to(step), symlink) unless File.exist?(symlink)
|
100
|
+
end
|
101
|
+
symlink = File.join(path_to(:script), "ripd")
|
102
|
+
FileUtils.ln_s(path_to(:ripd_root), symlink) unless File.exist?(symlink)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
# Creates a task <tt>:initialize</tt> which does nothing but
|
107
|
+
# depends upon all the tasks required to initialize the dataset.
|
108
|
+
def create_initialize_task
|
109
|
+
@last_description = "Set everything up to begin processing the dataset."
|
110
|
+
define_task(IMW::Task, :initialize => [:create_directories, :create_symlinks])
|
111
|
+
end
|
112
|
+
|
113
|
+
# Removes all data for this dataset from the data directories.
|
114
|
+
def create_delete_data_task
|
115
|
+
@last_description = "Deletes all data and directories for this dataset for the peel through package steps."
|
116
|
+
define_task(IMW::Task, {:delete_data => []}) do
|
117
|
+
[:peel, :munge, :fix, :package].each do |step|
|
118
|
+
FileUtils.remove_dir(path_to(step)) if File.exist?(path_to(step))
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
# Creates a task <tt>:destroy</tt> which does nothing but depends
|
124
|
+
# upon all the tasks required to delete the dataset's data and
|
125
|
+
# remove its footprint from IMW.
|
126
|
+
def create_destroy_task
|
127
|
+
@last_description = "Get rid of all traces of this dataset."
|
128
|
+
define_task(IMW::Task, :destroy => [:delete_data])
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
132
|
+
end
|