imw 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (111) hide show
  1. data/.gitignore +15 -0
  2. data/CHANGELOG +0 -0
  3. data/LICENSE +674 -0
  4. data/README.rdoc +101 -0
  5. data/Rakefile +20 -0
  6. data/VERSION +1 -0
  7. data/etc/imwrc.rb +76 -0
  8. data/lib/imw.rb +42 -0
  9. data/lib/imw/boot.rb +58 -0
  10. data/lib/imw/dataset.rb +233 -0
  11. data/lib/imw/dataset/datamapper.rb +66 -0
  12. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
  13. data/lib/imw/dataset/loaddump.rb +50 -0
  14. data/lib/imw/dataset/old/file_collection.rb +88 -0
  15. data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
  16. data/lib/imw/dataset/scaffold.rb +132 -0
  17. data/lib/imw/dataset/scraped_uri.rb +305 -0
  18. data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
  19. data/lib/imw/dataset/scrub/scrub.rb +147 -0
  20. data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
  21. data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
  22. data/lib/imw/dataset/scrub/slug.rb +101 -0
  23. data/lib/imw/dataset/stats.rb +73 -0
  24. data/lib/imw/dataset/stats/counter.rb +23 -0
  25. data/lib/imw/dataset/task.rb +38 -0
  26. data/lib/imw/dataset/workflow.rb +81 -0
  27. data/lib/imw/files.rb +110 -0
  28. data/lib/imw/files/archive.rb +113 -0
  29. data/lib/imw/files/basicfile.rb +122 -0
  30. data/lib/imw/files/binary.rb +28 -0
  31. data/lib/imw/files/compressed_file.rb +93 -0
  32. data/lib/imw/files/compressed_files_and_archives.rb +348 -0
  33. data/lib/imw/files/compressible.rb +103 -0
  34. data/lib/imw/files/csv.rb +112 -0
  35. data/lib/imw/files/json.rb +41 -0
  36. data/lib/imw/files/sgml.rb +65 -0
  37. data/lib/imw/files/text.rb +68 -0
  38. data/lib/imw/files/yaml.rb +46 -0
  39. data/lib/imw/packagers.rb +8 -0
  40. data/lib/imw/packagers/archiver.rb +108 -0
  41. data/lib/imw/packagers/s3_mover.rb +28 -0
  42. data/lib/imw/parsers.rb +7 -0
  43. data/lib/imw/parsers/html_parser.rb +382 -0
  44. data/lib/imw/parsers/html_parser/matchers.rb +306 -0
  45. data/lib/imw/parsers/line_parser.rb +87 -0
  46. data/lib/imw/parsers/regexp_parser.rb +72 -0
  47. data/lib/imw/utils.rb +24 -0
  48. data/lib/imw/utils/components.rb +61 -0
  49. data/lib/imw/utils/config.rb +46 -0
  50. data/lib/imw/utils/error.rb +54 -0
  51. data/lib/imw/utils/extensions/array.rb +125 -0
  52. data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
  53. data/lib/imw/utils/extensions/core.rb +43 -0
  54. data/lib/imw/utils/extensions/dir.rb +24 -0
  55. data/lib/imw/utils/extensions/file_core.rb +64 -0
  56. data/lib/imw/utils/extensions/hash.rb +218 -0
  57. data/lib/imw/utils/extensions/hpricot.rb +48 -0
  58. data/lib/imw/utils/extensions/string.rb +49 -0
  59. data/lib/imw/utils/extensions/struct.rb +42 -0
  60. data/lib/imw/utils/extensions/symbol.rb +28 -0
  61. data/lib/imw/utils/extensions/typed_struct.rb +22 -0
  62. data/lib/imw/utils/extensions/uri.rb +59 -0
  63. data/lib/imw/utils/log.rb +67 -0
  64. data/lib/imw/utils/misc.rb +63 -0
  65. data/lib/imw/utils/paths.rb +115 -0
  66. data/lib/imw/utils/uri.rb +59 -0
  67. data/lib/imw/utils/uuid.rb +33 -0
  68. data/lib/imw/utils/validate.rb +38 -0
  69. data/lib/imw/utils/version.rb +12 -0
  70. data/lib/imw/utils/view.rb +113 -0
  71. data/lib/imw/utils/view/dump_csv.rb +112 -0
  72. data/lib/imw/utils/view/dump_csv_older.rb +117 -0
  73. data/spec/data/sample.csv +131 -0
  74. data/spec/data/sample.tsv +131 -0
  75. data/spec/data/sample.txt +131 -0
  76. data/spec/data/sample.xml +653 -0
  77. data/spec/data/sample.yaml +652 -0
  78. data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
  79. data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
  80. data/spec/imw/files/archive_spec.rb +118 -0
  81. data/spec/imw/files/basicfile_spec.rb +121 -0
  82. data/spec/imw/files/bz2_spec.rb +32 -0
  83. data/spec/imw/files/compressed_file_spec.rb +96 -0
  84. data/spec/imw/files/compressible_spec.rb +100 -0
  85. data/spec/imw/files/file_spec.rb +144 -0
  86. data/spec/imw/files/gz_spec.rb +32 -0
  87. data/spec/imw/files/rar_spec.rb +33 -0
  88. data/spec/imw/files/tar_spec.rb +31 -0
  89. data/spec/imw/files/text_spec.rb +23 -0
  90. data/spec/imw/files/zip_spec.rb +31 -0
  91. data/spec/imw/files_spec.rb +38 -0
  92. data/spec/imw/packagers/archiver_spec.rb +125 -0
  93. data/spec/imw/packagers/s3_mover_spec.rb +7 -0
  94. data/spec/imw/parsers/line_parser_spec.rb +96 -0
  95. data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
  96. data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
  97. data/spec/imw/utils/extensions/find_spec.rb +113 -0
  98. data/spec/imw/utils/paths_spec.rb +38 -0
  99. data/spec/imw/workflow/rip/local_spec.rb +89 -0
  100. data/spec/imw/workflow/rip_spec.rb +27 -0
  101. data/spec/rcov.opts +1 -0
  102. data/spec/spec.opts +4 -0
  103. data/spec/spec_helper.rb +32 -0
  104. data/spec/support/archive_contents_matcher.rb +94 -0
  105. data/spec/support/custom_matchers.rb +21 -0
  106. data/spec/support/directory_contents_matcher.rb +61 -0
  107. data/spec/support/extensions.rb +18 -0
  108. data/spec/support/file_contents_matcher.rb +50 -0
  109. data/spec/support/random.rb +210 -0
  110. data/spec/support/without_regard_to_order_matcher.rb +58 -0
  111. metadata +196 -0
@@ -0,0 +1,66 @@
1
+ #
2
+ # h2. lib/imw/dataset/datamapper.rb -- extensions to datamapper for datasets
3
+ #
4
+ # == About
5
+ #
6
+ # The DataMapper[http://datamapper.org/] library is an ORM for Ruby
7
+ # which is lighter than ActiveRecord[http://ar.rubyonrails.com/] and
8
+ # the like. It is the ORM that IMW is designed to work natively with.
9
+ #
10
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
11
+ # Copyright:: Copyright (c) 2008 infochimps.org
12
+ # License:: GPL 3.0
13
+ # Website:: http://infinitemonkeywrench.org/
14
+ #
15
+ # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
16
+
17
+ require 'imw/utils'
18
+ require 'dm-core'
19
+ require 'dm-ar-finders'
20
+ require 'dm-aggregates'
21
+ require 'dm-serializer'
22
+
23
+ module DataMapper
24
+ # Connect to a remote database
25
+ def self.setup_remote_connection options
26
+ options = { :handle => :default }.merge options
27
+ params = options.values_at(:protocol, :username, :password, :hostname, :dbname)
28
+ DataMapper.setup(options[:handle], "%s://%s:%s@%s/%s" % params)
29
+ end
30
+ # Connect to a local database
31
+ def self.setup_local_connection options
32
+ options = { :handle => :default }.merge options
33
+ params = options.values_at(:protocol, :dbpath, :dbname)
34
+ DataMapper.setup(options[:handle], "%s://%s/%s" % params)
35
+ end
36
+
37
+ # KLUDGE
38
+ def self.open_repositories repository_dbnames, params
39
+ repository_dbnames.each do |handle, dbname|
40
+ repo_params = params.merge({ :handle => handle, :dbname => dbname })
41
+ DataMapper.setup_remote_connection repo_params
42
+ end
43
+ end
44
+
45
+
46
+ module Model
47
+
48
+ # Find or create the resource matching search attributes and in
49
+ # either case set the update-able attributes.
50
+ def update_or_create(search_attributes, updateable_attributes = {})
51
+ if (resource = first(search_attributes))
52
+ resource.update_attributes updateable_attributes
53
+ else
54
+ resource = create(search_attributes.merge(updateable_attributes))
55
+ end
56
+ resource
57
+ end
58
+
59
+ end
60
+
61
+ # watch SQL log -- must be BEFORE call to db setup
62
+ def self.logging=(verbosity)
63
+ verbosity = :debug if (verbosity == true)
64
+ DataMapper::Logger.new(STDERR, verbosity) if verbosity
65
+ end
66
+ end
@@ -0,0 +1,37 @@
1
+ require 'rubygems'
2
+
3
+ # gem 'dm-core', '=0.9.6'
4
+ require 'dm-core'
5
+
6
+ #
7
+ # Stolen from http://github.com/sam/dm-more/tree/master/dm-timestamps/lib/dm-timestamps.rb
8
+ #
9
+
10
+ module DataMapper
11
+ module Timestamp
12
+ TIMESTAMP_PROPERTIES = {
13
+ :updated_at => lambda { |r| r.updated_at = DateTime.now },
14
+ :updated_on => lambda { |r| r.updated_on = Date.today },
15
+ :updated_by => lambda { |r| r.updated_by = IMW::USER_INFO[:id] },
16
+ :created_at => lambda { |r| r.created_at = DateTime.now if r.new_record? && r.created_at.nil? },
17
+ :created_on => lambda { |r| r.created_on = Date.today if r.new_record? && r.created_on.nil?},
18
+ :created_by => lambda { |r| r.created_by = IMW::USER_INFO[:id] if r.new_record? && r.created_by.blank?},
19
+ }
20
+
21
+ def self.included(model)
22
+ model.before :save, :set_timestamp_properties
23
+ end
24
+
25
+ private
26
+
27
+ def set_timestamp_properties
28
+ if dirty?
29
+ self.class.properties.slice(*TIMESTAMP_PROPERTIES.keys).compact.each do |property|
30
+ TIMESTAMP_PROPERTIES[property.name][self] unless attribute_dirty?(property.name)
31
+ end
32
+ end
33
+ end
34
+ end # module Timestamp
35
+
36
+ Resource::append_inclusions Timestamp
37
+ end
@@ -0,0 +1,50 @@
1
+ #
2
+ # h2. lib/imw/dataset/loaddump.rb -- read and write datasets to resources
3
+ #
4
+ # == About
5
+ #
6
+ # Implements methods to load a dataset from a resource and to write a
7
+ # dataset back to a resource.
8
+ #
9
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
10
+ # Copyright:: Copyright (c) 2008 infochimps.org
11
+ # License:: GPL 3.0
12
+ # Website:: http://infinitemonkeywrench.org/
13
+ #
14
+ # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
15
+
16
+ require 'imw/utils'
17
+
18
+ module IMW
19
+ class Dataset
20
+
21
+ # Return the data in +filename+ in an appropriate form.
22
+ #
23
+ # FIXME How do I get pass a block from one method to another?
24
+ def self.load filename, &block
25
+ filename = path_to(filename)
26
+ announce "Loading #{filename}"
27
+ file = IMW.open(filename)
28
+ data = file.load(filename)
29
+ if block
30
+ data.each{|record| yield record}
31
+ file
32
+ else
33
+ data
34
+ end
35
+ end
36
+
37
+ # Dump +data+ to +filename+.
38
+ def self.dump data, filename
39
+ filename = path_to(filename)
40
+ announce "Dumping to #{filename}"
41
+ IMW.open(filename,'w').dump(data)
42
+ end
43
+
44
+ # Dispatch to <tt>Dataset.dump</tt>.
45
+ def dump filename
46
+ self.class.dump self.data, *args
47
+ end
48
+
49
+ end
50
+ end
@@ -0,0 +1,88 @@
1
+ require 'imw/dataset'
2
+ require 'imw/dataset/uri'
3
+
4
+ #
5
+ # All the files associated with a given URL
6
+ #
7
+ class DatasetFileCollection
8
+ include DataMapper::Resource
9
+ property :id, Integer, :serial => true
10
+ property :category, String, :nullable => false, :unique_index => :category
11
+ has n, :ripped_file_collections
12
+
13
+ end
14
+
15
+ #
16
+ # Collection of raw files retrieved from a spider based at a given URL
17
+ #
18
+ class RippedFileCollection
19
+ include DataMapper::Resource
20
+ property :id, Integer, :serial => true
21
+ belongs_to :url, :class_name => DM_URI, :child_key => [:url_id]
22
+ has n, :ripped_files
23
+ belongs_to :dataset_file_collection
24
+
25
+ def self.find_or_create_from_url url, dataset_file_collection
26
+ url = DM_URI.find_or_create_from_url(url)
27
+ ripdfiles = self.find_or_create(
28
+ { :url_id => url.id },
29
+ { :dataset_file_collection => dataset_file_collection})
30
+ end
31
+
32
+ def listing_filename()
33
+ path_to(:rawd, "listing-#{url.as_flat_filename}.txt")
34
+ end
35
+
36
+ def make_listing_file
37
+ return if File.exist?(listing_filename)
38
+ FileUtils.cd path_to(:ripd_root) do
39
+ `find #{url.as_path} > #{listing_filename}`
40
+ end
41
+ end
42
+
43
+ # Mon Aug 11 08:59:00 -0500 2008 files: 0
44
+ # Mon Aug 11 09:05:34 -0500 2008 files: 100000 => so, 1M files/hr. not good.
45
+ def index_from_listing
46
+ make_listing_file
47
+ self.ripped_files
48
+ FileUtils.cd path_to(:ripd_root) do
49
+ File.foreach(listing_filename) do |full_path|
50
+ track_count :files
51
+ full_path.chomp!
52
+ ripd_path = full_path[1+url.as_path.length..-1]
53
+ next if ripd_path.blank?
54
+ RippedFile.from_file(self, full_path, ripd_path)
55
+ end
56
+ end
57
+ self.save
58
+ end
59
+ end
60
+
61
+ #
62
+ # Index the raw files retrieved from website
63
+ #
64
+ class RippedFile
65
+ include DataMapper::Resource
66
+ property :id, Integer, :serial => true
67
+ property :ripped_file_collection_id, Integer, :unique_index => :ripd_path
68
+ property :ripd_path, String, :length => 255, :nullable => false, :unique_index => :ripd_path
69
+ property :retrieval_date, DateTime
70
+ property :compressed_size, Integer
71
+ belongs_to :ripped_file_collection
72
+
73
+ def self.from_file clxn, full_path, ripd_path
74
+ filedate = File.mtime(full_path)
75
+ filesize = File.size( full_path)
76
+ ripped_file = self.find_or_create({ :ripd_path => ripd_path }, {
77
+ :ripped_file_collection => clxn,
78
+ :retrieval_date => filedate,
79
+ :compressed_size => filesize,
80
+ })
81
+ ripped_file
82
+ end
83
+
84
+ end
85
+
86
+ # SELECT r.*, u.host, u.path FROM ripped_files r
87
+ # LEFT JOIN ripped_file_collections rfs ON r.ripped_file_collection_id = rfs.id
88
+ # LEFT JOIN dm_uris u ON rfs.url_id = u.id
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/env ruby
2
+ require 'imw/utils'; include IMW
3
+ require 'imw/dataset/file_collection'
4
+ require 'tempfile'
5
+
6
+ def bulk_listing_filename() '/tmp/listing_foo.txt' end
7
+ def table_name() 'ripped_files' end
8
+
9
+ def run_mysql_cmd db_params, cmd
10
+ username, password, hostname, dbname = db_params.values_at(:username, :password, :hostname, :dbname)
11
+ query_file = Tempfile.new("qlstg")
12
+ query_file.puts cmd
13
+ query_file.close
14
+ puts `time mysql -E -u#{username} -p#{password} -h#{hostname} #{dbname} < #{query_file.path}`
15
+ end
16
+
17
+ def bulk_load_mysql db_params, ripd_base
18
+ announce "Calling mysql to bulk load #{ripd_base} (expect ~2s per 100k files)"
19
+ run_mysql_cmd db_params, %Q{
20
+ LOAD DATA LOCAL INFILE '#{bulk_listing_filename}'
21
+ REPLACE INTO TABLE `#{table_name}`
22
+ FIELDS TERMINATED BY ','
23
+ LINES TERMINATED BY '\n'
24
+ (`ripped_file_collection_id`, `ripd_path`, `retrieval_date`, `compressed_size`)
25
+ ;
26
+ }
27
+ end
28
+
29
+ def clear_table
30
+ run_mysql_cmd "TRUNCATE #{table_name}"
31
+ end
32
+
33
+ class RippedFileCollection
34
+ def bulk_load_listing db_params, extra_find_args=""
35
+ announce "Indexing #{url.as_path} (expect ~10s per 100k files)"
36
+ FileUtils.cd path_to(:ripd_root) do
37
+ find_fmt = "#{self.id},%P,%TY-%Tm-%Td %TH:%TM:%TS,%s\n"
38
+ find_cmd = "find #{url.as_path} #{extra_find_args} -printf '#{find_fmt}' > #{bulk_listing_filename}"
39
+ puts `time #{find_cmd}`
40
+ end unless File.exist?(bulk_listing_filename)
41
+ bulk_load_mysql db_params, url.as_path
42
+ end
43
+ end
44
+
45
+
46
+ # SELECT rf_yrs.*, dfc.*, url.scheme, url.host, url.path
47
+ # FROM (
48
+ # SELECT SUBSTR(ripd_path,1,4) AS yr, COUNT(*), r.*
49
+ # FROM ripped_files r
50
+ # GROUP BY ripped_file_collection_id, yr
51
+ # ORDER BY ripped_file_collection_id, yr
52
+ # ) rf_yrs
53
+ # LEFT JOIN ripped_file_collections rfc ON rfc.id = rf_yrs.ripped_file_collection_id
54
+ # LEFT JOIN dataset_file_collections dfc ON dfc.id = rfc.dataset_file_collection_id
55
+ # LEFT JOIN dm_uris url ON url.id = rfc.url_id
56
+
57
+ db_params = IMW::DEFAULT_DATABASE_CONNECTION_PARAMS.merge({ :dbname => 'imw_weather_ncdc' })
58
+ IMW::Dataset.setup_remote_connection db_params
59
+
60
+ # Daily
61
+ daily_dset_clxn = DatasetFileCollection.find_or_create({ :category => 'weather/ncdc/daily' })
62
+ rf_clxn = RippedFileCollection.find_or_create_from_url 'ftp://ftp.ncdc.noaa.gov/pub/data/gsod', daily_dset_clxn
63
+ rf_clxn.bulk_load_listing db_params
64
+ # Hourly
65
+ hourly_dset_clxn = DatasetFileCollection.find_or_create({ :category => 'weather/ncdc/hourly' })
66
+ rf_clxn = RippedFileCollection.find_or_create_from_url 'ftp://ftp.ncdc.noaa.gov/pub/data/noaa', hourly_dset_clxn
67
+ rf_clxn.bulk_load_listing db_params, '\\! \\( -iname "isd-lite" -prune \\) '
68
+ # Hourly-lite
69
+ hlite_dset_clxn = DatasetFileCollection.find_or_create({ :category => 'weather/ncdc/hourly_lite' })
70
+ rf_clxn = RippedFileCollection.find_or_create_from_url 'ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-lite', hlite_dset_clxn
71
+ rf_clxn.bulk_load_listing db_params
@@ -0,0 +1,132 @@
1
+ #
2
+ # h2. lib/imw/workflow/scaffold.rb -- scaffold the directory structure for a dataset
3
+ #
4
+ # == About
5
+ #
6
+ # Defines workflow tasks for datasets to create directories and
7
+ # symlinks to ease the processing of a dataset.
8
+ #
9
+ # Right now this file contains code written by Flip as well as code
10
+ # written by Dhruv which accomplish basically the same task. Dhruv's
11
+ # code integrates with <tt>IMW::Dataset</tt> and Rake and should be
12
+ # used preferentially.
13
+ #
14
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
15
+ # Copyright:: Copyright (c) 2008 infochimps.org
16
+ # License:: GPL 3.0
17
+ # Website:: http://infinitemonkeywrench.org/
18
+ #
19
+ # puts "#{File.basename(__FILE__)}: POST NO BILLS. Is that funny to anyone but me? No?" # at bottom
20
+
21
+ require 'rake'
22
+ require 'fileutils'
23
+
24
+ require 'imw/utils'
25
+ require 'imw/dataset/task'
26
+
27
+ include FileUtils
28
+
29
+ module IMW
30
+ include FileUtils
31
+
32
+ ################################################################
33
+ ## FLIP'S CODE
34
+ ################################################################
35
+
36
+ def scaffold_script_dirs
37
+ mkdir_p path_to(:me)
38
+ end
39
+
40
+ #
41
+ # * creates a directory for the dataset in each of the top-level hierarchies
42
+ # (as given in ~/.imwrc)
43
+ # * links to that directory within the working directory
44
+ # in directory pool/foo/bar/baz we'd find
45
+ # rawd => /data/rawd/foo/bar/baz
46
+ #
47
+ def scaffold_dset_dirs
48
+ [:rawd, :temp, :fixd, :log].each do |seg|
49
+ unless File.exist?(path_to(seg))
50
+ seg_dir = path_to(pathseg_root(seg), :dset)
51
+ mkdir_p seg_dir
52
+ ln_s seg_dir, path_to(seg)
53
+ end
54
+ end
55
+ end
56
+
57
+
58
+ #
59
+ # * creates a symlink within the working directory to the
60
+ # ripped directory, named after its url
61
+ #
62
+ def scaffold_rip_dir url
63
+ unless File.exist?(path_to(seg))
64
+ ripd_dir = path_to(:ripd_root, url)
65
+ mkdir_p ripd_dir
66
+ ln_s ripd_dir, path_to(:ripd)
67
+ end
68
+ end
69
+
70
+ def scaffold_dset
71
+ scaffold_script_dirs
72
+ scaffold_dset_dirs
73
+ end
74
+
75
+
76
+ ################################################################
77
+ ## DHRUV's CODE -- uses IMW::Dataset and Rake
78
+ ################################################################
79
+ module Workflow
80
+
81
+ # Creates a workflow task <tt>:create_directories</tt> to create
82
+ # the directory structure for this dataset.
83
+ def create_directories_task
84
+ @last_description = "Creates directories for this dataset in the peel through package steps."
85
+ define_task(IMW::Task, {:create_directories => []}) do
86
+ [:peel, :munge, :fix, :package].each do |step|
87
+ FileUtils.mkdir_p(path_to(step)) unless File.exist?(path_to(step))
88
+ end
89
+ end
90
+ end
91
+
92
+ # Creates a workflow task <tt>:create_symlinks</tt> to create
93
+ # the directory structure for this dataset.
94
+ def create_symlinks_task
95
+ @last_description = "Creates symlinks pointing from the directory containing scripts for this dataset to the directories for the peel through package steps."
96
+ define_task(IMW::Task, {:create_symlinks => [:create_directories]}) do
97
+ [:peel, :munge, :fix, :package].each do |step|
98
+ symlink = File.join(path_to(:script),IMW::Dataset::WORKFLOW_STEP_DIRS[step].to_s)
99
+ FileUtils.ln_s(path_to(step), symlink) unless File.exist?(symlink)
100
+ end
101
+ symlink = File.join(path_to(:script), "ripd")
102
+ FileUtils.ln_s(path_to(:ripd_root), symlink) unless File.exist?(symlink)
103
+ end
104
+ end
105
+
106
+ # Creates a task <tt>:initialize</tt> which does nothing but
107
+ # depends upon all the tasks required to initialize the dataset.
108
+ def create_initialize_task
109
+ @last_description = "Set everything up to begin processing the dataset."
110
+ define_task(IMW::Task, :initialize => [:create_directories, :create_symlinks])
111
+ end
112
+
113
+ # Removes all data for this dataset from the data directories.
114
+ def create_delete_data_task
115
+ @last_description = "Deletes all data and directories for this dataset for the peel through package steps."
116
+ define_task(IMW::Task, {:delete_data => []}) do
117
+ [:peel, :munge, :fix, :package].each do |step|
118
+ FileUtils.remove_dir(path_to(step)) if File.exist?(path_to(step))
119
+ end
120
+ end
121
+ end
122
+
123
+ # Creates a task <tt>:destroy</tt> which does nothing but depends
124
+ # upon all the tasks required to delete the dataset's data and
125
+ # remove its footprint from IMW.
126
+ def create_destroy_task
127
+ @last_description = "Get rid of all traces of this dataset."
128
+ define_task(IMW::Task, :destroy => [:delete_data])
129
+ end
130
+
131
+ end
132
+ end