imw 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. data/README.rdoc +194 -31
  2. data/VERSION +1 -1
  3. data/bin/imw +5 -0
  4. data/lib/imw/boot.rb +0 -15
  5. data/lib/imw/dataset/paths.rb +38 -0
  6. data/lib/imw/dataset/task.rb +21 -18
  7. data/lib/imw/dataset/workflow.rb +126 -65
  8. data/lib/imw/dataset.rb +56 -82
  9. data/lib/imw/files/basicfile.rb +3 -3
  10. data/lib/imw/files/compressed_files_and_archives.rb +23 -37
  11. data/lib/imw/files/csv.rb +2 -1
  12. data/lib/imw/files/directory.rb +62 -0
  13. data/lib/imw/files/excel.rb +84 -0
  14. data/lib/imw/files/sgml.rb +4 -23
  15. data/lib/imw/files.rb +62 -47
  16. data/lib/imw/packagers/archiver.rb +19 -1
  17. data/lib/imw/packagers/s3_mover.rb +8 -0
  18. data/lib/imw/parsers/html_parser/matchers.rb +251 -268
  19. data/lib/imw/parsers/html_parser.rb +181 -176
  20. data/lib/imw/parsers.rb +1 -1
  21. data/lib/imw/repository.rb +35 -0
  22. data/lib/imw/runner.rb +114 -0
  23. data/lib/imw/utils/extensions/core.rb +0 -16
  24. data/lib/imw/utils/paths.rb +0 -28
  25. data/lib/imw.rb +21 -32
  26. metadata +11 -19
  27. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
  28. data/lib/imw/dataset/datamapper.rb +0 -66
  29. data/lib/imw/dataset/loaddump.rb +0 -50
  30. data/lib/imw/dataset/old/file_collection.rb +0 -88
  31. data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
  32. data/lib/imw/dataset/scaffold.rb +0 -132
  33. data/lib/imw/dataset/scraped_uri.rb +0 -305
  34. data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
  35. data/lib/imw/dataset/scrub/scrub.rb +0 -147
  36. data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
  37. data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
  38. data/lib/imw/dataset/scrub/slug.rb +0 -101
  39. data/lib/imw/dataset/stats/counter.rb +0 -23
  40. data/lib/imw/dataset/stats.rb +0 -73
data/README.rdoc CHANGED
@@ -38,7 +38,7 @@ right one to use. IMW is **not** designed for
38
38
 
39
39
  IMW is hosted on Gemcutter[http://gemcutter.org] so it's easy to install.
40
40
 
41
- You'll have to set up Gemcutter
41
+ You'll have to set up Gemcutter if you haven't already
42
42
 
43
43
  $ sudo gem install gemcutter
44
44
  $ gem tumble
@@ -47,55 +47,218 @@ and then install IMW
47
47
 
48
48
  $ sudo gem install imw
49
49
 
50
- = Using IMW
50
+ = IMW Basics
51
51
 
52
52
  The central goal of IMW is to make workflow involved in processing a
53
53
  dataset from a raw source to a finished product as simple as possible.
54
54
 
55
- So consider that there exist two datasets that I want to combine. The
56
- first details the historical price of bananas over the past century
57
- and the second
55
+ To help achieve this goal, IMW creates lots of convenient structures
56
+ and methods. The following sections provide a tour of these.
58
57
 
59
- == Working with paths and files
58
+ It is assumed that you've installed IMW and required it in a script
59
+ via
60
60
 
61
61
  require 'rubygems'
62
62
  require 'imw'
63
63
 
64
+ == Paths
65
+
64
66
  IMW holds a registry of paths that you can define on the fly or store
65
67
  in a configuration file.
66
68
 
67
- IMW.add_path :dropbox, "/var/www/public/dropbox"
68
- IMW.add_path :raw, "/mnt/data/raw"
69
- IMW.add_path :
69
+ IMW.add_path(:dropbox, "/var/www/public/dropbox")
70
+ IMW.path_to(:dropbox) #=> "/var/www/public/dropbox"
71
+
72
+ You can combine paths together dynamically.
70
73
 
71
- This makes it easeir
74
+ IMW.add_path(:raw, "/data/raw")
75
+ IMW.path_to(:raw, "my/dataset") #=> "/data/raw/my/dataset"
76
+ IMW.add_path(:rejects, :raw, "rejects")
77
+ IMW.path_to(:rejects) #=> "/data/raw/rejects"
72
78
 
73
- IMW.path_to :raw, "one/particular/dataset"
74
- #=> "/mnt/data/raw/one/particular/dataset"
79
+ Altering one path will update others
75
80
 
76
- IMW makes it easy to manipulate compressed files and archives.
81
+ IMW.add_path(:raw, "/data2/raw")
82
+ IMW.path_to(:rejects) #=> "/data2/raw/rejects", not "/data/raw/rejects"
77
83
 
84
+ == Files & Directories
78
85
 
79
- # Move a collection of files from a public dropbox to a processing directory
86
+ Use IMW.open to open files. The object returned by IMW.open obeys the
87
+ usual semantics of a File object but it has new methods to manipulate
88
+ and parse the file.
80
89
 
81
- raw
90
+ f1 = IMW.open("/path/to/file")
91
+ f1.read() # does what you think
82
92
 
83
- Dir["/public/*"].each do |path|
84
- file = IMW.open(path)
85
- case
86
- when file.compressed?
87
- file.decompress.mv_to_dir "/raw"
88
- when file.archive?
89
- FileUtils.cd("/raw") do
90
- file.extract
91
- end
92
- else
93
- file.mv_to_dir("/raw")
94
- end
93
+ # class methods from File are available
94
+ f1.size
95
+ f1.writeable?
96
+
97
+ # use a bang or a 'w' to write
98
+ writable_file = IMW.open!('/some/path') # similar to open('/some/path', 'w')
99
+
100
+ # as well as methods to manipulate the file on the filesystem
101
+ f2 = f1.cp("/new/path/to/file") # also try cp_to_dir
102
+ f1.exist? # true
103
+ f3 = f1.mv("/yet/another/path") # also try mv_to_dir
104
+ f1.exist? # false
105
+
106
+ IMW also knows about directories
107
+
108
+ d = IMW.open('/tmp')
109
+ d.directory? # true
110
+ d['*'] # Dir['/tmp/*']
111
+ d.mv('/parent/dir')
112
+
113
+ == Remote Files
114
+
115
+ Many operations defined for files are also defined for arbitrary URIs
116
+ through the <tt>open-uri</tt> library.
117
+
118
+ Files can readily be opened, read, and downloaded from the Internet
119
+
120
+ site = IMW.open('http://infochimps.org') #=> Recognized as an HTML document
121
+ site.read() # does what you think
122
+ site.cp('/some/local/path')
123
+ site.exist? # will work in many cases
124
+
125
+ (writing to remote sources isn't enabled yet).
126
+
127
+ == Archives & Compressed Files
128
+
129
+ IMW works with a variety of archiving and compression programs (see
130
+ IMW::EXTERNAL_PROGRAMS) to make packaging/unpackaging data easy.
131
+
132
+ bz2 = IMW.open('/path/to/big_file.bz2')
133
+ zip = IMW.open('/path/to/archive.zip')
134
+ targz = IMW.open('/path/to/archive.tar.gz')
135
+
136
+ # IMW recognizes files by extension
137
+ bz2.archive? # false
138
+ bz2.compressed? # true
139
+ zip.archive? # true
140
+ zip.compressed? # false
141
+ targz.archive? # true
142
+ targz.compressed? # true
143
+
144
+ # decompress or compress files
145
+ big_file = bz2.decompress! # skip the ! to preserve the original
146
+ new_bz2 = big_file.compress!
147
+
148
+ # extract and package archives
149
+ zip.extract # files show up in working directory
150
+ tarbz2.extract # no need to decompress first
151
+ new_tarbz2 = IMW.open!('/new/archive.tar').create(['/path1', '/path/2']).compress!
152
+
153
+ == Data Formats
154
+
155
+ IMW encourages you to work with data as Ruby objects as much as
156
+ possible by providing methods to parse common data formats directly
157
+ into Ruby.
158
+
159
+ The actual parsing is always handled by a separate library appropriate
160
+ for the data format so it will be fast and, if you're familiar with
161
+ the library, you can use many functions of the library directly on the
162
+ object returned by IMW.open.
163
+
164
+ IMW uses classes (defined in IMW::Files) to interface with each data
165
+ type. The choice of class is determined by the extension of the path
166
+ supplied to IMW.open.
167
+
168
+ IMW.open('file.csv') #=> IMW::Files::Csv
169
+ IMW.open('file.xml') #=> IMW::Files::Xml
170
+ IMW.open('file.html') #=> IMW::Files::Html
171
+
172
+ # default choice will be a text file
173
+ IMW.open('strange_filename.wuzz') #=> IMW::Files::Text
174
+
175
+ # but you force a particular choice
176
+ IMW.open('strange_filename.wuzz', :as => :csv) #=> IMW::Files::Csv
177
+
178
+ Some formats are extremely regular (CSV's, JSON, YAML, &c.) and can
179
+ immediately be converted to simple Ruby objects. Other formats (flat
180
+ files, HTML, XML, &c.) require parsing before they can be
181
+ unambiguously converted to Ruby objects.
182
+
183
+ As an example, consider flat, delimited files. They are extremely
184
+ regular and IMW uses FasterCSV to automatically parse them into nested
185
+ arrays, the only sensible and unambiguous Ruby representation of their
186
+ data:
187
+
188
+ delimit1 = IMW.open('/path/to/csv') # IMW::Files::Csv
189
+ delimit1.entries #=> array of arrays of entries
190
+ delimit1.each do |row|
191
+ # passes in parsed rows
192
+ ...
95
193
  end
194
+
195
+ # if there's a funny delimiter, it can be passed as an option (in
196
+ # this case identical to what would be passed to FasterCSV under the
197
+ # hood
198
+ delimit2 = IMW.open('/path/to/file.csv', :col_sep => " ")
96
199
 
97
-
98
-
99
-
100
-
200
+ HTML files, on the other hand, are more complex and typically have to
201
+ be parsed before being converted to plain Ruby objects:
202
+
203
+ # Grab a tiny link from the bottom of Google's homepage
204
+ doc = IMW.open('http://www.google.com') # IMW::Files::Html
205
+ doc.parse('p a') # 'Privacy'
206
+
207
+ More complex parsers can also be built
208
+
209
+ # Grab each row from an HTML table
210
+ doc = IMW.open('/path/to/data.html')
211
+ doc.parse :employees => ["tr", { :name => "td.name", :address => "td.address" } ]
212
+ #=> [{:name => "John Chimpo", :address => "123 Fake St."}, {...}, ... ]
213
+
214
+ see IMW::Parsers::HtmlParser for details on parsing HTML (and similar)
215
+ files. Examine the other parsers in IMW::Parsers for details on
216
+ parsing other data formats.
217
+
218
+ = The IMW Workflow
219
+
220
+ The workflow of IMW can be roughly summarized as follows:
221
+
222
+ rip::
223
+
224
+ Data is obtained from a source. IMW allows you to download data
225
+ from the web, obtain it by querying databases, or use other services
226
+ like rsync, ftp, &c. to pull it in from another computer.
227
+
228
+ extract::
229
+
230
+ Ripped data is often compressed or otherwise archived and needs to
231
+ be extracted. It may also be sliced in many ways (excluding certain
232
+ years, say) to reduce the volume to only what is required.
233
+
234
+ parse::
235
+
236
+ Data is parsed into Ruby objects and stored.
237
+
238
+ munge::
239
+
240
+ All the parsed data is combined, reconciled, and further processed
241
+ into a final form.
242
+
243
+ package::
244
+
245
+ The data is archived and compressed as necessary and moved to an
246
+ outbox, staging server, S3 bucket, &c.
247
+
248
+ Not all datasets
249
+
250
+
251
+ == Datasets
252
+
253
+ == Tasks & Dependencies
254
+
255
+ == Directory Structure
256
+
257
+ == Records
258
+
259
+ = IMW on the Command Line
260
+
261
+ == Repositories
262
+
263
+ == Running Tasks
101
264
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.1.1
data/bin/imw ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+ $:.unshift File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'imw/runner'
4
+ exit IMW::Runner.new(*ARGV).run!
5
+
data/lib/imw/boot.rb CHANGED
@@ -1,18 +1,3 @@
1
- #
2
- # h2. lib/imw/boot.rb -- startup functions
3
- #
4
- # == About
5
- #
6
- # This file contains code necessary to boot the Infinite Monkeywrench
7
- # at a particular site.
8
- #
9
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
10
- # Copyright:: Copyright (c) 2008 infochimps.org
11
- # License:: GPL 3.0
12
- # Website:: http://infinitemonkeywrench.org/
13
- #
14
- # puts "#{File.basename(__FILE__)}: You heft up your Infinite Monkeywrench for the first time and marvel at how something so powerful could be made so wondrous light!"
15
-
16
1
  module IMW
17
2
  module Config
18
3
 
@@ -0,0 +1,38 @@
1
+ module IMW
2
+
3
+ class Dataset
4
+ include IMW::Paths
5
+
6
+ # A dataset keeps track of its own collection of paths just like
7
+ # IMW itself. When an IMW::Dataset is instantiated in a script,
8
+ # that script's directory becomes the dataset's +self+ path and
9
+ # the default workflow directories (see IMW::Workflow) are created
10
+ # within this directory.
11
+ #
12
+ # You can change a dataset's paths the same way you can change
13
+ # IMW's paths; calling +add_path+ and +remove_path+ on the
14
+ # dataset.
15
+ #
16
+ # To customize this behavior for all future datasets, created a
17
+ # subclass of IMW::Dataset and override the +set_paths+ method.
18
+ def paths
19
+ @paths
20
+ end
21
+
22
+ protected
23
+ # Sets the roots of various paths relative to this dataset.
24
+ def set_root_paths
25
+ @paths = {}
26
+ add_path :script, File.expand_path(eval('__FILE__'))
27
+ add_path :self, File.dirname(path_to(:script))
28
+ IMW::Workflow::DIRS.each do |dir|
29
+ add_path dir, :self, dir.to_s
30
+ end
31
+ end
32
+
33
+ # Overwrite this method to set additional paths for the dataset.
34
+ def set_paths
35
+ end
36
+ end
37
+
38
+ end
@@ -1,25 +1,10 @@
1
- #
2
- # h2. lib/imw/workflow/task.rb --
3
- #
4
- # == About
5
- #
6
- # This file defines a class <tt>IMW::Task</tt> which subclasses
7
- # <tt>Rake::Task</tt>. Tasks defined in IMW should be instances of
8
- # <tt>IMW::Task</tt>.
9
- #
10
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
11
- # Copyright:: Copyright (c) 2008 infochimps.org
12
- # License:: GPL 3.0
13
- # Website:: http://infinitemonkeywrench.org/
14
- #
15
- # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
16
-
17
1
  require 'rake'
18
2
 
19
3
  module IMW
20
4
 
21
- class Task < Rake::Task
22
- end
5
+ Task = Class.new(Rake::Task)
6
+ FileTask = Class.new(Rake::FileTask)
7
+ FileCreationTask = Class.new(Rake::FileCreationTask)
23
8
 
24
9
  class Dataset
25
10
  include Rake::TaskManager
@@ -31,6 +16,24 @@ module IMW
31
16
  self.define_task IMW::Task, name, &block
32
17
  end
33
18
 
19
+ # Return a new (or existing) <tt>IMW::FileTask</tt> with the given
20
+ # +name+. Dependencies can be declared and a block passed in just
21
+ # as in Rake.
22
+ def file name, &block
23
+ self.define_task IMW::FileTask, name, &block
24
+ end
25
+
26
+ # Return a new (or existing) <tt>IMW::FileCreationTask</tt> with the given
27
+ # +name+. Dependencies can be declared and a block passed in just
28
+ # as in Rake.
29
+ def file_create name, &block
30
+ self.define_task IMW::FileCreationTask, name, &block
31
+ end
32
+
33
+ # Override this method to define default tasks for a subclass of
34
+ # IMW::Dataset.
35
+ def set_tasks
36
+ end
34
37
  end
35
38
  end
36
39
 
@@ -1,81 +1,142 @@
1
- #
2
- # lib/imw/workflow.rb -- implements the workflow class
3
- #
4
- # == About
5
- #
6
- # This file implements the <tt>IMW::Workflow</tt> class which tailors
7
- # the functionality of Rake for IMW objects.
8
- #
9
- # Author:: Philip flip Kromer for infochimps.org (mailto:coders@infochimps.org)
10
- # Copyright:: Copyright (c) 2008 infochimps.org
11
- # License:: GPL 3.0
12
- # Website:: http://infinitemonkeywrench.org/
13
- #
14
-
15
- require 'imw/dataset/scaffold'
16
1
  require 'imw/dataset/task'
2
+ require 'ostruct'
17
3
 
18
4
  module IMW
19
5
 
20
- # The <tt>IMW::Workflow</tt> module is a collection of methods which
21
- # define Rake[http://rake.rubyforge.org/] tasks specialized for each
22
- # dataset.
6
+ # IMW encourages you to view a data transformation as a network of
7
+ # dependencies. By default, IMW defines five main steps:
8
+ #
9
+ # rip::
10
+ # Obtain data via HTTP, FTP, SCP, RSYNC, database query, &c.
11
+ #
12
+ # extract::
13
+ # Extract data from its ripped form to a form which can be
14
+ # parsed.
15
+ #
16
+ # parse::
17
+ # Parse data into a structured form.
18
+ #
19
+ # munge::
20
+ # Combine, filter, reconcile, and transform already structured
21
+ # data into a desired form.
22
+ #
23
+ # package::
24
+ # Archive, compress, and deliver data in its final form to some
25
+ # location (HTTP, FTP, SCP, RSYNC, S3, EBS, &c.).
26
+ #
27
+ # Each step depends upon the one before it. The steps are blank by
28
+ # default so there's no need to write code for steps you don't need
29
+ # to use.
30
+ #
31
+ # Each step corresponds to a named directory in IMW::Workflow::DIRS.
23
32
  module Workflow
24
33
 
25
- # The functions called here define the default tasks associated
26
- # with each dataset.
27
- def create_default_tasks
28
- create_directories_task
29
- create_symlinks_task
30
- create_initialize_task
31
- create_delete_data_task
32
- create_destroy_task
33
- create_workflow_tasks
34
- end
34
+ # The <tt>Rake::TaskManager</tt> module allows the
35
+ # <tt>IMW::Dataset</tt> class to leverage the functionality of the
36
+ # Rake[http://rake.rubyforge.org/] library to manage tasks
37
+ # associated with the processing of this dataset.
38
+ include Rake::TaskManager
35
39
 
36
- # Sets the default tasks in this workflow.
37
- #
38
- # The default tasks constitute a set of consecutive actions that
39
- # must be taken in order: <tt>:rip</tt>, <tt>parse</tt>,
40
- # <tt>munge</tt>, <tt>fix</tt>, and <tt>package</tt>. Each task
41
- # is a <tt>Rake::Task</tt> which depends on the one before it.
40
+ # Default options passed to <tt>Rake</tt>. Any class including
41
+ # the <tt>Rake::TaskManager</tt> module must define a constant by
42
+ # this name.
43
+ DEFAULT_OPTIONS = {
44
+ :dry_run => false,
45
+ :trace => false,
46
+ :verbose => false
47
+ }
48
+
49
+ # The standard IMW workflow steps.
50
+ STEPS = [:rip, :extract, :parse, :munge, :package]
51
+
52
+ # The steps of the IMW workflow each correspond to a directory in
53
+ # which it is customary that they deposit their files <em>once
54
+ # they are finished processing</em> (so ripped files wind up in
55
+ # the +ripd+ directory, packaged files in the +pkgd+ directory,
56
+ # and so on).
57
+ DIRS = [:ripd, :xtrd, :prsd, :mungd, :pkgd ]
58
+
59
+ # Each workflow step can be configured to take default actions,
60
+ # each action being a proc in the array for the step in this hash.
42
61
  #
43
- # Each task does nothing by default other than create directories
44
- # to hold files for this dataset as it undergoes the workflow.
45
- def set_default_tasks
46
- define_task(Rake::Task, {:rip => []})
47
- define_task(Rake::Task, {:parse => :rip})
48
- define_task(Rake::Task, {:munge => :parse})
49
- define_task(Rake::Task, {:fix => :munge})
50
- define_task(Rake::Task, {:package => :fix})
51
- comment_default_tasks
62
+ # This allows classes which include IMW::Workflow to use class
63
+ # methods named after each step (+rip+, +parse+, &c.) to directly
64
+ # define tasks.
65
+ STEPS_TASKS = returning({}) do |steps_procs|
66
+ STEPS.each do |step|
67
+ steps_procs[step] = []
68
+ end
52
69
  end
53
70
 
54
- # Set the initial comments for each of the default tasks.
55
- def comment_default_tasks
56
- self[:rip].comment = "Rip dataset from an origin"
57
- self[:parse].comment = "Parse dataset into intermediate form"
58
- self[:munge].comment = "Munge dataset's structure into desired form"
59
- self[:fix].comment = "Fix and format dataset"
60
- self[:package].comment = "Package dataset into a final format"
71
+ protected
72
+ def self.included klass
73
+ STEPS.each do |step|
74
+ klass.class_eval <<EOF
75
+ def self.#{step}(deps=nil, &block)
76
+ STEPS_TASKS[:#{step}] << [deps, block]
77
+ end
78
+ EOF
79
+ end
80
+
81
+
61
82
  end
62
83
 
63
- # Creates the task dependency chain <tt>:package => :fix => :munge
64
- # => :peel => :rip => :initialize</tt>.
65
- def create_workflow_tasks
66
- @last_description = "Obtain data from some source."
67
- define_task(IMW::Task, :rip => [:initialize])
68
- @last_description = "Extract datafiles from ripped data."
69
- define_task(IMW::Task, :peel => [:rip])
70
- @last_description = "Transform records in a dataset."
71
- define_task(IMW::Task, :munge => [:peel])
72
- @last_description = "Reconcile records."
73
- define_task(IMW::Task, :fix => [:munge])
74
- @last_description = "Package dataset in final form."
75
- define_task(IMW::Task, :package => [:fix])
84
+ def define_workflow_task deps, comment
85
+ @last_description = comment
86
+ define_task(IMW::Task, deps)
87
+ step = deps.respond_to?(:keys) ? deps.keys.first : deps
88
+ STEPS_TASKS[step].each do |deps, block|
89
+ self[step].enhance(deps) do
90
+ self.instance_eval(&block)
91
+ end
92
+ end
93
+ end
94
+
95
+ # Create all the instance variables required by Rake::TaskManager
96
+ # and define default tasks for this dataset.
97
+ def initialize_workflow
98
+ @tasks = Hash.new
99
+ @rules = Array.new
100
+ @scope = Array.new
101
+ @last_description = nil
102
+ @options = OpenStruct.new(DEFAULT_OPTIONS)
103
+ define_create_directories_task
104
+ define_workflow_tasks
105
+ define_destroy_task
106
+ end
107
+
108
+ # Creates a task <tt>:create_directories</tt> to create the
109
+ # directory structure for this dataset.
110
+ def define_create_directories_task
111
+ @last_description = "Creates workflow directories for this dataset."
112
+ define_task(IMW::Task, {:create_directories => []}) do
113
+ DIRS.each do |dir|
114
+ FileUtils.mkdir_p(path_to(dir)) unless File.exist?(path_to(dir))
115
+ end
116
+ end
117
+ end
118
+
119
+ # Creates a task <tt>:destroy</tt> which removes dataset's
120
+ # workflow directories.
121
+ def define_destroy_task
122
+ @last_description = "Get rid of all traces of this dataset."
123
+ define_task(IMW::Task, :destroy => [:create_directories]) do
124
+ DIRS.each do |dir|
125
+ FileUtils.rm_rf(path_to(dir))
126
+ end
127
+ end
128
+ end
129
+
130
+ # Creates the task dependency chain <tt>:package => :munge =>
131
+ # :parse => :extract => :rip => :initialize</tt> of the
132
+ # IMW::Workflow.
133
+ def define_workflow_tasks
134
+ define_workflow_task({:rip => [:create_directories]}, "Obtain data from some source." )
135
+ define_workflow_task({:extract => [:rip]}, "Extract data so it's ready to parse." )
136
+ define_workflow_task({:parse => [:extract]}, "Parse data into a structured form." )
137
+ define_workflow_task({:munge => [:parse]}, "Munge structured data into desired form.")
138
+ define_workflow_task({:package => [:munge]}, "Package dataset in final form." )
76
139
  end
77
140
 
78
141
  end
79
142
  end
80
-
81
- # puts "#{File.basename(__FILE__)}: You find your flow next to a tall tree. Ahhhh."