imw 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. data/README.rdoc +194 -31
  2. data/VERSION +1 -1
  3. data/bin/imw +5 -0
  4. data/lib/imw/boot.rb +0 -15
  5. data/lib/imw/dataset/paths.rb +38 -0
  6. data/lib/imw/dataset/task.rb +21 -18
  7. data/lib/imw/dataset/workflow.rb +126 -65
  8. data/lib/imw/dataset.rb +56 -82
  9. data/lib/imw/files/basicfile.rb +3 -3
  10. data/lib/imw/files/compressed_files_and_archives.rb +23 -37
  11. data/lib/imw/files/csv.rb +2 -1
  12. data/lib/imw/files/directory.rb +62 -0
  13. data/lib/imw/files/excel.rb +84 -0
  14. data/lib/imw/files/sgml.rb +4 -23
  15. data/lib/imw/files.rb +62 -47
  16. data/lib/imw/packagers/archiver.rb +19 -1
  17. data/lib/imw/packagers/s3_mover.rb +8 -0
  18. data/lib/imw/parsers/html_parser/matchers.rb +251 -268
  19. data/lib/imw/parsers/html_parser.rb +181 -176
  20. data/lib/imw/parsers.rb +1 -1
  21. data/lib/imw/repository.rb +35 -0
  22. data/lib/imw/runner.rb +114 -0
  23. data/lib/imw/utils/extensions/core.rb +0 -16
  24. data/lib/imw/utils/paths.rb +0 -28
  25. data/lib/imw.rb +21 -32
  26. metadata +11 -19
  27. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
  28. data/lib/imw/dataset/datamapper.rb +0 -66
  29. data/lib/imw/dataset/loaddump.rb +0 -50
  30. data/lib/imw/dataset/old/file_collection.rb +0 -88
  31. data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
  32. data/lib/imw/dataset/scaffold.rb +0 -132
  33. data/lib/imw/dataset/scraped_uri.rb +0 -305
  34. data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
  35. data/lib/imw/dataset/scrub/scrub.rb +0 -147
  36. data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
  37. data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
  38. data/lib/imw/dataset/scrub/slug.rb +0 -101
  39. data/lib/imw/dataset/stats/counter.rb +0 -23
  40. data/lib/imw/dataset/stats.rb +0 -73
data/README.rdoc CHANGED
@@ -38,7 +38,7 @@ right one to use. IMW is **not** designed for
38
38
 
39
39
  IMW is hosted on Gemcutter[http://gemcutter.org] so it's easy to install.
40
40
 
41
- You'll have to set up Gemcutter
41
+ You'll have to set up Gemcutter if you haven't already
42
42
 
43
43
  $ sudo gem install gemcutter
44
44
  $ gem tumble
@@ -47,55 +47,218 @@ and then install IMW
47
47
 
48
48
  $ sudo gem install imw
49
49
 
50
- = Using IMW
50
+ = IMW Basics
51
51
 
52
52
  The central goal of IMW is to make workflow involved in processing a
53
53
  dataset from a raw source to a finished product as simple as possible.
54
54
 
55
- So consider that there exist two datasets that I want to combine. The
56
- first details the historical price of bananas over the past century
57
- and the second
55
+ To help achieve this goal, IMW creates lots of convenient structures
56
+ and methods. The following sections provide a tour of these.
58
57
 
59
- == Working with paths and files
58
+ It is assumed that you've installed IMW and required it in a script
59
+ via
60
60
 
61
61
  require 'rubygems'
62
62
  require 'imw'
63
63
 
64
+ == Paths
65
+
64
66
  IMW holds a registry of paths that you can define on the fly or store
65
67
  in a configuration file.
66
68
 
67
- IMW.add_path :dropbox, "/var/www/public/dropbox"
68
- IMW.add_path :raw, "/mnt/data/raw"
69
- IMW.add_path :
69
+ IMW.add_path(:dropbox, "/var/www/public/dropbox")
70
+ IMW.path_to(:dropbox) #=> "/var/www/public/dropbox"
71
+
72
+ You can combine paths together dynamically.
70
73
 
71
- This makes it easeir
74
+ IMW.add_path(:raw, "/data/raw")
75
+ IMW.path_to(:raw, "my/dataset") #=> "/data/raw/my/dataset"
76
+ IMW.add_path(:rejects, :raw, "rejects")
77
+ IMW.path_to(:rejects) #=> "/data/raw/rejects"
72
78
 
73
- IMW.path_to :raw, "one/particular/dataset"
74
- #=> "/mnt/data/raw/one/particular/dataset"
79
+ Altering one path will update others
75
80
 
76
- IMW makes it easy to manipulate compressed files and archives.
81
+ IMW.add_path(:raw, "/data2/raw")
82
+ IMW.path_to(:rejects) #=> "/data2/raw/rejects", not "/data/raw/rejects"
77
83
 
84
+ == Files & Directories
78
85
 
79
- # Move a collection of files from a public dropbox to a processing directory
86
+ Use IMW.open to open files. The object returned by IMW.open obeys the
87
+ usual semantics of a File object but it has new methods to manipulate
88
+ and parse the file.
80
89
 
81
- raw
90
+ f1 = IMW.open("/path/to/file")
91
+ f1.read() # does what you think
82
92
 
83
- Dir["/public/*"].each do |path|
84
- file = IMW.open(path)
85
- case
86
- when file.compressed?
87
- file.decompress.mv_to_dir "/raw"
88
- when file.archive?
89
- FileUtils.cd("/raw") do
90
- file.extract
91
- end
92
- else
93
- file.mv_to_dir("/raw")
94
- end
93
+ # class methods from File are available
94
+ f1.size
95
+ f1.writeable?
96
+
97
+ # use a bang or a 'w' to write
98
+ writable_file = IMW.open!('/some/path') # similar to open('/some/path', 'w')
99
+
100
+ # as well as methods to manipulate the file on the filesystem
101
+ f2 = f1.cp("/new/path/to/file") # also try cp_to_dir
102
+ f1.exist? # true
103
+ f3 = f1.mv("/yet/another/path") # also try mv_to_dir
104
+ f1.exist? # false
105
+
106
+ IMW also knows about directories
107
+
108
+ d = IMW.open('/tmp')
109
+ d.directory? # true
110
+ d['*'] # Dir['/tmp/*']
111
+ d.mv('/parent/dir')
112
+
113
+ == Remote Files
114
+
115
+ Many operations defined for files are also defined for arbitrary URIs
116
+ through the <tt>open-uri</tt> library.
117
+
118
+ Files can readily be opened, read, and downloaded from the Internet
119
+
120
+ site = IMW.open('http://infochimps.org') #=> Recognized as an HTML document
121
+ site.read() # does what you think
122
+ site.cp('/some/local/path')
123
+ site.exist? # will work in many cases
124
+
125
+ (writing to remote sources isn't enabled yet).
126
+
127
+ == Archives & Compressed Files
128
+
129
+ IMW works with a variety of archiving and compression programs (see
130
+ IMW::EXTERNAL_PROGRAMS) to make packaging/unpackaging data easy.
131
+
132
+ bz2 = IMW.open('/path/to/big_file.bz2')
133
+ zip = IMW.open('/path/to/archive.zip')
134
+ targz = IMW.open('/path/to/archive.tar.gz')
135
+
136
+ # IMW recognizes files by extension
137
+ bz2.archive? # false
138
+ bz2.compressed? # true
139
+ zip.archive? # true
140
+ zip.compressed? # false
141
+ targz.archive? # true
142
+ targz.compressed? # true
143
+
144
+ # decompress or compress files
145
+ big_file = bz2.decompress! # skip the ! to preserve the original
146
+ new_bz2 = big_file.compress!
147
+
148
+ # extract and package archives
149
+ zip.extract # files show up in working directory
150
+ tarbz2.extract # no need to decompress first
151
+ new_tarbz2 = IMW.open!('/new/archive.tar').create(['/path1', '/path/2']).compress!
152
+
153
+ == Data Formats
154
+
155
+ IMW encourages you to work with data as Ruby objects as much as
156
+ possible by providing methods to parse common data formats directly
157
+ into Ruby.
158
+
159
+ The actual parsing is always handled by a separate library appropriate
160
+ for the data format so it will be fast and, if you're familiar with
161
+ the library, you can use many functions of the library directly on the
162
+ object returned by IMW.open.
163
+
164
+ IMW uses classes (defined in IMW::Files) to interface with each data
165
+ type. The choice of class is determined by the extension of the path
166
+ supplied to IMW.open.
167
+
168
+ IMW.open('file.csv') #=> IMW::Files::Csv
169
+ IMW.open('file.xml') #=> IMW::Files::Xml
170
+ IMW.open('file.html') #=> IMW::Files::Html
171
+
172
+ # default choice will be a text file
173
+ IMW.open('strange_filename.wuzz') #=> IMW::Files::Text
174
+
175
+ # but you force a particular choice
176
+ IMW.open('strange_filename.wuzz', :as => :csv) #=> IMW::Files::Csv
177
+
178
+ Some formats are extremely regular (CSV's, JSON, YAML, &c.) and can
179
+ immediately be converted to simple Ruby objects. Other formats (flat
180
+ files, HTML, XML, &c.) require parsing before they can be
181
+ unambiguously converted to Ruby objects.
182
+
183
+ As an example, consider flat, delimited files. They are extremely
184
+ regular and IMW uses FasterCSV to automatically parse them into nested
185
+ arrays, the only sensible and unambiguous Ruby representation of their
186
+ data:
187
+
188
+ delimit1 = IMW.open('/path/to/csv') # IMW::Files::Csv
189
+ delimit1.entries #=> array of arrays of entries
190
+ delimit1.each do |row|
191
+ # passes in parsed rows
192
+ ...
95
193
  end
194
+
195
+ # if there's a funny delimiter, it can be passed as an option (in
196
+ # this case identical to what would be passed to FasterCSV under the
197
+ # hood
198
+ delimit2 = IMW.open('/path/to/file.csv', :col_sep => " ")
96
199
 
97
-
98
-
99
-
100
-
200
+ HTML files, on the other hand, are more complex and typically have to
201
+ be parsed before being converted to plain Ruby objects:
202
+
203
+ # Grab a tiny link from the bottom of Google's homepage
204
+ doc = IMW.open('http://www.google.com') # IMW::Files::Html
205
+ doc.parse('p a') # 'Privacy'
206
+
207
+ More complex parsers can also be built
208
+
209
+ # Grab each row from an HTML table
210
+ doc = IMW.open('/path/to/data.html')
211
+ doc.parse :employees => ["tr", { :name => "td.name", :address => "td.address" } ]
212
+ #=> [{:name => "John Chimpo", :address => "123 Fake St."}, {...}, ... ]
213
+
214
+ see IMW::Parsers::HtmlParser for details on parsing HTML (and similar)
215
+ files. Examine the other parsers in IMW::Parsers for details on
216
+ parsing other data formats.
217
+
218
+ = The IMW Workflow
219
+
220
+ The workflow of IMW can be roughly summarized as follows:
221
+
222
+ rip::
223
+
224
+ Data is obtained from a source. IMW allows you to download data
225
+ from the web, obtain it by querying databases, or use other services
226
+ like rsync, ftp, &c. to pull it in from another computer.
227
+
228
+ extract::
229
+
230
+ Ripped data is often compressed or otherwise archived and needs to
231
+ be extracted. It may also be sliced in many ways (excluding certain
232
+ years, say) to reduce the volume to only what is required.
233
+
234
+ parse::
235
+
236
+ Data is parsed into Ruby objects and stored.
237
+
238
+ munge::
239
+
240
+ All the parsed data is combined, reconciled, and further processed
241
+ into a final form.
242
+
243
+ package::
244
+
245
+ The data is archived and compressed as necessary and moved to an
246
+ outbox, staging server, S3 bucket, &c.
247
+
248
+ Not all datasets
249
+
250
+
251
+ == Datasets
252
+
253
+ == Tasks & Dependencies
254
+
255
+ == Directory Structure
256
+
257
+ == Records
258
+
259
+ = IMW on the Command Line
260
+
261
+ == Repositories
262
+
263
+ == Running Tasks
101
264
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.0
1
+ 0.1.1
data/bin/imw ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+ $:.unshift File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'imw/runner'
4
+ exit IMW::Runner.new(*ARGV).run!
5
+
data/lib/imw/boot.rb CHANGED
@@ -1,18 +1,3 @@
1
- #
2
- # h2. lib/imw/boot.rb -- startup functions
3
- #
4
- # == About
5
- #
6
- # This file contains code necessary to boot the Infinite Monkeywrench
7
- # at a particular site.
8
- #
9
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
10
- # Copyright:: Copyright (c) 2008 infochimps.org
11
- # License:: GPL 3.0
12
- # Website:: http://infinitemonkeywrench.org/
13
- #
14
- # puts "#{File.basename(__FILE__)}: You heft up your Infinite Monkeywrench for the first time and marvel at how something so powerful could be made so wondrous light!"
15
-
16
1
  module IMW
17
2
  module Config
18
3
 
@@ -0,0 +1,38 @@
1
+ module IMW
2
+
3
+ class Dataset
4
+ include IMW::Paths
5
+
6
+ # A dataset keeps track of its own collection of paths just like
7
+ # IMW itself. When an IMW::Dataset is instantiated in a script,
8
+ # that script's directory becomes the dataset's +self+ path and
9
+ # the default workflow directories (see IMW::Workflow) are created
10
+ # within this directory.
11
+ #
12
+ # You can change a dataset's paths the same way you can change
13
+ # IMW's paths; calling +add_path+ and +remove_path+ on the
14
+ # dataset.
15
+ #
16
+ # To customize this behavior for all future datasets, created a
17
+ # subclass of IMW::Dataset and override the +set_paths+ method.
18
+ def paths
19
+ @paths
20
+ end
21
+
22
+ protected
23
+ # Sets the roots of various paths relative to this dataset.
24
+ def set_root_paths
25
+ @paths = {}
26
+ add_path :script, File.expand_path(eval('__FILE__'))
27
+ add_path :self, File.dirname(path_to(:script))
28
+ IMW::Workflow::DIRS.each do |dir|
29
+ add_path dir, :self, dir.to_s
30
+ end
31
+ end
32
+
33
+ # Overwrite this method to set additional paths for the dataset.
34
+ def set_paths
35
+ end
36
+ end
37
+
38
+ end
@@ -1,25 +1,10 @@
1
- #
2
- # h2. lib/imw/workflow/task.rb --
3
- #
4
- # == About
5
- #
6
- # This file defines a class <tt>IMW::Task</tt> which subclasses
7
- # <tt>Rake::Task</tt>. Tasks defined in IMW should be instances of
8
- # <tt>IMW::Task</tt>.
9
- #
10
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
11
- # Copyright:: Copyright (c) 2008 infochimps.org
12
- # License:: GPL 3.0
13
- # Website:: http://infinitemonkeywrench.org/
14
- #
15
- # puts "#{File.basename(__FILE__)}: Something clever" # at bottom
16
-
17
1
  require 'rake'
18
2
 
19
3
  module IMW
20
4
 
21
- class Task < Rake::Task
22
- end
5
+ Task = Class.new(Rake::Task)
6
+ FileTask = Class.new(Rake::FileTask)
7
+ FileCreationTask = Class.new(Rake::FileCreationTask)
23
8
 
24
9
  class Dataset
25
10
  include Rake::TaskManager
@@ -31,6 +16,24 @@ module IMW
31
16
  self.define_task IMW::Task, name, &block
32
17
  end
33
18
 
19
+ # Return a new (or existing) <tt>IMW::FileTask</tt> with the given
20
+ # +name+. Dependencies can be declared and a block passed in just
21
+ # as in Rake.
22
+ def file name, &block
23
+ self.define_task IMW::FileTask, name, &block
24
+ end
25
+
26
+ # Return a new (or existing) <tt>IMW::FileCreationTask</tt> with the given
27
+ # +name+. Dependencies can be declared and a block passed in just
28
+ # as in Rake.
29
+ def file_create name, &block
30
+ self.define_task IMW::FileCreationTask, name, &block
31
+ end
32
+
33
+ # Override this method to define default tasks for a subclass of
34
+ # IMW::Dataset.
35
+ def set_tasks
36
+ end
34
37
  end
35
38
  end
36
39
 
@@ -1,81 +1,142 @@
1
- #
2
- # lib/imw/workflow.rb -- implements the workflow class
3
- #
4
- # == About
5
- #
6
- # This file implements the <tt>IMW::Workflow</tt> class which tailors
7
- # the functionality of Rake for IMW objects.
8
- #
9
- # Author:: Philip flip Kromer for infochimps.org (mailto:coders@infochimps.org)
10
- # Copyright:: Copyright (c) 2008 infochimps.org
11
- # License:: GPL 3.0
12
- # Website:: http://infinitemonkeywrench.org/
13
- #
14
-
15
- require 'imw/dataset/scaffold'
16
1
  require 'imw/dataset/task'
2
+ require 'ostruct'
17
3
 
18
4
  module IMW
19
5
 
20
- # The <tt>IMW::Workflow</tt> module is a collection of methods which
21
- # define Rake[http://rake.rubyforge.org/] tasks specialized for each
22
- # dataset.
6
+ # IMW encourages you to view a data transformation as a network of
7
+ # dependencies. By default, IMW defines five main steps:
8
+ #
9
+ # rip::
10
+ # Obtain data via HTTP, FTP, SCP, RSYNC, database query, &c.
11
+ #
12
+ # extract::
13
+ # Extract data from its ripped form to a form which can be
14
+ # parsed.
15
+ #
16
+ # parse::
17
+ # Parse data into a structured form.
18
+ #
19
+ # munge::
20
+ # Combine, filter, reconcile, and transform already structured
21
+ # data into a desired form.
22
+ #
23
+ # package::
24
+ # Archive, compress, and deliver data in its final form to some
25
+ # location (HTTP, FTP, SCP, RSYNC, S3, EBS, &c.).
26
+ #
27
+ # Each step depends upon the one before it. The steps are blank by
28
+ # default so there's no need to write code for steps you don't need
29
+ # to use.
30
+ #
31
+ # Each step corresponds to a named directory in IMW::Workflow::DIRS.
23
32
  module Workflow
24
33
 
25
- # The functions called here define the default tasks associated
26
- # with each dataset.
27
- def create_default_tasks
28
- create_directories_task
29
- create_symlinks_task
30
- create_initialize_task
31
- create_delete_data_task
32
- create_destroy_task
33
- create_workflow_tasks
34
- end
34
+ # The <tt>Rake::TaskManager</tt> module allows the
35
+ # <tt>IMW::Dataset</tt> class to leverage the functionality of the
36
+ # Rake[http://rake.rubyforge.org/] library to manage tasks
37
+ # associated with the processing of this dataset.
38
+ include Rake::TaskManager
35
39
 
36
- # Sets the default tasks in this workflow.
37
- #
38
- # The default tasks constitute a set of consecutive actions that
39
- # must be taken in order: <tt>:rip</tt>, <tt>parse</tt>,
40
- # <tt>munge</tt>, <tt>fix</tt>, and <tt>package</tt>. Each task
41
- # is a <tt>Rake::Task</tt> which depends on the one before it.
40
+ # Default options passed to <tt>Rake</tt>. Any class including
41
+ # the <tt>Rake::TaskManager</tt> module must define a constant by
42
+ # this name.
43
+ DEFAULT_OPTIONS = {
44
+ :dry_run => false,
45
+ :trace => false,
46
+ :verbose => false
47
+ }
48
+
49
+ # The standard IMW workflow steps.
50
+ STEPS = [:rip, :extract, :parse, :munge, :package]
51
+
52
+ # The steps of the IMW workflow each correspond to a directory in
53
+ # which it is customary that they deposit their files <em>once
54
+ # they are finished processing</em> (so ripped files wind up in
55
+ # the +ripd+ directory, packaged files in the +pkgd+ directory,
56
+ # and so on).
57
+ DIRS = [:ripd, :xtrd, :prsd, :mungd, :pkgd ]
58
+
59
+ # Each workflow step can be configured to take default actions,
60
+ # each action being a proc in the array for the step in this hash.
42
61
  #
43
- # Each task does nothing by default other than create directories
44
- # to hold files for this dataset as it undergoes the workflow.
45
- def set_default_tasks
46
- define_task(Rake::Task, {:rip => []})
47
- define_task(Rake::Task, {:parse => :rip})
48
- define_task(Rake::Task, {:munge => :parse})
49
- define_task(Rake::Task, {:fix => :munge})
50
- define_task(Rake::Task, {:package => :fix})
51
- comment_default_tasks
62
+ # This allows classes which include IMW::Workflow to use class
63
+ # methods named after each step (+rip+, +parse+, &c.) to directly
64
+ # define tasks.
65
+ STEPS_TASKS = returning({}) do |steps_procs|
66
+ STEPS.each do |step|
67
+ steps_procs[step] = []
68
+ end
52
69
  end
53
70
 
54
- # Set the initial comments for each of the default tasks.
55
- def comment_default_tasks
56
- self[:rip].comment = "Rip dataset from an origin"
57
- self[:parse].comment = "Parse dataset into intermediate form"
58
- self[:munge].comment = "Munge dataset's structure into desired form"
59
- self[:fix].comment = "Fix and format dataset"
60
- self[:package].comment = "Package dataset into a final format"
71
+ protected
72
+ def self.included klass
73
+ STEPS.each do |step|
74
+ klass.class_eval <<EOF
75
+ def self.#{step}(deps=nil, &block)
76
+ STEPS_TASKS[:#{step}] << [deps, block]
77
+ end
78
+ EOF
79
+ end
80
+
81
+
61
82
  end
62
83
 
63
- # Creates the task dependency chain <tt>:package => :fix => :munge
64
- # => :peel => :rip => :initialize</tt>.
65
- def create_workflow_tasks
66
- @last_description = "Obtain data from some source."
67
- define_task(IMW::Task, :rip => [:initialize])
68
- @last_description = "Extract datafiles from ripped data."
69
- define_task(IMW::Task, :peel => [:rip])
70
- @last_description = "Transform records in a dataset."
71
- define_task(IMW::Task, :munge => [:peel])
72
- @last_description = "Reconcile records."
73
- define_task(IMW::Task, :fix => [:munge])
74
- @last_description = "Package dataset in final form."
75
- define_task(IMW::Task, :package => [:fix])
84
+ def define_workflow_task deps, comment
85
+ @last_description = comment
86
+ define_task(IMW::Task, deps)
87
+ step = deps.respond_to?(:keys) ? deps.keys.first : deps
88
+ STEPS_TASKS[step].each do |deps, block|
89
+ self[step].enhance(deps) do
90
+ self.instance_eval(&block)
91
+ end
92
+ end
93
+ end
94
+
95
+ # Create all the instance variables required by Rake::TaskManager
96
+ # and define default tasks for this dataset.
97
+ def initialize_workflow
98
+ @tasks = Hash.new
99
+ @rules = Array.new
100
+ @scope = Array.new
101
+ @last_description = nil
102
+ @options = OpenStruct.new(DEFAULT_OPTIONS)
103
+ define_create_directories_task
104
+ define_workflow_tasks
105
+ define_destroy_task
106
+ end
107
+
108
+ # Creates a task <tt>:create_directories</tt> to create the
109
+ # directory structure for this dataset.
110
+ def define_create_directories_task
111
+ @last_description = "Creates workflow directories for this dataset."
112
+ define_task(IMW::Task, {:create_directories => []}) do
113
+ DIRS.each do |dir|
114
+ FileUtils.mkdir_p(path_to(dir)) unless File.exist?(path_to(dir))
115
+ end
116
+ end
117
+ end
118
+
119
+ # Creates a task <tt>:destroy</tt> which removes dataset's
120
+ # workflow directories.
121
+ def define_destroy_task
122
+ @last_description = "Get rid of all traces of this dataset."
123
+ define_task(IMW::Task, :destroy => [:create_directories]) do
124
+ DIRS.each do |dir|
125
+ FileUtils.rm_rf(path_to(dir))
126
+ end
127
+ end
128
+ end
129
+
130
+ # Creates the task dependency chain <tt>:package => :munge =>
131
+ # :parse => :extract => :rip => :initialize</tt> of the
132
+ # IMW::Workflow.
133
+ def define_workflow_tasks
134
+ define_workflow_task({:rip => [:create_directories]}, "Obtain data from some source." )
135
+ define_workflow_task({:extract => [:rip]}, "Extract data so it's ready to parse." )
136
+ define_workflow_task({:parse => [:extract]}, "Parse data into a structured form." )
137
+ define_workflow_task({:munge => [:parse]}, "Munge structured data into desired form.")
138
+ define_workflow_task({:package => [:munge]}, "Package dataset in final form." )
76
139
  end
77
140
 
78
141
  end
79
142
  end
80
-
81
- # puts "#{File.basename(__FILE__)}: You find your flow next to a tall tree. Ahhhh."