imw 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +194 -31
- data/VERSION +1 -1
- data/bin/imw +5 -0
- data/lib/imw/boot.rb +0 -15
- data/lib/imw/dataset/paths.rb +38 -0
- data/lib/imw/dataset/task.rb +21 -18
- data/lib/imw/dataset/workflow.rb +126 -65
- data/lib/imw/dataset.rb +56 -82
- data/lib/imw/files/basicfile.rb +3 -3
- data/lib/imw/files/compressed_files_and_archives.rb +23 -37
- data/lib/imw/files/csv.rb +2 -1
- data/lib/imw/files/directory.rb +62 -0
- data/lib/imw/files/excel.rb +84 -0
- data/lib/imw/files/sgml.rb +4 -23
- data/lib/imw/files.rb +62 -47
- data/lib/imw/packagers/archiver.rb +19 -1
- data/lib/imw/packagers/s3_mover.rb +8 -0
- data/lib/imw/parsers/html_parser/matchers.rb +251 -268
- data/lib/imw/parsers/html_parser.rb +181 -176
- data/lib/imw/parsers.rb +1 -1
- data/lib/imw/repository.rb +35 -0
- data/lib/imw/runner.rb +114 -0
- data/lib/imw/utils/extensions/core.rb +0 -16
- data/lib/imw/utils/paths.rb +0 -28
- data/lib/imw.rb +21 -32
- metadata +11 -19
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
- data/lib/imw/dataset/datamapper.rb +0 -66
- data/lib/imw/dataset/loaddump.rb +0 -50
- data/lib/imw/dataset/old/file_collection.rb +0 -88
- data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
- data/lib/imw/dataset/scaffold.rb +0 -132
- data/lib/imw/dataset/scraped_uri.rb +0 -305
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
- data/lib/imw/dataset/scrub/scrub.rb +0 -147
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
- data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
- data/lib/imw/dataset/scrub/slug.rb +0 -101
- data/lib/imw/dataset/stats/counter.rb +0 -23
- data/lib/imw/dataset/stats.rb +0 -73
data/README.rdoc
CHANGED
@@ -38,7 +38,7 @@ right one to use. IMW is **not** designed for
|
|
38
38
|
|
39
39
|
IMW is hosted on Gemcutter[http://gemcutter.org] so it's easy to install.
|
40
40
|
|
41
|
-
You'll have to set up Gemcutter
|
41
|
+
You'll have to set up Gemcutter if you haven't already
|
42
42
|
|
43
43
|
$ sudo gem install gemcutter
|
44
44
|
$ gem tumble
|
@@ -47,55 +47,218 @@ and then install IMW
|
|
47
47
|
|
48
48
|
$ sudo gem install imw
|
49
49
|
|
50
|
-
=
|
50
|
+
= IMW Basics
|
51
51
|
|
52
52
|
The central goal of IMW is to make workflow involved in processing a
|
53
53
|
dataset from a raw source to a finished product as simple as possible.
|
54
54
|
|
55
|
-
|
56
|
-
|
57
|
-
and the second
|
55
|
+
To help achieve this goal, IMW creates lots of convenient structures
|
56
|
+
and methods. The following sections provide a tour of these.
|
58
57
|
|
59
|
-
|
58
|
+
It is assumed that you've installed IMW and required it in a script
|
59
|
+
via
|
60
60
|
|
61
61
|
require 'rubygems'
|
62
62
|
require 'imw'
|
63
63
|
|
64
|
+
== Paths
|
65
|
+
|
64
66
|
IMW holds a registry of paths that you can define on the fly or store
|
65
67
|
in a configuration file.
|
66
68
|
|
67
|
-
IMW.add_path
|
68
|
-
IMW.
|
69
|
-
|
69
|
+
IMW.add_path(:dropbox, "/var/www/public/dropbox")
|
70
|
+
IMW.path_to(:dropbox) #=> "/var/www/public/dropbox"
|
71
|
+
|
72
|
+
You can combine paths together dynamically.
|
70
73
|
|
71
|
-
|
74
|
+
IMW.add_path(:raw, "/data/raw")
|
75
|
+
IMW.path_to(:raw, "my/dataset") #=> "/data/raw/my/dataset"
|
76
|
+
IMW.add_path(:rejects, :raw, "rejects")
|
77
|
+
IMW.path_to(:rejects) #=> "/data/raw/rejects"
|
72
78
|
|
73
|
-
|
74
|
-
#=> "/mnt/data/raw/one/particular/dataset"
|
79
|
+
Altering one path will update others
|
75
80
|
|
76
|
-
IMW
|
81
|
+
IMW.add_path(:raw, "/data2/raw")
|
82
|
+
IMW.path_to(:rejects) #=> "/data2/raw/rejects", not "/data/raw/rejects"
|
77
83
|
|
84
|
+
== Files & Directories
|
78
85
|
|
79
|
-
|
86
|
+
Use IMW.open to open files. The object returned by IMW.open obeys the
|
87
|
+
usual semantics of a File object but it has new methods to manipulate
|
88
|
+
and parse the file.
|
80
89
|
|
81
|
-
|
90
|
+
f1 = IMW.open("/path/to/file")
|
91
|
+
f1.read() # does what you think
|
82
92
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
93
|
+
# class methods from File are available
|
94
|
+
f1.size
|
95
|
+
f1.writeable?
|
96
|
+
|
97
|
+
# use a bang or a 'w' to write
|
98
|
+
writable_file = IMW.open!('/some/path') # similar to open('/some/path', 'w')
|
99
|
+
|
100
|
+
# as well as methods to manipulate the file on the filesystem
|
101
|
+
f2 = f1.cp("/new/path/to/file") # also try cp_to_dir
|
102
|
+
f1.exist? # true
|
103
|
+
f3 = f1.mv("/yet/another/path") # also try mv_to_dir
|
104
|
+
f1.exist? # false
|
105
|
+
|
106
|
+
IMW also knows about directories
|
107
|
+
|
108
|
+
d = IMW.open('/tmp')
|
109
|
+
d.directory? # true
|
110
|
+
d['*'] # Dir['/tmp/*']
|
111
|
+
d.mv('/parent/dir')
|
112
|
+
|
113
|
+
== Remote Files
|
114
|
+
|
115
|
+
Many operations defined for files are also defined for arbitrary URIs
|
116
|
+
through the <tt>open-uri</tt> library.
|
117
|
+
|
118
|
+
Files can readily be opened, read, and downloaded from the Internet
|
119
|
+
|
120
|
+
site = IMW.open('http://infochimps.org') #=> Recognized as an HTML document
|
121
|
+
site.read() # does what you think
|
122
|
+
site.cp('/some/local/path')
|
123
|
+
site.exist? # will work in many cases
|
124
|
+
|
125
|
+
(writing to remote sources isn't enabled yet).
|
126
|
+
|
127
|
+
== Archives & Compressed Files
|
128
|
+
|
129
|
+
IMW works with a variety of archiving and compression programs (see
|
130
|
+
IMW::EXTERNAL_PROGRAMS) to make packaging/unpackaging data easy.
|
131
|
+
|
132
|
+
bz2 = IMW.open('/path/to/big_file.bz2')
|
133
|
+
zip = IMW.open('/path/to/archive.zip')
|
134
|
+
targz = IMW.open('/path/to/archive.tar.gz')
|
135
|
+
|
136
|
+
# IMW recognizes files by extension
|
137
|
+
bz2.archive? # false
|
138
|
+
bz2.compressed? # true
|
139
|
+
zip.archive? # true
|
140
|
+
zip.compressed? # false
|
141
|
+
targz.archive? # true
|
142
|
+
targz.compressed? # true
|
143
|
+
|
144
|
+
# decompress or compress files
|
145
|
+
big_file = bz2.decompress! # skip the ! to preserve the original
|
146
|
+
new_bz2 = big_file.compress!
|
147
|
+
|
148
|
+
# extract and package archives
|
149
|
+
zip.extract # files show up in working directory
|
150
|
+
tarbz2.extract # no need to decompress first
|
151
|
+
new_tarbz2 = IMW.open!('/new/archive.tar').create(['/path1', '/path/2']).compress!
|
152
|
+
|
153
|
+
== Data Formats
|
154
|
+
|
155
|
+
IMW encourages you to work with data as Ruby objects as much as
|
156
|
+
possible by providing methods to parse common data formats directly
|
157
|
+
into Ruby.
|
158
|
+
|
159
|
+
The actual parsing is always handled by a separate library appropriate
|
160
|
+
for the data format so it will be fast and, if you're familiar with
|
161
|
+
the library, you can use many functions of the library directly on the
|
162
|
+
object returned by IMW.open.
|
163
|
+
|
164
|
+
IMW uses classes (defined in IMW::Files) to interface with each data
|
165
|
+
type. The choice of class is determined by the extension of the path
|
166
|
+
supplied to IMW.open.
|
167
|
+
|
168
|
+
IMW.open('file.csv') #=> IMW::Files::Csv
|
169
|
+
IMW.open('file.xml') #=> IMW::Files::Xml
|
170
|
+
IMW.open('file.html') #=> IMW::Files::Html
|
171
|
+
|
172
|
+
# default choice will be a text file
|
173
|
+
IMW.open('strange_filename.wuzz') #=> IMW::Files::Text
|
174
|
+
|
175
|
+
# but you force a particular choice
|
176
|
+
IMW.open('strange_filename.wuzz', :as => :csv) #=> IMW::Files::Csv
|
177
|
+
|
178
|
+
Some formats are extremely regular (CSV's, JSON, YAML, &c.) and can
|
179
|
+
immediately be converted to simple Ruby objects. Other formats (flat
|
180
|
+
files, HTML, XML, &c.) require parsing before they can be
|
181
|
+
unambiguously converted to Ruby objects.
|
182
|
+
|
183
|
+
As an example, consider flat, delimited files. They are extremely
|
184
|
+
regular and IMW uses FasterCSV to automatically parse them into nested
|
185
|
+
arrays, the only sensible and unambiguous Ruby representation of their
|
186
|
+
data:
|
187
|
+
|
188
|
+
delimit1 = IMW.open('/path/to/csv') # IMW::Files::Csv
|
189
|
+
delimit1.entries #=> array of arrays of entries
|
190
|
+
delimit1.each do |row|
|
191
|
+
# passes in parsed rows
|
192
|
+
...
|
95
193
|
end
|
194
|
+
|
195
|
+
# if there's a funny delimiter, it can be passed as an option (in
|
196
|
+
# this case identical to what would be passed to FasterCSV under the
|
197
|
+
# hood
|
198
|
+
delimit2 = IMW.open('/path/to/file.csv', :col_sep => " ")
|
96
199
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
200
|
+
HTML files, on the other hand, are more complex and typically have to
|
201
|
+
be parsed before being converted to plain Ruby objects:
|
202
|
+
|
203
|
+
# Grab a tiny link from the bottom of Google's homepage
|
204
|
+
doc = IMW.open('http://www.google.com') # IMW::Files::Html
|
205
|
+
doc.parse('p a') # 'Privacy'
|
206
|
+
|
207
|
+
More complex parsers can also be built
|
208
|
+
|
209
|
+
# Grab each row from an HTML table
|
210
|
+
doc = IMW.open('/path/to/data.html')
|
211
|
+
doc.parse :employees => ["tr", { :name => "td.name", :address => "td.address" } ]
|
212
|
+
#=> [{:name => "John Chimpo", :address => "123 Fake St."}, {...}, ... ]
|
213
|
+
|
214
|
+
see IMW::Parsers::HtmlParser for details on parsing HTML (and similar)
|
215
|
+
files. Examine the other parsers in IMW::Parsers for details on
|
216
|
+
parsing other data formats.
|
217
|
+
|
218
|
+
= The IMW Workflow
|
219
|
+
|
220
|
+
The workflow of IMW can be roughly summarized as follows:
|
221
|
+
|
222
|
+
rip::
|
223
|
+
|
224
|
+
Data is obtained from a source. IMW allows you to download data
|
225
|
+
from the web, obtain it by querying databases, or use other services
|
226
|
+
like rsync, ftp, &c. to pull it in from another computer.
|
227
|
+
|
228
|
+
extract::
|
229
|
+
|
230
|
+
Ripped data is often compressed or otherwise archived and needs to
|
231
|
+
be extracted. It may also be sliced in many ways (excluding certain
|
232
|
+
years, say) to reduce the volume to only what is required.
|
233
|
+
|
234
|
+
parse::
|
235
|
+
|
236
|
+
Data is parsed into Ruby objects and stored.
|
237
|
+
|
238
|
+
munge::
|
239
|
+
|
240
|
+
All the parsed data is combined, reconciled, and further processed
|
241
|
+
into a final form.
|
242
|
+
|
243
|
+
package::
|
244
|
+
|
245
|
+
The data is archived and compressed as necessary and moved to an
|
246
|
+
outbox, staging server, S3 bucket, &c.
|
247
|
+
|
248
|
+
Not all datasets
|
249
|
+
|
250
|
+
|
251
|
+
== Datasets
|
252
|
+
|
253
|
+
== Tasks & Dependencies
|
254
|
+
|
255
|
+
== Directory Structure
|
256
|
+
|
257
|
+
== Records
|
258
|
+
|
259
|
+
= IMW on the Command Line
|
260
|
+
|
261
|
+
== Repositories
|
262
|
+
|
263
|
+
== Running Tasks
|
101
264
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.1
|
data/bin/imw
ADDED
data/lib/imw/boot.rb
CHANGED
@@ -1,18 +1,3 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/boot.rb -- startup functions
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# This file contains code necessary to boot the Infinite Monkeywrench
|
7
|
-
# at a particular site.
|
8
|
-
#
|
9
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
10
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
11
|
-
# License:: GPL 3.0
|
12
|
-
# Website:: http://infinitemonkeywrench.org/
|
13
|
-
#
|
14
|
-
# puts "#{File.basename(__FILE__)}: You heft up your Infinite Monkeywrench for the first time and marvel at how something so powerful could be made so wondrous light!"
|
15
|
-
|
16
1
|
module IMW
|
17
2
|
module Config
|
18
3
|
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module IMW
|
2
|
+
|
3
|
+
class Dataset
|
4
|
+
include IMW::Paths
|
5
|
+
|
6
|
+
# A dataset keeps track of its own collection of paths just like
|
7
|
+
# IMW itself. When an IMW::Dataset is instantiated in a script,
|
8
|
+
# that script's directory becomes the dataset's +self+ path and
|
9
|
+
# the default workflow directories (see IMW::Workflow) are created
|
10
|
+
# within this directory.
|
11
|
+
#
|
12
|
+
# You can change a dataset's paths the same way you can change
|
13
|
+
# IMW's paths; calling +add_path+ and +remove_path+ on the
|
14
|
+
# dataset.
|
15
|
+
#
|
16
|
+
# To customize this behavior for all future datasets, created a
|
17
|
+
# subclass of IMW::Dataset and override the +set_paths+ method.
|
18
|
+
def paths
|
19
|
+
@paths
|
20
|
+
end
|
21
|
+
|
22
|
+
protected
|
23
|
+
# Sets the roots of various paths relative to this dataset.
|
24
|
+
def set_root_paths
|
25
|
+
@paths = {}
|
26
|
+
add_path :script, File.expand_path(eval('__FILE__'))
|
27
|
+
add_path :self, File.dirname(path_to(:script))
|
28
|
+
IMW::Workflow::DIRS.each do |dir|
|
29
|
+
add_path dir, :self, dir.to_s
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Overwrite this method to set additional paths for the dataset.
|
34
|
+
def set_paths
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
data/lib/imw/dataset/task.rb
CHANGED
@@ -1,25 +1,10 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/workflow/task.rb --
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# This file defines a class <tt>IMW::Task</tt> which subclasses
|
7
|
-
# <tt>Rake::Task</tt>. Tasks defined in IMW should be instances of
|
8
|
-
# <tt>IMW::Task</tt>.
|
9
|
-
#
|
10
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
11
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
12
|
-
# License:: GPL 3.0
|
13
|
-
# Website:: http://infinitemonkeywrench.org/
|
14
|
-
#
|
15
|
-
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
16
|
-
|
17
1
|
require 'rake'
|
18
2
|
|
19
3
|
module IMW
|
20
4
|
|
21
|
-
|
22
|
-
|
5
|
+
Task = Class.new(Rake::Task)
|
6
|
+
FileTask = Class.new(Rake::FileTask)
|
7
|
+
FileCreationTask = Class.new(Rake::FileCreationTask)
|
23
8
|
|
24
9
|
class Dataset
|
25
10
|
include Rake::TaskManager
|
@@ -31,6 +16,24 @@ module IMW
|
|
31
16
|
self.define_task IMW::Task, name, &block
|
32
17
|
end
|
33
18
|
|
19
|
+
# Return a new (or existing) <tt>IMW::FileTask</tt> with the given
|
20
|
+
# +name+. Dependencies can be declared and a block passed in just
|
21
|
+
# as in Rake.
|
22
|
+
def file name, &block
|
23
|
+
self.define_task IMW::FileTask, name, &block
|
24
|
+
end
|
25
|
+
|
26
|
+
# Return a new (or existing) <tt>IMW::FileCreationTask</tt> with the given
|
27
|
+
# +name+. Dependencies can be declared and a block passed in just
|
28
|
+
# as in Rake.
|
29
|
+
def file_create name, &block
|
30
|
+
self.define_task IMW::FileCreationTask, name, &block
|
31
|
+
end
|
32
|
+
|
33
|
+
# Override this method to define default tasks for a subclass of
|
34
|
+
# IMW::Dataset.
|
35
|
+
def set_tasks
|
36
|
+
end
|
34
37
|
end
|
35
38
|
end
|
36
39
|
|
data/lib/imw/dataset/workflow.rb
CHANGED
@@ -1,81 +1,142 @@
|
|
1
|
-
#
|
2
|
-
# lib/imw/workflow.rb -- implements the workflow class
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# This file implements the <tt>IMW::Workflow</tt> class which tailors
|
7
|
-
# the functionality of Rake for IMW objects.
|
8
|
-
#
|
9
|
-
# Author:: Philip flip Kromer for infochimps.org (mailto:coders@infochimps.org)
|
10
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
11
|
-
# License:: GPL 3.0
|
12
|
-
# Website:: http://infinitemonkeywrench.org/
|
13
|
-
#
|
14
|
-
|
15
|
-
require 'imw/dataset/scaffold'
|
16
1
|
require 'imw/dataset/task'
|
2
|
+
require 'ostruct'
|
17
3
|
|
18
4
|
module IMW
|
19
5
|
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
6
|
+
# IMW encourages you to view a data transformation as a network of
|
7
|
+
# dependencies. By default, IMW defines five main steps:
|
8
|
+
#
|
9
|
+
# rip::
|
10
|
+
# Obtain data via HTTP, FTP, SCP, RSYNC, database query, &c.
|
11
|
+
#
|
12
|
+
# extract::
|
13
|
+
# Extract data from its ripped form to a form which can be
|
14
|
+
# parsed.
|
15
|
+
#
|
16
|
+
# parse::
|
17
|
+
# Parse data into a structured form.
|
18
|
+
#
|
19
|
+
# munge::
|
20
|
+
# Combine, filter, reconcile, and transform already structured
|
21
|
+
# data into a desired form.
|
22
|
+
#
|
23
|
+
# package::
|
24
|
+
# Archive, compress, and deliver data in its final form to some
|
25
|
+
# location (HTTP, FTP, SCP, RSYNC, S3, EBS, &c.).
|
26
|
+
#
|
27
|
+
# Each step depends upon the one before it. The steps are blank by
|
28
|
+
# default so there's no need to write code for steps you don't need
|
29
|
+
# to use.
|
30
|
+
#
|
31
|
+
# Each step corresponds to a named directory in IMW::Workflow::DIRS.
|
23
32
|
module Workflow
|
24
33
|
|
25
|
-
# The
|
26
|
-
#
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
create_initialize_task
|
31
|
-
create_delete_data_task
|
32
|
-
create_destroy_task
|
33
|
-
create_workflow_tasks
|
34
|
-
end
|
34
|
+
# The <tt>Rake::TaskManager</tt> module allows the
|
35
|
+
# <tt>IMW::Dataset</tt> class to leverage the functionality of the
|
36
|
+
# Rake[http://rake.rubyforge.org/] library to manage tasks
|
37
|
+
# associated with the processing of this dataset.
|
38
|
+
include Rake::TaskManager
|
35
39
|
|
36
|
-
#
|
37
|
-
#
|
38
|
-
#
|
39
|
-
|
40
|
-
|
41
|
-
|
40
|
+
# Default options passed to <tt>Rake</tt>. Any class including
|
41
|
+
# the <tt>Rake::TaskManager</tt> module must define a constant by
|
42
|
+
# this name.
|
43
|
+
DEFAULT_OPTIONS = {
|
44
|
+
:dry_run => false,
|
45
|
+
:trace => false,
|
46
|
+
:verbose => false
|
47
|
+
}
|
48
|
+
|
49
|
+
# The standard IMW workflow steps.
|
50
|
+
STEPS = [:rip, :extract, :parse, :munge, :package]
|
51
|
+
|
52
|
+
# The steps of the IMW workflow each correspond to a directory in
|
53
|
+
# which it is customary that they deposit their files <em>once
|
54
|
+
# they are finished processing</em> (so ripped files wind up in
|
55
|
+
# the +ripd+ directory, packaged files in the +pkgd+ directory,
|
56
|
+
# and so on).
|
57
|
+
DIRS = [:ripd, :xtrd, :prsd, :mungd, :pkgd ]
|
58
|
+
|
59
|
+
# Each workflow step can be configured to take default actions,
|
60
|
+
# each action being a proc in the array for the step in this hash.
|
42
61
|
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
define_task(Rake::Task, {:package => :fix})
|
51
|
-
comment_default_tasks
|
62
|
+
# This allows classes which include IMW::Workflow to use class
|
63
|
+
# methods named after each step (+rip+, +parse+, &c.) to directly
|
64
|
+
# define tasks.
|
65
|
+
STEPS_TASKS = returning({}) do |steps_procs|
|
66
|
+
STEPS.each do |step|
|
67
|
+
steps_procs[step] = []
|
68
|
+
end
|
52
69
|
end
|
53
70
|
|
54
|
-
|
55
|
-
def
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
71
|
+
protected
|
72
|
+
def self.included klass
|
73
|
+
STEPS.each do |step|
|
74
|
+
klass.class_eval <<EOF
|
75
|
+
def self.#{step}(deps=nil, &block)
|
76
|
+
STEPS_TASKS[:#{step}] << [deps, block]
|
77
|
+
end
|
78
|
+
EOF
|
79
|
+
end
|
80
|
+
|
81
|
+
|
61
82
|
end
|
62
83
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
84
|
+
def define_workflow_task deps, comment
|
85
|
+
@last_description = comment
|
86
|
+
define_task(IMW::Task, deps)
|
87
|
+
step = deps.respond_to?(:keys) ? deps.keys.first : deps
|
88
|
+
STEPS_TASKS[step].each do |deps, block|
|
89
|
+
self[step].enhance(deps) do
|
90
|
+
self.instance_eval(&block)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# Create all the instance variables required by Rake::TaskManager
|
96
|
+
# and define default tasks for this dataset.
|
97
|
+
def initialize_workflow
|
98
|
+
@tasks = Hash.new
|
99
|
+
@rules = Array.new
|
100
|
+
@scope = Array.new
|
101
|
+
@last_description = nil
|
102
|
+
@options = OpenStruct.new(DEFAULT_OPTIONS)
|
103
|
+
define_create_directories_task
|
104
|
+
define_workflow_tasks
|
105
|
+
define_destroy_task
|
106
|
+
end
|
107
|
+
|
108
|
+
# Creates a task <tt>:create_directories</tt> to create the
|
109
|
+
# directory structure for this dataset.
|
110
|
+
def define_create_directories_task
|
111
|
+
@last_description = "Creates workflow directories for this dataset."
|
112
|
+
define_task(IMW::Task, {:create_directories => []}) do
|
113
|
+
DIRS.each do |dir|
|
114
|
+
FileUtils.mkdir_p(path_to(dir)) unless File.exist?(path_to(dir))
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
# Creates a task <tt>:destroy</tt> which removes dataset's
|
120
|
+
# workflow directories.
|
121
|
+
def define_destroy_task
|
122
|
+
@last_description = "Get rid of all traces of this dataset."
|
123
|
+
define_task(IMW::Task, :destroy => [:create_directories]) do
|
124
|
+
DIRS.each do |dir|
|
125
|
+
FileUtils.rm_rf(path_to(dir))
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# Creates the task dependency chain <tt>:package => :munge =>
|
131
|
+
# :parse => :extract => :rip => :initialize</tt> of the
|
132
|
+
# IMW::Workflow.
|
133
|
+
def define_workflow_tasks
|
134
|
+
define_workflow_task({:rip => [:create_directories]}, "Obtain data from some source." )
|
135
|
+
define_workflow_task({:extract => [:rip]}, "Extract data so it's ready to parse." )
|
136
|
+
define_workflow_task({:parse => [:extract]}, "Parse data into a structured form." )
|
137
|
+
define_workflow_task({:munge => [:parse]}, "Munge structured data into desired form.")
|
138
|
+
define_workflow_task({:package => [:munge]}, "Package dataset in final form." )
|
76
139
|
end
|
77
140
|
|
78
141
|
end
|
79
142
|
end
|
80
|
-
|
81
|
-
# puts "#{File.basename(__FILE__)}: You find your flow next to a tall tree. Ahhhh."
|