imw 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +194 -31
- data/VERSION +1 -1
- data/bin/imw +5 -0
- data/lib/imw/boot.rb +0 -15
- data/lib/imw/dataset/paths.rb +38 -0
- data/lib/imw/dataset/task.rb +21 -18
- data/lib/imw/dataset/workflow.rb +126 -65
- data/lib/imw/dataset.rb +56 -82
- data/lib/imw/files/basicfile.rb +3 -3
- data/lib/imw/files/compressed_files_and_archives.rb +23 -37
- data/lib/imw/files/csv.rb +2 -1
- data/lib/imw/files/directory.rb +62 -0
- data/lib/imw/files/excel.rb +84 -0
- data/lib/imw/files/sgml.rb +4 -23
- data/lib/imw/files.rb +62 -47
- data/lib/imw/packagers/archiver.rb +19 -1
- data/lib/imw/packagers/s3_mover.rb +8 -0
- data/lib/imw/parsers/html_parser/matchers.rb +251 -268
- data/lib/imw/parsers/html_parser.rb +181 -176
- data/lib/imw/parsers.rb +1 -1
- data/lib/imw/repository.rb +35 -0
- data/lib/imw/runner.rb +114 -0
- data/lib/imw/utils/extensions/core.rb +0 -16
- data/lib/imw/utils/paths.rb +0 -28
- data/lib/imw.rb +21 -32
- metadata +11 -19
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
- data/lib/imw/dataset/datamapper.rb +0 -66
- data/lib/imw/dataset/loaddump.rb +0 -50
- data/lib/imw/dataset/old/file_collection.rb +0 -88
- data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
- data/lib/imw/dataset/scaffold.rb +0 -132
- data/lib/imw/dataset/scraped_uri.rb +0 -305
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
- data/lib/imw/dataset/scrub/scrub.rb +0 -147
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
- data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
- data/lib/imw/dataset/scrub/slug.rb +0 -101
- data/lib/imw/dataset/stats/counter.rb +0 -23
- data/lib/imw/dataset/stats.rb +0 -73
data/README.rdoc
CHANGED
@@ -38,7 +38,7 @@ right one to use. IMW is **not** designed for
|
|
38
38
|
|
39
39
|
IMW is hosted on Gemcutter[http://gemcutter.org] so it's easy to install.
|
40
40
|
|
41
|
-
You'll have to set up Gemcutter
|
41
|
+
You'll have to set up Gemcutter if you haven't already
|
42
42
|
|
43
43
|
$ sudo gem install gemcutter
|
44
44
|
$ gem tumble
|
@@ -47,55 +47,218 @@ and then install IMW
|
|
47
47
|
|
48
48
|
$ sudo gem install imw
|
49
49
|
|
50
|
-
=
|
50
|
+
= IMW Basics
|
51
51
|
|
52
52
|
The central goal of IMW is to make workflow involved in processing a
|
53
53
|
dataset from a raw source to a finished product as simple as possible.
|
54
54
|
|
55
|
-
|
56
|
-
|
57
|
-
and the second
|
55
|
+
To help achieve this goal, IMW creates lots of convenient structures
|
56
|
+
and methods. The following sections provide a tour of these.
|
58
57
|
|
59
|
-
|
58
|
+
It is assumed that you've installed IMW and required it in a script
|
59
|
+
via
|
60
60
|
|
61
61
|
require 'rubygems'
|
62
62
|
require 'imw'
|
63
63
|
|
64
|
+
== Paths
|
65
|
+
|
64
66
|
IMW holds a registry of paths that you can define on the fly or store
|
65
67
|
in a configuration file.
|
66
68
|
|
67
|
-
IMW.add_path
|
68
|
-
IMW.
|
69
|
-
|
69
|
+
IMW.add_path(:dropbox, "/var/www/public/dropbox")
|
70
|
+
IMW.path_to(:dropbox) #=> "/var/www/public/dropbox"
|
71
|
+
|
72
|
+
You can combine paths together dynamically.
|
70
73
|
|
71
|
-
|
74
|
+
IMW.add_path(:raw, "/data/raw")
|
75
|
+
IMW.path_to(:raw, "my/dataset") #=> "/data/raw/my/dataset"
|
76
|
+
IMW.add_path(:rejects, :raw, "rejects")
|
77
|
+
IMW.path_to(:rejects) #=> "/data/raw/rejects"
|
72
78
|
|
73
|
-
|
74
|
-
#=> "/mnt/data/raw/one/particular/dataset"
|
79
|
+
Altering one path will update others
|
75
80
|
|
76
|
-
IMW
|
81
|
+
IMW.add_path(:raw, "/data2/raw")
|
82
|
+
IMW.path_to(:rejects) #=> "/data2/raw/rejects", not "/data/raw/rejects"
|
77
83
|
|
84
|
+
== Files & Directories
|
78
85
|
|
79
|
-
|
86
|
+
Use IMW.open to open files. The object returned by IMW.open obeys the
|
87
|
+
usual semantics of a File object but it has new methods to manipulate
|
88
|
+
and parse the file.
|
80
89
|
|
81
|
-
|
90
|
+
f1 = IMW.open("/path/to/file")
|
91
|
+
f1.read() # does what you think
|
82
92
|
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
93
|
+
# class methods from File are available
|
94
|
+
f1.size
|
95
|
+
f1.writeable?
|
96
|
+
|
97
|
+
# use a bang or a 'w' to write
|
98
|
+
writable_file = IMW.open!('/some/path') # similar to open('/some/path', 'w')
|
99
|
+
|
100
|
+
# as well as methods to manipulate the file on the filesystem
|
101
|
+
f2 = f1.cp("/new/path/to/file") # also try cp_to_dir
|
102
|
+
f1.exist? # true
|
103
|
+
f3 = f1.mv("/yet/another/path") # also try mv_to_dir
|
104
|
+
f1.exist? # false
|
105
|
+
|
106
|
+
IMW also knows about directories
|
107
|
+
|
108
|
+
d = IMW.open('/tmp')
|
109
|
+
d.directory? # true
|
110
|
+
d['*'] # Dir['/tmp/*']
|
111
|
+
d.mv('/parent/dir')
|
112
|
+
|
113
|
+
== Remote Files
|
114
|
+
|
115
|
+
Many operations defined for files are also defined for arbitrary URIs
|
116
|
+
through the <tt>open-uri</tt> library.
|
117
|
+
|
118
|
+
Files can readily be opened, read, and downloaded from the Internet
|
119
|
+
|
120
|
+
site = IMW.open('http://infochimps.org') #=> Recognized as an HTML document
|
121
|
+
site.read() # does what you think
|
122
|
+
site.cp('/some/local/path')
|
123
|
+
site.exist? # will work in many cases
|
124
|
+
|
125
|
+
(writing to remote sources isn't enabled yet).
|
126
|
+
|
127
|
+
== Archives & Compressed Files
|
128
|
+
|
129
|
+
IMW works with a variety of archiving and compression programs (see
|
130
|
+
IMW::EXTERNAL_PROGRAMS) to make packaging/unpackaging data easy.
|
131
|
+
|
132
|
+
bz2 = IMW.open('/path/to/big_file.bz2')
|
133
|
+
zip = IMW.open('/path/to/archive.zip')
|
134
|
+
targz = IMW.open('/path/to/archive.tar.gz')
|
135
|
+
|
136
|
+
# IMW recognizes files by extension
|
137
|
+
bz2.archive? # false
|
138
|
+
bz2.compressed? # true
|
139
|
+
zip.archive? # true
|
140
|
+
zip.compressed? # false
|
141
|
+
targz.archive? # true
|
142
|
+
targz.compressed? # true
|
143
|
+
|
144
|
+
# decompress or compress files
|
145
|
+
big_file = bz2.decompress! # skip the ! to preserve the original
|
146
|
+
new_bz2 = big_file.compress!
|
147
|
+
|
148
|
+
# extract and package archives
|
149
|
+
zip.extract # files show up in working directory
|
150
|
+
tarbz2.extract # no need to decompress first
|
151
|
+
new_tarbz2 = IMW.open!('/new/archive.tar').create(['/path1', '/path/2']).compress!
|
152
|
+
|
153
|
+
== Data Formats
|
154
|
+
|
155
|
+
IMW encourages you to work with data as Ruby objects as much as
|
156
|
+
possible by providing methods to parse common data formats directly
|
157
|
+
into Ruby.
|
158
|
+
|
159
|
+
The actual parsing is always handled by a separate library appropriate
|
160
|
+
for the data format so it will be fast and, if you're familiar with
|
161
|
+
the library, you can use many functions of the library directly on the
|
162
|
+
object returned by IMW.open.
|
163
|
+
|
164
|
+
IMW uses classes (defined in IMW::Files) to interface with each data
|
165
|
+
type. The choice of class is determined by the extension of the path
|
166
|
+
supplied to IMW.open.
|
167
|
+
|
168
|
+
IMW.open('file.csv') #=> IMW::Files::Csv
|
169
|
+
IMW.open('file.xml') #=> IMW::Files::Xml
|
170
|
+
IMW.open('file.html') #=> IMW::Files::Html
|
171
|
+
|
172
|
+
# default choice will be a text file
|
173
|
+
IMW.open('strange_filename.wuzz') #=> IMW::Files::Text
|
174
|
+
|
175
|
+
# but you force a particular choice
|
176
|
+
IMW.open('strange_filename.wuzz', :as => :csv) #=> IMW::Files::Csv
|
177
|
+
|
178
|
+
Some formats are extremely regular (CSV's, JSON, YAML, &c.) and can
|
179
|
+
immediately be converted to simple Ruby objects. Other formats (flat
|
180
|
+
files, HTML, XML, &c.) require parsing before they can be
|
181
|
+
unambiguously converted to Ruby objects.
|
182
|
+
|
183
|
+
As an example, consider flat, delimited files. They are extremely
|
184
|
+
regular and IMW uses FasterCSV to automatically parse them into nested
|
185
|
+
arrays, the only sensible and unambiguous Ruby representation of their
|
186
|
+
data:
|
187
|
+
|
188
|
+
delimit1 = IMW.open('/path/to/csv') # IMW::Files::Csv
|
189
|
+
delimit1.entries #=> array of arrays of entries
|
190
|
+
delimit1.each do |row|
|
191
|
+
# passes in parsed rows
|
192
|
+
...
|
95
193
|
end
|
194
|
+
|
195
|
+
# if there's a funny delimiter, it can be passed as an option (in
|
196
|
+
# this case identical to what would be passed to FasterCSV under the
|
197
|
+
# hood
|
198
|
+
delimit2 = IMW.open('/path/to/file.csv', :col_sep => " ")
|
96
199
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
200
|
+
HTML files, on the other hand, are more complex and typically have to
|
201
|
+
be parsed before being converted to plain Ruby objects:
|
202
|
+
|
203
|
+
# Grab a tiny link from the bottom of Google's homepage
|
204
|
+
doc = IMW.open('http://www.google.com') # IMW::Files::Html
|
205
|
+
doc.parse('p a') # 'Privacy'
|
206
|
+
|
207
|
+
More complex parsers can also be built
|
208
|
+
|
209
|
+
# Grab each row from an HTML table
|
210
|
+
doc = IMW.open('/path/to/data.html')
|
211
|
+
doc.parse :employees => ["tr", { :name => "td.name", :address => "td.address" } ]
|
212
|
+
#=> [{:name => "John Chimpo", :address => "123 Fake St."}, {...}, ... ]
|
213
|
+
|
214
|
+
see IMW::Parsers::HtmlParser for details on parsing HTML (and similar)
|
215
|
+
files. Examine the other parsers in IMW::Parsers for details on
|
216
|
+
parsing other data formats.
|
217
|
+
|
218
|
+
= The IMW Workflow
|
219
|
+
|
220
|
+
The workflow of IMW can be roughly summarized as follows:
|
221
|
+
|
222
|
+
rip::
|
223
|
+
|
224
|
+
Data is obtained from a source. IMW allows you to download data
|
225
|
+
from the web, obtain it by querying databases, or use other services
|
226
|
+
like rsync, ftp, &c. to pull it in from another computer.
|
227
|
+
|
228
|
+
extract::
|
229
|
+
|
230
|
+
Ripped data is often compressed or otherwise archived and needs to
|
231
|
+
be extracted. It may also be sliced in many ways (excluding certain
|
232
|
+
years, say) to reduce the volume to only what is required.
|
233
|
+
|
234
|
+
parse::
|
235
|
+
|
236
|
+
Data is parsed into Ruby objects and stored.
|
237
|
+
|
238
|
+
munge::
|
239
|
+
|
240
|
+
All the parsed data is combined, reconciled, and further processed
|
241
|
+
into a final form.
|
242
|
+
|
243
|
+
package::
|
244
|
+
|
245
|
+
The data is archived and compressed as necessary and moved to an
|
246
|
+
outbox, staging server, S3 bucket, &c.
|
247
|
+
|
248
|
+
Not all datasets
|
249
|
+
|
250
|
+
|
251
|
+
== Datasets
|
252
|
+
|
253
|
+
== Tasks & Dependencies
|
254
|
+
|
255
|
+
== Directory Structure
|
256
|
+
|
257
|
+
== Records
|
258
|
+
|
259
|
+
= IMW on the Command Line
|
260
|
+
|
261
|
+
== Repositories
|
262
|
+
|
263
|
+
== Running Tasks
|
101
264
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.1
|
data/bin/imw
ADDED
data/lib/imw/boot.rb
CHANGED
@@ -1,18 +1,3 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/boot.rb -- startup functions
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# This file contains code necessary to boot the Infinite Monkeywrench
|
7
|
-
# at a particular site.
|
8
|
-
#
|
9
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
10
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
11
|
-
# License:: GPL 3.0
|
12
|
-
# Website:: http://infinitemonkeywrench.org/
|
13
|
-
#
|
14
|
-
# puts "#{File.basename(__FILE__)}: You heft up your Infinite Monkeywrench for the first time and marvel at how something so powerful could be made so wondrous light!"
|
15
|
-
|
16
1
|
module IMW
|
17
2
|
module Config
|
18
3
|
|
@@ -0,0 +1,38 @@
|
|
1
|
+
module IMW
|
2
|
+
|
3
|
+
class Dataset
|
4
|
+
include IMW::Paths
|
5
|
+
|
6
|
+
# A dataset keeps track of its own collection of paths just like
|
7
|
+
# IMW itself. When an IMW::Dataset is instantiated in a script,
|
8
|
+
# that script's directory becomes the dataset's +self+ path and
|
9
|
+
# the default workflow directories (see IMW::Workflow) are created
|
10
|
+
# within this directory.
|
11
|
+
#
|
12
|
+
# You can change a dataset's paths the same way you can change
|
13
|
+
# IMW's paths; calling +add_path+ and +remove_path+ on the
|
14
|
+
# dataset.
|
15
|
+
#
|
16
|
+
# To customize this behavior for all future datasets, created a
|
17
|
+
# subclass of IMW::Dataset and override the +set_paths+ method.
|
18
|
+
def paths
|
19
|
+
@paths
|
20
|
+
end
|
21
|
+
|
22
|
+
protected
|
23
|
+
# Sets the roots of various paths relative to this dataset.
|
24
|
+
def set_root_paths
|
25
|
+
@paths = {}
|
26
|
+
add_path :script, File.expand_path(eval('__FILE__'))
|
27
|
+
add_path :self, File.dirname(path_to(:script))
|
28
|
+
IMW::Workflow::DIRS.each do |dir|
|
29
|
+
add_path dir, :self, dir.to_s
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
# Overwrite this method to set additional paths for the dataset.
|
34
|
+
def set_paths
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
data/lib/imw/dataset/task.rb
CHANGED
@@ -1,25 +1,10 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/workflow/task.rb --
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# This file defines a class <tt>IMW::Task</tt> which subclasses
|
7
|
-
# <tt>Rake::Task</tt>. Tasks defined in IMW should be instances of
|
8
|
-
# <tt>IMW::Task</tt>.
|
9
|
-
#
|
10
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
11
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
12
|
-
# License:: GPL 3.0
|
13
|
-
# Website:: http://infinitemonkeywrench.org/
|
14
|
-
#
|
15
|
-
# puts "#{File.basename(__FILE__)}: Something clever" # at bottom
|
16
|
-
|
17
1
|
require 'rake'
|
18
2
|
|
19
3
|
module IMW
|
20
4
|
|
21
|
-
|
22
|
-
|
5
|
+
Task = Class.new(Rake::Task)
|
6
|
+
FileTask = Class.new(Rake::FileTask)
|
7
|
+
FileCreationTask = Class.new(Rake::FileCreationTask)
|
23
8
|
|
24
9
|
class Dataset
|
25
10
|
include Rake::TaskManager
|
@@ -31,6 +16,24 @@ module IMW
|
|
31
16
|
self.define_task IMW::Task, name, &block
|
32
17
|
end
|
33
18
|
|
19
|
+
# Return a new (or existing) <tt>IMW::FileTask</tt> with the given
|
20
|
+
# +name+. Dependencies can be declared and a block passed in just
|
21
|
+
# as in Rake.
|
22
|
+
def file name, &block
|
23
|
+
self.define_task IMW::FileTask, name, &block
|
24
|
+
end
|
25
|
+
|
26
|
+
# Return a new (or existing) <tt>IMW::FileCreationTask</tt> with the given
|
27
|
+
# +name+. Dependencies can be declared and a block passed in just
|
28
|
+
# as in Rake.
|
29
|
+
def file_create name, &block
|
30
|
+
self.define_task IMW::FileCreationTask, name, &block
|
31
|
+
end
|
32
|
+
|
33
|
+
# Override this method to define default tasks for a subclass of
|
34
|
+
# IMW::Dataset.
|
35
|
+
def set_tasks
|
36
|
+
end
|
34
37
|
end
|
35
38
|
end
|
36
39
|
|
data/lib/imw/dataset/workflow.rb
CHANGED
@@ -1,81 +1,142 @@
|
|
1
|
-
#
|
2
|
-
# lib/imw/workflow.rb -- implements the workflow class
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# This file implements the <tt>IMW::Workflow</tt> class which tailors
|
7
|
-
# the functionality of Rake for IMW objects.
|
8
|
-
#
|
9
|
-
# Author:: Philip flip Kromer for infochimps.org (mailto:coders@infochimps.org)
|
10
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
11
|
-
# License:: GPL 3.0
|
12
|
-
# Website:: http://infinitemonkeywrench.org/
|
13
|
-
#
|
14
|
-
|
15
|
-
require 'imw/dataset/scaffold'
|
16
1
|
require 'imw/dataset/task'
|
2
|
+
require 'ostruct'
|
17
3
|
|
18
4
|
module IMW
|
19
5
|
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
6
|
+
# IMW encourages you to view a data transformation as a network of
|
7
|
+
# dependencies. By default, IMW defines five main steps:
|
8
|
+
#
|
9
|
+
# rip::
|
10
|
+
# Obtain data via HTTP, FTP, SCP, RSYNC, database query, &c.
|
11
|
+
#
|
12
|
+
# extract::
|
13
|
+
# Extract data from its ripped form to a form which can be
|
14
|
+
# parsed.
|
15
|
+
#
|
16
|
+
# parse::
|
17
|
+
# Parse data into a structured form.
|
18
|
+
#
|
19
|
+
# munge::
|
20
|
+
# Combine, filter, reconcile, and transform already structured
|
21
|
+
# data into a desired form.
|
22
|
+
#
|
23
|
+
# package::
|
24
|
+
# Archive, compress, and deliver data in its final form to some
|
25
|
+
# location (HTTP, FTP, SCP, RSYNC, S3, EBS, &c.).
|
26
|
+
#
|
27
|
+
# Each step depends upon the one before it. The steps are blank by
|
28
|
+
# default so there's no need to write code for steps you don't need
|
29
|
+
# to use.
|
30
|
+
#
|
31
|
+
# Each step corresponds to a named directory in IMW::Workflow::DIRS.
|
23
32
|
module Workflow
|
24
33
|
|
25
|
-
# The
|
26
|
-
#
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
create_initialize_task
|
31
|
-
create_delete_data_task
|
32
|
-
create_destroy_task
|
33
|
-
create_workflow_tasks
|
34
|
-
end
|
34
|
+
# The <tt>Rake::TaskManager</tt> module allows the
|
35
|
+
# <tt>IMW::Dataset</tt> class to leverage the functionality of the
|
36
|
+
# Rake[http://rake.rubyforge.org/] library to manage tasks
|
37
|
+
# associated with the processing of this dataset.
|
38
|
+
include Rake::TaskManager
|
35
39
|
|
36
|
-
#
|
37
|
-
#
|
38
|
-
#
|
39
|
-
|
40
|
-
|
41
|
-
|
40
|
+
# Default options passed to <tt>Rake</tt>. Any class including
|
41
|
+
# the <tt>Rake::TaskManager</tt> module must define a constant by
|
42
|
+
# this name.
|
43
|
+
DEFAULT_OPTIONS = {
|
44
|
+
:dry_run => false,
|
45
|
+
:trace => false,
|
46
|
+
:verbose => false
|
47
|
+
}
|
48
|
+
|
49
|
+
# The standard IMW workflow steps.
|
50
|
+
STEPS = [:rip, :extract, :parse, :munge, :package]
|
51
|
+
|
52
|
+
# The steps of the IMW workflow each correspond to a directory in
|
53
|
+
# which it is customary that they deposit their files <em>once
|
54
|
+
# they are finished processing</em> (so ripped files wind up in
|
55
|
+
# the +ripd+ directory, packaged files in the +pkgd+ directory,
|
56
|
+
# and so on).
|
57
|
+
DIRS = [:ripd, :xtrd, :prsd, :mungd, :pkgd ]
|
58
|
+
|
59
|
+
# Each workflow step can be configured to take default actions,
|
60
|
+
# each action being a proc in the array for the step in this hash.
|
42
61
|
#
|
43
|
-
#
|
44
|
-
#
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
define_task(Rake::Task, {:package => :fix})
|
51
|
-
comment_default_tasks
|
62
|
+
# This allows classes which include IMW::Workflow to use class
|
63
|
+
# methods named after each step (+rip+, +parse+, &c.) to directly
|
64
|
+
# define tasks.
|
65
|
+
STEPS_TASKS = returning({}) do |steps_procs|
|
66
|
+
STEPS.each do |step|
|
67
|
+
steps_procs[step] = []
|
68
|
+
end
|
52
69
|
end
|
53
70
|
|
54
|
-
|
55
|
-
def
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
71
|
+
protected
|
72
|
+
def self.included klass
|
73
|
+
STEPS.each do |step|
|
74
|
+
klass.class_eval <<EOF
|
75
|
+
def self.#{step}(deps=nil, &block)
|
76
|
+
STEPS_TASKS[:#{step}] << [deps, block]
|
77
|
+
end
|
78
|
+
EOF
|
79
|
+
end
|
80
|
+
|
81
|
+
|
61
82
|
end
|
62
83
|
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
84
|
+
def define_workflow_task deps, comment
|
85
|
+
@last_description = comment
|
86
|
+
define_task(IMW::Task, deps)
|
87
|
+
step = deps.respond_to?(:keys) ? deps.keys.first : deps
|
88
|
+
STEPS_TASKS[step].each do |deps, block|
|
89
|
+
self[step].enhance(deps) do
|
90
|
+
self.instance_eval(&block)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# Create all the instance variables required by Rake::TaskManager
|
96
|
+
# and define default tasks for this dataset.
|
97
|
+
def initialize_workflow
|
98
|
+
@tasks = Hash.new
|
99
|
+
@rules = Array.new
|
100
|
+
@scope = Array.new
|
101
|
+
@last_description = nil
|
102
|
+
@options = OpenStruct.new(DEFAULT_OPTIONS)
|
103
|
+
define_create_directories_task
|
104
|
+
define_workflow_tasks
|
105
|
+
define_destroy_task
|
106
|
+
end
|
107
|
+
|
108
|
+
# Creates a task <tt>:create_directories</tt> to create the
|
109
|
+
# directory structure for this dataset.
|
110
|
+
def define_create_directories_task
|
111
|
+
@last_description = "Creates workflow directories for this dataset."
|
112
|
+
define_task(IMW::Task, {:create_directories => []}) do
|
113
|
+
DIRS.each do |dir|
|
114
|
+
FileUtils.mkdir_p(path_to(dir)) unless File.exist?(path_to(dir))
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
# Creates a task <tt>:destroy</tt> which removes dataset's
|
120
|
+
# workflow directories.
|
121
|
+
def define_destroy_task
|
122
|
+
@last_description = "Get rid of all traces of this dataset."
|
123
|
+
define_task(IMW::Task, :destroy => [:create_directories]) do
|
124
|
+
DIRS.each do |dir|
|
125
|
+
FileUtils.rm_rf(path_to(dir))
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# Creates the task dependency chain <tt>:package => :munge =>
|
131
|
+
# :parse => :extract => :rip => :initialize</tt> of the
|
132
|
+
# IMW::Workflow.
|
133
|
+
def define_workflow_tasks
|
134
|
+
define_workflow_task({:rip => [:create_directories]}, "Obtain data from some source." )
|
135
|
+
define_workflow_task({:extract => [:rip]}, "Extract data so it's ready to parse." )
|
136
|
+
define_workflow_task({:parse => [:extract]}, "Parse data into a structured form." )
|
137
|
+
define_workflow_task({:munge => [:parse]}, "Munge structured data into desired form.")
|
138
|
+
define_workflow_task({:package => [:munge]}, "Package dataset in final form." )
|
76
139
|
end
|
77
140
|
|
78
141
|
end
|
79
142
|
end
|
80
|
-
|
81
|
-
# puts "#{File.basename(__FILE__)}: You find your flow next to a tall tree. Ahhhh."
|