imw 0.2.18 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +7 -26
- data/Gemfile.lock +13 -38
- data/{LICENSE → LICENSE.txt} +1 -1
- data/README.textile +35 -0
- data/Rakefile +45 -22
- data/VERSION +1 -1
- data/examples/foo.rb +19 -0
- data/examples/html_selector.rb +22 -0
- data/examples/nes_game_list.csv +625 -0
- data/examples/nes_gamespot.csv +1371 -0
- data/examples/nes_nintendo.csv +624 -0
- data/examples/nes_unlicensed.csv +89 -0
- data/examples/nes_wikipedia.csv +710 -0
- data/examples/nibbler_test.rb +24 -0
- data/examples/script.rb +19 -0
- data/lib/imw.rb +28 -140
- data/lib/imw/error.rb +9 -0
- data/lib/imw/recordizer.rb +8 -0
- data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
- data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
- data/lib/imw/resource.rb +3 -119
- data/lib/imw/serializer.rb +7 -0
- data/lib/imw/serializer/json_serializer.rb +17 -0
- data/lib/imw/uri.rb +41 -0
- data/spec/resource_spec.rb +78 -0
- data/spec/uri_spec.rb +55 -0
- metadata +81 -232
- data/README.rdoc +0 -371
- data/bin/imw +0 -5
- data/bin/tsv_to_json.rb +0 -29
- data/etc/imwrc.rb +0 -26
- data/examples/dataset.rb +0 -12
- data/examples/metadata.yml +0 -10
- data/lib/imw/archives.rb +0 -120
- data/lib/imw/archives/rar.rb +0 -19
- data/lib/imw/archives/tar.rb +0 -19
- data/lib/imw/archives/tarbz2.rb +0 -73
- data/lib/imw/archives/targz.rb +0 -73
- data/lib/imw/archives/zip.rb +0 -51
- data/lib/imw/boot.rb +0 -87
- data/lib/imw/compressed_files.rb +0 -94
- data/lib/imw/compressed_files/bz2.rb +0 -16
- data/lib/imw/compressed_files/compressible.rb +0 -75
- data/lib/imw/compressed_files/gz.rb +0 -16
- data/lib/imw/dataset.rb +0 -125
- data/lib/imw/dataset/paths.rb +0 -29
- data/lib/imw/dataset/workflow.rb +0 -195
- data/lib/imw/formats.rb +0 -33
- data/lib/imw/formats/delimited.rb +0 -170
- data/lib/imw/formats/excel.rb +0 -100
- data/lib/imw/formats/json.rb +0 -41
- data/lib/imw/formats/pdf.rb +0 -71
- data/lib/imw/formats/sgml.rb +0 -69
- data/lib/imw/formats/yaml.rb +0 -41
- data/lib/imw/metadata.rb +0 -83
- data/lib/imw/metadata/contains_metadata.rb +0 -54
- data/lib/imw/metadata/dsl.rb +0 -111
- data/lib/imw/metadata/field.rb +0 -37
- data/lib/imw/metadata/has_metadata.rb +0 -98
- data/lib/imw/metadata/has_summary.rb +0 -57
- data/lib/imw/metadata/schema.rb +0 -17
- data/lib/imw/parsers.rb +0 -8
- data/lib/imw/parsers/flat.rb +0 -44
- data/lib/imw/parsers/html_parser.rb +0 -387
- data/lib/imw/parsers/html_parser/matchers.rb +0 -289
- data/lib/imw/parsers/line_parser.rb +0 -87
- data/lib/imw/parsers/regexp_parser.rb +0 -72
- data/lib/imw/repository.rb +0 -12
- data/lib/imw/runner.rb +0 -118
- data/lib/imw/schemes.rb +0 -23
- data/lib/imw/schemes/ftp.rb +0 -142
- data/lib/imw/schemes/hdfs.rb +0 -251
- data/lib/imw/schemes/http.rb +0 -165
- data/lib/imw/schemes/local.rb +0 -409
- data/lib/imw/schemes/remote.rb +0 -119
- data/lib/imw/schemes/s3.rb +0 -143
- data/lib/imw/schemes/sql.rb +0 -129
- data/lib/imw/tools.rb +0 -12
- data/lib/imw/tools/aggregator.rb +0 -148
- data/lib/imw/tools/archiver.rb +0 -220
- data/lib/imw/tools/downloader.rb +0 -63
- data/lib/imw/tools/extension_analyzer.rb +0 -114
- data/lib/imw/tools/summarizer.rb +0 -83
- data/lib/imw/tools/transferer.rb +0 -167
- data/lib/imw/utils.rb +0 -74
- data/lib/imw/utils/dynamically_extendable.rb +0 -137
- data/lib/imw/utils/error.rb +0 -59
- data/lib/imw/utils/extensions/hpricot.rb +0 -34
- data/lib/imw/utils/has_uri.rb +0 -131
- data/lib/imw/utils/log.rb +0 -92
- data/lib/imw/utils/misc.rb +0 -57
- data/lib/imw/utils/paths.rb +0 -146
- data/lib/imw/utils/uri.rb +0 -59
- data/lib/imw/utils/uuid.rb +0 -33
- data/lib/imw/utils/validate.rb +0 -38
- data/lib/imw/utils/version.rb +0 -11
- data/spec/data/formats/delimited/sample.csv +0 -131
- data/spec/data/formats/delimited/sample.tsv +0 -131
- data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
- data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
- data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
- data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
- data/spec/data/formats/excel/sample.xls +0 -0
- data/spec/data/formats/json/sample.json +0 -1
- data/spec/data/formats/none/sample +0 -650
- data/spec/data/formats/sgml/sample.xml +0 -617
- data/spec/data/formats/text/sample.txt +0 -650
- data/spec/data/formats/yaml/sample.yaml +0 -410
- data/spec/data/schema-tabular.yaml +0 -11
- data/spec/imw/archives/rar_spec.rb +0 -16
- data/spec/imw/archives/tar_spec.rb +0 -16
- data/spec/imw/archives/tarbz2_spec.rb +0 -24
- data/spec/imw/archives/targz_spec.rb +0 -21
- data/spec/imw/archives/zip_spec.rb +0 -16
- data/spec/imw/archives_spec.rb +0 -77
- data/spec/imw/compressed_files/bz2_spec.rb +0 -15
- data/spec/imw/compressed_files/compressible_spec.rb +0 -36
- data/spec/imw/compressed_files/gz_spec.rb +0 -15
- data/spec/imw/compressed_files_spec.rb +0 -47
- data/spec/imw/dataset/paths_spec.rb +0 -32
- data/spec/imw/dataset/workflow_spec.rb +0 -41
- data/spec/imw/formats/delimited_spec.rb +0 -44
- data/spec/imw/formats/excel_spec.rb +0 -55
- data/spec/imw/formats/json_spec.rb +0 -18
- data/spec/imw/formats/sgml_spec.rb +0 -24
- data/spec/imw/formats/yaml_spec.rb +0 -19
- data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
- data/spec/imw/metadata/field_spec.rb +0 -25
- data/spec/imw/metadata/has_metadata_spec.rb +0 -58
- data/spec/imw/metadata/has_summary_spec.rb +0 -32
- data/spec/imw/metadata/schema_spec.rb +0 -24
- data/spec/imw/metadata_spec.rb +0 -86
- data/spec/imw/parsers/line_parser_spec.rb +0 -96
- data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
- data/spec/imw/resource_spec.rb +0 -32
- data/spec/imw/schemes/hdfs_spec.rb +0 -67
- data/spec/imw/schemes/http_spec.rb +0 -19
- data/spec/imw/schemes/local_spec.rb +0 -165
- data/spec/imw/schemes/remote_spec.rb +0 -38
- data/spec/imw/schemes/s3_spec.rb +0 -31
- data/spec/imw/schemes/sql_spec.rb +0 -3
- data/spec/imw/tools/aggregator_spec.rb +0 -71
- data/spec/imw/tools/archiver_spec.rb +0 -120
- data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
- data/spec/imw/tools/summarizer_spec.rb +0 -8
- data/spec/imw/tools/transferer_spec.rb +0 -195
- data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
- data/spec/imw/utils/has_uri_spec.rb +0 -61
- data/spec/imw/utils/paths_spec.rb +0 -10
- data/spec/imw/utils/shared_paths_spec.rb +0 -29
- data/spec/imw_spec.rb +0 -14
- data/spec/rcov.opts +0 -1
- data/spec/spec_helper.rb +0 -31
- data/spec/support/custom_matchers.rb +0 -28
- data/spec/support/file_contents_matcher.rb +0 -30
- data/spec/support/paths_matcher.rb +0 -66
- data/spec/support/random.rb +0 -213
- data/spec/support/without_regard_to_order_matcher.rb +0 -41
data/README.rdoc
DELETED
@@ -1,371 +0,0 @@
|
|
1
|
-
= What is the Infinite Monkeywrench?
|
2
|
-
|
3
|
-
The Infinite Monkeywrench (IMW) is a Ruby frameworks to simplify the
|
4
|
-
tasks of acquiring, extracting, transforming, loading, and packaging
|
5
|
-
data. It has the following goals:
|
6
|
-
|
7
|
-
* Minimize programmer time even at the expense of increasing run
|
8
|
-
time.
|
9
|
-
|
10
|
-
* Take data through a full transformation from raw source to packaged
|
11
|
-
purity in as few lines of code as possible.
|
12
|
-
|
13
|
-
* Treat data records as objects as much as possible.
|
14
|
-
|
15
|
-
* Use instead of repeat better code that already exists in other
|
16
|
-
libraries (FasterCSV, I'm talkin' to you).
|
17
|
-
|
18
|
-
* Make what's common easy without making what's uncommon impossible.
|
19
|
-
|
20
|
-
* Work with messy data as well as clean data.
|
21
|
-
|
22
|
-
* Let you incorporate your own tools wherever you choose to.
|
23
|
-
|
24
|
-
The Infinite Monkeywrench is a powerful tool but it is not always the
|
25
|
-
right tool. IMW is **not** designed for
|
26
|
-
|
27
|
-
* Scraping vast amounts of data (use Wuclan[http://github.com/infochimps/wuclan] and Monkeyshines[http://github.com/infochimps/monkeyshines])
|
28
|
-
|
29
|
-
* Really, really big datasets (use Wukong[http://github.com/infochimps/wukong] and Hadoop[http://hadoop.apache.org])
|
30
|
-
|
31
|
-
* Data mining or statistical analysis
|
32
|
-
|
33
|
-
* Visualization
|
34
|
-
|
35
|
-
= Installation
|
36
|
-
|
37
|
-
IMW is hosted on Gemcutter[http://gemcutter.org] so it's easy to install.
|
38
|
-
|
39
|
-
You'll have to add <tt>http://gemcutter.org</tt> to your gem sources
|
40
|
-
if it isn't there already:
|
41
|
-
|
42
|
-
$ gem sources -a http://gemcutter.org
|
43
|
-
|
44
|
-
and then install IMW
|
45
|
-
|
46
|
-
$ sudo gem install imw
|
47
|
-
|
48
|
-
In all the examples that follow it is assumed that you've installed
|
49
|
-
IMW and required it in a script via
|
50
|
-
|
51
|
-
require 'rubygems'
|
52
|
-
require 'imw'
|
53
|
-
|
54
|
-
= Resources
|
55
|
-
|
56
|
-
IMW is centered around processing resources. A resource can be
|
57
|
-
_anything_ with a URI and you create one using IMW.open.
|
58
|
-
|
59
|
-
csv = IMW.open('/path/to/my_data.csv')
|
60
|
-
html = IMW.open('http://www.example.com/history/march_2007')
|
61
|
-
|
62
|
-
IMW dynamically extends a resource with modules appropriate to it when
|
63
|
-
you open it. In the above case, +csv+ would be automatically extended
|
64
|
-
by the IMW::Resources::Formats::Csv module, among others:
|
65
|
-
|
66
|
-
csv.modules
|
67
|
-
=> [IMW::Schemes::Local::Base, IMW::Schemes::Local::LocalFile, IMW::CompressedFiles::Compressible, IMW::Formats::Csv]
|
68
|
-
|
69
|
-
while +html+ will use a different set
|
70
|
-
|
71
|
-
html.modules
|
72
|
-
=> [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP, IMW::Formats::Html]
|
73
|
-
|
74
|
-
Consult the documentation for the modules a resource uses to learn
|
75
|
-
what it can do.
|
76
|
-
|
77
|
-
== Including/Excluding Resource Modules
|
78
|
-
|
79
|
-
You can exercise finer control of the resource modules IMW will extend
|
80
|
-
a given resource with by passing the <tt>:as</tt> and <tt>:without</tt>.
|
81
|
-
|
82
|
-
IMW.open('http://www.infochimps.com/some_raw_data', :without => [IMW::Formats::Html]).resource_modules
|
83
|
-
=> [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP]
|
84
|
-
|
85
|
-
IMW.open('http://www.infochimps.com', :as => [IMW::Formats::Json]).resource_modules
|
86
|
-
=> [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP, IMW::Formats::Json]
|
87
|
-
|
88
|
-
You can also pass <tt>:no_modules</tt> to not use any resource
|
89
|
-
modules.
|
90
|
-
|
91
|
-
== Handlers and Custom Resource Modules
|
92
|
-
|
93
|
-
IMW chooses which resource modules to extend an IMW::Resource by
|
94
|
-
iterating through an array of handlers, passing the resource to the
|
95
|
-
handler, and letting the handler's response (true/false) determine
|
96
|
-
whether or not to extend the resource with the module accompanying the
|
97
|
-
handler.
|
98
|
-
|
99
|
-
You can hook into this process by defining your own handlers. To
|
100
|
-
define a handler which should extend with +MyModule+ any resource with
|
101
|
-
a URI ending with <tt>.xxx</tt>
|
102
|
-
|
103
|
-
IMW::Resource.register_handler MyModule, /\.xxx$/
|
104
|
-
|
105
|
-
You can also use a Proc instead of a Regexp for more control. If the
|
106
|
-
result output of the Proc called with a resource is evaluates true
|
107
|
-
then the resource will be extended by +MyModule+.
|
108
|
-
|
109
|
-
IMW::Resource.register_handler MyModule, Proc.new { |resource| resource.is_local? && resource.path =~ /\.xxx$/ }
|
110
|
-
|
111
|
-
= Manipulating Paths
|
112
|
-
|
113
|
-
IMW holds a registry of paths that you can define on the fly or store
|
114
|
-
in a configuration file. Defining paths once in the registry and then
|
115
|
-
referring to them forever after by name helps keep your code flexible
|
116
|
-
as well as portable.
|
117
|
-
|
118
|
-
IMW.add_path(:dropbox, "/var/www/public")
|
119
|
-
IMW.path_to(:dropbox)
|
120
|
-
=> "/var/www/public"
|
121
|
-
|
122
|
-
You can combine named references together dynamically.
|
123
|
-
|
124
|
-
IMW.add_path(:raw, :dropbox, "raw")
|
125
|
-
IMW.path_to(:raw)
|
126
|
-
=> "/var/www/public/raw"
|
127
|
-
IMW.path_to(:raw, "my/dataset")
|
128
|
-
=> "/var/www/public/raw/my/dataset
|
129
|
-
|
130
|
-
Altering one path will update others
|
131
|
-
|
132
|
-
IMW.add_path(:dropbox, "/data") # redefines :raw
|
133
|
-
IMW.path_to(:raw, "my/dataset)
|
134
|
-
=> "/data/raw/my/dataset" # not /var/www/public/raw/my/dataset
|
135
|
-
|
136
|
-
= Files & Directories
|
137
|
-
|
138
|
-
Use IMW.open to open files. The object returned by IMW.open obeys the
|
139
|
-
usual semantics of a File object but it has new methods to manipulate
|
140
|
-
and parse the file.
|
141
|
-
|
142
|
-
f1 = IMW.open("/path/to/file")
|
143
|
-
f1.read() # does what you think
|
144
|
-
|
145
|
-
# class methods from File are available
|
146
|
-
f1.size
|
147
|
-
f1.writeable?
|
148
|
-
|
149
|
-
# use a bang or a 'w' to write
|
150
|
-
writable_file = IMW.open!('/some/path') # similar to open('/some/path', 'w')
|
151
|
-
|
152
|
-
# as well as methods to manipulate the file on the filesystem
|
153
|
-
f2 = f1.cp("/new/path/to/file") # also try cp_to_dir
|
154
|
-
f1.exist? # true
|
155
|
-
f3 = f1.mv("/yet/another/path") # also try mv_to_dir
|
156
|
-
f1.exist? # false
|
157
|
-
|
158
|
-
IMW also knows about directories
|
159
|
-
|
160
|
-
d = IMW.open('/tmp')
|
161
|
-
d.directory? # true
|
162
|
-
d['*'] # Dir['/tmp/*']
|
163
|
-
d.mv('/parent/dir')
|
164
|
-
|
165
|
-
== Remote Files
|
166
|
-
|
167
|
-
Many operations defined for files are also defined for arbitrary URIs
|
168
|
-
through the <tt>open-uri</tt> library.
|
169
|
-
|
170
|
-
Files can readily be opened, read, and downloaded from the Internet
|
171
|
-
|
172
|
-
site = IMW.open('http://infochimps.org') #=> Recognized as an HTML document
|
173
|
-
site.read() # does what you think
|
174
|
-
site.cp('/some/local/path')
|
175
|
-
site.exist? # will work in many cases
|
176
|
-
|
177
|
-
(writing to remote sources isn't enabled yet).
|
178
|
-
|
179
|
-
== Archives & Compressed Files
|
180
|
-
|
181
|
-
IMW works with a variety of archiving and compression programs to make
|
182
|
-
packaging/unpackaging data easy.
|
183
|
-
|
184
|
-
bz2 = IMW.open('/path/to/big_file.bz2')
|
185
|
-
zip = IMW.open('/path/to/archive.zip')
|
186
|
-
targz = IMW.open('/path/to/archive.tar.gz')
|
187
|
-
|
188
|
-
IMW recognizes file properties by extension
|
189
|
-
|
190
|
-
bz2.is_archive? # false
|
191
|
-
bz2.is_compressed? # true
|
192
|
-
zip.is_archive? # true
|
193
|
-
zip.is_compressed? # false
|
194
|
-
targz.is_archive? # true
|
195
|
-
targz.is_compressed? # true
|
196
|
-
|
197
|
-
# decompress or compress files
|
198
|
-
big_file = bz2.decompress! # skip the ! to preserve the original
|
199
|
-
new_bz2 = big_file.compress!
|
200
|
-
|
201
|
-
# extract and package archives
|
202
|
-
zip.extract # files show up in working directory
|
203
|
-
tarbz2.extract # no need to decompress first
|
204
|
-
new_tarbz2 = IMW.open!('/new/archive.tar').create(['/path1', '/path/2']).compress!
|
205
|
-
|
206
|
-
== Parsing and Emitting Data
|
207
|
-
|
208
|
-
IMW encourages you to work with native Ruby data structures as much as
|
209
|
-
possible by providing methods to parse common data formats directly
|
210
|
-
into Arrays, Hashes and Strings.
|
211
|
-
|
212
|
-
Some data formats (CSV, JSON, YAML) have a structure which trivially
|
213
|
-
maps to Arrays, Hashes, and Strings and so these formats can
|
214
|
-
immediately be parsed.
|
215
|
-
|
216
|
-
Other formats (XML, HTML, flat files, &c.) use data structures which
|
217
|
-
do not map as readily to Arrays, Hashes, and Strings and so these will
|
218
|
-
have to be parsed first.
|
219
|
-
|
220
|
-
=== Ruby-like Data Formats
|
221
|
-
|
222
|
-
These include delimited formats such as CSV and TSV as well as
|
223
|
-
"restricted tree-like" formats like JSON and YAML.
|
224
|
-
|
225
|
-
For the case of delimited data, consider the following CSV file:
|
226
|
-
|
227
|
-
ID,Name,Genus,Species
|
228
|
-
001,Gray-bellied Night Monkey,Aotus,lemurinus
|
229
|
-
002,Panamanian Night Monkey,Aotus,zonalis
|
230
|
-
003,Hernández-Camacho's Night Monkey,Aotus,jorgehernandezi
|
231
|
-
004,Gray-handed Night Monkey,Aotus,griseimembra
|
232
|
-
005,Hershkovitz's Night Monkey,Aotus,hershkovitzi
|
233
|
-
006,Brumback's Night Monkey,Aotus,brumbacki
|
234
|
-
007,Three-striped Night Monkey,Aotus,trivirgatus
|
235
|
-
008,Spix's Night Monkey,Aotus,vociferans
|
236
|
-
009,Malaysian Lar Gibbon,Hylobates,lar lar
|
237
|
-
010,Carpenter's Lar Gibbon,Hylobates,lar carpenteri
|
238
|
-
|
239
|
-
It trivially maps to an Array of Arrays:
|
240
|
-
|
241
|
-
data = IMW.open('/path/to/monkeys.csv').load
|
242
|
-
puts data.class
|
243
|
-
=> Array
|
244
|
-
puts data.first.class
|
245
|
-
=> Array
|
246
|
-
data.each { |row| puts row.inspect }
|
247
|
-
=> ["ID", "Name", "Genus", "Species"]
|
248
|
-
["001", "Gray-bellied Night Monkey", "Aotus", "lemurinus"]
|
249
|
-
["002", "Panamanian Night Monkey", "Aotus", "zonalis"]
|
250
|
-
...
|
251
|
-
["010", "Carpenter's Lar Gibbon", "Hylobates", "lar carpenteri"]
|
252
|
-
|
253
|
-
Conversely, any array of arrays trivially maps to a delimited file.
|
254
|
-
Here we write out all rows where the genus is _Hylobates_ to a TSV
|
255
|
-
file:
|
256
|
-
|
257
|
-
hylobates = data.find_all { |row| row[2] == 'Hylobates' }
|
258
|
-
hylobates.dump('/path/to/monkeys.tsv')
|
259
|
-
|
260
|
-
IMW automatically formats the output as TSV and writes it to the
|
261
|
-
specified path.
|
262
|
-
|
263
|
-
Similarly, restricted tree-like formats like JSON and YAML, which map
|
264
|
-
cleanly onto Hashes, Arrays, and Strings, can also be automatically
|
265
|
-
parsed and emitted by IMW.
|
266
|
-
|
267
|
-
Consider a YAML version of the above CSV data:
|
268
|
-
|
269
|
-
- id: 001
|
270
|
-
name: Gray-bellied Night Monkey
|
271
|
-
genus: Aotus
|
272
|
-
species: lemurinus
|
273
|
-
- id: 002
|
274
|
-
name: Panamanian Night Monkey
|
275
|
-
genus: Aotus
|
276
|
-
species: zonalis
|
277
|
-
- id: 003
|
278
|
-
name: Hernández-Camacho's Night Monkey
|
279
|
-
genus: Aotus
|
280
|
-
species: jorgehernandezi
|
281
|
-
...
|
282
|
-
- id: 010
|
283
|
-
name: Carpenter's Lar Gibbon
|
284
|
-
genus: Hylobates
|
285
|
-
species: lar carpenteri
|
286
|
-
|
287
|
-
This trivially maps to an Array of Hashes and so we can perform the
|
288
|
-
exact same filtration for YAML and JSON as we did for CSV and TSV (in
|
289
|
-
a one-liner!):
|
290
|
-
|
291
|
-
data = IMW.open('/path/to/monkeys.yaml').load
|
292
|
-
hylobates = data.map{ |monkey| monkey['genus'] == 'Hylobates' }
|
293
|
-
hylobates.dump('/path/to/monkeys.json')
|
294
|
-
|
295
|
-
Resources in these Ruby-like data formats also extend themselves with
|
296
|
-
Enumerable so goodies like +map+, +find_all+, &c. are available. This
|
297
|
-
enables converting YAML to JSON with a one-liner:
|
298
|
-
|
299
|
-
IMW.open('/path/to/monkeys.yaml').find_all { |monkey| monkey['genus'] == 'Hylobates' }.dump('/path/to/monkeys.json')
|
300
|
-
|
301
|
-
=== Parsing More General Data Formats
|
302
|
-
|
303
|
-
Some data formats are structured but do not map readily to Hashes,
|
304
|
-
Arrays, and Strings (XML, HTML, &c.) while other data formats lack
|
305
|
-
structure or have a peculiar structure (flat files in arbitrary
|
306
|
-
syntax).
|
307
|
-
|
308
|
-
In both these cases the data needs to be parsed before it's usable.
|
309
|
-
For the XML and HTML type data formats, IMW uses Hpricot and the
|
310
|
-
IMW::Parsers::HtmlParser for parsing. For flat files, IMW provides
|
311
|
-
the IMW::Parsers::LineParser and the IMW::Parsers::RegexpParser.
|
312
|
-
|
313
|
-
HTML files, on the other hand, are more complex and typically have to
|
314
|
-
be parsed before being converted to plain Ruby objects:
|
315
|
-
|
316
|
-
# Grab a tiny link from the bottom of Google's homepage
|
317
|
-
doc = IMW.open('http://www.google.com') # IMW::Files::Html
|
318
|
-
doc.parse('p a') # 'Privacy'
|
319
|
-
|
320
|
-
More complex parsers can also be built
|
321
|
-
|
322
|
-
# Grab each row from an HTML table
|
323
|
-
doc = IMW.open('/path/to/data.html')
|
324
|
-
doc.parse :employees => ["tr", { :name => "td.name", :address => "td.address" } ]
|
325
|
-
#=> [{:name => "John Chimpo", :address => "123 Fake St."}, {...}, ... ]
|
326
|
-
|
327
|
-
see IMW::Parsers::HtmlParser for details on parsing HTML (and similar)
|
328
|
-
files. Examine the other parsers in IMW::Parsers for details on
|
329
|
-
parsing other data formats.
|
330
|
-
|
331
|
-
= The IMW Workflow
|
332
|
-
|
333
|
-
The workflow of IMW can be roughly summarized as follows:
|
334
|
-
|
335
|
-
rip::
|
336
|
-
|
337
|
-
Data is obtained from a source. IMW allows you to download data
|
338
|
-
from the web, obtain it by querying databases, or use other services
|
339
|
-
like rsync, ftp, &c. to pull it in from another computer.
|
340
|
-
|
341
|
-
parse::
|
342
|
-
|
343
|
-
Data is parsed into Ruby objects and stored.
|
344
|
-
|
345
|
-
fix::
|
346
|
-
|
347
|
-
All the parsed data is combined, reconciled, and further processed
|
348
|
-
into a final form.
|
349
|
-
|
350
|
-
package::
|
351
|
-
|
352
|
-
The data is archived and compressed as necessary and moved to an
|
353
|
-
outbox, staging server, S3 bucket, &c.
|
354
|
-
|
355
|
-
Not all datasets
|
356
|
-
|
357
|
-
|
358
|
-
= Datasets
|
359
|
-
|
360
|
-
== Tasks & Dependencies
|
361
|
-
|
362
|
-
== Directory Structure
|
363
|
-
|
364
|
-
== Records
|
365
|
-
|
366
|
-
= IMW on the Command Line
|
367
|
-
|
368
|
-
== Repositories
|
369
|
-
|
370
|
-
== Running Tasks
|
371
|
-
|
data/bin/imw
DELETED
data/bin/tsv_to_json.rb
DELETED
@@ -1,29 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
require 'rubygems'
|
3
|
-
require 'json/ext'
|
4
|
-
require 'configliere'
|
5
|
-
|
6
|
-
Settings.use :commandline, :define
|
7
|
-
Settings.define :json_keys, :description => "A comma separated list of keys, in the order to be read from source."
|
8
|
-
# Settings.resolve!
|
9
|
-
|
10
|
-
module TSVtoJSON
|
11
|
-
|
12
|
-
# def initialize
|
13
|
-
# keys unless Settings.keys.nil?
|
14
|
-
# end
|
15
|
-
|
16
|
-
def keys
|
17
|
-
@keys ||= Settings.json_keys.split(",")
|
18
|
-
end
|
19
|
-
|
20
|
-
def into_json record, exclude=[]
|
21
|
-
json_hash = Hash.new
|
22
|
-
keys.each_with_index do |key, index|
|
23
|
-
next if exclude.include?(key)
|
24
|
-
json_hash[key] = record[index]
|
25
|
-
end
|
26
|
-
return JSON.generate(json_hash)
|
27
|
-
end
|
28
|
-
|
29
|
-
end
|
data/etc/imwrc.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
#-*- mode: ruby -*-
|
2
|
-
#
|
3
|
-
# h2. etc/imwrc -- default site-wide imw configuration file
|
4
|
-
#
|
5
|
-
# == About
|
6
|
-
#
|
7
|
-
# This file contains the site-wide configuration settings for this
|
8
|
-
# installation of the Infinite Monkeywrench. Settings here override
|
9
|
-
# the defaults in <tt>lib/imw/utils/config.rb</tt> (see the
|
10
|
-
# documentation for that file for more detail on the variables that
|
11
|
-
# can be configured here) but will in turn be overwritten by settings
|
12
|
-
# in the <tt>~/.imwrc</tt> file in each user's directory (though the
|
13
|
-
# location of this file can be customized).
|
14
|
-
#
|
15
|
-
# At the present moment, all settings are stored as plain Ruby files
|
16
|
-
# (though they may lack the <tt>.rb</tt> extension). As the IMW
|
17
|
-
# develops, these will be replaced by YAML files which will be parsed
|
18
|
-
# by <tt>lib/imw/utils/config.rb</tt>.
|
19
|
-
#
|
20
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
21
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
22
|
-
# License:: GPL 3.0
|
23
|
-
# Website:: http://infinitemonkeywrench.org/
|
24
|
-
|
25
|
-
module IMW
|
26
|
-
end
|
data/examples/dataset.rb
DELETED