imw 0.2.18 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +7 -26
- data/Gemfile.lock +13 -38
- data/{LICENSE → LICENSE.txt} +1 -1
- data/README.textile +35 -0
- data/Rakefile +45 -22
- data/VERSION +1 -1
- data/examples/foo.rb +19 -0
- data/examples/html_selector.rb +22 -0
- data/examples/nes_game_list.csv +625 -0
- data/examples/nes_gamespot.csv +1371 -0
- data/examples/nes_nintendo.csv +624 -0
- data/examples/nes_unlicensed.csv +89 -0
- data/examples/nes_wikipedia.csv +710 -0
- data/examples/nibbler_test.rb +24 -0
- data/examples/script.rb +19 -0
- data/lib/imw.rb +28 -140
- data/lib/imw/error.rb +9 -0
- data/lib/imw/recordizer.rb +8 -0
- data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
- data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
- data/lib/imw/resource.rb +3 -119
- data/lib/imw/serializer.rb +7 -0
- data/lib/imw/serializer/json_serializer.rb +17 -0
- data/lib/imw/uri.rb +41 -0
- data/spec/resource_spec.rb +78 -0
- data/spec/uri_spec.rb +55 -0
- metadata +81 -232
- data/README.rdoc +0 -371
- data/bin/imw +0 -5
- data/bin/tsv_to_json.rb +0 -29
- data/etc/imwrc.rb +0 -26
- data/examples/dataset.rb +0 -12
- data/examples/metadata.yml +0 -10
- data/lib/imw/archives.rb +0 -120
- data/lib/imw/archives/rar.rb +0 -19
- data/lib/imw/archives/tar.rb +0 -19
- data/lib/imw/archives/tarbz2.rb +0 -73
- data/lib/imw/archives/targz.rb +0 -73
- data/lib/imw/archives/zip.rb +0 -51
- data/lib/imw/boot.rb +0 -87
- data/lib/imw/compressed_files.rb +0 -94
- data/lib/imw/compressed_files/bz2.rb +0 -16
- data/lib/imw/compressed_files/compressible.rb +0 -75
- data/lib/imw/compressed_files/gz.rb +0 -16
- data/lib/imw/dataset.rb +0 -125
- data/lib/imw/dataset/paths.rb +0 -29
- data/lib/imw/dataset/workflow.rb +0 -195
- data/lib/imw/formats.rb +0 -33
- data/lib/imw/formats/delimited.rb +0 -170
- data/lib/imw/formats/excel.rb +0 -100
- data/lib/imw/formats/json.rb +0 -41
- data/lib/imw/formats/pdf.rb +0 -71
- data/lib/imw/formats/sgml.rb +0 -69
- data/lib/imw/formats/yaml.rb +0 -41
- data/lib/imw/metadata.rb +0 -83
- data/lib/imw/metadata/contains_metadata.rb +0 -54
- data/lib/imw/metadata/dsl.rb +0 -111
- data/lib/imw/metadata/field.rb +0 -37
- data/lib/imw/metadata/has_metadata.rb +0 -98
- data/lib/imw/metadata/has_summary.rb +0 -57
- data/lib/imw/metadata/schema.rb +0 -17
- data/lib/imw/parsers.rb +0 -8
- data/lib/imw/parsers/flat.rb +0 -44
- data/lib/imw/parsers/html_parser.rb +0 -387
- data/lib/imw/parsers/html_parser/matchers.rb +0 -289
- data/lib/imw/parsers/line_parser.rb +0 -87
- data/lib/imw/parsers/regexp_parser.rb +0 -72
- data/lib/imw/repository.rb +0 -12
- data/lib/imw/runner.rb +0 -118
- data/lib/imw/schemes.rb +0 -23
- data/lib/imw/schemes/ftp.rb +0 -142
- data/lib/imw/schemes/hdfs.rb +0 -251
- data/lib/imw/schemes/http.rb +0 -165
- data/lib/imw/schemes/local.rb +0 -409
- data/lib/imw/schemes/remote.rb +0 -119
- data/lib/imw/schemes/s3.rb +0 -143
- data/lib/imw/schemes/sql.rb +0 -129
- data/lib/imw/tools.rb +0 -12
- data/lib/imw/tools/aggregator.rb +0 -148
- data/lib/imw/tools/archiver.rb +0 -220
- data/lib/imw/tools/downloader.rb +0 -63
- data/lib/imw/tools/extension_analyzer.rb +0 -114
- data/lib/imw/tools/summarizer.rb +0 -83
- data/lib/imw/tools/transferer.rb +0 -167
- data/lib/imw/utils.rb +0 -74
- data/lib/imw/utils/dynamically_extendable.rb +0 -137
- data/lib/imw/utils/error.rb +0 -59
- data/lib/imw/utils/extensions/hpricot.rb +0 -34
- data/lib/imw/utils/has_uri.rb +0 -131
- data/lib/imw/utils/log.rb +0 -92
- data/lib/imw/utils/misc.rb +0 -57
- data/lib/imw/utils/paths.rb +0 -146
- data/lib/imw/utils/uri.rb +0 -59
- data/lib/imw/utils/uuid.rb +0 -33
- data/lib/imw/utils/validate.rb +0 -38
- data/lib/imw/utils/version.rb +0 -11
- data/spec/data/formats/delimited/sample.csv +0 -131
- data/spec/data/formats/delimited/sample.tsv +0 -131
- data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
- data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
- data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
- data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
- data/spec/data/formats/excel/sample.xls +0 -0
- data/spec/data/formats/json/sample.json +0 -1
- data/spec/data/formats/none/sample +0 -650
- data/spec/data/formats/sgml/sample.xml +0 -617
- data/spec/data/formats/text/sample.txt +0 -650
- data/spec/data/formats/yaml/sample.yaml +0 -410
- data/spec/data/schema-tabular.yaml +0 -11
- data/spec/imw/archives/rar_spec.rb +0 -16
- data/spec/imw/archives/tar_spec.rb +0 -16
- data/spec/imw/archives/tarbz2_spec.rb +0 -24
- data/spec/imw/archives/targz_spec.rb +0 -21
- data/spec/imw/archives/zip_spec.rb +0 -16
- data/spec/imw/archives_spec.rb +0 -77
- data/spec/imw/compressed_files/bz2_spec.rb +0 -15
- data/spec/imw/compressed_files/compressible_spec.rb +0 -36
- data/spec/imw/compressed_files/gz_spec.rb +0 -15
- data/spec/imw/compressed_files_spec.rb +0 -47
- data/spec/imw/dataset/paths_spec.rb +0 -32
- data/spec/imw/dataset/workflow_spec.rb +0 -41
- data/spec/imw/formats/delimited_spec.rb +0 -44
- data/spec/imw/formats/excel_spec.rb +0 -55
- data/spec/imw/formats/json_spec.rb +0 -18
- data/spec/imw/formats/sgml_spec.rb +0 -24
- data/spec/imw/formats/yaml_spec.rb +0 -19
- data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
- data/spec/imw/metadata/field_spec.rb +0 -25
- data/spec/imw/metadata/has_metadata_spec.rb +0 -58
- data/spec/imw/metadata/has_summary_spec.rb +0 -32
- data/spec/imw/metadata/schema_spec.rb +0 -24
- data/spec/imw/metadata_spec.rb +0 -86
- data/spec/imw/parsers/line_parser_spec.rb +0 -96
- data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
- data/spec/imw/resource_spec.rb +0 -32
- data/spec/imw/schemes/hdfs_spec.rb +0 -67
- data/spec/imw/schemes/http_spec.rb +0 -19
- data/spec/imw/schemes/local_spec.rb +0 -165
- data/spec/imw/schemes/remote_spec.rb +0 -38
- data/spec/imw/schemes/s3_spec.rb +0 -31
- data/spec/imw/schemes/sql_spec.rb +0 -3
- data/spec/imw/tools/aggregator_spec.rb +0 -71
- data/spec/imw/tools/archiver_spec.rb +0 -120
- data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
- data/spec/imw/tools/summarizer_spec.rb +0 -8
- data/spec/imw/tools/transferer_spec.rb +0 -195
- data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
- data/spec/imw/utils/has_uri_spec.rb +0 -61
- data/spec/imw/utils/paths_spec.rb +0 -10
- data/spec/imw/utils/shared_paths_spec.rb +0 -29
- data/spec/imw_spec.rb +0 -14
- data/spec/rcov.opts +0 -1
- data/spec/spec_helper.rb +0 -31
- data/spec/support/custom_matchers.rb +0 -28
- data/spec/support/file_contents_matcher.rb +0 -30
- data/spec/support/paths_matcher.rb +0 -66
- data/spec/support/random.rb +0 -213
- data/spec/support/without_regard_to_order_matcher.rb +0 -41
data/README.rdoc
DELETED
|
@@ -1,371 +0,0 @@
|
|
|
1
|
-
= What is the Infinite Monkeywrench?
|
|
2
|
-
|
|
3
|
-
The Infinite Monkeywrench (IMW) is a Ruby frameworks to simplify the
|
|
4
|
-
tasks of acquiring, extracting, transforming, loading, and packaging
|
|
5
|
-
data. It has the following goals:
|
|
6
|
-
|
|
7
|
-
* Minimize programmer time even at the expense of increasing run
|
|
8
|
-
time.
|
|
9
|
-
|
|
10
|
-
* Take data through a full transformation from raw source to packaged
|
|
11
|
-
purity in as few lines of code as possible.
|
|
12
|
-
|
|
13
|
-
* Treat data records as objects as much as possible.
|
|
14
|
-
|
|
15
|
-
* Use instead of repeat better code that already exists in other
|
|
16
|
-
libraries (FasterCSV, I'm talkin' to you).
|
|
17
|
-
|
|
18
|
-
* Make what's common easy without making what's uncommon impossible.
|
|
19
|
-
|
|
20
|
-
* Work with messy data as well as clean data.
|
|
21
|
-
|
|
22
|
-
* Let you incorporate your own tools wherever you choose to.
|
|
23
|
-
|
|
24
|
-
The Infinite Monkeywrench is a powerful tool but it is not always the
|
|
25
|
-
right tool. IMW is **not** designed for
|
|
26
|
-
|
|
27
|
-
* Scraping vast amounts of data (use Wuclan[http://github.com/infochimps/wuclan] and Monkeyshines[http://github.com/infochimps/monkeyshines])
|
|
28
|
-
|
|
29
|
-
* Really, really big datasets (use Wukong[http://github.com/infochimps/wukong] and Hadoop[http://hadoop.apache.org])
|
|
30
|
-
|
|
31
|
-
* Data mining or statistical analysis
|
|
32
|
-
|
|
33
|
-
* Visualization
|
|
34
|
-
|
|
35
|
-
= Installation
|
|
36
|
-
|
|
37
|
-
IMW is hosted on Gemcutter[http://gemcutter.org] so it's easy to install.
|
|
38
|
-
|
|
39
|
-
You'll have to add <tt>http://gemcutter.org</tt> to your gem sources
|
|
40
|
-
if it isn't there already:
|
|
41
|
-
|
|
42
|
-
$ gem sources -a http://gemcutter.org
|
|
43
|
-
|
|
44
|
-
and then install IMW
|
|
45
|
-
|
|
46
|
-
$ sudo gem install imw
|
|
47
|
-
|
|
48
|
-
In all the examples that follow it is assumed that you've installed
|
|
49
|
-
IMW and required it in a script via
|
|
50
|
-
|
|
51
|
-
require 'rubygems'
|
|
52
|
-
require 'imw'
|
|
53
|
-
|
|
54
|
-
= Resources
|
|
55
|
-
|
|
56
|
-
IMW is centered around processing resources. A resource can be
|
|
57
|
-
_anything_ with a URI and you create one using IMW.open.
|
|
58
|
-
|
|
59
|
-
csv = IMW.open('/path/to/my_data.csv')
|
|
60
|
-
html = IMW.open('http://www.example.com/history/march_2007')
|
|
61
|
-
|
|
62
|
-
IMW dynamically extends a resource with modules appropriate to it when
|
|
63
|
-
you open it. In the above case, +csv+ would be automatically extended
|
|
64
|
-
by the IMW::Resources::Formats::Csv module, among others:
|
|
65
|
-
|
|
66
|
-
csv.modules
|
|
67
|
-
=> [IMW::Schemes::Local::Base, IMW::Schemes::Local::LocalFile, IMW::CompressedFiles::Compressible, IMW::Formats::Csv]
|
|
68
|
-
|
|
69
|
-
while +html+ will use a different set
|
|
70
|
-
|
|
71
|
-
html.modules
|
|
72
|
-
=> [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP, IMW::Formats::Html]
|
|
73
|
-
|
|
74
|
-
Consult the documentation for the modules a resource uses to learn
|
|
75
|
-
what it can do.
|
|
76
|
-
|
|
77
|
-
== Including/Excluding Resource Modules
|
|
78
|
-
|
|
79
|
-
You can exercise finer control of the resource modules IMW will extend
|
|
80
|
-
a given resource with by passing the <tt>:as</tt> and <tt>:without</tt>.
|
|
81
|
-
|
|
82
|
-
IMW.open('http://www.infochimps.com/some_raw_data', :without => [IMW::Formats::Html]).resource_modules
|
|
83
|
-
=> [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP]
|
|
84
|
-
|
|
85
|
-
IMW.open('http://www.infochimps.com', :as => [IMW::Formats::Json]).resource_modules
|
|
86
|
-
=> [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP, IMW::Formats::Json]
|
|
87
|
-
|
|
88
|
-
You can also pass <tt>:no_modules</tt> to not use any resource
|
|
89
|
-
modules.
|
|
90
|
-
|
|
91
|
-
== Handlers and Custom Resource Modules
|
|
92
|
-
|
|
93
|
-
IMW chooses which resource modules to extend an IMW::Resource by
|
|
94
|
-
iterating through an array of handlers, passing the resource to the
|
|
95
|
-
handler, and letting the handler's response (true/false) determine
|
|
96
|
-
whether or not to extend the resource with the module accompanying the
|
|
97
|
-
handler.
|
|
98
|
-
|
|
99
|
-
You can hook into this process by defining your own handlers. To
|
|
100
|
-
define a handler which should extend with +MyModule+ any resource with
|
|
101
|
-
a URI ending with <tt>.xxx</tt>
|
|
102
|
-
|
|
103
|
-
IMW::Resource.register_handler MyModule, /\.xxx$/
|
|
104
|
-
|
|
105
|
-
You can also use a Proc instead of a Regexp for more control. If the
|
|
106
|
-
result output of the Proc called with a resource is evaluates true
|
|
107
|
-
then the resource will be extended by +MyModule+.
|
|
108
|
-
|
|
109
|
-
IMW::Resource.register_handler MyModule, Proc.new { |resource| resource.is_local? && resource.path =~ /\.xxx$/ }
|
|
110
|
-
|
|
111
|
-
= Manipulating Paths
|
|
112
|
-
|
|
113
|
-
IMW holds a registry of paths that you can define on the fly or store
|
|
114
|
-
in a configuration file. Defining paths once in the registry and then
|
|
115
|
-
referring to them forever after by name helps keep your code flexible
|
|
116
|
-
as well as portable.
|
|
117
|
-
|
|
118
|
-
IMW.add_path(:dropbox, "/var/www/public")
|
|
119
|
-
IMW.path_to(:dropbox)
|
|
120
|
-
=> "/var/www/public"
|
|
121
|
-
|
|
122
|
-
You can combine named references together dynamically.
|
|
123
|
-
|
|
124
|
-
IMW.add_path(:raw, :dropbox, "raw")
|
|
125
|
-
IMW.path_to(:raw)
|
|
126
|
-
=> "/var/www/public/raw"
|
|
127
|
-
IMW.path_to(:raw, "my/dataset")
|
|
128
|
-
=> "/var/www/public/raw/my/dataset
|
|
129
|
-
|
|
130
|
-
Altering one path will update others
|
|
131
|
-
|
|
132
|
-
IMW.add_path(:dropbox, "/data") # redefines :raw
|
|
133
|
-
IMW.path_to(:raw, "my/dataset)
|
|
134
|
-
=> "/data/raw/my/dataset" # not /var/www/public/raw/my/dataset
|
|
135
|
-
|
|
136
|
-
= Files & Directories
|
|
137
|
-
|
|
138
|
-
Use IMW.open to open files. The object returned by IMW.open obeys the
|
|
139
|
-
usual semantics of a File object but it has new methods to manipulate
|
|
140
|
-
and parse the file.
|
|
141
|
-
|
|
142
|
-
f1 = IMW.open("/path/to/file")
|
|
143
|
-
f1.read() # does what you think
|
|
144
|
-
|
|
145
|
-
# class methods from File are available
|
|
146
|
-
f1.size
|
|
147
|
-
f1.writeable?
|
|
148
|
-
|
|
149
|
-
# use a bang or a 'w' to write
|
|
150
|
-
writable_file = IMW.open!('/some/path') # similar to open('/some/path', 'w')
|
|
151
|
-
|
|
152
|
-
# as well as methods to manipulate the file on the filesystem
|
|
153
|
-
f2 = f1.cp("/new/path/to/file") # also try cp_to_dir
|
|
154
|
-
f1.exist? # true
|
|
155
|
-
f3 = f1.mv("/yet/another/path") # also try mv_to_dir
|
|
156
|
-
f1.exist? # false
|
|
157
|
-
|
|
158
|
-
IMW also knows about directories
|
|
159
|
-
|
|
160
|
-
d = IMW.open('/tmp')
|
|
161
|
-
d.directory? # true
|
|
162
|
-
d['*'] # Dir['/tmp/*']
|
|
163
|
-
d.mv('/parent/dir')
|
|
164
|
-
|
|
165
|
-
== Remote Files
|
|
166
|
-
|
|
167
|
-
Many operations defined for files are also defined for arbitrary URIs
|
|
168
|
-
through the <tt>open-uri</tt> library.
|
|
169
|
-
|
|
170
|
-
Files can readily be opened, read, and downloaded from the Internet
|
|
171
|
-
|
|
172
|
-
site = IMW.open('http://infochimps.org') #=> Recognized as an HTML document
|
|
173
|
-
site.read() # does what you think
|
|
174
|
-
site.cp('/some/local/path')
|
|
175
|
-
site.exist? # will work in many cases
|
|
176
|
-
|
|
177
|
-
(writing to remote sources isn't enabled yet).
|
|
178
|
-
|
|
179
|
-
== Archives & Compressed Files
|
|
180
|
-
|
|
181
|
-
IMW works with a variety of archiving and compression programs to make
|
|
182
|
-
packaging/unpackaging data easy.
|
|
183
|
-
|
|
184
|
-
bz2 = IMW.open('/path/to/big_file.bz2')
|
|
185
|
-
zip = IMW.open('/path/to/archive.zip')
|
|
186
|
-
targz = IMW.open('/path/to/archive.tar.gz')
|
|
187
|
-
|
|
188
|
-
IMW recognizes file properties by extension
|
|
189
|
-
|
|
190
|
-
bz2.is_archive? # false
|
|
191
|
-
bz2.is_compressed? # true
|
|
192
|
-
zip.is_archive? # true
|
|
193
|
-
zip.is_compressed? # false
|
|
194
|
-
targz.is_archive? # true
|
|
195
|
-
targz.is_compressed? # true
|
|
196
|
-
|
|
197
|
-
# decompress or compress files
|
|
198
|
-
big_file = bz2.decompress! # skip the ! to preserve the original
|
|
199
|
-
new_bz2 = big_file.compress!
|
|
200
|
-
|
|
201
|
-
# extract and package archives
|
|
202
|
-
zip.extract # files show up in working directory
|
|
203
|
-
tarbz2.extract # no need to decompress first
|
|
204
|
-
new_tarbz2 = IMW.open!('/new/archive.tar').create(['/path1', '/path/2']).compress!
|
|
205
|
-
|
|
206
|
-
== Parsing and Emitting Data
|
|
207
|
-
|
|
208
|
-
IMW encourages you to work with native Ruby data structures as much as
|
|
209
|
-
possible by providing methods to parse common data formats directly
|
|
210
|
-
into Arrays, Hashes and Strings.
|
|
211
|
-
|
|
212
|
-
Some data formats (CSV, JSON, YAML) have a structure which trivially
|
|
213
|
-
maps to Arrays, Hashes, and Strings and so these formats can
|
|
214
|
-
immediately be parsed.
|
|
215
|
-
|
|
216
|
-
Other formats (XML, HTML, flat files, &c.) use data structures which
|
|
217
|
-
do not map as readily to Arrays, Hashes, and Strings and so these will
|
|
218
|
-
have to be parsed first.
|
|
219
|
-
|
|
220
|
-
=== Ruby-like Data Formats
|
|
221
|
-
|
|
222
|
-
These include delimited formats such as CSV and TSV as well as
|
|
223
|
-
"restricted tree-like" formats like JSON and YAML.
|
|
224
|
-
|
|
225
|
-
For the case of delimited data, consider the following CSV file:
|
|
226
|
-
|
|
227
|
-
ID,Name,Genus,Species
|
|
228
|
-
001,Gray-bellied Night Monkey,Aotus,lemurinus
|
|
229
|
-
002,Panamanian Night Monkey,Aotus,zonalis
|
|
230
|
-
003,Hernández-Camacho's Night Monkey,Aotus,jorgehernandezi
|
|
231
|
-
004,Gray-handed Night Monkey,Aotus,griseimembra
|
|
232
|
-
005,Hershkovitz's Night Monkey,Aotus,hershkovitzi
|
|
233
|
-
006,Brumback's Night Monkey,Aotus,brumbacki
|
|
234
|
-
007,Three-striped Night Monkey,Aotus,trivirgatus
|
|
235
|
-
008,Spix's Night Monkey,Aotus,vociferans
|
|
236
|
-
009,Malaysian Lar Gibbon,Hylobates,lar lar
|
|
237
|
-
010,Carpenter's Lar Gibbon,Hylobates,lar carpenteri
|
|
238
|
-
|
|
239
|
-
It trivially maps to an Array of Arrays:
|
|
240
|
-
|
|
241
|
-
data = IMW.open('/path/to/monkeys.csv').load
|
|
242
|
-
puts data.class
|
|
243
|
-
=> Array
|
|
244
|
-
puts data.first.class
|
|
245
|
-
=> Array
|
|
246
|
-
data.each { |row| puts row.inspect }
|
|
247
|
-
=> ["ID", "Name", "Genus", "Species"]
|
|
248
|
-
["001", "Gray-bellied Night Monkey", "Aotus", "lemurinus"]
|
|
249
|
-
["002", "Panamanian Night Monkey", "Aotus", "zonalis"]
|
|
250
|
-
...
|
|
251
|
-
["010", "Carpenter's Lar Gibbon", "Hylobates", "lar carpenteri"]
|
|
252
|
-
|
|
253
|
-
Conversely, any array of arrays trivially maps to a delimited file.
|
|
254
|
-
Here we write out all rows where the genus is _Hylobates_ to a TSV
|
|
255
|
-
file:
|
|
256
|
-
|
|
257
|
-
hylobates = data.find_all { |row| row[2] == 'Hylobates' }
|
|
258
|
-
hylobates.dump('/path/to/monkeys.tsv')
|
|
259
|
-
|
|
260
|
-
IMW automatically formats the output as TSV and writes it to the
|
|
261
|
-
specified path.
|
|
262
|
-
|
|
263
|
-
Similarly, restricted tree-like formats like JSON and YAML, which map
|
|
264
|
-
cleanly onto Hashes, Arrays, and Strings, can also be automatically
|
|
265
|
-
parsed and emitted by IMW.
|
|
266
|
-
|
|
267
|
-
Consider a YAML version of the above CSV data:
|
|
268
|
-
|
|
269
|
-
- id: 001
|
|
270
|
-
name: Gray-bellied Night Monkey
|
|
271
|
-
genus: Aotus
|
|
272
|
-
species: lemurinus
|
|
273
|
-
- id: 002
|
|
274
|
-
name: Panamanian Night Monkey
|
|
275
|
-
genus: Aotus
|
|
276
|
-
species: zonalis
|
|
277
|
-
- id: 003
|
|
278
|
-
name: Hernández-Camacho's Night Monkey
|
|
279
|
-
genus: Aotus
|
|
280
|
-
species: jorgehernandezi
|
|
281
|
-
...
|
|
282
|
-
- id: 010
|
|
283
|
-
name: Carpenter's Lar Gibbon
|
|
284
|
-
genus: Hylobates
|
|
285
|
-
species: lar carpenteri
|
|
286
|
-
|
|
287
|
-
This trivially maps to an Array of Hashes and so we can perform the
|
|
288
|
-
exact same filtration for YAML and JSON as we did for CSV and TSV (in
|
|
289
|
-
a one-liner!):
|
|
290
|
-
|
|
291
|
-
data = IMW.open('/path/to/monkeys.yaml').load
|
|
292
|
-
hylobates = data.map{ |monkey| monkey['genus'] == 'Hylobates' }
|
|
293
|
-
hylobates.dump('/path/to/monkeys.json')
|
|
294
|
-
|
|
295
|
-
Resources in these Ruby-like data formats also extend themselves with
|
|
296
|
-
Enumerable so goodies like +map+, +find_all+, &c. are available. This
|
|
297
|
-
enables converting YAML to JSON with a one-liner:
|
|
298
|
-
|
|
299
|
-
IMW.open('/path/to/monkeys.yaml').find_all { |monkey| monkey['genus'] == 'Hylobates' }.dump('/path/to/monkeys.json')
|
|
300
|
-
|
|
301
|
-
=== Parsing More General Data Formats
|
|
302
|
-
|
|
303
|
-
Some data formats are structured but do not map readily to Hashes,
|
|
304
|
-
Arrays, and Strings (XML, HTML, &c.) while other data formats lack
|
|
305
|
-
structure or have a peculiar structure (flat files in arbitrary
|
|
306
|
-
syntax).
|
|
307
|
-
|
|
308
|
-
In both these cases the data needs to be parsed before it's usable.
|
|
309
|
-
For the XML and HTML type data formats, IMW uses Hpricot and the
|
|
310
|
-
IMW::Parsers::HtmlParser for parsing. For flat files, IMW provides
|
|
311
|
-
the IMW::Parsers::LineParser and the IMW::Parsers::RegexpParser.
|
|
312
|
-
|
|
313
|
-
HTML files, on the other hand, are more complex and typically have to
|
|
314
|
-
be parsed before being converted to plain Ruby objects:
|
|
315
|
-
|
|
316
|
-
# Grab a tiny link from the bottom of Google's homepage
|
|
317
|
-
doc = IMW.open('http://www.google.com') # IMW::Files::Html
|
|
318
|
-
doc.parse('p a') # 'Privacy'
|
|
319
|
-
|
|
320
|
-
More complex parsers can also be built
|
|
321
|
-
|
|
322
|
-
# Grab each row from an HTML table
|
|
323
|
-
doc = IMW.open('/path/to/data.html')
|
|
324
|
-
doc.parse :employees => ["tr", { :name => "td.name", :address => "td.address" } ]
|
|
325
|
-
#=> [{:name => "John Chimpo", :address => "123 Fake St."}, {...}, ... ]
|
|
326
|
-
|
|
327
|
-
see IMW::Parsers::HtmlParser for details on parsing HTML (and similar)
|
|
328
|
-
files. Examine the other parsers in IMW::Parsers for details on
|
|
329
|
-
parsing other data formats.
|
|
330
|
-
|
|
331
|
-
= The IMW Workflow
|
|
332
|
-
|
|
333
|
-
The workflow of IMW can be roughly summarized as follows:
|
|
334
|
-
|
|
335
|
-
rip::
|
|
336
|
-
|
|
337
|
-
Data is obtained from a source. IMW allows you to download data
|
|
338
|
-
from the web, obtain it by querying databases, or use other services
|
|
339
|
-
like rsync, ftp, &c. to pull it in from another computer.
|
|
340
|
-
|
|
341
|
-
parse::
|
|
342
|
-
|
|
343
|
-
Data is parsed into Ruby objects and stored.
|
|
344
|
-
|
|
345
|
-
fix::
|
|
346
|
-
|
|
347
|
-
All the parsed data is combined, reconciled, and further processed
|
|
348
|
-
into a final form.
|
|
349
|
-
|
|
350
|
-
package::
|
|
351
|
-
|
|
352
|
-
The data is archived and compressed as necessary and moved to an
|
|
353
|
-
outbox, staging server, S3 bucket, &c.
|
|
354
|
-
|
|
355
|
-
Not all datasets
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
= Datasets
|
|
359
|
-
|
|
360
|
-
== Tasks & Dependencies
|
|
361
|
-
|
|
362
|
-
== Directory Structure
|
|
363
|
-
|
|
364
|
-
== Records
|
|
365
|
-
|
|
366
|
-
= IMW on the Command Line
|
|
367
|
-
|
|
368
|
-
== Repositories
|
|
369
|
-
|
|
370
|
-
== Running Tasks
|
|
371
|
-
|
data/bin/imw
DELETED
data/bin/tsv_to_json.rb
DELETED
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env ruby
|
|
2
|
-
require 'rubygems'
|
|
3
|
-
require 'json/ext'
|
|
4
|
-
require 'configliere'
|
|
5
|
-
|
|
6
|
-
Settings.use :commandline, :define
|
|
7
|
-
Settings.define :json_keys, :description => "A comma separated list of keys, in the order to be read from source."
|
|
8
|
-
# Settings.resolve!
|
|
9
|
-
|
|
10
|
-
module TSVtoJSON
|
|
11
|
-
|
|
12
|
-
# def initialize
|
|
13
|
-
# keys unless Settings.keys.nil?
|
|
14
|
-
# end
|
|
15
|
-
|
|
16
|
-
def keys
|
|
17
|
-
@keys ||= Settings.json_keys.split(",")
|
|
18
|
-
end
|
|
19
|
-
|
|
20
|
-
def into_json record, exclude=[]
|
|
21
|
-
json_hash = Hash.new
|
|
22
|
-
keys.each_with_index do |key, index|
|
|
23
|
-
next if exclude.include?(key)
|
|
24
|
-
json_hash[key] = record[index]
|
|
25
|
-
end
|
|
26
|
-
return JSON.generate(json_hash)
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
end
|
data/etc/imwrc.rb
DELETED
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
#-*- mode: ruby -*-
|
|
2
|
-
#
|
|
3
|
-
# h2. etc/imwrc -- default site-wide imw configuration file
|
|
4
|
-
#
|
|
5
|
-
# == About
|
|
6
|
-
#
|
|
7
|
-
# This file contains the site-wide configuration settings for this
|
|
8
|
-
# installation of the Infinite Monkeywrench. Settings here override
|
|
9
|
-
# the defaults in <tt>lib/imw/utils/config.rb</tt> (see the
|
|
10
|
-
# documentation for that file for more detail on the variables that
|
|
11
|
-
# can be configured here) but will in turn be overwritten by settings
|
|
12
|
-
# in the <tt>~/.imwrc</tt> file in each user's directory (though the
|
|
13
|
-
# location of this file can be customized).
|
|
14
|
-
#
|
|
15
|
-
# At the present moment, all settings are stored as plain Ruby files
|
|
16
|
-
# (though they may lack the <tt>.rb</tt> extension). As the IMW
|
|
17
|
-
# develops, these will be replaced by YAML files which will be parsed
|
|
18
|
-
# by <tt>lib/imw/utils/config.rb</tt>.
|
|
19
|
-
#
|
|
20
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
|
21
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
|
22
|
-
# License:: GPL 3.0
|
|
23
|
-
# Website:: http://infinitemonkeywrench.org/
|
|
24
|
-
|
|
25
|
-
module IMW
|
|
26
|
-
end
|
data/examples/dataset.rb
DELETED