imw 0.2.18 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (172) hide show
  1. data/Gemfile +7 -26
  2. data/Gemfile.lock +13 -38
  3. data/{LICENSE → LICENSE.txt} +1 -1
  4. data/README.textile +35 -0
  5. data/Rakefile +45 -22
  6. data/VERSION +1 -1
  7. data/examples/foo.rb +19 -0
  8. data/examples/html_selector.rb +22 -0
  9. data/examples/nes_game_list.csv +625 -0
  10. data/examples/nes_gamespot.csv +1371 -0
  11. data/examples/nes_nintendo.csv +624 -0
  12. data/examples/nes_unlicensed.csv +89 -0
  13. data/examples/nes_wikipedia.csv +710 -0
  14. data/examples/nibbler_test.rb +24 -0
  15. data/examples/script.rb +19 -0
  16. data/lib/imw.rb +28 -140
  17. data/lib/imw/error.rb +9 -0
  18. data/lib/imw/recordizer.rb +8 -0
  19. data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
  20. data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
  21. data/lib/imw/resource.rb +3 -119
  22. data/lib/imw/serializer.rb +7 -0
  23. data/lib/imw/serializer/json_serializer.rb +17 -0
  24. data/lib/imw/uri.rb +41 -0
  25. data/spec/resource_spec.rb +78 -0
  26. data/spec/uri_spec.rb +55 -0
  27. metadata +81 -232
  28. data/README.rdoc +0 -371
  29. data/bin/imw +0 -5
  30. data/bin/tsv_to_json.rb +0 -29
  31. data/etc/imwrc.rb +0 -26
  32. data/examples/dataset.rb +0 -12
  33. data/examples/metadata.yml +0 -10
  34. data/lib/imw/archives.rb +0 -120
  35. data/lib/imw/archives/rar.rb +0 -19
  36. data/lib/imw/archives/tar.rb +0 -19
  37. data/lib/imw/archives/tarbz2.rb +0 -73
  38. data/lib/imw/archives/targz.rb +0 -73
  39. data/lib/imw/archives/zip.rb +0 -51
  40. data/lib/imw/boot.rb +0 -87
  41. data/lib/imw/compressed_files.rb +0 -94
  42. data/lib/imw/compressed_files/bz2.rb +0 -16
  43. data/lib/imw/compressed_files/compressible.rb +0 -75
  44. data/lib/imw/compressed_files/gz.rb +0 -16
  45. data/lib/imw/dataset.rb +0 -125
  46. data/lib/imw/dataset/paths.rb +0 -29
  47. data/lib/imw/dataset/workflow.rb +0 -195
  48. data/lib/imw/formats.rb +0 -33
  49. data/lib/imw/formats/delimited.rb +0 -170
  50. data/lib/imw/formats/excel.rb +0 -100
  51. data/lib/imw/formats/json.rb +0 -41
  52. data/lib/imw/formats/pdf.rb +0 -71
  53. data/lib/imw/formats/sgml.rb +0 -69
  54. data/lib/imw/formats/yaml.rb +0 -41
  55. data/lib/imw/metadata.rb +0 -83
  56. data/lib/imw/metadata/contains_metadata.rb +0 -54
  57. data/lib/imw/metadata/dsl.rb +0 -111
  58. data/lib/imw/metadata/field.rb +0 -37
  59. data/lib/imw/metadata/has_metadata.rb +0 -98
  60. data/lib/imw/metadata/has_summary.rb +0 -57
  61. data/lib/imw/metadata/schema.rb +0 -17
  62. data/lib/imw/parsers.rb +0 -8
  63. data/lib/imw/parsers/flat.rb +0 -44
  64. data/lib/imw/parsers/html_parser.rb +0 -387
  65. data/lib/imw/parsers/html_parser/matchers.rb +0 -289
  66. data/lib/imw/parsers/line_parser.rb +0 -87
  67. data/lib/imw/parsers/regexp_parser.rb +0 -72
  68. data/lib/imw/repository.rb +0 -12
  69. data/lib/imw/runner.rb +0 -118
  70. data/lib/imw/schemes.rb +0 -23
  71. data/lib/imw/schemes/ftp.rb +0 -142
  72. data/lib/imw/schemes/hdfs.rb +0 -251
  73. data/lib/imw/schemes/http.rb +0 -165
  74. data/lib/imw/schemes/local.rb +0 -409
  75. data/lib/imw/schemes/remote.rb +0 -119
  76. data/lib/imw/schemes/s3.rb +0 -143
  77. data/lib/imw/schemes/sql.rb +0 -129
  78. data/lib/imw/tools.rb +0 -12
  79. data/lib/imw/tools/aggregator.rb +0 -148
  80. data/lib/imw/tools/archiver.rb +0 -220
  81. data/lib/imw/tools/downloader.rb +0 -63
  82. data/lib/imw/tools/extension_analyzer.rb +0 -114
  83. data/lib/imw/tools/summarizer.rb +0 -83
  84. data/lib/imw/tools/transferer.rb +0 -167
  85. data/lib/imw/utils.rb +0 -74
  86. data/lib/imw/utils/dynamically_extendable.rb +0 -137
  87. data/lib/imw/utils/error.rb +0 -59
  88. data/lib/imw/utils/extensions/hpricot.rb +0 -34
  89. data/lib/imw/utils/has_uri.rb +0 -131
  90. data/lib/imw/utils/log.rb +0 -92
  91. data/lib/imw/utils/misc.rb +0 -57
  92. data/lib/imw/utils/paths.rb +0 -146
  93. data/lib/imw/utils/uri.rb +0 -59
  94. data/lib/imw/utils/uuid.rb +0 -33
  95. data/lib/imw/utils/validate.rb +0 -38
  96. data/lib/imw/utils/version.rb +0 -11
  97. data/spec/data/formats/delimited/sample.csv +0 -131
  98. data/spec/data/formats/delimited/sample.tsv +0 -131
  99. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
  100. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
  101. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
  102. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
  103. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
  104. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
  105. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
  106. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
  107. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
  108. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
  109. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
  110. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
  111. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
  112. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
  113. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
  114. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
  115. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
  116. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
  117. data/spec/data/formats/excel/sample.xls +0 -0
  118. data/spec/data/formats/json/sample.json +0 -1
  119. data/spec/data/formats/none/sample +0 -650
  120. data/spec/data/formats/sgml/sample.xml +0 -617
  121. data/spec/data/formats/text/sample.txt +0 -650
  122. data/spec/data/formats/yaml/sample.yaml +0 -410
  123. data/spec/data/schema-tabular.yaml +0 -11
  124. data/spec/imw/archives/rar_spec.rb +0 -16
  125. data/spec/imw/archives/tar_spec.rb +0 -16
  126. data/spec/imw/archives/tarbz2_spec.rb +0 -24
  127. data/spec/imw/archives/targz_spec.rb +0 -21
  128. data/spec/imw/archives/zip_spec.rb +0 -16
  129. data/spec/imw/archives_spec.rb +0 -77
  130. data/spec/imw/compressed_files/bz2_spec.rb +0 -15
  131. data/spec/imw/compressed_files/compressible_spec.rb +0 -36
  132. data/spec/imw/compressed_files/gz_spec.rb +0 -15
  133. data/spec/imw/compressed_files_spec.rb +0 -47
  134. data/spec/imw/dataset/paths_spec.rb +0 -32
  135. data/spec/imw/dataset/workflow_spec.rb +0 -41
  136. data/spec/imw/formats/delimited_spec.rb +0 -44
  137. data/spec/imw/formats/excel_spec.rb +0 -55
  138. data/spec/imw/formats/json_spec.rb +0 -18
  139. data/spec/imw/formats/sgml_spec.rb +0 -24
  140. data/spec/imw/formats/yaml_spec.rb +0 -19
  141. data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
  142. data/spec/imw/metadata/field_spec.rb +0 -25
  143. data/spec/imw/metadata/has_metadata_spec.rb +0 -58
  144. data/spec/imw/metadata/has_summary_spec.rb +0 -32
  145. data/spec/imw/metadata/schema_spec.rb +0 -24
  146. data/spec/imw/metadata_spec.rb +0 -86
  147. data/spec/imw/parsers/line_parser_spec.rb +0 -96
  148. data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
  149. data/spec/imw/resource_spec.rb +0 -32
  150. data/spec/imw/schemes/hdfs_spec.rb +0 -67
  151. data/spec/imw/schemes/http_spec.rb +0 -19
  152. data/spec/imw/schemes/local_spec.rb +0 -165
  153. data/spec/imw/schemes/remote_spec.rb +0 -38
  154. data/spec/imw/schemes/s3_spec.rb +0 -31
  155. data/spec/imw/schemes/sql_spec.rb +0 -3
  156. data/spec/imw/tools/aggregator_spec.rb +0 -71
  157. data/spec/imw/tools/archiver_spec.rb +0 -120
  158. data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
  159. data/spec/imw/tools/summarizer_spec.rb +0 -8
  160. data/spec/imw/tools/transferer_spec.rb +0 -195
  161. data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
  162. data/spec/imw/utils/has_uri_spec.rb +0 -61
  163. data/spec/imw/utils/paths_spec.rb +0 -10
  164. data/spec/imw/utils/shared_paths_spec.rb +0 -29
  165. data/spec/imw_spec.rb +0 -14
  166. data/spec/rcov.opts +0 -1
  167. data/spec/spec_helper.rb +0 -31
  168. data/spec/support/custom_matchers.rb +0 -28
  169. data/spec/support/file_contents_matcher.rb +0 -30
  170. data/spec/support/paths_matcher.rb +0 -66
  171. data/spec/support/random.rb +0 -213
  172. data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -1,371 +0,0 @@
1
- = What is the Infinite Monkeywrench?
2
-
3
- The Infinite Monkeywrench (IMW) is a Ruby frameworks to simplify the
4
- tasks of acquiring, extracting, transforming, loading, and packaging
5
- data. It has the following goals:
6
-
7
- * Minimize programmer time even at the expense of increasing run
8
- time.
9
-
10
- * Take data through a full transformation from raw source to packaged
11
- purity in as few lines of code as possible.
12
-
13
- * Treat data records as objects as much as possible.
14
-
15
- * Use instead of repeat better code that already exists in other
16
- libraries (FasterCSV, I'm talkin' to you).
17
-
18
- * Make what's common easy without making what's uncommon impossible.
19
-
20
- * Work with messy data as well as clean data.
21
-
22
- * Let you incorporate your own tools wherever you choose to.
23
-
24
- The Infinite Monkeywrench is a powerful tool but it is not always the
25
- right tool. IMW is **not** designed for
26
-
27
- * Scraping vast amounts of data (use Wuclan[http://github.com/infochimps/wuclan] and Monkeyshines[http://github.com/infochimps/monkeyshines])
28
-
29
- * Really, really big datasets (use Wukong[http://github.com/infochimps/wukong] and Hadoop[http://hadoop.apache.org])
30
-
31
- * Data mining or statistical analysis
32
-
33
- * Visualization
34
-
35
- = Installation
36
-
37
- IMW is hosted on Gemcutter[http://gemcutter.org] so it's easy to install.
38
-
39
- You'll have to add <tt>http://gemcutter.org</tt> to your gem sources
40
- if it isn't there already:
41
-
42
- $ gem sources -a http://gemcutter.org
43
-
44
- and then install IMW
45
-
46
- $ sudo gem install imw
47
-
48
- In all the examples that follow it is assumed that you've installed
49
- IMW and required it in a script via
50
-
51
- require 'rubygems'
52
- require 'imw'
53
-
54
- = Resources
55
-
56
- IMW is centered around processing resources. A resource can be
57
- _anything_ with a URI and you create one using IMW.open.
58
-
59
- csv = IMW.open('/path/to/my_data.csv')
60
- html = IMW.open('http://www.example.com/history/march_2007')
61
-
62
- IMW dynamically extends a resource with modules appropriate to it when
63
- you open it. In the above case, +csv+ would be automatically extended
64
- by the IMW::Resources::Formats::Csv module, among others:
65
-
66
- csv.modules
67
- => [IMW::Schemes::Local::Base, IMW::Schemes::Local::LocalFile, IMW::CompressedFiles::Compressible, IMW::Formats::Csv]
68
-
69
- while +html+ will use a different set
70
-
71
- html.modules
72
- => [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP, IMW::Formats::Html]
73
-
74
- Consult the documentation for the modules a resource uses to learn
75
- what it can do.
76
-
77
- == Including/Excluding Resource Modules
78
-
79
- You can exercise finer control of the resource modules IMW will extend
80
- a given resource with by passing the <tt>:as</tt> and <tt>:without</tt>.
81
-
82
- IMW.open('http://www.infochimps.com/some_raw_data', :without => [IMW::Formats::Html]).resource_modules
83
- => [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP]
84
-
85
- IMW.open('http://www.infochimps.com', :as => [IMW::Formats::Json]).resource_modules
86
- => [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP, IMW::Formats::Json]
87
-
88
- You can also pass <tt>:no_modules</tt> to not use any resource
89
- modules.
90
-
91
- == Handlers and Custom Resource Modules
92
-
93
- IMW chooses which resource modules to extend an IMW::Resource by
94
- iterating through an array of handlers, passing the resource to the
95
- handler, and letting the handler's response (true/false) determine
96
- whether or not to extend the resource with the module accompanying the
97
- handler.
98
-
99
- You can hook into this process by defining your own handlers. To
100
- define a handler which should extend with +MyModule+ any resource with
101
- a URI ending with <tt>.xxx</tt>
102
-
103
- IMW::Resource.register_handler MyModule, /\.xxx$/
104
-
105
- You can also use a Proc instead of a Regexp for more control. If the
106
- result output of the Proc called with a resource is evaluates true
107
- then the resource will be extended by +MyModule+.
108
-
109
- IMW::Resource.register_handler MyModule, Proc.new { |resource| resource.is_local? && resource.path =~ /\.xxx$/ }
110
-
111
- = Manipulating Paths
112
-
113
- IMW holds a registry of paths that you can define on the fly or store
114
- in a configuration file. Defining paths once in the registry and then
115
- referring to them forever after by name helps keep your code flexible
116
- as well as portable.
117
-
118
- IMW.add_path(:dropbox, "/var/www/public")
119
- IMW.path_to(:dropbox)
120
- => "/var/www/public"
121
-
122
- You can combine named references together dynamically.
123
-
124
- IMW.add_path(:raw, :dropbox, "raw")
125
- IMW.path_to(:raw)
126
- => "/var/www/public/raw"
127
- IMW.path_to(:raw, "my/dataset")
128
- => "/var/www/public/raw/my/dataset
129
-
130
- Altering one path will update others
131
-
132
- IMW.add_path(:dropbox, "/data") # redefines :raw
133
- IMW.path_to(:raw, "my/dataset)
134
- => "/data/raw/my/dataset" # not /var/www/public/raw/my/dataset
135
-
136
- = Files & Directories
137
-
138
- Use IMW.open to open files. The object returned by IMW.open obeys the
139
- usual semantics of a File object but it has new methods to manipulate
140
- and parse the file.
141
-
142
- f1 = IMW.open("/path/to/file")
143
- f1.read() # does what you think
144
-
145
- # class methods from File are available
146
- f1.size
147
- f1.writeable?
148
-
149
- # use a bang or a 'w' to write
150
- writable_file = IMW.open!('/some/path') # similar to open('/some/path', 'w')
151
-
152
- # as well as methods to manipulate the file on the filesystem
153
- f2 = f1.cp("/new/path/to/file") # also try cp_to_dir
154
- f1.exist? # true
155
- f3 = f1.mv("/yet/another/path") # also try mv_to_dir
156
- f1.exist? # false
157
-
158
- IMW also knows about directories
159
-
160
- d = IMW.open('/tmp')
161
- d.directory? # true
162
- d['*'] # Dir['/tmp/*']
163
- d.mv('/parent/dir')
164
-
165
- == Remote Files
166
-
167
- Many operations defined for files are also defined for arbitrary URIs
168
- through the <tt>open-uri</tt> library.
169
-
170
- Files can readily be opened, read, and downloaded from the Internet
171
-
172
- site = IMW.open('http://infochimps.org') #=> Recognized as an HTML document
173
- site.read() # does what you think
174
- site.cp('/some/local/path')
175
- site.exist? # will work in many cases
176
-
177
- (writing to remote sources isn't enabled yet).
178
-
179
- == Archives & Compressed Files
180
-
181
- IMW works with a variety of archiving and compression programs to make
182
- packaging/unpackaging data easy.
183
-
184
- bz2 = IMW.open('/path/to/big_file.bz2')
185
- zip = IMW.open('/path/to/archive.zip')
186
- targz = IMW.open('/path/to/archive.tar.gz')
187
-
188
- IMW recognizes file properties by extension
189
-
190
- bz2.is_archive? # false
191
- bz2.is_compressed? # true
192
- zip.is_archive? # true
193
- zip.is_compressed? # false
194
- targz.is_archive? # true
195
- targz.is_compressed? # true
196
-
197
- # decompress or compress files
198
- big_file = bz2.decompress! # skip the ! to preserve the original
199
- new_bz2 = big_file.compress!
200
-
201
- # extract and package archives
202
- zip.extract # files show up in working directory
203
- tarbz2.extract # no need to decompress first
204
- new_tarbz2 = IMW.open!('/new/archive.tar').create(['/path1', '/path/2']).compress!
205
-
206
- == Parsing and Emitting Data
207
-
208
- IMW encourages you to work with native Ruby data structures as much as
209
- possible by providing methods to parse common data formats directly
210
- into Arrays, Hashes and Strings.
211
-
212
- Some data formats (CSV, JSON, YAML) have a structure which trivially
213
- maps to Arrays, Hashes, and Strings and so these formats can
214
- immediately be parsed.
215
-
216
- Other formats (XML, HTML, flat files, &c.) use data structures which
217
- do not map as readily to Arrays, Hashes, and Strings and so these will
218
- have to be parsed first.
219
-
220
- === Ruby-like Data Formats
221
-
222
- These include delimited formats such as CSV and TSV as well as
223
- "restricted tree-like" formats like JSON and YAML.
224
-
225
- For the case of delimited data, consider the following CSV file:
226
-
227
- ID,Name,Genus,Species
228
- 001,Gray-bellied Night Monkey,Aotus,lemurinus
229
- 002,Panamanian Night Monkey,Aotus,zonalis
230
- 003,Hernández-Camacho's Night Monkey,Aotus,jorgehernandezi
231
- 004,Gray-handed Night Monkey,Aotus,griseimembra
232
- 005,Hershkovitz's Night Monkey,Aotus,hershkovitzi
233
- 006,Brumback's Night Monkey,Aotus,brumbacki
234
- 007,Three-striped Night Monkey,Aotus,trivirgatus
235
- 008,Spix's Night Monkey,Aotus,vociferans
236
- 009,Malaysian Lar Gibbon,Hylobates,lar lar
237
- 010,Carpenter's Lar Gibbon,Hylobates,lar carpenteri
238
-
239
- It trivially maps to an Array of Arrays:
240
-
241
- data = IMW.open('/path/to/monkeys.csv').load
242
- puts data.class
243
- => Array
244
- puts data.first.class
245
- => Array
246
- data.each { |row| puts row.inspect }
247
- => ["ID", "Name", "Genus", "Species"]
248
- ["001", "Gray-bellied Night Monkey", "Aotus", "lemurinus"]
249
- ["002", "Panamanian Night Monkey", "Aotus", "zonalis"]
250
- ...
251
- ["010", "Carpenter's Lar Gibbon", "Hylobates", "lar carpenteri"]
252
-
253
- Conversely, any array of arrays trivially maps to a delimited file.
254
- Here we write out all rows where the genus is _Hylobates_ to a TSV
255
- file:
256
-
257
- hylobates = data.find_all { |row| row[2] == 'Hylobates' }
258
- hylobates.dump('/path/to/monkeys.tsv')
259
-
260
- IMW automatically formats the output as TSV and writes it to the
261
- specified path.
262
-
263
- Similarly, restricted tree-like formats like JSON and YAML, which map
264
- cleanly onto Hashes, Arrays, and Strings, can also be automatically
265
- parsed and emitted by IMW.
266
-
267
- Consider a YAML version of the above CSV data:
268
-
269
- - id: 001
270
- name: Gray-bellied Night Monkey
271
- genus: Aotus
272
- species: lemurinus
273
- - id: 002
274
- name: Panamanian Night Monkey
275
- genus: Aotus
276
- species: zonalis
277
- - id: 003
278
- name: Hernández-Camacho's Night Monkey
279
- genus: Aotus
280
- species: jorgehernandezi
281
- ...
282
- - id: 010
283
- name: Carpenter's Lar Gibbon
284
- genus: Hylobates
285
- species: lar carpenteri
286
-
287
- This trivially maps to an Array of Hashes and so we can perform the
288
- exact same filtration for YAML and JSON as we did for CSV and TSV (in
289
- a one-liner!):
290
-
291
- data = IMW.open('/path/to/monkeys.yaml').load
292
- hylobates = data.map{ |monkey| monkey['genus'] == 'Hylobates' }
293
- hylobates.dump('/path/to/monkeys.json')
294
-
295
- Resources in these Ruby-like data formats also extend themselves with
296
- Enumerable so goodies like +map+, +find_all+, &c. are available. This
297
- enables converting YAML to JSON with a one-liner:
298
-
299
- IMW.open('/path/to/monkeys.yaml').find_all { |monkey| monkey['genus'] == 'Hylobates' }.dump('/path/to/monkeys.json')
300
-
301
- === Parsing More General Data Formats
302
-
303
- Some data formats are structured but do not map readily to Hashes,
304
- Arrays, and Strings (XML, HTML, &c.) while other data formats lack
305
- structure or have a peculiar structure (flat files in arbitrary
306
- syntax).
307
-
308
- In both these cases the data needs to be parsed before it's usable.
309
- For the XML and HTML type data formats, IMW uses Hpricot and the
310
- IMW::Parsers::HtmlParser for parsing. For flat files, IMW provides
311
- the IMW::Parsers::LineParser and the IMW::Parsers::RegexpParser.
312
-
313
- HTML files, on the other hand, are more complex and typically have to
314
- be parsed before being converted to plain Ruby objects:
315
-
316
- # Grab a tiny link from the bottom of Google's homepage
317
- doc = IMW.open('http://www.google.com') # IMW::Files::Html
318
- doc.parse('p a') # 'Privacy'
319
-
320
- More complex parsers can also be built
321
-
322
- # Grab each row from an HTML table
323
- doc = IMW.open('/path/to/data.html')
324
- doc.parse :employees => ["tr", { :name => "td.name", :address => "td.address" } ]
325
- #=> [{:name => "John Chimpo", :address => "123 Fake St."}, {...}, ... ]
326
-
327
- see IMW::Parsers::HtmlParser for details on parsing HTML (and similar)
328
- files. Examine the other parsers in IMW::Parsers for details on
329
- parsing other data formats.
330
-
331
- = The IMW Workflow
332
-
333
- The workflow of IMW can be roughly summarized as follows:
334
-
335
- rip::
336
-
337
- Data is obtained from a source. IMW allows you to download data
338
- from the web, obtain it by querying databases, or use other services
339
- like rsync, ftp, &c. to pull it in from another computer.
340
-
341
- parse::
342
-
343
- Data is parsed into Ruby objects and stored.
344
-
345
- fix::
346
-
347
- All the parsed data is combined, reconciled, and further processed
348
- into a final form.
349
-
350
- package::
351
-
352
- The data is archived and compressed as necessary and moved to an
353
- outbox, staging server, S3 bucket, &c.
354
-
355
- Not all datasets
356
-
357
-
358
- = Datasets
359
-
360
- == Tasks & Dependencies
361
-
362
- == Directory Structure
363
-
364
- == Records
365
-
366
- = IMW on the Command Line
367
-
368
- == Repositories
369
-
370
- == Running Tasks
371
-
data/bin/imw DELETED
@@ -1,5 +0,0 @@
1
- #!/usr/bin/env ruby
2
- $:.unshift File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
3
- require 'imw/runner'
4
- exit IMW::Runner.new(*ARGV.dup).run!
5
-
@@ -1,29 +0,0 @@
1
- #!/usr/bin/env ruby
2
- require 'rubygems'
3
- require 'json/ext'
4
- require 'configliere'
5
-
6
- Settings.use :commandline, :define
7
- Settings.define :json_keys, :description => "A comma separated list of keys, in the order to be read from source."
8
- # Settings.resolve!
9
-
10
- module TSVtoJSON
11
-
12
- # def initialize
13
- # keys unless Settings.keys.nil?
14
- # end
15
-
16
- def keys
17
- @keys ||= Settings.json_keys.split(",")
18
- end
19
-
20
- def into_json record, exclude=[]
21
- json_hash = Hash.new
22
- keys.each_with_index do |key, index|
23
- next if exclude.include?(key)
24
- json_hash[key] = record[index]
25
- end
26
- return JSON.generate(json_hash)
27
- end
28
-
29
- end
@@ -1,26 +0,0 @@
1
- #-*- mode: ruby -*-
2
- #
3
- # h2. etc/imwrc -- default site-wide imw configuration file
4
- #
5
- # == About
6
- #
7
- # This file contains the site-wide configuration settings for this
8
- # installation of the Infinite Monkeywrench. Settings here override
9
- # the defaults in <tt>lib/imw/utils/config.rb</tt> (see the
10
- # documentation for that file for more detail on the variables that
11
- # can be configured here) but will in turn be overwritten by settings
12
- # in the <tt>~/.imwrc</tt> file in each user's directory (though the
13
- # location of this file can be customized).
14
- #
15
- # At the present moment, all settings are stored as plain Ruby files
16
- # (though they may lack the <tt>.rb</tt> extension). As the IMW
17
- # develops, these will be replaced by YAML files which will be parsed
18
- # by <tt>lib/imw/utils/config.rb</tt>.
19
- #
20
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
21
- # Copyright:: Copyright (c) 2008 infochimps.org
22
- # License:: GPL 3.0
23
- # Website:: http://infinitemonkeywrench.org/
24
-
25
- module IMW
26
- end
@@ -1,12 +0,0 @@
1
- require 'imw'
2
- dataset = IMW::Dataset.new :handle => 'test'
3
-
4
- dataset.rip do
5
- IMW.open("http://path/to/somre/resource.html").cp(dataset.path_to(:ripd), 'original_data.html')
6
- end
7
-
8
- dataset.parse do
9
- #...
10
- end
11
-
12
-
@@ -1,10 +0,0 @@
1
- ---
2
-
3
- metadata:
4
- /path/to/my/data.tsv:
5
- - name: foobar
6
- datatype: foobar
7
- doc: foobar
8
-
9
-
10
-