imw 0.2.18 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. data/Gemfile +7 -26
  2. data/Gemfile.lock +13 -38
  3. data/{LICENSE → LICENSE.txt} +1 -1
  4. data/README.textile +35 -0
  5. data/Rakefile +45 -22
  6. data/VERSION +1 -1
  7. data/examples/foo.rb +19 -0
  8. data/examples/html_selector.rb +22 -0
  9. data/examples/nes_game_list.csv +625 -0
  10. data/examples/nes_gamespot.csv +1371 -0
  11. data/examples/nes_nintendo.csv +624 -0
  12. data/examples/nes_unlicensed.csv +89 -0
  13. data/examples/nes_wikipedia.csv +710 -0
  14. data/examples/nibbler_test.rb +24 -0
  15. data/examples/script.rb +19 -0
  16. data/lib/imw.rb +28 -140
  17. data/lib/imw/error.rb +9 -0
  18. data/lib/imw/recordizer.rb +8 -0
  19. data/lib/imw/recordizer/html_selector_recordizer.rb +86 -0
  20. data/lib/imw/recordizer/string_slice_recordizer.rb +39 -0
  21. data/lib/imw/resource.rb +3 -119
  22. data/lib/imw/serializer.rb +7 -0
  23. data/lib/imw/serializer/json_serializer.rb +17 -0
  24. data/lib/imw/uri.rb +41 -0
  25. data/spec/resource_spec.rb +78 -0
  26. data/spec/uri_spec.rb +55 -0
  27. metadata +81 -232
  28. data/README.rdoc +0 -371
  29. data/bin/imw +0 -5
  30. data/bin/tsv_to_json.rb +0 -29
  31. data/etc/imwrc.rb +0 -26
  32. data/examples/dataset.rb +0 -12
  33. data/examples/metadata.yml +0 -10
  34. data/lib/imw/archives.rb +0 -120
  35. data/lib/imw/archives/rar.rb +0 -19
  36. data/lib/imw/archives/tar.rb +0 -19
  37. data/lib/imw/archives/tarbz2.rb +0 -73
  38. data/lib/imw/archives/targz.rb +0 -73
  39. data/lib/imw/archives/zip.rb +0 -51
  40. data/lib/imw/boot.rb +0 -87
  41. data/lib/imw/compressed_files.rb +0 -94
  42. data/lib/imw/compressed_files/bz2.rb +0 -16
  43. data/lib/imw/compressed_files/compressible.rb +0 -75
  44. data/lib/imw/compressed_files/gz.rb +0 -16
  45. data/lib/imw/dataset.rb +0 -125
  46. data/lib/imw/dataset/paths.rb +0 -29
  47. data/lib/imw/dataset/workflow.rb +0 -195
  48. data/lib/imw/formats.rb +0 -33
  49. data/lib/imw/formats/delimited.rb +0 -170
  50. data/lib/imw/formats/excel.rb +0 -100
  51. data/lib/imw/formats/json.rb +0 -41
  52. data/lib/imw/formats/pdf.rb +0 -71
  53. data/lib/imw/formats/sgml.rb +0 -69
  54. data/lib/imw/formats/yaml.rb +0 -41
  55. data/lib/imw/metadata.rb +0 -83
  56. data/lib/imw/metadata/contains_metadata.rb +0 -54
  57. data/lib/imw/metadata/dsl.rb +0 -111
  58. data/lib/imw/metadata/field.rb +0 -37
  59. data/lib/imw/metadata/has_metadata.rb +0 -98
  60. data/lib/imw/metadata/has_summary.rb +0 -57
  61. data/lib/imw/metadata/schema.rb +0 -17
  62. data/lib/imw/parsers.rb +0 -8
  63. data/lib/imw/parsers/flat.rb +0 -44
  64. data/lib/imw/parsers/html_parser.rb +0 -387
  65. data/lib/imw/parsers/html_parser/matchers.rb +0 -289
  66. data/lib/imw/parsers/line_parser.rb +0 -87
  67. data/lib/imw/parsers/regexp_parser.rb +0 -72
  68. data/lib/imw/repository.rb +0 -12
  69. data/lib/imw/runner.rb +0 -118
  70. data/lib/imw/schemes.rb +0 -23
  71. data/lib/imw/schemes/ftp.rb +0 -142
  72. data/lib/imw/schemes/hdfs.rb +0 -251
  73. data/lib/imw/schemes/http.rb +0 -165
  74. data/lib/imw/schemes/local.rb +0 -409
  75. data/lib/imw/schemes/remote.rb +0 -119
  76. data/lib/imw/schemes/s3.rb +0 -143
  77. data/lib/imw/schemes/sql.rb +0 -129
  78. data/lib/imw/tools.rb +0 -12
  79. data/lib/imw/tools/aggregator.rb +0 -148
  80. data/lib/imw/tools/archiver.rb +0 -220
  81. data/lib/imw/tools/downloader.rb +0 -63
  82. data/lib/imw/tools/extension_analyzer.rb +0 -114
  83. data/lib/imw/tools/summarizer.rb +0 -83
  84. data/lib/imw/tools/transferer.rb +0 -167
  85. data/lib/imw/utils.rb +0 -74
  86. data/lib/imw/utils/dynamically_extendable.rb +0 -137
  87. data/lib/imw/utils/error.rb +0 -59
  88. data/lib/imw/utils/extensions/hpricot.rb +0 -34
  89. data/lib/imw/utils/has_uri.rb +0 -131
  90. data/lib/imw/utils/log.rb +0 -92
  91. data/lib/imw/utils/misc.rb +0 -57
  92. data/lib/imw/utils/paths.rb +0 -146
  93. data/lib/imw/utils/uri.rb +0 -59
  94. data/lib/imw/utils/uuid.rb +0 -33
  95. data/lib/imw/utils/validate.rb +0 -38
  96. data/lib/imw/utils/version.rb +0 -11
  97. data/spec/data/formats/delimited/sample.csv +0 -131
  98. data/spec/data/formats/delimited/sample.tsv +0 -131
  99. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +0 -11
  100. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -16
  101. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +0 -11
  102. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -22
  103. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -22
  104. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -12
  105. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -13
  106. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -22
  107. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -22
  108. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +0 -10
  109. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +0 -15
  110. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +0 -10
  111. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +0 -21
  112. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +0 -21
  113. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +0 -11
  114. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +0 -12
  115. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +0 -21
  116. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +0 -21
  117. data/spec/data/formats/excel/sample.xls +0 -0
  118. data/spec/data/formats/json/sample.json +0 -1
  119. data/spec/data/formats/none/sample +0 -650
  120. data/spec/data/formats/sgml/sample.xml +0 -617
  121. data/spec/data/formats/text/sample.txt +0 -650
  122. data/spec/data/formats/yaml/sample.yaml +0 -410
  123. data/spec/data/schema-tabular.yaml +0 -11
  124. data/spec/imw/archives/rar_spec.rb +0 -16
  125. data/spec/imw/archives/tar_spec.rb +0 -16
  126. data/spec/imw/archives/tarbz2_spec.rb +0 -24
  127. data/spec/imw/archives/targz_spec.rb +0 -21
  128. data/spec/imw/archives/zip_spec.rb +0 -16
  129. data/spec/imw/archives_spec.rb +0 -77
  130. data/spec/imw/compressed_files/bz2_spec.rb +0 -15
  131. data/spec/imw/compressed_files/compressible_spec.rb +0 -36
  132. data/spec/imw/compressed_files/gz_spec.rb +0 -15
  133. data/spec/imw/compressed_files_spec.rb +0 -47
  134. data/spec/imw/dataset/paths_spec.rb +0 -32
  135. data/spec/imw/dataset/workflow_spec.rb +0 -41
  136. data/spec/imw/formats/delimited_spec.rb +0 -44
  137. data/spec/imw/formats/excel_spec.rb +0 -55
  138. data/spec/imw/formats/json_spec.rb +0 -18
  139. data/spec/imw/formats/sgml_spec.rb +0 -24
  140. data/spec/imw/formats/yaml_spec.rb +0 -19
  141. data/spec/imw/metadata/contains_metadata_spec.rb +0 -56
  142. data/spec/imw/metadata/field_spec.rb +0 -25
  143. data/spec/imw/metadata/has_metadata_spec.rb +0 -58
  144. data/spec/imw/metadata/has_summary_spec.rb +0 -32
  145. data/spec/imw/metadata/schema_spec.rb +0 -24
  146. data/spec/imw/metadata_spec.rb +0 -86
  147. data/spec/imw/parsers/line_parser_spec.rb +0 -96
  148. data/spec/imw/parsers/regexp_parser_spec.rb +0 -42
  149. data/spec/imw/resource_spec.rb +0 -32
  150. data/spec/imw/schemes/hdfs_spec.rb +0 -67
  151. data/spec/imw/schemes/http_spec.rb +0 -19
  152. data/spec/imw/schemes/local_spec.rb +0 -165
  153. data/spec/imw/schemes/remote_spec.rb +0 -38
  154. data/spec/imw/schemes/s3_spec.rb +0 -31
  155. data/spec/imw/schemes/sql_spec.rb +0 -3
  156. data/spec/imw/tools/aggregator_spec.rb +0 -71
  157. data/spec/imw/tools/archiver_spec.rb +0 -120
  158. data/spec/imw/tools/extension_analyzer_spec.rb +0 -153
  159. data/spec/imw/tools/summarizer_spec.rb +0 -8
  160. data/spec/imw/tools/transferer_spec.rb +0 -195
  161. data/spec/imw/utils/dynamically_extendable_spec.rb +0 -69
  162. data/spec/imw/utils/has_uri_spec.rb +0 -61
  163. data/spec/imw/utils/paths_spec.rb +0 -10
  164. data/spec/imw/utils/shared_paths_spec.rb +0 -29
  165. data/spec/imw_spec.rb +0 -14
  166. data/spec/rcov.opts +0 -1
  167. data/spec/spec_helper.rb +0 -31
  168. data/spec/support/custom_matchers.rb +0 -28
  169. data/spec/support/file_contents_matcher.rb +0 -30
  170. data/spec/support/paths_matcher.rb +0 -66
  171. data/spec/support/random.rb +0 -213
  172. data/spec/support/without_regard_to_order_matcher.rb +0 -41
@@ -1,371 +0,0 @@
1
- = What is the Infinite Monkeywrench?
2
-
3
- The Infinite Monkeywrench (IMW) is a Ruby frameworks to simplify the
4
- tasks of acquiring, extracting, transforming, loading, and packaging
5
- data. It has the following goals:
6
-
7
- * Minimize programmer time even at the expense of increasing run
8
- time.
9
-
10
- * Take data through a full transformation from raw source to packaged
11
- purity in as few lines of code as possible.
12
-
13
- * Treat data records as objects as much as possible.
14
-
15
- * Use instead of repeat better code that already exists in other
16
- libraries (FasterCSV, I'm talkin' to you).
17
-
18
- * Make what's common easy without making what's uncommon impossible.
19
-
20
- * Work with messy data as well as clean data.
21
-
22
- * Let you incorporate your own tools wherever you choose to.
23
-
24
- The Infinite Monkeywrench is a powerful tool but it is not always the
25
- right tool. IMW is **not** designed for
26
-
27
- * Scraping vast amounts of data (use Wuclan[http://github.com/infochimps/wuclan] and Monkeyshines[http://github.com/infochimps/monkeyshines])
28
-
29
- * Really, really big datasets (use Wukong[http://github.com/infochimps/wukong] and Hadoop[http://hadoop.apache.org])
30
-
31
- * Data mining or statistical analysis
32
-
33
- * Visualization
34
-
35
- = Installation
36
-
37
- IMW is hosted on Gemcutter[http://gemcutter.org] so it's easy to install.
38
-
39
- You'll have to add <tt>http://gemcutter.org</tt> to your gem sources
40
- if it isn't there already:
41
-
42
- $ gem sources -a http://gemcutter.org
43
-
44
- and then install IMW
45
-
46
- $ sudo gem install imw
47
-
48
- In all the examples that follow it is assumed that you've installed
49
- IMW and required it in a script via
50
-
51
- require 'rubygems'
52
- require 'imw'
53
-
54
- = Resources
55
-
56
- IMW is centered around processing resources. A resource can be
57
- _anything_ with a URI and you create one using IMW.open.
58
-
59
- csv = IMW.open('/path/to/my_data.csv')
60
- html = IMW.open('http://www.example.com/history/march_2007')
61
-
62
- IMW dynamically extends a resource with modules appropriate to it when
63
- you open it. In the above case, +csv+ would be automatically extended
64
- by the IMW::Resources::Formats::Csv module, among others:
65
-
66
- csv.modules
67
- => [IMW::Schemes::Local::Base, IMW::Schemes::Local::LocalFile, IMW::CompressedFiles::Compressible, IMW::Formats::Csv]
68
-
69
- while +html+ will use a different set
70
-
71
- html.modules
72
- => [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP, IMW::Formats::Html]
73
-
74
- Consult the documentation for the modules a resource uses to learn
75
- what it can do.
76
-
77
- == Including/Excluding Resource Modules
78
-
79
- You can exercise finer control of the resource modules IMW will extend
80
- a given resource with by passing the <tt>:as</tt> and <tt>:without</tt>.
81
-
82
- IMW.open('http://www.infochimps.com/some_raw_data', :without => [IMW::Formats::Html]).resource_modules
83
- => [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP]
84
-
85
- IMW.open('http://www.infochimps.com', :as => [IMW::Formats::Json]).resource_modules
86
- => [IMW::Schemes::Remote::Base, IMW::Schemes::Remote::RemoteFile, IMW::Schemes::HTTP, IMW::Formats::Json]
87
-
88
- You can also pass <tt>:no_modules</tt> to not use any resource
89
- modules.
90
-
91
- == Handlers and Custom Resource Modules
92
-
93
- IMW chooses which resource modules to extend an IMW::Resource by
94
- iterating through an array of handlers, passing the resource to the
95
- handler, and letting the handler's response (true/false) determine
96
- whether or not to extend the resource with the module accompanying the
97
- handler.
98
-
99
- You can hook into this process by defining your own handlers. To
100
- define a handler which should extend with +MyModule+ any resource with
101
- a URI ending with <tt>.xxx</tt>
102
-
103
- IMW::Resource.register_handler MyModule, /\.xxx$/
104
-
105
- You can also use a Proc instead of a Regexp for more control. If the
106
- result output of the Proc called with a resource is evaluates true
107
- then the resource will be extended by +MyModule+.
108
-
109
- IMW::Resource.register_handler MyModule, Proc.new { |resource| resource.is_local? && resource.path =~ /\.xxx$/ }
110
-
111
- = Manipulating Paths
112
-
113
- IMW holds a registry of paths that you can define on the fly or store
114
- in a configuration file. Defining paths once in the registry and then
115
- referring to them forever after by name helps keep your code flexible
116
- as well as portable.
117
-
118
- IMW.add_path(:dropbox, "/var/www/public")
119
- IMW.path_to(:dropbox)
120
- => "/var/www/public"
121
-
122
- You can combine named references together dynamically.
123
-
124
- IMW.add_path(:raw, :dropbox, "raw")
125
- IMW.path_to(:raw)
126
- => "/var/www/public/raw"
127
- IMW.path_to(:raw, "my/dataset")
128
- => "/var/www/public/raw/my/dataset
129
-
130
- Altering one path will update others
131
-
132
- IMW.add_path(:dropbox, "/data") # redefines :raw
133
- IMW.path_to(:raw, "my/dataset)
134
- => "/data/raw/my/dataset" # not /var/www/public/raw/my/dataset
135
-
136
- = Files & Directories
137
-
138
- Use IMW.open to open files. The object returned by IMW.open obeys the
139
- usual semantics of a File object but it has new methods to manipulate
140
- and parse the file.
141
-
142
- f1 = IMW.open("/path/to/file")
143
- f1.read() # does what you think
144
-
145
- # class methods from File are available
146
- f1.size
147
- f1.writeable?
148
-
149
- # use a bang or a 'w' to write
150
- writable_file = IMW.open!('/some/path') # similar to open('/some/path', 'w')
151
-
152
- # as well as methods to manipulate the file on the filesystem
153
- f2 = f1.cp("/new/path/to/file") # also try cp_to_dir
154
- f1.exist? # true
155
- f3 = f1.mv("/yet/another/path") # also try mv_to_dir
156
- f1.exist? # false
157
-
158
- IMW also knows about directories
159
-
160
- d = IMW.open('/tmp')
161
- d.directory? # true
162
- d['*'] # Dir['/tmp/*']
163
- d.mv('/parent/dir')
164
-
165
- == Remote Files
166
-
167
- Many operations defined for files are also defined for arbitrary URIs
168
- through the <tt>open-uri</tt> library.
169
-
170
- Files can readily be opened, read, and downloaded from the Internet
171
-
172
- site = IMW.open('http://infochimps.org') #=> Recognized as an HTML document
173
- site.read() # does what you think
174
- site.cp('/some/local/path')
175
- site.exist? # will work in many cases
176
-
177
- (writing to remote sources isn't enabled yet).
178
-
179
- == Archives & Compressed Files
180
-
181
- IMW works with a variety of archiving and compression programs to make
182
- packaging/unpackaging data easy.
183
-
184
- bz2 = IMW.open('/path/to/big_file.bz2')
185
- zip = IMW.open('/path/to/archive.zip')
186
- targz = IMW.open('/path/to/archive.tar.gz')
187
-
188
- IMW recognizes file properties by extension
189
-
190
- bz2.is_archive? # false
191
- bz2.is_compressed? # true
192
- zip.is_archive? # true
193
- zip.is_compressed? # false
194
- targz.is_archive? # true
195
- targz.is_compressed? # true
196
-
197
- # decompress or compress files
198
- big_file = bz2.decompress! # skip the ! to preserve the original
199
- new_bz2 = big_file.compress!
200
-
201
- # extract and package archives
202
- zip.extract # files show up in working directory
203
- tarbz2.extract # no need to decompress first
204
- new_tarbz2 = IMW.open!('/new/archive.tar').create(['/path1', '/path/2']).compress!
205
-
206
- == Parsing and Emitting Data
207
-
208
- IMW encourages you to work with native Ruby data structures as much as
209
- possible by providing methods to parse common data formats directly
210
- into Arrays, Hashes and Strings.
211
-
212
- Some data formats (CSV, JSON, YAML) have a structure which trivially
213
- maps to Arrays, Hashes, and Strings and so these formats can
214
- immediately be parsed.
215
-
216
- Other formats (XML, HTML, flat files, &c.) use data structures which
217
- do not map as readily to Arrays, Hashes, and Strings and so these will
218
- have to be parsed first.
219
-
220
- === Ruby-like Data Formats
221
-
222
- These include delimited formats such as CSV and TSV as well as
223
- "restricted tree-like" formats like JSON and YAML.
224
-
225
- For the case of delimited data, consider the following CSV file:
226
-
227
- ID,Name,Genus,Species
228
- 001,Gray-bellied Night Monkey,Aotus,lemurinus
229
- 002,Panamanian Night Monkey,Aotus,zonalis
230
- 003,Hernández-Camacho's Night Monkey,Aotus,jorgehernandezi
231
- 004,Gray-handed Night Monkey,Aotus,griseimembra
232
- 005,Hershkovitz's Night Monkey,Aotus,hershkovitzi
233
- 006,Brumback's Night Monkey,Aotus,brumbacki
234
- 007,Three-striped Night Monkey,Aotus,trivirgatus
235
- 008,Spix's Night Monkey,Aotus,vociferans
236
- 009,Malaysian Lar Gibbon,Hylobates,lar lar
237
- 010,Carpenter's Lar Gibbon,Hylobates,lar carpenteri
238
-
239
- It trivially maps to an Array of Arrays:
240
-
241
- data = IMW.open('/path/to/monkeys.csv').load
242
- puts data.class
243
- => Array
244
- puts data.first.class
245
- => Array
246
- data.each { |row| puts row.inspect }
247
- => ["ID", "Name", "Genus", "Species"]
248
- ["001", "Gray-bellied Night Monkey", "Aotus", "lemurinus"]
249
- ["002", "Panamanian Night Monkey", "Aotus", "zonalis"]
250
- ...
251
- ["010", "Carpenter's Lar Gibbon", "Hylobates", "lar carpenteri"]
252
-
253
- Conversely, any array of arrays trivially maps to a delimited file.
254
- Here we write out all rows where the genus is _Hylobates_ to a TSV
255
- file:
256
-
257
- hylobates = data.find_all { |row| row[2] == 'Hylobates' }
258
- hylobates.dump('/path/to/monkeys.tsv')
259
-
260
- IMW automatically formats the output as TSV and writes it to the
261
- specified path.
262
-
263
- Similarly, restricted tree-like formats like JSON and YAML, which map
264
- cleanly onto Hashes, Arrays, and Strings, can also be automatically
265
- parsed and emitted by IMW.
266
-
267
- Consider a YAML version of the above CSV data:
268
-
269
- - id: 001
270
- name: Gray-bellied Night Monkey
271
- genus: Aotus
272
- species: lemurinus
273
- - id: 002
274
- name: Panamanian Night Monkey
275
- genus: Aotus
276
- species: zonalis
277
- - id: 003
278
- name: Hernández-Camacho's Night Monkey
279
- genus: Aotus
280
- species: jorgehernandezi
281
- ...
282
- - id: 010
283
- name: Carpenter's Lar Gibbon
284
- genus: Hylobates
285
- species: lar carpenteri
286
-
287
- This trivially maps to an Array of Hashes and so we can perform the
288
- exact same filtration for YAML and JSON as we did for CSV and TSV (in
289
- a one-liner!):
290
-
291
- data = IMW.open('/path/to/monkeys.yaml').load
292
- hylobates = data.map{ |monkey| monkey['genus'] == 'Hylobates' }
293
- hylobates.dump('/path/to/monkeys.json')
294
-
295
- Resources in these Ruby-like data formats also extend themselves with
296
- Enumerable so goodies like +map+, +find_all+, &c. are available. This
297
- enables converting YAML to JSON with a one-liner:
298
-
299
- IMW.open('/path/to/monkeys.yaml').find_all { |monkey| monkey['genus'] == 'Hylobates' }.dump('/path/to/monkeys.json')
300
-
301
- === Parsing More General Data Formats
302
-
303
- Some data formats are structured but do not map readily to Hashes,
304
- Arrays, and Strings (XML, HTML, &c.) while other data formats lack
305
- structure or have a peculiar structure (flat files in arbitrary
306
- syntax).
307
-
308
- In both these cases the data needs to be parsed before it's usable.
309
- For the XML and HTML type data formats, IMW uses Hpricot and the
310
- IMW::Parsers::HtmlParser for parsing. For flat files, IMW provides
311
- the IMW::Parsers::LineParser and the IMW::Parsers::RegexpParser.
312
-
313
- HTML files, on the other hand, are more complex and typically have to
314
- be parsed before being converted to plain Ruby objects:
315
-
316
- # Grab a tiny link from the bottom of Google's homepage
317
- doc = IMW.open('http://www.google.com') # IMW::Files::Html
318
- doc.parse('p a') # 'Privacy'
319
-
320
- More complex parsers can also be built
321
-
322
- # Grab each row from an HTML table
323
- doc = IMW.open('/path/to/data.html')
324
- doc.parse :employees => ["tr", { :name => "td.name", :address => "td.address" } ]
325
- #=> [{:name => "John Chimpo", :address => "123 Fake St."}, {...}, ... ]
326
-
327
- see IMW::Parsers::HtmlParser for details on parsing HTML (and similar)
328
- files. Examine the other parsers in IMW::Parsers for details on
329
- parsing other data formats.
330
-
331
- = The IMW Workflow
332
-
333
- The workflow of IMW can be roughly summarized as follows:
334
-
335
- rip::
336
-
337
- Data is obtained from a source. IMW allows you to download data
338
- from the web, obtain it by querying databases, or use other services
339
- like rsync, ftp, &c. to pull it in from another computer.
340
-
341
- parse::
342
-
343
- Data is parsed into Ruby objects and stored.
344
-
345
- fix::
346
-
347
- All the parsed data is combined, reconciled, and further processed
348
- into a final form.
349
-
350
- package::
351
-
352
- The data is archived and compressed as necessary and moved to an
353
- outbox, staging server, S3 bucket, &c.
354
-
355
- Not all datasets
356
-
357
-
358
- = Datasets
359
-
360
- == Tasks & Dependencies
361
-
362
- == Directory Structure
363
-
364
- == Records
365
-
366
- = IMW on the Command Line
367
-
368
- == Repositories
369
-
370
- == Running Tasks
371
-
data/bin/imw DELETED
@@ -1,5 +0,0 @@
1
- #!/usr/bin/env ruby
2
- $:.unshift File.expand_path(File.join(File.dirname(__FILE__), '..', 'lib'))
3
- require 'imw/runner'
4
- exit IMW::Runner.new(*ARGV.dup).run!
5
-
@@ -1,29 +0,0 @@
1
- #!/usr/bin/env ruby
2
- require 'rubygems'
3
- require 'json/ext'
4
- require 'configliere'
5
-
6
- Settings.use :commandline, :define
7
- Settings.define :json_keys, :description => "A comma separated list of keys, in the order to be read from source."
8
- # Settings.resolve!
9
-
10
- module TSVtoJSON
11
-
12
- # def initialize
13
- # keys unless Settings.keys.nil?
14
- # end
15
-
16
- def keys
17
- @keys ||= Settings.json_keys.split(",")
18
- end
19
-
20
- def into_json record, exclude=[]
21
- json_hash = Hash.new
22
- keys.each_with_index do |key, index|
23
- next if exclude.include?(key)
24
- json_hash[key] = record[index]
25
- end
26
- return JSON.generate(json_hash)
27
- end
28
-
29
- end
@@ -1,26 +0,0 @@
1
- #-*- mode: ruby -*-
2
- #
3
- # h2. etc/imwrc -- default site-wide imw configuration file
4
- #
5
- # == About
6
- #
7
- # This file contains the site-wide configuration settings for this
8
- # installation of the Infinite Monkeywrench. Settings here override
9
- # the defaults in <tt>lib/imw/utils/config.rb</tt> (see the
10
- # documentation for that file for more detail on the variables that
11
- # can be configured here) but will in turn be overwritten by settings
12
- # in the <tt>~/.imwrc</tt> file in each user's directory (though the
13
- # location of this file can be customized).
14
- #
15
- # At the present moment, all settings are stored as plain Ruby files
16
- # (though they may lack the <tt>.rb</tt> extension). As the IMW
17
- # develops, these will be replaced by YAML files which will be parsed
18
- # by <tt>lib/imw/utils/config.rb</tt>.
19
- #
20
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
21
- # Copyright:: Copyright (c) 2008 infochimps.org
22
- # License:: GPL 3.0
23
- # Website:: http://infinitemonkeywrench.org/
24
-
25
- module IMW
26
- end
@@ -1,12 +0,0 @@
1
- require 'imw'
2
- dataset = IMW::Dataset.new :handle => 'test'
3
-
4
- dataset.rip do
5
- IMW.open("http://path/to/somre/resource.html").cp(dataset.path_to(:ripd), 'original_data.html')
6
- end
7
-
8
- dataset.parse do
9
- #...
10
- end
11
-
12
-
@@ -1,10 +0,0 @@
1
- ---
2
-
3
- metadata:
4
- /path/to/my/data.tsv:
5
- - name: foobar
6
- datatype: foobar
7
- doc: foobar
8
-
9
-
10
-