imw 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +194 -31
- data/VERSION +1 -1
- data/bin/imw +5 -0
- data/lib/imw/boot.rb +0 -15
- data/lib/imw/dataset/paths.rb +38 -0
- data/lib/imw/dataset/task.rb +21 -18
- data/lib/imw/dataset/workflow.rb +126 -65
- data/lib/imw/dataset.rb +56 -82
- data/lib/imw/files/basicfile.rb +3 -3
- data/lib/imw/files/compressed_files_and_archives.rb +23 -37
- data/lib/imw/files/csv.rb +2 -1
- data/lib/imw/files/directory.rb +62 -0
- data/lib/imw/files/excel.rb +84 -0
- data/lib/imw/files/sgml.rb +4 -23
- data/lib/imw/files.rb +62 -47
- data/lib/imw/packagers/archiver.rb +19 -1
- data/lib/imw/packagers/s3_mover.rb +8 -0
- data/lib/imw/parsers/html_parser/matchers.rb +251 -268
- data/lib/imw/parsers/html_parser.rb +181 -176
- data/lib/imw/parsers.rb +1 -1
- data/lib/imw/repository.rb +35 -0
- data/lib/imw/runner.rb +114 -0
- data/lib/imw/utils/extensions/core.rb +0 -16
- data/lib/imw/utils/paths.rb +0 -28
- data/lib/imw.rb +21 -32
- metadata +11 -19
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
- data/lib/imw/dataset/datamapper.rb +0 -66
- data/lib/imw/dataset/loaddump.rb +0 -50
- data/lib/imw/dataset/old/file_collection.rb +0 -88
- data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
- data/lib/imw/dataset/scaffold.rb +0 -132
- data/lib/imw/dataset/scraped_uri.rb +0 -305
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
- data/lib/imw/dataset/scrub/scrub.rb +0 -147
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
- data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
- data/lib/imw/dataset/scrub/slug.rb +0 -101
- data/lib/imw/dataset/stats/counter.rb +0 -23
- data/lib/imw/dataset/stats.rb +0 -73
@@ -191,192 +191,197 @@
|
|
191
191
|
|
192
192
|
require 'imw/parsers/html_parser/matchers'
|
193
193
|
|
194
|
-
|
194
|
+
module IMW
|
195
|
+
module Parsers
|
196
|
+
class HtmlParser
|
195
197
|
|
196
|
-
|
198
|
+
include IMW::Parsers::HtmlMatchers
|
197
199
|
|
198
|
-
|
200
|
+
attr_accessor :parse_tree
|
199
201
|
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
202
|
+
#
|
203
|
+
# Parse Tree
|
204
|
+
#
|
205
|
+
def initialize arg_spec=nil
|
206
|
+
spec = arg_spec || self.class.parser_spec
|
207
|
+
self.parse_tree = IMW::Parsers::HtmlMatchers.build_parse_tree(spec)
|
208
|
+
end
|
207
209
|
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
210
|
+
#
|
211
|
+
# See IMW::HtmlParser for syntax
|
212
|
+
#
|
213
|
+
#
|
214
|
+
def self.parser_spec
|
215
|
+
raise "Override this to create your own parser spec"
|
216
|
+
end
|
215
217
|
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
218
|
+
#
|
219
|
+
# Walk
|
220
|
+
#
|
221
|
+
def parse doc
|
222
|
+
self.parse_tree.match(doc)
|
223
|
+
end
|
222
224
|
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
225
|
+
# one("hpricot_path") first match to hpricot_path
|
226
|
+
# one("hpricot_path", /spec/) applies spec to first match to hpricot_path
|
227
|
+
#
|
228
|
+
def self.one selector, matcher
|
229
|
+
MatchFirstElement.new(selector, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
|
230
|
+
end
|
231
|
+
# match the +attr+ attribute of the first element given by +selector+
|
232
|
+
def self.attr selector, attr, matcher=nil
|
233
|
+
MatchAttribute.new(selector, attr, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
|
234
|
+
end
|
235
|
+
# shorthand for +attr(foo, 'href')+
|
236
|
+
def self.href selector, matcher=nil
|
237
|
+
self.attr(selector, 'href', matcher)
|
238
|
+
end
|
239
|
+
# shorthand for +attr(foo, 'src')+
|
240
|
+
def self.src selector, matcher=nil
|
241
|
+
self.attr(selector, 'src', matcher)
|
242
|
+
end
|
241
243
|
|
242
|
-
|
243
|
-
|
244
|
-
|
244
|
+
def self.proc selector, proc, matcher=nil
|
245
|
+
MatchProc.new(selector, proc, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
|
246
|
+
end
|
245
247
|
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
248
|
+
# strip ","s (!! thus disrespecting locale !!!)
|
249
|
+
# and convert to int
|
250
|
+
def self.to_num selector, matcher=nil
|
251
|
+
proc selector, lambda{|num| num.to_s.gsub(/,/,'').to_i if num }, matcher
|
252
|
+
end
|
253
|
+
def self.to_json selector, matcher=nil
|
254
|
+
proc selector, lambda{|v| v.to_json if v }, matcher
|
255
|
+
end
|
254
256
|
|
255
|
-
|
256
|
-
|
257
|
-
|
257
|
+
def self.strip selector, matcher=nil
|
258
|
+
proc selector, lambda{|v| v.strip }, matcher
|
259
|
+
end
|
258
260
|
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
261
|
+
def self.re_group selector, re
|
262
|
+
MatchRegexp.new(selector, re)
|
263
|
+
end
|
264
|
+
def self.re selector, re
|
265
|
+
MatchRegexp.new(selector, re, nil, :capture => 1)
|
266
|
+
end
|
267
|
+
def self.re_all selector, re, matcher=nil
|
268
|
+
MatchRegexpRepeatedly.new(selector, re)
|
269
|
+
end
|
268
270
|
|
269
|
-
|
270
|
-
|
271
|
-
|
271
|
+
# def self.plain_text selector, matcher=nil
|
272
|
+
# proc selector, lambda{|el| el.inner_text if el }, matcher
|
273
|
+
# end
|
272
274
|
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
275
|
+
# attr_accessor :mapping
|
276
|
+
#
|
277
|
+
# #
|
278
|
+
# # Feed me a hash and I'll semantify HTML
|
279
|
+
# #
|
280
|
+
# # The hash should magically adhere to the too-complicated,
|
281
|
+
# # ever evolving goatrope that works for the below
|
282
|
+
# #
|
283
|
+
# #
|
284
|
+
# def initialize mapping
|
285
|
+
# self.mapping = mapping
|
286
|
+
# end
|
287
|
+
#
|
288
|
+
# #
|
289
|
+
# # take a document subtree,
|
290
|
+
# # and a mapping of hpricot paths to that subtree's data mapping
|
291
|
+
# # recursively extract that datamapping
|
292
|
+
# #
|
293
|
+
# def extract_tree hdoc, content, sub_mapping
|
294
|
+
# data = { }
|
295
|
+
# sub_mapping.each do |selector, target|
|
296
|
+
# data[selector] = []
|
297
|
+
# sub_contents = content/selector
|
298
|
+
# sub_contents.each do |sub_content|
|
299
|
+
# sub_data = {}
|
300
|
+
# extract_node hdoc, sub_content, sub_data, selector, target
|
301
|
+
# data[selector] << sub_data
|
302
|
+
# end
|
303
|
+
# end
|
304
|
+
# data
|
305
|
+
# # end
|
306
|
+
# # if selector.is_a?(String)
|
307
|
+
# # conts = (content)
|
308
|
+
# # else
|
309
|
+
# # conts = [content]
|
310
|
+
# # end
|
311
|
+
# # conts[0..0].each do |content|
|
312
|
+
# # extract_node hdoc, content, data, selector, target
|
313
|
+
# # end
|
314
|
+
# # end
|
315
|
+
# data
|
316
|
+
# end
|
317
|
+
#
|
318
|
+
# #
|
319
|
+
# # insert the extracted element into the data mapping
|
320
|
+
# #
|
321
|
+
# def extract_node hdoc, content, data, selector, target
|
322
|
+
# classification = classify_node(selector, target)
|
323
|
+
# result = \
|
324
|
+
# case classification
|
325
|
+
# when :subtree
|
326
|
+
# target.each do |sub_selector, sub_target|
|
327
|
+
# extract_node hdoc, content, data, sub_selector, sub_target
|
328
|
+
# end
|
329
|
+
#
|
330
|
+
# when :sub_attribute
|
331
|
+
# k, v = selector.to_a[0]
|
332
|
+
# subcontent = (k[0..0] == '/') ? (hdoc.at(k)) : (content.at(k))
|
333
|
+
# val = subcontent.attributes[v.to_s] if subcontent
|
334
|
+
# data[target] = val unless val.blank?
|
335
|
+
#
|
336
|
+
# when :attribute then
|
337
|
+
# val = content.attributes[selector.to_s]
|
338
|
+
# data[target] = val unless val.blank?
|
339
|
+
#
|
340
|
+
# when :flatten_list
|
341
|
+
# subcontents = (selector[0..0] == '/') ? (hdoc/selector) : (content/selector)
|
342
|
+
# data[target.first] = subcontents.map{|subcontent| subcontent.inner_html }
|
343
|
+
#
|
344
|
+
# when :inner_html
|
345
|
+
# subcontent = (selector[0..0] == '/') ? (hdoc.at(selector)) : (content.at(selector))
|
346
|
+
# data[target] = subcontent.inner_html.strip if subcontent
|
347
|
+
#
|
348
|
+
# else
|
349
|
+
# raise "classify_node shouldn't ever return #{classification}"
|
350
|
+
# end
|
351
|
+
# # puts "%-19s %-19s %-31s %s" % [target.inspect[0..18], classification.inspect[0..18], selector.inspect[0..30], result.inspect[0..90]] if (classification == :sub_attribute)
|
352
|
+
# # puts '' if classification == :subtree
|
353
|
+
# end
|
354
|
+
#
|
355
|
+
# def classify_node selector, target
|
356
|
+
# case
|
357
|
+
# when target.is_a?(Hash) then :subtree
|
358
|
+
# when selector.is_a?(Hash) && (selector.length == 1) then
|
359
|
+
# k, v = selector.to_a[0]
|
360
|
+
# case v
|
361
|
+
# when Symbol then :sub_attribute
|
362
|
+
# end
|
363
|
+
# when selector.is_a?(Symbol) then :attribute
|
364
|
+
# when selector.is_a?(String) && target.is_a?(Array) then :flatten_list
|
365
|
+
# when selector.is_a?(String) && target.is_a?(Symbol) then :inner_html
|
366
|
+
# else
|
367
|
+
# raise "Can't classify mapping: " + [selector, target].join(" - ")
|
368
|
+
# end
|
369
|
+
# end
|
370
|
+
#
|
371
|
+
# # use #mapping to parse file
|
372
|
+
# def parse link
|
373
|
+
# begin hdoc = Hpricot(link.contents)
|
374
|
+
# rescue; warn "can't hpricot #{link.to_s}" ; return false; end
|
375
|
+
# raw_taggings = extract_tree hdoc, hdoc, self.mapping
|
376
|
+
# end
|
377
|
+
#
|
378
|
+
# # use #mapping to parse file
|
379
|
+
# def parse_file filename
|
380
|
+
# begin hdoc = Hpricot(File.open(filename))
|
381
|
+
# rescue; warn "can't hpricot #{filename}" ; return false; end
|
382
|
+
# raw_taggings = extract_tree hdoc, hdoc, self.mapping
|
383
|
+
# end
|
384
|
+
end
|
385
|
+
end
|
382
386
|
end
|
387
|
+
|
data/lib/imw/parsers.rb
CHANGED
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'imw/utils'
|
2
|
+
|
3
|
+
module IMW
|
4
|
+
|
5
|
+
# A Repository is a collection of datasets.
|
6
|
+
class Repository < Hash
|
7
|
+
|
8
|
+
# FIXME This should read some configuration settings somewhere and
|
9
|
+
# generate a pool specific to each IMW user.
|
10
|
+
def self.default
|
11
|
+
new
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
# The default repository managed by IMW.
|
17
|
+
REPOSITORY = Repository.default
|
18
|
+
|
19
|
+
# Add a dataset to the IMW::REPOSITORY. If the dataset has a
|
20
|
+
# +handle+ then it will be used as the key in this repository;
|
21
|
+
# otherwise the dataset's class will be used.
|
22
|
+
def self.add dataset
|
23
|
+
REPOSITORY[dataset.handle] = dataset
|
24
|
+
end
|
25
|
+
|
26
|
+
# Remove a dataset from the IMW::REPOSITORY. Can pass in either a
|
27
|
+
# string handle or an instance of the dataset.
|
28
|
+
def self.delete handle
|
29
|
+
handle = handle.handle if handle.respond_to?(:handle)
|
30
|
+
REPOSITORY.delete(handle)
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
|
data/lib/imw/runner.rb
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
require 'imw'
|
2
|
+
require 'optparse'
|
3
|
+
|
4
|
+
module IMW
|
5
|
+
|
6
|
+
RunnerError = Class.new(IMW::Error)
|
7
|
+
|
8
|
+
class Runner
|
9
|
+
|
10
|
+
DEFAULT_OPTIONS = {
|
11
|
+
:requires => [],
|
12
|
+
:selectors => [],
|
13
|
+
:dry_run => false
|
14
|
+
}
|
15
|
+
|
16
|
+
attr_reader :args, :options
|
17
|
+
|
18
|
+
def initialize *args
|
19
|
+
@args = args
|
20
|
+
@options = DEFAULT_OPTIONS.dup
|
21
|
+
parser.parse!(args) # will trim options from args
|
22
|
+
end
|
23
|
+
|
24
|
+
def parser
|
25
|
+
OptionParser.new do |opts|
|
26
|
+
opts.banner = "usage: imw [OPTIONS] TASK"
|
27
|
+
opts.separator <<EOF
|
28
|
+
|
29
|
+
Run TASK for all datasets in the repository. IMW will read any
|
30
|
+
*.imw files in the current directory by default.
|
31
|
+
|
32
|
+
Options include
|
33
|
+
|
34
|
+
EOF
|
35
|
+
|
36
|
+
opts.on('-l', '--list', "List datasets in repository") do
|
37
|
+
options[:list] = true
|
38
|
+
end
|
39
|
+
|
40
|
+
opts.on('-s', '--selector SELECTOR', "Filter datasets by regexp SELECTOR. Can be given more than once.") do |selector|
|
41
|
+
options[:selectors] << selector
|
42
|
+
end
|
43
|
+
|
44
|
+
opts.on('-r', '--require PATH', "Require PATH. Can be given more than once.") do |path|
|
45
|
+
options[:requires] << path
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def require_files
|
52
|
+
Dir['*.imw'].each { |path| load File.expand_path(path) }
|
53
|
+
options[:requires].each do |path|
|
54
|
+
IMW.open(path) do |requireable|
|
55
|
+
if requireable.directory?
|
56
|
+
requireable["**/*.rb"].each { |file| require file }
|
57
|
+
requireable["**/*.imw"].each { |file| load file }
|
58
|
+
else
|
59
|
+
require requireable.path
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def task
|
66
|
+
args.first
|
67
|
+
end
|
68
|
+
|
69
|
+
def handles
|
70
|
+
matched_handles = Set.new
|
71
|
+
if options[:selectors].blank?
|
72
|
+
matched_handles += IMW::REPOSITORY.keys
|
73
|
+
else
|
74
|
+
keys = IMW::REPOSITORY.keys
|
75
|
+
unless keys.empty?
|
76
|
+
options[:selectors].each do |selector|
|
77
|
+
matched_handles += keys.find_all { |key| key =~ Regexp.new(selector) }
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
matched_handles.to_a.sort
|
82
|
+
end
|
83
|
+
|
84
|
+
def datasets
|
85
|
+
handles.map { |handle| IMW::REPOSITORY[handle] }
|
86
|
+
end
|
87
|
+
|
88
|
+
def list!
|
89
|
+
puts handles
|
90
|
+
exit
|
91
|
+
end
|
92
|
+
|
93
|
+
def run_task!
|
94
|
+
datasets.each do |dataset|
|
95
|
+
dataset[task].invoke
|
96
|
+
end
|
97
|
+
exit
|
98
|
+
end
|
99
|
+
|
100
|
+
def run!
|
101
|
+
require_files
|
102
|
+
case
|
103
|
+
when options[:list]
|
104
|
+
list!
|
105
|
+
when task.blank?
|
106
|
+
puts parser
|
107
|
+
exit 1
|
108
|
+
else
|
109
|
+
run_task!
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
@@ -1,19 +1,3 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/utils/extensions/core.rb -- extensions to the Ruby core
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# Some useful extensions to basic Ruby classes. This file is required
|
7
|
-
# by <tt>imw/utils</tt> so any files required here are automatically
|
8
|
-
# required when loading IMW.
|
9
|
-
#
|
10
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
11
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
12
|
-
# License:: GPL 3.0
|
13
|
-
# Website:: http://infinitemonkeywrench.org/
|
14
|
-
#
|
15
|
-
# puts "#{File.basename(__FILE__)}: Your monkeywrench does a complicated series of core-burning exercises and emerges with ripped, powerful-looking abs."
|
16
|
-
|
17
1
|
require 'imw/utils/extensions/string'
|
18
2
|
require 'imw/utils/extensions/array'
|
19
3
|
require 'imw/utils/extensions/hash'
|
data/lib/imw/utils/paths.rb
CHANGED
@@ -1,20 +1,3 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/utils/paths.rb -- defines the path structure of IMW
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# IMW uses lots of different directories to keep information on data
|
7
|
-
# and datasets separate. This module interfaces with the
|
8
|
-
# configuration files to establish the paths to these IMW directories
|
9
|
-
# and provides functions and mixins for IMW objects to use to access
|
10
|
-
# these paths.
|
11
|
-
#
|
12
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
13
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
14
|
-
# License:: GPL 3.0
|
15
|
-
# Website:: http://infinitemonkeywrench.org/
|
16
|
-
#
|
17
|
-
|
18
1
|
module IMW
|
19
2
|
|
20
3
|
# Implements methods designed to work with an object's
|
@@ -67,17 +50,6 @@ module IMW
|
|
67
50
|
end
|
68
51
|
end
|
69
52
|
|
70
|
-
class Dataset
|
71
|
-
attr_reader :paths
|
72
|
-
include IMW::Paths
|
73
|
-
|
74
|
-
private
|
75
|
-
def set_paths
|
76
|
-
@paths = {}
|
77
|
-
add_path :self, File.dirname(eval('__FILE__'))
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
53
|
def self.path_to *pathsegs
|
82
54
|
begin
|
83
55
|
path = Pathname.new IMW.path_to_helper(*pathsegs)
|
data/lib/imw.rb
CHANGED
@@ -1,42 +1,31 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw.rb -- main imw file
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# This file is the entry-point to the IMW library. It loads a minimal
|
7
|
-
# setup. Optional components can be loaded by calling the function
|
8
|
-
# <tt>IMW.imw_components</tt>.
|
9
|
-
#
|
10
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
11
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
12
|
-
# License:: GPL 3.0
|
13
|
-
# Website:: http://infinitemonkeywrench.org/
|
14
|
-
#
|
15
|
-
# puts "#{File.basename(__FILE__)}: Behold, the weighty, the munificent, the Infinite Monkeywrench! Approach it with care: it has overwhelmed mightier monkeys than ye."
|
16
|
-
|
17
1
|
require 'rubygems'
|
18
|
-
require 'YAML' unless defined?('YAML') # some stupid collision with datamapper makes it double include
|
19
2
|
require 'imw/boot'
|
20
3
|
require 'imw/utils'
|
21
4
|
require 'imw/dataset'
|
5
|
+
require 'imw/repository'
|
22
6
|
require 'imw/files'
|
23
7
|
require 'imw/parsers'
|
24
8
|
require 'imw/packagers'
|
25
9
|
|
26
|
-
# The Infinite Monkeywrench (IMW) is a Ruby library for
|
27
|
-
#
|
28
|
-
#
|
29
|
-
#
|
30
|
-
#
|
31
|
-
#
|
32
|
-
#
|
33
|
-
#
|
34
|
-
#
|
35
|
-
#
|
36
|
-
#
|
37
|
-
#
|
38
|
-
#
|
39
|
-
#
|
40
|
-
#
|
10
|
+
# The Infinite Monkeywrench (IMW) is a Ruby library for ripping,
|
11
|
+
# extracting, parsing, munging, and packaging datasets. It allows you
|
12
|
+
# to handle different data formats transparently as well as organize
|
13
|
+
# transformations of data as a network of dependencies (a la Make or
|
14
|
+
# Rake).
|
15
|
+
#
|
16
|
+
# On first reading of IMW examine the classes within the IMW::Files
|
17
|
+
# module, all transparently instantiated when using IMW.open (instead
|
18
|
+
# of File.open). These classes do a lot of work to ensure that all
|
19
|
+
# objects returned by IMW.open share methods (write, read, load, dump,
|
20
|
+
# parse, compress, extract, &c.) while continuing to use existing
|
21
|
+
# implementations of these concepts.
|
22
|
+
#
|
23
|
+
# Another entrace point is the <tt>IMW::Dataset</tt> class. It
|
24
|
+
# leverages Rake to craft workflows for transforming datasets. IMW
|
25
|
+
# encourages you to organize your data transformations in a step-wise
|
26
|
+
# process, managed with dependencies.
|
27
|
+
#
|
28
|
+
# Utilities to help with one step in particular (ripping, parsing,
|
29
|
+
# pacaking, &c.) are in their own directories.
|
41
30
|
module IMW
|
42
31
|
end
|