imw 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +194 -31
- data/VERSION +1 -1
- data/bin/imw +5 -0
- data/lib/imw/boot.rb +0 -15
- data/lib/imw/dataset/paths.rb +38 -0
- data/lib/imw/dataset/task.rb +21 -18
- data/lib/imw/dataset/workflow.rb +126 -65
- data/lib/imw/dataset.rb +56 -82
- data/lib/imw/files/basicfile.rb +3 -3
- data/lib/imw/files/compressed_files_and_archives.rb +23 -37
- data/lib/imw/files/csv.rb +2 -1
- data/lib/imw/files/directory.rb +62 -0
- data/lib/imw/files/excel.rb +84 -0
- data/lib/imw/files/sgml.rb +4 -23
- data/lib/imw/files.rb +62 -47
- data/lib/imw/packagers/archiver.rb +19 -1
- data/lib/imw/packagers/s3_mover.rb +8 -0
- data/lib/imw/parsers/html_parser/matchers.rb +251 -268
- data/lib/imw/parsers/html_parser.rb +181 -176
- data/lib/imw/parsers.rb +1 -1
- data/lib/imw/repository.rb +35 -0
- data/lib/imw/runner.rb +114 -0
- data/lib/imw/utils/extensions/core.rb +0 -16
- data/lib/imw/utils/paths.rb +0 -28
- data/lib/imw.rb +21 -32
- metadata +11 -19
- data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
- data/lib/imw/dataset/datamapper.rb +0 -66
- data/lib/imw/dataset/loaddump.rb +0 -50
- data/lib/imw/dataset/old/file_collection.rb +0 -88
- data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
- data/lib/imw/dataset/scaffold.rb +0 -132
- data/lib/imw/dataset/scraped_uri.rb +0 -305
- data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
- data/lib/imw/dataset/scrub/scrub.rb +0 -147
- data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
- data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
- data/lib/imw/dataset/scrub/slug.rb +0 -101
- data/lib/imw/dataset/stats/counter.rb +0 -23
- data/lib/imw/dataset/stats.rb +0 -73
@@ -191,192 +191,197 @@
|
|
191
191
|
|
192
192
|
require 'imw/parsers/html_parser/matchers'
|
193
193
|
|
194
|
-
|
194
|
+
module IMW
|
195
|
+
module Parsers
|
196
|
+
class HtmlParser
|
195
197
|
|
196
|
-
|
198
|
+
include IMW::Parsers::HtmlMatchers
|
197
199
|
|
198
|
-
|
200
|
+
attr_accessor :parse_tree
|
199
201
|
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
202
|
+
#
|
203
|
+
# Parse Tree
|
204
|
+
#
|
205
|
+
def initialize arg_spec=nil
|
206
|
+
spec = arg_spec || self.class.parser_spec
|
207
|
+
self.parse_tree = IMW::Parsers::HtmlMatchers.build_parse_tree(spec)
|
208
|
+
end
|
207
209
|
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
210
|
+
#
|
211
|
+
# See IMW::HtmlParser for syntax
|
212
|
+
#
|
213
|
+
#
|
214
|
+
def self.parser_spec
|
215
|
+
raise "Override this to create your own parser spec"
|
216
|
+
end
|
215
217
|
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
218
|
+
#
|
219
|
+
# Walk
|
220
|
+
#
|
221
|
+
def parse doc
|
222
|
+
self.parse_tree.match(doc)
|
223
|
+
end
|
222
224
|
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
225
|
+
# one("hpricot_path") first match to hpricot_path
|
226
|
+
# one("hpricot_path", /spec/) applies spec to first match to hpricot_path
|
227
|
+
#
|
228
|
+
def self.one selector, matcher
|
229
|
+
MatchFirstElement.new(selector, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
|
230
|
+
end
|
231
|
+
# match the +attr+ attribute of the first element given by +selector+
|
232
|
+
def self.attr selector, attr, matcher=nil
|
233
|
+
MatchAttribute.new(selector, attr, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
|
234
|
+
end
|
235
|
+
# shorthand for +attr(foo, 'href')+
|
236
|
+
def self.href selector, matcher=nil
|
237
|
+
self.attr(selector, 'href', matcher)
|
238
|
+
end
|
239
|
+
# shorthand for +attr(foo, 'src')+
|
240
|
+
def self.src selector, matcher=nil
|
241
|
+
self.attr(selector, 'src', matcher)
|
242
|
+
end
|
241
243
|
|
242
|
-
|
243
|
-
|
244
|
-
|
244
|
+
def self.proc selector, proc, matcher=nil
|
245
|
+
MatchProc.new(selector, proc, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
|
246
|
+
end
|
245
247
|
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
248
|
+
# strip ","s (!! thus disrespecting locale !!!)
|
249
|
+
# and convert to int
|
250
|
+
def self.to_num selector, matcher=nil
|
251
|
+
proc selector, lambda{|num| num.to_s.gsub(/,/,'').to_i if num }, matcher
|
252
|
+
end
|
253
|
+
def self.to_json selector, matcher=nil
|
254
|
+
proc selector, lambda{|v| v.to_json if v }, matcher
|
255
|
+
end
|
254
256
|
|
255
|
-
|
256
|
-
|
257
|
-
|
257
|
+
def self.strip selector, matcher=nil
|
258
|
+
proc selector, lambda{|v| v.strip }, matcher
|
259
|
+
end
|
258
260
|
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
261
|
+
def self.re_group selector, re
|
262
|
+
MatchRegexp.new(selector, re)
|
263
|
+
end
|
264
|
+
def self.re selector, re
|
265
|
+
MatchRegexp.new(selector, re, nil, :capture => 1)
|
266
|
+
end
|
267
|
+
def self.re_all selector, re, matcher=nil
|
268
|
+
MatchRegexpRepeatedly.new(selector, re)
|
269
|
+
end
|
268
270
|
|
269
|
-
|
270
|
-
|
271
|
-
|
271
|
+
# def self.plain_text selector, matcher=nil
|
272
|
+
# proc selector, lambda{|el| el.inner_text if el }, matcher
|
273
|
+
# end
|
272
274
|
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
275
|
+
# attr_accessor :mapping
|
276
|
+
#
|
277
|
+
# #
|
278
|
+
# # Feed me a hash and I'll semantify HTML
|
279
|
+
# #
|
280
|
+
# # The hash should magically adhere to the too-complicated,
|
281
|
+
# # ever evolving goatrope that works for the below
|
282
|
+
# #
|
283
|
+
# #
|
284
|
+
# def initialize mapping
|
285
|
+
# self.mapping = mapping
|
286
|
+
# end
|
287
|
+
#
|
288
|
+
# #
|
289
|
+
# # take a document subtree,
|
290
|
+
# # and a mapping of hpricot paths to that subtree's data mapping
|
291
|
+
# # recursively extract that datamapping
|
292
|
+
# #
|
293
|
+
# def extract_tree hdoc, content, sub_mapping
|
294
|
+
# data = { }
|
295
|
+
# sub_mapping.each do |selector, target|
|
296
|
+
# data[selector] = []
|
297
|
+
# sub_contents = content/selector
|
298
|
+
# sub_contents.each do |sub_content|
|
299
|
+
# sub_data = {}
|
300
|
+
# extract_node hdoc, sub_content, sub_data, selector, target
|
301
|
+
# data[selector] << sub_data
|
302
|
+
# end
|
303
|
+
# end
|
304
|
+
# data
|
305
|
+
# # end
|
306
|
+
# # if selector.is_a?(String)
|
307
|
+
# # conts = (content)
|
308
|
+
# # else
|
309
|
+
# # conts = [content]
|
310
|
+
# # end
|
311
|
+
# # conts[0..0].each do |content|
|
312
|
+
# # extract_node hdoc, content, data, selector, target
|
313
|
+
# # end
|
314
|
+
# # end
|
315
|
+
# data
|
316
|
+
# end
|
317
|
+
#
|
318
|
+
# #
|
319
|
+
# # insert the extracted element into the data mapping
|
320
|
+
# #
|
321
|
+
# def extract_node hdoc, content, data, selector, target
|
322
|
+
# classification = classify_node(selector, target)
|
323
|
+
# result = \
|
324
|
+
# case classification
|
325
|
+
# when :subtree
|
326
|
+
# target.each do |sub_selector, sub_target|
|
327
|
+
# extract_node hdoc, content, data, sub_selector, sub_target
|
328
|
+
# end
|
329
|
+
#
|
330
|
+
# when :sub_attribute
|
331
|
+
# k, v = selector.to_a[0]
|
332
|
+
# subcontent = (k[0..0] == '/') ? (hdoc.at(k)) : (content.at(k))
|
333
|
+
# val = subcontent.attributes[v.to_s] if subcontent
|
334
|
+
# data[target] = val unless val.blank?
|
335
|
+
#
|
336
|
+
# when :attribute then
|
337
|
+
# val = content.attributes[selector.to_s]
|
338
|
+
# data[target] = val unless val.blank?
|
339
|
+
#
|
340
|
+
# when :flatten_list
|
341
|
+
# subcontents = (selector[0..0] == '/') ? (hdoc/selector) : (content/selector)
|
342
|
+
# data[target.first] = subcontents.map{|subcontent| subcontent.inner_html }
|
343
|
+
#
|
344
|
+
# when :inner_html
|
345
|
+
# subcontent = (selector[0..0] == '/') ? (hdoc.at(selector)) : (content.at(selector))
|
346
|
+
# data[target] = subcontent.inner_html.strip if subcontent
|
347
|
+
#
|
348
|
+
# else
|
349
|
+
# raise "classify_node shouldn't ever return #{classification}"
|
350
|
+
# end
|
351
|
+
# # puts "%-19s %-19s %-31s %s" % [target.inspect[0..18], classification.inspect[0..18], selector.inspect[0..30], result.inspect[0..90]] if (classification == :sub_attribute)
|
352
|
+
# # puts '' if classification == :subtree
|
353
|
+
# end
|
354
|
+
#
|
355
|
+
# def classify_node selector, target
|
356
|
+
# case
|
357
|
+
# when target.is_a?(Hash) then :subtree
|
358
|
+
# when selector.is_a?(Hash) && (selector.length == 1) then
|
359
|
+
# k, v = selector.to_a[0]
|
360
|
+
# case v
|
361
|
+
# when Symbol then :sub_attribute
|
362
|
+
# end
|
363
|
+
# when selector.is_a?(Symbol) then :attribute
|
364
|
+
# when selector.is_a?(String) && target.is_a?(Array) then :flatten_list
|
365
|
+
# when selector.is_a?(String) && target.is_a?(Symbol) then :inner_html
|
366
|
+
# else
|
367
|
+
# raise "Can't classify mapping: " + [selector, target].join(" - ")
|
368
|
+
# end
|
369
|
+
# end
|
370
|
+
#
|
371
|
+
# # use #mapping to parse file
|
372
|
+
# def parse link
|
373
|
+
# begin hdoc = Hpricot(link.contents)
|
374
|
+
# rescue; warn "can't hpricot #{link.to_s}" ; return false; end
|
375
|
+
# raw_taggings = extract_tree hdoc, hdoc, self.mapping
|
376
|
+
# end
|
377
|
+
#
|
378
|
+
# # use #mapping to parse file
|
379
|
+
# def parse_file filename
|
380
|
+
# begin hdoc = Hpricot(File.open(filename))
|
381
|
+
# rescue; warn "can't hpricot #{filename}" ; return false; end
|
382
|
+
# raw_taggings = extract_tree hdoc, hdoc, self.mapping
|
383
|
+
# end
|
384
|
+
end
|
385
|
+
end
|
382
386
|
end
|
387
|
+
|
data/lib/imw/parsers.rb
CHANGED
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'imw/utils'
|
2
|
+
|
3
|
+
module IMW
|
4
|
+
|
5
|
+
# A Repository is a collection of datasets.
|
6
|
+
class Repository < Hash
|
7
|
+
|
8
|
+
# FIXME This should read some configuration settings somewhere and
|
9
|
+
# generate a pool specific to each IMW user.
|
10
|
+
def self.default
|
11
|
+
new
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
# The default repository managed by IMW.
|
17
|
+
REPOSITORY = Repository.default
|
18
|
+
|
19
|
+
# Add a dataset to the IMW::REPOSITORY. If the dataset has a
|
20
|
+
# +handle+ then it will be used as the key in this repository;
|
21
|
+
# otherwise the dataset's class will be used.
|
22
|
+
def self.add dataset
|
23
|
+
REPOSITORY[dataset.handle] = dataset
|
24
|
+
end
|
25
|
+
|
26
|
+
# Remove a dataset from the IMW::REPOSITORY. Can pass in either a
|
27
|
+
# string handle or an instance of the dataset.
|
28
|
+
def self.delete handle
|
29
|
+
handle = handle.handle if handle.respond_to?(:handle)
|
30
|
+
REPOSITORY.delete(handle)
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
|
data/lib/imw/runner.rb
ADDED
@@ -0,0 +1,114 @@
|
|
1
|
+
require 'imw'
|
2
|
+
require 'optparse'
|
3
|
+
|
4
|
+
module IMW
|
5
|
+
|
6
|
+
RunnerError = Class.new(IMW::Error)
|
7
|
+
|
8
|
+
class Runner
|
9
|
+
|
10
|
+
DEFAULT_OPTIONS = {
|
11
|
+
:requires => [],
|
12
|
+
:selectors => [],
|
13
|
+
:dry_run => false
|
14
|
+
}
|
15
|
+
|
16
|
+
attr_reader :args, :options
|
17
|
+
|
18
|
+
def initialize *args
|
19
|
+
@args = args
|
20
|
+
@options = DEFAULT_OPTIONS.dup
|
21
|
+
parser.parse!(args) # will trim options from args
|
22
|
+
end
|
23
|
+
|
24
|
+
def parser
|
25
|
+
OptionParser.new do |opts|
|
26
|
+
opts.banner = "usage: imw [OPTIONS] TASK"
|
27
|
+
opts.separator <<EOF
|
28
|
+
|
29
|
+
Run TASK for all datasets in the repository. IMW will read any
|
30
|
+
*.imw files in the current directory by default.
|
31
|
+
|
32
|
+
Options include
|
33
|
+
|
34
|
+
EOF
|
35
|
+
|
36
|
+
opts.on('-l', '--list', "List datasets in repository") do
|
37
|
+
options[:list] = true
|
38
|
+
end
|
39
|
+
|
40
|
+
opts.on('-s', '--selector SELECTOR', "Filter datasets by regexp SELECTOR. Can be given more than once.") do |selector|
|
41
|
+
options[:selectors] << selector
|
42
|
+
end
|
43
|
+
|
44
|
+
opts.on('-r', '--require PATH', "Require PATH. Can be given more than once.") do |path|
|
45
|
+
options[:requires] << path
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def require_files
|
52
|
+
Dir['*.imw'].each { |path| load File.expand_path(path) }
|
53
|
+
options[:requires].each do |path|
|
54
|
+
IMW.open(path) do |requireable|
|
55
|
+
if requireable.directory?
|
56
|
+
requireable["**/*.rb"].each { |file| require file }
|
57
|
+
requireable["**/*.imw"].each { |file| load file }
|
58
|
+
else
|
59
|
+
require requireable.path
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def task
|
66
|
+
args.first
|
67
|
+
end
|
68
|
+
|
69
|
+
def handles
|
70
|
+
matched_handles = Set.new
|
71
|
+
if options[:selectors].blank?
|
72
|
+
matched_handles += IMW::REPOSITORY.keys
|
73
|
+
else
|
74
|
+
keys = IMW::REPOSITORY.keys
|
75
|
+
unless keys.empty?
|
76
|
+
options[:selectors].each do |selector|
|
77
|
+
matched_handles += keys.find_all { |key| key =~ Regexp.new(selector) }
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
matched_handles.to_a.sort
|
82
|
+
end
|
83
|
+
|
84
|
+
def datasets
|
85
|
+
handles.map { |handle| IMW::REPOSITORY[handle] }
|
86
|
+
end
|
87
|
+
|
88
|
+
def list!
|
89
|
+
puts handles
|
90
|
+
exit
|
91
|
+
end
|
92
|
+
|
93
|
+
def run_task!
|
94
|
+
datasets.each do |dataset|
|
95
|
+
dataset[task].invoke
|
96
|
+
end
|
97
|
+
exit
|
98
|
+
end
|
99
|
+
|
100
|
+
def run!
|
101
|
+
require_files
|
102
|
+
case
|
103
|
+
when options[:list]
|
104
|
+
list!
|
105
|
+
when task.blank?
|
106
|
+
puts parser
|
107
|
+
exit 1
|
108
|
+
else
|
109
|
+
run_task!
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
@@ -1,19 +1,3 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/utils/extensions/core.rb -- extensions to the Ruby core
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# Some useful extensions to basic Ruby classes. This file is required
|
7
|
-
# by <tt>imw/utils</tt> so any files required here are automatically
|
8
|
-
# required when loading IMW.
|
9
|
-
#
|
10
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
11
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
12
|
-
# License:: GPL 3.0
|
13
|
-
# Website:: http://infinitemonkeywrench.org/
|
14
|
-
#
|
15
|
-
# puts "#{File.basename(__FILE__)}: Your monkeywrench does a complicated series of core-burning exercises and emerges with ripped, powerful-looking abs."
|
16
|
-
|
17
1
|
require 'imw/utils/extensions/string'
|
18
2
|
require 'imw/utils/extensions/array'
|
19
3
|
require 'imw/utils/extensions/hash'
|
data/lib/imw/utils/paths.rb
CHANGED
@@ -1,20 +1,3 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw/utils/paths.rb -- defines the path structure of IMW
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# IMW uses lots of different directories to keep information on data
|
7
|
-
# and datasets separate. This module interfaces with the
|
8
|
-
# configuration files to establish the paths to these IMW directories
|
9
|
-
# and provides functions and mixins for IMW objects to use to access
|
10
|
-
# these paths.
|
11
|
-
#
|
12
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
13
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
14
|
-
# License:: GPL 3.0
|
15
|
-
# Website:: http://infinitemonkeywrench.org/
|
16
|
-
#
|
17
|
-
|
18
1
|
module IMW
|
19
2
|
|
20
3
|
# Implements methods designed to work with an object's
|
@@ -67,17 +50,6 @@ module IMW
|
|
67
50
|
end
|
68
51
|
end
|
69
52
|
|
70
|
-
class Dataset
|
71
|
-
attr_reader :paths
|
72
|
-
include IMW::Paths
|
73
|
-
|
74
|
-
private
|
75
|
-
def set_paths
|
76
|
-
@paths = {}
|
77
|
-
add_path :self, File.dirname(eval('__FILE__'))
|
78
|
-
end
|
79
|
-
end
|
80
|
-
|
81
53
|
def self.path_to *pathsegs
|
82
54
|
begin
|
83
55
|
path = Pathname.new IMW.path_to_helper(*pathsegs)
|
data/lib/imw.rb
CHANGED
@@ -1,42 +1,31 @@
|
|
1
|
-
#
|
2
|
-
# h2. lib/imw.rb -- main imw file
|
3
|
-
#
|
4
|
-
# == About
|
5
|
-
#
|
6
|
-
# This file is the entry-point to the IMW library. It loads a minimal
|
7
|
-
# setup. Optional components can be loaded by calling the function
|
8
|
-
# <tt>IMW.imw_components</tt>.
|
9
|
-
#
|
10
|
-
# Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
|
11
|
-
# Copyright:: Copyright (c) 2008 infochimps.org
|
12
|
-
# License:: GPL 3.0
|
13
|
-
# Website:: http://infinitemonkeywrench.org/
|
14
|
-
#
|
15
|
-
# puts "#{File.basename(__FILE__)}: Behold, the weighty, the munificent, the Infinite Monkeywrench! Approach it with care: it has overwhelmed mightier monkeys than ye."
|
16
|
-
|
17
1
|
require 'rubygems'
|
18
|
-
require 'YAML' unless defined?('YAML') # some stupid collision with datamapper makes it double include
|
19
2
|
require 'imw/boot'
|
20
3
|
require 'imw/utils'
|
21
4
|
require 'imw/dataset'
|
5
|
+
require 'imw/repository'
|
22
6
|
require 'imw/files'
|
23
7
|
require 'imw/parsers'
|
24
8
|
require 'imw/packagers'
|
25
9
|
|
26
|
-
# The Infinite Monkeywrench (IMW) is a Ruby library for
|
27
|
-
#
|
28
|
-
#
|
29
|
-
#
|
30
|
-
#
|
31
|
-
#
|
32
|
-
#
|
33
|
-
#
|
34
|
-
#
|
35
|
-
#
|
36
|
-
#
|
37
|
-
#
|
38
|
-
#
|
39
|
-
#
|
40
|
-
#
|
10
|
+
# The Infinite Monkeywrench (IMW) is a Ruby library for ripping,
|
11
|
+
# extracting, parsing, munging, and packaging datasets. It allows you
|
12
|
+
# to handle different data formats transparently as well as organize
|
13
|
+
# transformations of data as a network of dependencies (a la Make or
|
14
|
+
# Rake).
|
15
|
+
#
|
16
|
+
# On first reading of IMW examine the classes within the IMW::Files
|
17
|
+
# module, all transparently instantiated when using IMW.open (instead
|
18
|
+
# of File.open). These classes do a lot of work to ensure that all
|
19
|
+
# objects returned by IMW.open share methods (write, read, load, dump,
|
20
|
+
# parse, compress, extract, &c.) while continuing to use existing
|
21
|
+
# implementations of these concepts.
|
22
|
+
#
|
23
|
+
# Another entrace point is the <tt>IMW::Dataset</tt> class. It
|
24
|
+
# leverages Rake to craft workflows for transforming datasets. IMW
|
25
|
+
# encourages you to organize your data transformations in a step-wise
|
26
|
+
# process, managed with dependencies.
|
27
|
+
#
|
28
|
+
# Utilities to help with one step in particular (ripping, parsing,
|
29
|
+
# pacaking, &c.) are in their own directories.
|
41
30
|
module IMW
|
42
31
|
end
|