imw 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. data/README.rdoc +194 -31
  2. data/VERSION +1 -1
  3. data/bin/imw +5 -0
  4. data/lib/imw/boot.rb +0 -15
  5. data/lib/imw/dataset/paths.rb +38 -0
  6. data/lib/imw/dataset/task.rb +21 -18
  7. data/lib/imw/dataset/workflow.rb +126 -65
  8. data/lib/imw/dataset.rb +56 -82
  9. data/lib/imw/files/basicfile.rb +3 -3
  10. data/lib/imw/files/compressed_files_and_archives.rb +23 -37
  11. data/lib/imw/files/csv.rb +2 -1
  12. data/lib/imw/files/directory.rb +62 -0
  13. data/lib/imw/files/excel.rb +84 -0
  14. data/lib/imw/files/sgml.rb +4 -23
  15. data/lib/imw/files.rb +62 -47
  16. data/lib/imw/packagers/archiver.rb +19 -1
  17. data/lib/imw/packagers/s3_mover.rb +8 -0
  18. data/lib/imw/parsers/html_parser/matchers.rb +251 -268
  19. data/lib/imw/parsers/html_parser.rb +181 -176
  20. data/lib/imw/parsers.rb +1 -1
  21. data/lib/imw/repository.rb +35 -0
  22. data/lib/imw/runner.rb +114 -0
  23. data/lib/imw/utils/extensions/core.rb +0 -16
  24. data/lib/imw/utils/paths.rb +0 -28
  25. data/lib/imw.rb +21 -32
  26. metadata +11 -19
  27. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
  28. data/lib/imw/dataset/datamapper.rb +0 -66
  29. data/lib/imw/dataset/loaddump.rb +0 -50
  30. data/lib/imw/dataset/old/file_collection.rb +0 -88
  31. data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
  32. data/lib/imw/dataset/scaffold.rb +0 -132
  33. data/lib/imw/dataset/scraped_uri.rb +0 -305
  34. data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
  35. data/lib/imw/dataset/scrub/scrub.rb +0 -147
  36. data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
  37. data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
  38. data/lib/imw/dataset/scrub/slug.rb +0 -101
  39. data/lib/imw/dataset/stats/counter.rb +0 -23
  40. data/lib/imw/dataset/stats.rb +0 -73
@@ -191,192 +191,197 @@
191
191
 
192
192
  require 'imw/parsers/html_parser/matchers'
193
193
 
194
- class IMW::HTMLParser
194
+ module IMW
195
+ module Parsers
196
+ class HtmlParser
195
197
 
196
- include IMW::HTMLParserMatcher
198
+ include IMW::Parsers::HtmlMatchers
197
199
 
198
- attr_accessor :parse_tree
200
+ attr_accessor :parse_tree
199
201
 
200
- #
201
- # Parse Tree
202
- #
203
- def initialize arg_spec=nil
204
- spec = arg_spec || self.class.parser_spec
205
- self.parse_tree = IMW::HTMLParserMatcher.build_parse_tree(spec)
206
- end
202
+ #
203
+ # Parse Tree
204
+ #
205
+ def initialize arg_spec=nil
206
+ spec = arg_spec || self.class.parser_spec
207
+ self.parse_tree = IMW::Parsers::HtmlMatchers.build_parse_tree(spec)
208
+ end
207
209
 
208
- #
209
- # See IMW::HTMLParser for syntax
210
- #
211
- #
212
- def self.parser_spec
213
- raise "Override this to create your own parser spec"
214
- end
210
+ #
211
+ # See IMW::HtmlParser for syntax
212
+ #
213
+ #
214
+ def self.parser_spec
215
+ raise "Override this to create your own parser spec"
216
+ end
215
217
 
216
- #
217
- # Walk
218
- #
219
- def parse doc
220
- self.parse_tree.match(doc)
221
- end
218
+ #
219
+ # Walk
220
+ #
221
+ def parse doc
222
+ self.parse_tree.match(doc)
223
+ end
222
224
 
223
- # one("hpricot_path") first match to hpricot_path
224
- # one("hpricot_path", /spec/) applies spec to first match to hpricot_path
225
- #
226
- def self.one selector, matcher
227
- MatchFirstElement.new(selector, IMW::HTMLParserMatcher.build_parse_tree(matcher))
228
- end
229
- # match the +attr+ attribute of the first element given by +selector+
230
- def self.attr selector, attr, matcher=nil
231
- MatchAttribute.new(selector, attr, IMW::HTMLParserMatcher.build_parse_tree(matcher))
232
- end
233
- # shorthand for +attr(foo, 'href')+
234
- def self.href selector, matcher=nil
235
- self.attr(selector, 'href', matcher)
236
- end
237
- # shorthand for +attr(foo, 'src')+
238
- def self.src selector, matcher=nil
239
- self.attr(selector, 'src', matcher)
240
- end
225
+ # one("hpricot_path") first match to hpricot_path
226
+ # one("hpricot_path", /spec/) applies spec to first match to hpricot_path
227
+ #
228
+ def self.one selector, matcher
229
+ MatchFirstElement.new(selector, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
230
+ end
231
+ # match the +attr+ attribute of the first element given by +selector+
232
+ def self.attr selector, attr, matcher=nil
233
+ MatchAttribute.new(selector, attr, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
234
+ end
235
+ # shorthand for +attr(foo, 'href')+
236
+ def self.href selector, matcher=nil
237
+ self.attr(selector, 'href', matcher)
238
+ end
239
+ # shorthand for +attr(foo, 'src')+
240
+ def self.src selector, matcher=nil
241
+ self.attr(selector, 'src', matcher)
242
+ end
241
243
 
242
- def self.proc selector, proc, matcher=nil
243
- MatchProc.new(selector, proc, IMW::HTMLParserMatcher.build_parse_tree(matcher))
244
- end
244
+ def self.proc selector, proc, matcher=nil
245
+ MatchProc.new(selector, proc, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
246
+ end
245
247
 
246
- # strip ","s (!! thus disrespecting locale !!!)
247
- # and convert to int
248
- def self.to_num selector, matcher=nil
249
- proc selector, lambda{|num| num.to_s.gsub(/,/,'').to_i if num }, matcher
250
- end
251
- def self.to_json selector, matcher=nil
252
- proc selector, lambda{|v| v.to_json if v }, matcher
253
- end
248
+ # strip ","s (!! thus disrespecting locale !!!)
249
+ # and convert to int
250
+ def self.to_num selector, matcher=nil
251
+ proc selector, lambda{|num| num.to_s.gsub(/,/,'').to_i if num }, matcher
252
+ end
253
+ def self.to_json selector, matcher=nil
254
+ proc selector, lambda{|v| v.to_json if v }, matcher
255
+ end
254
256
 
255
- def self.strip selector, matcher=nil
256
- proc selector, lambda{|v| v.strip }, matcher
257
- end
257
+ def self.strip selector, matcher=nil
258
+ proc selector, lambda{|v| v.strip }, matcher
259
+ end
258
260
 
259
- def self.re_group selector, re
260
- MatchRegexp.new(selector, re)
261
- end
262
- def self.re selector, re
263
- MatchRegexp.new(selector, re, nil, :capture => 1)
264
- end
265
- def self.re_all selector, re, matcher=nil
266
- MatchRegexpRepeatedly.new(selector, re)
267
- end
261
+ def self.re_group selector, re
262
+ MatchRegexp.new(selector, re)
263
+ end
264
+ def self.re selector, re
265
+ MatchRegexp.new(selector, re, nil, :capture => 1)
266
+ end
267
+ def self.re_all selector, re, matcher=nil
268
+ MatchRegexpRepeatedly.new(selector, re)
269
+ end
268
270
 
269
- # def self.plain_text selector, matcher=nil
270
- # proc selector, lambda{|el| el.inner_text if el }, matcher
271
- # end
271
+ # def self.plain_text selector, matcher=nil
272
+ # proc selector, lambda{|el| el.inner_text if el }, matcher
273
+ # end
272
274
 
273
- # attr_accessor :mapping
274
- #
275
- # #
276
- # # Feed me a hash and I'll semantify HTML
277
- # #
278
- # # The hash should magically adhere to the too-complicated,
279
- # # ever evolving goatrope that works for the below
280
- # #
281
- # #
282
- # def initialize mapping
283
- # self.mapping = mapping
284
- # end
285
- #
286
- # #
287
- # # take a document subtree,
288
- # # and a mapping of hpricot paths to that subtree's data mapping
289
- # # recursively extract that datamapping
290
- # #
291
- # def extract_tree hdoc, content, sub_mapping
292
- # data = { }
293
- # sub_mapping.each do |selector, target|
294
- # data[selector] = []
295
- # sub_contents = content/selector
296
- # sub_contents.each do |sub_content|
297
- # sub_data = {}
298
- # extract_node hdoc, sub_content, sub_data, selector, target
299
- # data[selector] << sub_data
300
- # end
301
- # end
302
- # data
303
- # # end
304
- # # if selector.is_a?(String)
305
- # # conts = (content)
306
- # # else
307
- # # conts = [content]
308
- # # end
309
- # # conts[0..0].each do |content|
310
- # # extract_node hdoc, content, data, selector, target
311
- # # end
312
- # # end
313
- # data
314
- # end
315
- #
316
- # #
317
- # # insert the extracted element into the data mapping
318
- # #
319
- # def extract_node hdoc, content, data, selector, target
320
- # classification = classify_node(selector, target)
321
- # result = \
322
- # case classification
323
- # when :subtree
324
- # target.each do |sub_selector, sub_target|
325
- # extract_node hdoc, content, data, sub_selector, sub_target
326
- # end
327
- #
328
- # when :sub_attribute
329
- # k, v = selector.to_a[0]
330
- # subcontent = (k[0..0] == '/') ? (hdoc.at(k)) : (content.at(k))
331
- # val = subcontent.attributes[v.to_s] if subcontent
332
- # data[target] = val unless val.blank?
333
- #
334
- # when :attribute then
335
- # val = content.attributes[selector.to_s]
336
- # data[target] = val unless val.blank?
337
- #
338
- # when :flatten_list
339
- # subcontents = (selector[0..0] == '/') ? (hdoc/selector) : (content/selector)
340
- # data[target.first] = subcontents.map{|subcontent| subcontent.inner_html }
341
- #
342
- # when :inner_html
343
- # subcontent = (selector[0..0] == '/') ? (hdoc.at(selector)) : (content.at(selector))
344
- # data[target] = subcontent.inner_html.strip if subcontent
345
- #
346
- # else
347
- # raise "classify_node shouldn't ever return #{classification}"
348
- # end
349
- # # puts "%-19s %-19s %-31s %s" % [target.inspect[0..18], classification.inspect[0..18], selector.inspect[0..30], result.inspect[0..90]] if (classification == :sub_attribute)
350
- # # puts '' if classification == :subtree
351
- # end
352
- #
353
- # def classify_node selector, target
354
- # case
355
- # when target.is_a?(Hash) then :subtree
356
- # when selector.is_a?(Hash) && (selector.length == 1) then
357
- # k, v = selector.to_a[0]
358
- # case v
359
- # when Symbol then :sub_attribute
360
- # end
361
- # when selector.is_a?(Symbol) then :attribute
362
- # when selector.is_a?(String) && target.is_a?(Array) then :flatten_list
363
- # when selector.is_a?(String) && target.is_a?(Symbol) then :inner_html
364
- # else
365
- # raise "Can't classify mapping: " + [selector, target].join(" - ")
366
- # end
367
- # end
368
- #
369
- # # use #mapping to parse file
370
- # def parse link
371
- # begin hdoc = Hpricot(link.contents)
372
- # rescue; warn "can't hpricot #{link.to_s}" ; return false; end
373
- # raw_taggings = extract_tree hdoc, hdoc, self.mapping
374
- # end
375
- #
376
- # # use #mapping to parse file
377
- # def parse_file filename
378
- # begin hdoc = Hpricot(File.open(filename))
379
- # rescue; warn "can't hpricot #{filename}" ; return false; end
380
- # raw_taggings = extract_tree hdoc, hdoc, self.mapping
381
- # end
275
+ # attr_accessor :mapping
276
+ #
277
+ # #
278
+ # # Feed me a hash and I'll semantify HTML
279
+ # #
280
+ # # The hash should magically adhere to the too-complicated,
281
+ # # ever evolving goatrope that works for the below
282
+ # #
283
+ # #
284
+ # def initialize mapping
285
+ # self.mapping = mapping
286
+ # end
287
+ #
288
+ # #
289
+ # # take a document subtree,
290
+ # # and a mapping of hpricot paths to that subtree's data mapping
291
+ # # recursively extract that datamapping
292
+ # #
293
+ # def extract_tree hdoc, content, sub_mapping
294
+ # data = { }
295
+ # sub_mapping.each do |selector, target|
296
+ # data[selector] = []
297
+ # sub_contents = content/selector
298
+ # sub_contents.each do |sub_content|
299
+ # sub_data = {}
300
+ # extract_node hdoc, sub_content, sub_data, selector, target
301
+ # data[selector] << sub_data
302
+ # end
303
+ # end
304
+ # data
305
+ # # end
306
+ # # if selector.is_a?(String)
307
+ # # conts = (content)
308
+ # # else
309
+ # # conts = [content]
310
+ # # end
311
+ # # conts[0..0].each do |content|
312
+ # # extract_node hdoc, content, data, selector, target
313
+ # # end
314
+ # # end
315
+ # data
316
+ # end
317
+ #
318
+ # #
319
+ # # insert the extracted element into the data mapping
320
+ # #
321
+ # def extract_node hdoc, content, data, selector, target
322
+ # classification = classify_node(selector, target)
323
+ # result = \
324
+ # case classification
325
+ # when :subtree
326
+ # target.each do |sub_selector, sub_target|
327
+ # extract_node hdoc, content, data, sub_selector, sub_target
328
+ # end
329
+ #
330
+ # when :sub_attribute
331
+ # k, v = selector.to_a[0]
332
+ # subcontent = (k[0..0] == '/') ? (hdoc.at(k)) : (content.at(k))
333
+ # val = subcontent.attributes[v.to_s] if subcontent
334
+ # data[target] = val unless val.blank?
335
+ #
336
+ # when :attribute then
337
+ # val = content.attributes[selector.to_s]
338
+ # data[target] = val unless val.blank?
339
+ #
340
+ # when :flatten_list
341
+ # subcontents = (selector[0..0] == '/') ? (hdoc/selector) : (content/selector)
342
+ # data[target.first] = subcontents.map{|subcontent| subcontent.inner_html }
343
+ #
344
+ # when :inner_html
345
+ # subcontent = (selector[0..0] == '/') ? (hdoc.at(selector)) : (content.at(selector))
346
+ # data[target] = subcontent.inner_html.strip if subcontent
347
+ #
348
+ # else
349
+ # raise "classify_node shouldn't ever return #{classification}"
350
+ # end
351
+ # # puts "%-19s %-19s %-31s %s" % [target.inspect[0..18], classification.inspect[0..18], selector.inspect[0..30], result.inspect[0..90]] if (classification == :sub_attribute)
352
+ # # puts '' if classification == :subtree
353
+ # end
354
+ #
355
+ # def classify_node selector, target
356
+ # case
357
+ # when target.is_a?(Hash) then :subtree
358
+ # when selector.is_a?(Hash) && (selector.length == 1) then
359
+ # k, v = selector.to_a[0]
360
+ # case v
361
+ # when Symbol then :sub_attribute
362
+ # end
363
+ # when selector.is_a?(Symbol) then :attribute
364
+ # when selector.is_a?(String) && target.is_a?(Array) then :flatten_list
365
+ # when selector.is_a?(String) && target.is_a?(Symbol) then :inner_html
366
+ # else
367
+ # raise "Can't classify mapping: " + [selector, target].join(" - ")
368
+ # end
369
+ # end
370
+ #
371
+ # # use #mapping to parse file
372
+ # def parse link
373
+ # begin hdoc = Hpricot(link.contents)
374
+ # rescue; warn "can't hpricot #{link.to_s}" ; return false; end
375
+ # raw_taggings = extract_tree hdoc, hdoc, self.mapping
376
+ # end
377
+ #
378
+ # # use #mapping to parse file
379
+ # def parse_file filename
380
+ # begin hdoc = Hpricot(File.open(filename))
381
+ # rescue; warn "can't hpricot #{filename}" ; return false; end
382
+ # raw_taggings = extract_tree hdoc, hdoc, self.mapping
383
+ # end
384
+ end
385
+ end
382
386
  end
387
+
data/lib/imw/parsers.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  module IMW
2
2
  module Parsers
3
- autoload :HTML, 'imw/parsers/html_parser'
3
+ autoload :HtmlParser, 'imw/parsers/html_parser'
4
4
  autoload :LineParser, 'imw/parsers/line_parser'
5
5
  autoload :RegexpParser, 'imw/parsers/regexp_parser'
6
6
  end
@@ -0,0 +1,35 @@
1
+ require 'imw/utils'
2
+
3
+ module IMW
4
+
5
+ # A Repository is a collection of datasets.
6
+ class Repository < Hash
7
+
8
+ # FIXME This should read some configuration settings somewhere and
9
+ # generate a pool specific to each IMW user.
10
+ def self.default
11
+ new
12
+ end
13
+
14
+ end
15
+
16
+ # The default repository managed by IMW.
17
+ REPOSITORY = Repository.default
18
+
19
+ # Add a dataset to the IMW::REPOSITORY. If the dataset has a
20
+ # +handle+ then it will be used as the key in this repository;
21
+ # otherwise the dataset's class will be used.
22
+ def self.add dataset
23
+ REPOSITORY[dataset.handle] = dataset
24
+ end
25
+
26
+ # Remove a dataset from the IMW::REPOSITORY. Can pass in either a
27
+ # string handle or an instance of the dataset.
28
+ def self.delete handle
29
+ handle = handle.handle if handle.respond_to?(:handle)
30
+ REPOSITORY.delete(handle)
31
+ end
32
+
33
+ end
34
+
35
+
data/lib/imw/runner.rb ADDED
@@ -0,0 +1,114 @@
1
+ require 'imw'
2
+ require 'optparse'
3
+
4
+ module IMW
5
+
6
+ RunnerError = Class.new(IMW::Error)
7
+
8
+ class Runner
9
+
10
+ DEFAULT_OPTIONS = {
11
+ :requires => [],
12
+ :selectors => [],
13
+ :dry_run => false
14
+ }
15
+
16
+ attr_reader :args, :options
17
+
18
+ def initialize *args
19
+ @args = args
20
+ @options = DEFAULT_OPTIONS.dup
21
+ parser.parse!(args) # will trim options from args
22
+ end
23
+
24
+ def parser
25
+ OptionParser.new do |opts|
26
+ opts.banner = "usage: imw [OPTIONS] TASK"
27
+ opts.separator <<EOF
28
+
29
+ Run TASK for all datasets in the repository. IMW will read any
30
+ *.imw files in the current directory by default.
31
+
32
+ Options include
33
+
34
+ EOF
35
+
36
+ opts.on('-l', '--list', "List datasets in repository") do
37
+ options[:list] = true
38
+ end
39
+
40
+ opts.on('-s', '--selector SELECTOR', "Filter datasets by regexp SELECTOR. Can be given more than once.") do |selector|
41
+ options[:selectors] << selector
42
+ end
43
+
44
+ opts.on('-r', '--require PATH', "Require PATH. Can be given more than once.") do |path|
45
+ options[:requires] << path
46
+ end
47
+
48
+ end
49
+ end
50
+
51
+ def require_files
52
+ Dir['*.imw'].each { |path| load File.expand_path(path) }
53
+ options[:requires].each do |path|
54
+ IMW.open(path) do |requireable|
55
+ if requireable.directory?
56
+ requireable["**/*.rb"].each { |file| require file }
57
+ requireable["**/*.imw"].each { |file| load file }
58
+ else
59
+ require requireable.path
60
+ end
61
+ end
62
+ end
63
+ end
64
+
65
+ def task
66
+ args.first
67
+ end
68
+
69
+ def handles
70
+ matched_handles = Set.new
71
+ if options[:selectors].blank?
72
+ matched_handles += IMW::REPOSITORY.keys
73
+ else
74
+ keys = IMW::REPOSITORY.keys
75
+ unless keys.empty?
76
+ options[:selectors].each do |selector|
77
+ matched_handles += keys.find_all { |key| key =~ Regexp.new(selector) }
78
+ end
79
+ end
80
+ end
81
+ matched_handles.to_a.sort
82
+ end
83
+
84
+ def datasets
85
+ handles.map { |handle| IMW::REPOSITORY[handle] }
86
+ end
87
+
88
+ def list!
89
+ puts handles
90
+ exit
91
+ end
92
+
93
+ def run_task!
94
+ datasets.each do |dataset|
95
+ dataset[task].invoke
96
+ end
97
+ exit
98
+ end
99
+
100
+ def run!
101
+ require_files
102
+ case
103
+ when options[:list]
104
+ list!
105
+ when task.blank?
106
+ puts parser
107
+ exit 1
108
+ else
109
+ run_task!
110
+ end
111
+ end
112
+ end
113
+ end
114
+
@@ -1,19 +1,3 @@
1
- #
2
- # h2. lib/imw/utils/extensions/core.rb -- extensions to the Ruby core
3
- #
4
- # == About
5
- #
6
- # Some useful extensions to basic Ruby classes. This file is required
7
- # by <tt>imw/utils</tt> so any files required here are automatically
8
- # required when loading IMW.
9
- #
10
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
11
- # Copyright:: Copyright (c) 2008 infochimps.org
12
- # License:: GPL 3.0
13
- # Website:: http://infinitemonkeywrench.org/
14
- #
15
- # puts "#{File.basename(__FILE__)}: Your monkeywrench does a complicated series of core-burning exercises and emerges with ripped, powerful-looking abs."
16
-
17
1
  require 'imw/utils/extensions/string'
18
2
  require 'imw/utils/extensions/array'
19
3
  require 'imw/utils/extensions/hash'
@@ -1,20 +1,3 @@
1
- #
2
- # h2. lib/imw/utils/paths.rb -- defines the path structure of IMW
3
- #
4
- # == About
5
- #
6
- # IMW uses lots of different directories to keep information on data
7
- # and datasets separate. This module interfaces with the
8
- # configuration files to establish the paths to these IMW directories
9
- # and provides functions and mixins for IMW objects to use to access
10
- # these paths.
11
- #
12
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
13
- # Copyright:: Copyright (c) 2008 infochimps.org
14
- # License:: GPL 3.0
15
- # Website:: http://infinitemonkeywrench.org/
16
- #
17
-
18
1
  module IMW
19
2
 
20
3
  # Implements methods designed to work with an object's
@@ -67,17 +50,6 @@ module IMW
67
50
  end
68
51
  end
69
52
 
70
- class Dataset
71
- attr_reader :paths
72
- include IMW::Paths
73
-
74
- private
75
- def set_paths
76
- @paths = {}
77
- add_path :self, File.dirname(eval('__FILE__'))
78
- end
79
- end
80
-
81
53
  def self.path_to *pathsegs
82
54
  begin
83
55
  path = Pathname.new IMW.path_to_helper(*pathsegs)
data/lib/imw.rb CHANGED
@@ -1,42 +1,31 @@
1
- #
2
- # h2. lib/imw.rb -- main imw file
3
- #
4
- # == About
5
- #
6
- # This file is the entry-point to the IMW library. It loads a minimal
7
- # setup. Optional components can be loaded by calling the function
8
- # <tt>IMW.imw_components</tt>.
9
- #
10
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
11
- # Copyright:: Copyright (c) 2008 infochimps.org
12
- # License:: GPL 3.0
13
- # Website:: http://infinitemonkeywrench.org/
14
- #
15
- # puts "#{File.basename(__FILE__)}: Behold, the weighty, the munificent, the Infinite Monkeywrench! Approach it with care: it has overwhelmed mightier monkeys than ye."
16
-
17
1
  require 'rubygems'
18
- require 'YAML' unless defined?('YAML') # some stupid collision with datamapper makes it double include
19
2
  require 'imw/boot'
20
3
  require 'imw/utils'
21
4
  require 'imw/dataset'
5
+ require 'imw/repository'
22
6
  require 'imw/files'
23
7
  require 'imw/parsers'
24
8
  require 'imw/packagers'
25
9
 
26
- # The Infinite Monkeywrench (IMW) is a Ruby library for obtaining,
27
- # parsing, transforming, reconciling, and packaging datasets.
28
- #
29
- # Data is obtained via FIXME
30
- #
31
- # Data is loaded into IMW using <tt>IMW.open</tt> which provides a
32
- # uniform interface across a variety of data formats. The objects
33
- # returned will each have +load+ method which will return data in the
34
- # best form for further processing. If the data is a YAML file, then
35
- # Ruby's +YAML+ library will be used to return primitive Ruby objects,
36
- # if it is a CSV, then the +FasterCSV+ library will be used, &c.
37
- #
38
- # The main interface to handling data is the <tt>IMW::Dataset</tt>
39
- # class. It has methods for summarizing, transforming, and dumping
40
- # data to a variety of formats.
10
+ # The Infinite Monkeywrench (IMW) is a Ruby library for ripping,
11
+ # extracting, parsing, munging, and packaging datasets. It allows you
12
+ # to handle different data formats transparently as well as organize
13
+ # transformations of data as a network of dependencies (a la Make or
14
+ # Rake).
15
+ #
16
+ # On first reading of IMW examine the classes within the IMW::Files
17
+ # module, all transparently instantiated when using IMW.open (instead
18
+ # of File.open). These classes do a lot of work to ensure that all
19
+ # objects returned by IMW.open share methods (write, read, load, dump,
20
+ # parse, compress, extract, &c.) while continuing to use existing
21
+ # implementations of these concepts.
22
+ #
23
+ # Another entrace point is the <tt>IMW::Dataset</tt> class. It
24
+ # leverages Rake to craft workflows for transforming datasets. IMW
25
+ # encourages you to organize your data transformations in a step-wise
26
+ # process, managed with dependencies.
27
+ #
28
+ # Utilities to help with one step in particular (ripping, parsing,
29
+ # pacaking, &c.) are in their own directories.
41
30
  module IMW
42
31
  end