imw 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. data/README.rdoc +194 -31
  2. data/VERSION +1 -1
  3. data/bin/imw +5 -0
  4. data/lib/imw/boot.rb +0 -15
  5. data/lib/imw/dataset/paths.rb +38 -0
  6. data/lib/imw/dataset/task.rb +21 -18
  7. data/lib/imw/dataset/workflow.rb +126 -65
  8. data/lib/imw/dataset.rb +56 -82
  9. data/lib/imw/files/basicfile.rb +3 -3
  10. data/lib/imw/files/compressed_files_and_archives.rb +23 -37
  11. data/lib/imw/files/csv.rb +2 -1
  12. data/lib/imw/files/directory.rb +62 -0
  13. data/lib/imw/files/excel.rb +84 -0
  14. data/lib/imw/files/sgml.rb +4 -23
  15. data/lib/imw/files.rb +62 -47
  16. data/lib/imw/packagers/archiver.rb +19 -1
  17. data/lib/imw/packagers/s3_mover.rb +8 -0
  18. data/lib/imw/parsers/html_parser/matchers.rb +251 -268
  19. data/lib/imw/parsers/html_parser.rb +181 -176
  20. data/lib/imw/parsers.rb +1 -1
  21. data/lib/imw/repository.rb +35 -0
  22. data/lib/imw/runner.rb +114 -0
  23. data/lib/imw/utils/extensions/core.rb +0 -16
  24. data/lib/imw/utils/paths.rb +0 -28
  25. data/lib/imw.rb +21 -32
  26. metadata +11 -19
  27. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +0 -37
  28. data/lib/imw/dataset/datamapper.rb +0 -66
  29. data/lib/imw/dataset/loaddump.rb +0 -50
  30. data/lib/imw/dataset/old/file_collection.rb +0 -88
  31. data/lib/imw/dataset/old/file_collection_utils.rb +0 -71
  32. data/lib/imw/dataset/scaffold.rb +0 -132
  33. data/lib/imw/dataset/scraped_uri.rb +0 -305
  34. data/lib/imw/dataset/scrub/old_working_scrubber.rb +0 -87
  35. data/lib/imw/dataset/scrub/scrub.rb +0 -147
  36. data/lib/imw/dataset/scrub/scrub_simple_url.rb +0 -38
  37. data/lib/imw/dataset/scrub/scrub_test.rb +0 -60
  38. data/lib/imw/dataset/scrub/slug.rb +0 -101
  39. data/lib/imw/dataset/stats/counter.rb +0 -23
  40. data/lib/imw/dataset/stats.rb +0 -73
@@ -191,192 +191,197 @@
191
191
 
192
192
  require 'imw/parsers/html_parser/matchers'
193
193
 
194
- class IMW::HTMLParser
194
+ module IMW
195
+ module Parsers
196
+ class HtmlParser
195
197
 
196
- include IMW::HTMLParserMatcher
198
+ include IMW::Parsers::HtmlMatchers
197
199
 
198
- attr_accessor :parse_tree
200
+ attr_accessor :parse_tree
199
201
 
200
- #
201
- # Parse Tree
202
- #
203
- def initialize arg_spec=nil
204
- spec = arg_spec || self.class.parser_spec
205
- self.parse_tree = IMW::HTMLParserMatcher.build_parse_tree(spec)
206
- end
202
+ #
203
+ # Parse Tree
204
+ #
205
+ def initialize arg_spec=nil
206
+ spec = arg_spec || self.class.parser_spec
207
+ self.parse_tree = IMW::Parsers::HtmlMatchers.build_parse_tree(spec)
208
+ end
207
209
 
208
- #
209
- # See IMW::HTMLParser for syntax
210
- #
211
- #
212
- def self.parser_spec
213
- raise "Override this to create your own parser spec"
214
- end
210
+ #
211
+ # See IMW::HtmlParser for syntax
212
+ #
213
+ #
214
+ def self.parser_spec
215
+ raise "Override this to create your own parser spec"
216
+ end
215
217
 
216
- #
217
- # Walk
218
- #
219
- def parse doc
220
- self.parse_tree.match(doc)
221
- end
218
+ #
219
+ # Walk
220
+ #
221
+ def parse doc
222
+ self.parse_tree.match(doc)
223
+ end
222
224
 
223
- # one("hpricot_path") first match to hpricot_path
224
- # one("hpricot_path", /spec/) applies spec to first match to hpricot_path
225
- #
226
- def self.one selector, matcher
227
- MatchFirstElement.new(selector, IMW::HTMLParserMatcher.build_parse_tree(matcher))
228
- end
229
- # match the +attr+ attribute of the first element given by +selector+
230
- def self.attr selector, attr, matcher=nil
231
- MatchAttribute.new(selector, attr, IMW::HTMLParserMatcher.build_parse_tree(matcher))
232
- end
233
- # shorthand for +attr(foo, 'href')+
234
- def self.href selector, matcher=nil
235
- self.attr(selector, 'href', matcher)
236
- end
237
- # shorthand for +attr(foo, 'src')+
238
- def self.src selector, matcher=nil
239
- self.attr(selector, 'src', matcher)
240
- end
225
+ # one("hpricot_path") first match to hpricot_path
226
+ # one("hpricot_path", /spec/) applies spec to first match to hpricot_path
227
+ #
228
+ def self.one selector, matcher
229
+ MatchFirstElement.new(selector, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
230
+ end
231
+ # match the +attr+ attribute of the first element given by +selector+
232
+ def self.attr selector, attr, matcher=nil
233
+ MatchAttribute.new(selector, attr, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
234
+ end
235
+ # shorthand for +attr(foo, 'href')+
236
+ def self.href selector, matcher=nil
237
+ self.attr(selector, 'href', matcher)
238
+ end
239
+ # shorthand for +attr(foo, 'src')+
240
+ def self.src selector, matcher=nil
241
+ self.attr(selector, 'src', matcher)
242
+ end
241
243
 
242
- def self.proc selector, proc, matcher=nil
243
- MatchProc.new(selector, proc, IMW::HTMLParserMatcher.build_parse_tree(matcher))
244
- end
244
+ def self.proc selector, proc, matcher=nil
245
+ MatchProc.new(selector, proc, IMW::Parsers::HtmlMatchers.build_parse_tree(matcher))
246
+ end
245
247
 
246
- # strip ","s (!! thus disrespecting locale !!!)
247
- # and convert to int
248
- def self.to_num selector, matcher=nil
249
- proc selector, lambda{|num| num.to_s.gsub(/,/,'').to_i if num }, matcher
250
- end
251
- def self.to_json selector, matcher=nil
252
- proc selector, lambda{|v| v.to_json if v }, matcher
253
- end
248
+ # strip ","s (!! thus disrespecting locale !!!)
249
+ # and convert to int
250
+ def self.to_num selector, matcher=nil
251
+ proc selector, lambda{|num| num.to_s.gsub(/,/,'').to_i if num }, matcher
252
+ end
253
+ def self.to_json selector, matcher=nil
254
+ proc selector, lambda{|v| v.to_json if v }, matcher
255
+ end
254
256
 
255
- def self.strip selector, matcher=nil
256
- proc selector, lambda{|v| v.strip }, matcher
257
- end
257
+ def self.strip selector, matcher=nil
258
+ proc selector, lambda{|v| v.strip }, matcher
259
+ end
258
260
 
259
- def self.re_group selector, re
260
- MatchRegexp.new(selector, re)
261
- end
262
- def self.re selector, re
263
- MatchRegexp.new(selector, re, nil, :capture => 1)
264
- end
265
- def self.re_all selector, re, matcher=nil
266
- MatchRegexpRepeatedly.new(selector, re)
267
- end
261
+ def self.re_group selector, re
262
+ MatchRegexp.new(selector, re)
263
+ end
264
+ def self.re selector, re
265
+ MatchRegexp.new(selector, re, nil, :capture => 1)
266
+ end
267
+ def self.re_all selector, re, matcher=nil
268
+ MatchRegexpRepeatedly.new(selector, re)
269
+ end
268
270
 
269
- # def self.plain_text selector, matcher=nil
270
- # proc selector, lambda{|el| el.inner_text if el }, matcher
271
- # end
271
+ # def self.plain_text selector, matcher=nil
272
+ # proc selector, lambda{|el| el.inner_text if el }, matcher
273
+ # end
272
274
 
273
- # attr_accessor :mapping
274
- #
275
- # #
276
- # # Feed me a hash and I'll semantify HTML
277
- # #
278
- # # The hash should magically adhere to the too-complicated,
279
- # # ever evolving goatrope that works for the below
280
- # #
281
- # #
282
- # def initialize mapping
283
- # self.mapping = mapping
284
- # end
285
- #
286
- # #
287
- # # take a document subtree,
288
- # # and a mapping of hpricot paths to that subtree's data mapping
289
- # # recursively extract that datamapping
290
- # #
291
- # def extract_tree hdoc, content, sub_mapping
292
- # data = { }
293
- # sub_mapping.each do |selector, target|
294
- # data[selector] = []
295
- # sub_contents = content/selector
296
- # sub_contents.each do |sub_content|
297
- # sub_data = {}
298
- # extract_node hdoc, sub_content, sub_data, selector, target
299
- # data[selector] << sub_data
300
- # end
301
- # end
302
- # data
303
- # # end
304
- # # if selector.is_a?(String)
305
- # # conts = (content)
306
- # # else
307
- # # conts = [content]
308
- # # end
309
- # # conts[0..0].each do |content|
310
- # # extract_node hdoc, content, data, selector, target
311
- # # end
312
- # # end
313
- # data
314
- # end
315
- #
316
- # #
317
- # # insert the extracted element into the data mapping
318
- # #
319
- # def extract_node hdoc, content, data, selector, target
320
- # classification = classify_node(selector, target)
321
- # result = \
322
- # case classification
323
- # when :subtree
324
- # target.each do |sub_selector, sub_target|
325
- # extract_node hdoc, content, data, sub_selector, sub_target
326
- # end
327
- #
328
- # when :sub_attribute
329
- # k, v = selector.to_a[0]
330
- # subcontent = (k[0..0] == '/') ? (hdoc.at(k)) : (content.at(k))
331
- # val = subcontent.attributes[v.to_s] if subcontent
332
- # data[target] = val unless val.blank?
333
- #
334
- # when :attribute then
335
- # val = content.attributes[selector.to_s]
336
- # data[target] = val unless val.blank?
337
- #
338
- # when :flatten_list
339
- # subcontents = (selector[0..0] == '/') ? (hdoc/selector) : (content/selector)
340
- # data[target.first] = subcontents.map{|subcontent| subcontent.inner_html }
341
- #
342
- # when :inner_html
343
- # subcontent = (selector[0..0] == '/') ? (hdoc.at(selector)) : (content.at(selector))
344
- # data[target] = subcontent.inner_html.strip if subcontent
345
- #
346
- # else
347
- # raise "classify_node shouldn't ever return #{classification}"
348
- # end
349
- # # puts "%-19s %-19s %-31s %s" % [target.inspect[0..18], classification.inspect[0..18], selector.inspect[0..30], result.inspect[0..90]] if (classification == :sub_attribute)
350
- # # puts '' if classification == :subtree
351
- # end
352
- #
353
- # def classify_node selector, target
354
- # case
355
- # when target.is_a?(Hash) then :subtree
356
- # when selector.is_a?(Hash) && (selector.length == 1) then
357
- # k, v = selector.to_a[0]
358
- # case v
359
- # when Symbol then :sub_attribute
360
- # end
361
- # when selector.is_a?(Symbol) then :attribute
362
- # when selector.is_a?(String) && target.is_a?(Array) then :flatten_list
363
- # when selector.is_a?(String) && target.is_a?(Symbol) then :inner_html
364
- # else
365
- # raise "Can't classify mapping: " + [selector, target].join(" - ")
366
- # end
367
- # end
368
- #
369
- # # use #mapping to parse file
370
- # def parse link
371
- # begin hdoc = Hpricot(link.contents)
372
- # rescue; warn "can't hpricot #{link.to_s}" ; return false; end
373
- # raw_taggings = extract_tree hdoc, hdoc, self.mapping
374
- # end
375
- #
376
- # # use #mapping to parse file
377
- # def parse_file filename
378
- # begin hdoc = Hpricot(File.open(filename))
379
- # rescue; warn "can't hpricot #{filename}" ; return false; end
380
- # raw_taggings = extract_tree hdoc, hdoc, self.mapping
381
- # end
275
+ # attr_accessor :mapping
276
+ #
277
+ # #
278
+ # # Feed me a hash and I'll semantify HTML
279
+ # #
280
+ # # The hash should magically adhere to the too-complicated,
281
+ # # ever evolving goatrope that works for the below
282
+ # #
283
+ # #
284
+ # def initialize mapping
285
+ # self.mapping = mapping
286
+ # end
287
+ #
288
+ # #
289
+ # # take a document subtree,
290
+ # # and a mapping of hpricot paths to that subtree's data mapping
291
+ # # recursively extract that datamapping
292
+ # #
293
+ # def extract_tree hdoc, content, sub_mapping
294
+ # data = { }
295
+ # sub_mapping.each do |selector, target|
296
+ # data[selector] = []
297
+ # sub_contents = content/selector
298
+ # sub_contents.each do |sub_content|
299
+ # sub_data = {}
300
+ # extract_node hdoc, sub_content, sub_data, selector, target
301
+ # data[selector] << sub_data
302
+ # end
303
+ # end
304
+ # data
305
+ # # end
306
+ # # if selector.is_a?(String)
307
+ # # conts = (content)
308
+ # # else
309
+ # # conts = [content]
310
+ # # end
311
+ # # conts[0..0].each do |content|
312
+ # # extract_node hdoc, content, data, selector, target
313
+ # # end
314
+ # # end
315
+ # data
316
+ # end
317
+ #
318
+ # #
319
+ # # insert the extracted element into the data mapping
320
+ # #
321
+ # def extract_node hdoc, content, data, selector, target
322
+ # classification = classify_node(selector, target)
323
+ # result = \
324
+ # case classification
325
+ # when :subtree
326
+ # target.each do |sub_selector, sub_target|
327
+ # extract_node hdoc, content, data, sub_selector, sub_target
328
+ # end
329
+ #
330
+ # when :sub_attribute
331
+ # k, v = selector.to_a[0]
332
+ # subcontent = (k[0..0] == '/') ? (hdoc.at(k)) : (content.at(k))
333
+ # val = subcontent.attributes[v.to_s] if subcontent
334
+ # data[target] = val unless val.blank?
335
+ #
336
+ # when :attribute then
337
+ # val = content.attributes[selector.to_s]
338
+ # data[target] = val unless val.blank?
339
+ #
340
+ # when :flatten_list
341
+ # subcontents = (selector[0..0] == '/') ? (hdoc/selector) : (content/selector)
342
+ # data[target.first] = subcontents.map{|subcontent| subcontent.inner_html }
343
+ #
344
+ # when :inner_html
345
+ # subcontent = (selector[0..0] == '/') ? (hdoc.at(selector)) : (content.at(selector))
346
+ # data[target] = subcontent.inner_html.strip if subcontent
347
+ #
348
+ # else
349
+ # raise "classify_node shouldn't ever return #{classification}"
350
+ # end
351
+ # # puts "%-19s %-19s %-31s %s" % [target.inspect[0..18], classification.inspect[0..18], selector.inspect[0..30], result.inspect[0..90]] if (classification == :sub_attribute)
352
+ # # puts '' if classification == :subtree
353
+ # end
354
+ #
355
+ # def classify_node selector, target
356
+ # case
357
+ # when target.is_a?(Hash) then :subtree
358
+ # when selector.is_a?(Hash) && (selector.length == 1) then
359
+ # k, v = selector.to_a[0]
360
+ # case v
361
+ # when Symbol then :sub_attribute
362
+ # end
363
+ # when selector.is_a?(Symbol) then :attribute
364
+ # when selector.is_a?(String) && target.is_a?(Array) then :flatten_list
365
+ # when selector.is_a?(String) && target.is_a?(Symbol) then :inner_html
366
+ # else
367
+ # raise "Can't classify mapping: " + [selector, target].join(" - ")
368
+ # end
369
+ # end
370
+ #
371
+ # # use #mapping to parse file
372
+ # def parse link
373
+ # begin hdoc = Hpricot(link.contents)
374
+ # rescue; warn "can't hpricot #{link.to_s}" ; return false; end
375
+ # raw_taggings = extract_tree hdoc, hdoc, self.mapping
376
+ # end
377
+ #
378
+ # # use #mapping to parse file
379
+ # def parse_file filename
380
+ # begin hdoc = Hpricot(File.open(filename))
381
+ # rescue; warn "can't hpricot #{filename}" ; return false; end
382
+ # raw_taggings = extract_tree hdoc, hdoc, self.mapping
383
+ # end
384
+ end
385
+ end
382
386
  end
387
+
data/lib/imw/parsers.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  module IMW
2
2
  module Parsers
3
- autoload :HTML, 'imw/parsers/html_parser'
3
+ autoload :HtmlParser, 'imw/parsers/html_parser'
4
4
  autoload :LineParser, 'imw/parsers/line_parser'
5
5
  autoload :RegexpParser, 'imw/parsers/regexp_parser'
6
6
  end
@@ -0,0 +1,35 @@
1
+ require 'imw/utils'
2
+
3
+ module IMW
4
+
5
+ # A Repository is a collection of datasets.
6
+ class Repository < Hash
7
+
8
+ # FIXME This should read some configuration settings somewhere and
9
+ # generate a pool specific to each IMW user.
10
+ def self.default
11
+ new
12
+ end
13
+
14
+ end
15
+
16
+ # The default repository managed by IMW.
17
+ REPOSITORY = Repository.default
18
+
19
+ # Add a dataset to the IMW::REPOSITORY. If the dataset has a
20
+ # +handle+ then it will be used as the key in this repository;
21
+ # otherwise the dataset's class will be used.
22
+ def self.add dataset
23
+ REPOSITORY[dataset.handle] = dataset
24
+ end
25
+
26
+ # Remove a dataset from the IMW::REPOSITORY. Can pass in either a
27
+ # string handle or an instance of the dataset.
28
+ def self.delete handle
29
+ handle = handle.handle if handle.respond_to?(:handle)
30
+ REPOSITORY.delete(handle)
31
+ end
32
+
33
+ end
34
+
35
+
data/lib/imw/runner.rb ADDED
@@ -0,0 +1,114 @@
1
+ require 'imw'
2
+ require 'optparse'
3
+
4
+ module IMW
5
+
6
+ RunnerError = Class.new(IMW::Error)
7
+
8
+ class Runner
9
+
10
+ DEFAULT_OPTIONS = {
11
+ :requires => [],
12
+ :selectors => [],
13
+ :dry_run => false
14
+ }
15
+
16
+ attr_reader :args, :options
17
+
18
+ def initialize *args
19
+ @args = args
20
+ @options = DEFAULT_OPTIONS.dup
21
+ parser.parse!(args) # will trim options from args
22
+ end
23
+
24
+ def parser
25
+ OptionParser.new do |opts|
26
+ opts.banner = "usage: imw [OPTIONS] TASK"
27
+ opts.separator <<EOF
28
+
29
+ Run TASK for all datasets in the repository. IMW will read any
30
+ *.imw files in the current directory by default.
31
+
32
+ Options include
33
+
34
+ EOF
35
+
36
+ opts.on('-l', '--list', "List datasets in repository") do
37
+ options[:list] = true
38
+ end
39
+
40
+ opts.on('-s', '--selector SELECTOR', "Filter datasets by regexp SELECTOR. Can be given more than once.") do |selector|
41
+ options[:selectors] << selector
42
+ end
43
+
44
+ opts.on('-r', '--require PATH', "Require PATH. Can be given more than once.") do |path|
45
+ options[:requires] << path
46
+ end
47
+
48
+ end
49
+ end
50
+
51
+ def require_files
52
+ Dir['*.imw'].each { |path| load File.expand_path(path) }
53
+ options[:requires].each do |path|
54
+ IMW.open(path) do |requireable|
55
+ if requireable.directory?
56
+ requireable["**/*.rb"].each { |file| require file }
57
+ requireable["**/*.imw"].each { |file| load file }
58
+ else
59
+ require requireable.path
60
+ end
61
+ end
62
+ end
63
+ end
64
+
65
+ def task
66
+ args.first
67
+ end
68
+
69
+ def handles
70
+ matched_handles = Set.new
71
+ if options[:selectors].blank?
72
+ matched_handles += IMW::REPOSITORY.keys
73
+ else
74
+ keys = IMW::REPOSITORY.keys
75
+ unless keys.empty?
76
+ options[:selectors].each do |selector|
77
+ matched_handles += keys.find_all { |key| key =~ Regexp.new(selector) }
78
+ end
79
+ end
80
+ end
81
+ matched_handles.to_a.sort
82
+ end
83
+
84
+ def datasets
85
+ handles.map { |handle| IMW::REPOSITORY[handle] }
86
+ end
87
+
88
+ def list!
89
+ puts handles
90
+ exit
91
+ end
92
+
93
+ def run_task!
94
+ datasets.each do |dataset|
95
+ dataset[task].invoke
96
+ end
97
+ exit
98
+ end
99
+
100
+ def run!
101
+ require_files
102
+ case
103
+ when options[:list]
104
+ list!
105
+ when task.blank?
106
+ puts parser
107
+ exit 1
108
+ else
109
+ run_task!
110
+ end
111
+ end
112
+ end
113
+ end
114
+
@@ -1,19 +1,3 @@
1
- #
2
- # h2. lib/imw/utils/extensions/core.rb -- extensions to the Ruby core
3
- #
4
- # == About
5
- #
6
- # Some useful extensions to basic Ruby classes. This file is required
7
- # by <tt>imw/utils</tt> so any files required here are automatically
8
- # required when loading IMW.
9
- #
10
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
11
- # Copyright:: Copyright (c) 2008 infochimps.org
12
- # License:: GPL 3.0
13
- # Website:: http://infinitemonkeywrench.org/
14
- #
15
- # puts "#{File.basename(__FILE__)}: Your monkeywrench does a complicated series of core-burning exercises and emerges with ripped, powerful-looking abs."
16
-
17
1
  require 'imw/utils/extensions/string'
18
2
  require 'imw/utils/extensions/array'
19
3
  require 'imw/utils/extensions/hash'
@@ -1,20 +1,3 @@
1
- #
2
- # h2. lib/imw/utils/paths.rb -- defines the path structure of IMW
3
- #
4
- # == About
5
- #
6
- # IMW uses lots of different directories to keep information on data
7
- # and datasets separate. This module interfaces with the
8
- # configuration files to establish the paths to these IMW directories
9
- # and provides functions and mixins for IMW objects to use to access
10
- # these paths.
11
- #
12
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
13
- # Copyright:: Copyright (c) 2008 infochimps.org
14
- # License:: GPL 3.0
15
- # Website:: http://infinitemonkeywrench.org/
16
- #
17
-
18
1
  module IMW
19
2
 
20
3
  # Implements methods designed to work with an object's
@@ -67,17 +50,6 @@ module IMW
67
50
  end
68
51
  end
69
52
 
70
- class Dataset
71
- attr_reader :paths
72
- include IMW::Paths
73
-
74
- private
75
- def set_paths
76
- @paths = {}
77
- add_path :self, File.dirname(eval('__FILE__'))
78
- end
79
- end
80
-
81
53
  def self.path_to *pathsegs
82
54
  begin
83
55
  path = Pathname.new IMW.path_to_helper(*pathsegs)
data/lib/imw.rb CHANGED
@@ -1,42 +1,31 @@
1
- #
2
- # h2. lib/imw.rb -- main imw file
3
- #
4
- # == About
5
- #
6
- # This file is the entry-point to the IMW library. It loads a minimal
7
- # setup. Optional components can be loaded by calling the function
8
- # <tt>IMW.imw_components</tt>.
9
- #
10
- # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
11
- # Copyright:: Copyright (c) 2008 infochimps.org
12
- # License:: GPL 3.0
13
- # Website:: http://infinitemonkeywrench.org/
14
- #
15
- # puts "#{File.basename(__FILE__)}: Behold, the weighty, the munificent, the Infinite Monkeywrench! Approach it with care: it has overwhelmed mightier monkeys than ye."
16
-
17
1
  require 'rubygems'
18
- require 'YAML' unless defined?('YAML') # some stupid collision with datamapper makes it double include
19
2
  require 'imw/boot'
20
3
  require 'imw/utils'
21
4
  require 'imw/dataset'
5
+ require 'imw/repository'
22
6
  require 'imw/files'
23
7
  require 'imw/parsers'
24
8
  require 'imw/packagers'
25
9
 
26
- # The Infinite Monkeywrench (IMW) is a Ruby library for obtaining,
27
- # parsing, transforming, reconciling, and packaging datasets.
28
- #
29
- # Data is obtained via FIXME
30
- #
31
- # Data is loaded into IMW using <tt>IMW.open</tt> which provides a
32
- # uniform interface across a variety of data formats. The objects
33
- # returned will each have +load+ method which will return data in the
34
- # best form for further processing. If the data is a YAML file, then
35
- # Ruby's +YAML+ library will be used to return primitive Ruby objects,
36
- # if it is a CSV, then the +FasterCSV+ library will be used, &c.
37
- #
38
- # The main interface to handling data is the <tt>IMW::Dataset</tt>
39
- # class. It has methods for summarizing, transforming, and dumping
40
- # data to a variety of formats.
10
+ # The Infinite Monkeywrench (IMW) is a Ruby library for ripping,
11
+ # extracting, parsing, munging, and packaging datasets. It allows you
12
+ # to handle different data formats transparently as well as organize
13
+ # transformations of data as a network of dependencies (a la Make or
14
+ # Rake).
15
+ #
16
+ # On first reading of IMW examine the classes within the IMW::Files
17
+ # module, all transparently instantiated when using IMW.open (instead
18
+ # of File.open). These classes do a lot of work to ensure that all
19
+ # objects returned by IMW.open share methods (write, read, load, dump,
20
+ # parse, compress, extract, &c.) while continuing to use existing
21
+ # implementations of these concepts.
22
+ #
23
+ # Another entrace point is the <tt>IMW::Dataset</tt> class. It
24
+ # leverages Rake to craft workflows for transforming datasets. IMW
25
+ # encourages you to organize your data transformations in a step-wise
26
+ # process, managed with dependencies.
27
+ #
28
+ # Utilities to help with one step in particular (ripping, parsing,
29
+ # pacaking, &c.) are in their own directories.
41
30
  module IMW
42
31
  end