imw 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (111) hide show
  1. data/.gitignore +15 -0
  2. data/CHANGELOG +0 -0
  3. data/LICENSE +674 -0
  4. data/README.rdoc +101 -0
  5. data/Rakefile +20 -0
  6. data/VERSION +1 -0
  7. data/etc/imwrc.rb +76 -0
  8. data/lib/imw.rb +42 -0
  9. data/lib/imw/boot.rb +58 -0
  10. data/lib/imw/dataset.rb +233 -0
  11. data/lib/imw/dataset/datamapper.rb +66 -0
  12. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
  13. data/lib/imw/dataset/loaddump.rb +50 -0
  14. data/lib/imw/dataset/old/file_collection.rb +88 -0
  15. data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
  16. data/lib/imw/dataset/scaffold.rb +132 -0
  17. data/lib/imw/dataset/scraped_uri.rb +305 -0
  18. data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
  19. data/lib/imw/dataset/scrub/scrub.rb +147 -0
  20. data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
  21. data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
  22. data/lib/imw/dataset/scrub/slug.rb +101 -0
  23. data/lib/imw/dataset/stats.rb +73 -0
  24. data/lib/imw/dataset/stats/counter.rb +23 -0
  25. data/lib/imw/dataset/task.rb +38 -0
  26. data/lib/imw/dataset/workflow.rb +81 -0
  27. data/lib/imw/files.rb +110 -0
  28. data/lib/imw/files/archive.rb +113 -0
  29. data/lib/imw/files/basicfile.rb +122 -0
  30. data/lib/imw/files/binary.rb +28 -0
  31. data/lib/imw/files/compressed_file.rb +93 -0
  32. data/lib/imw/files/compressed_files_and_archives.rb +348 -0
  33. data/lib/imw/files/compressible.rb +103 -0
  34. data/lib/imw/files/csv.rb +112 -0
  35. data/lib/imw/files/json.rb +41 -0
  36. data/lib/imw/files/sgml.rb +65 -0
  37. data/lib/imw/files/text.rb +68 -0
  38. data/lib/imw/files/yaml.rb +46 -0
  39. data/lib/imw/packagers.rb +8 -0
  40. data/lib/imw/packagers/archiver.rb +108 -0
  41. data/lib/imw/packagers/s3_mover.rb +28 -0
  42. data/lib/imw/parsers.rb +7 -0
  43. data/lib/imw/parsers/html_parser.rb +382 -0
  44. data/lib/imw/parsers/html_parser/matchers.rb +306 -0
  45. data/lib/imw/parsers/line_parser.rb +87 -0
  46. data/lib/imw/parsers/regexp_parser.rb +72 -0
  47. data/lib/imw/utils.rb +24 -0
  48. data/lib/imw/utils/components.rb +61 -0
  49. data/lib/imw/utils/config.rb +46 -0
  50. data/lib/imw/utils/error.rb +54 -0
  51. data/lib/imw/utils/extensions/array.rb +125 -0
  52. data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
  53. data/lib/imw/utils/extensions/core.rb +43 -0
  54. data/lib/imw/utils/extensions/dir.rb +24 -0
  55. data/lib/imw/utils/extensions/file_core.rb +64 -0
  56. data/lib/imw/utils/extensions/hash.rb +218 -0
  57. data/lib/imw/utils/extensions/hpricot.rb +48 -0
  58. data/lib/imw/utils/extensions/string.rb +49 -0
  59. data/lib/imw/utils/extensions/struct.rb +42 -0
  60. data/lib/imw/utils/extensions/symbol.rb +28 -0
  61. data/lib/imw/utils/extensions/typed_struct.rb +22 -0
  62. data/lib/imw/utils/extensions/uri.rb +59 -0
  63. data/lib/imw/utils/log.rb +67 -0
  64. data/lib/imw/utils/misc.rb +63 -0
  65. data/lib/imw/utils/paths.rb +115 -0
  66. data/lib/imw/utils/uri.rb +59 -0
  67. data/lib/imw/utils/uuid.rb +33 -0
  68. data/lib/imw/utils/validate.rb +38 -0
  69. data/lib/imw/utils/version.rb +12 -0
  70. data/lib/imw/utils/view.rb +113 -0
  71. data/lib/imw/utils/view/dump_csv.rb +112 -0
  72. data/lib/imw/utils/view/dump_csv_older.rb +117 -0
  73. data/spec/data/sample.csv +131 -0
  74. data/spec/data/sample.tsv +131 -0
  75. data/spec/data/sample.txt +131 -0
  76. data/spec/data/sample.xml +653 -0
  77. data/spec/data/sample.yaml +652 -0
  78. data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
  79. data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
  80. data/spec/imw/files/archive_spec.rb +118 -0
  81. data/spec/imw/files/basicfile_spec.rb +121 -0
  82. data/spec/imw/files/bz2_spec.rb +32 -0
  83. data/spec/imw/files/compressed_file_spec.rb +96 -0
  84. data/spec/imw/files/compressible_spec.rb +100 -0
  85. data/spec/imw/files/file_spec.rb +144 -0
  86. data/spec/imw/files/gz_spec.rb +32 -0
  87. data/spec/imw/files/rar_spec.rb +33 -0
  88. data/spec/imw/files/tar_spec.rb +31 -0
  89. data/spec/imw/files/text_spec.rb +23 -0
  90. data/spec/imw/files/zip_spec.rb +31 -0
  91. data/spec/imw/files_spec.rb +38 -0
  92. data/spec/imw/packagers/archiver_spec.rb +125 -0
  93. data/spec/imw/packagers/s3_mover_spec.rb +7 -0
  94. data/spec/imw/parsers/line_parser_spec.rb +96 -0
  95. data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
  96. data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
  97. data/spec/imw/utils/extensions/find_spec.rb +113 -0
  98. data/spec/imw/utils/paths_spec.rb +38 -0
  99. data/spec/imw/workflow/rip/local_spec.rb +89 -0
  100. data/spec/imw/workflow/rip_spec.rb +27 -0
  101. data/spec/rcov.opts +1 -0
  102. data/spec/spec.opts +4 -0
  103. data/spec/spec_helper.rb +32 -0
  104. data/spec/support/archive_contents_matcher.rb +94 -0
  105. data/spec/support/custom_matchers.rb +21 -0
  106. data/spec/support/directory_contents_matcher.rb +61 -0
  107. data/spec/support/extensions.rb +18 -0
  108. data/spec/support/file_contents_matcher.rb +50 -0
  109. data/spec/support/random.rb +210 -0
  110. data/spec/support/without_regard_to_order_matcher.rb +58 -0
  111. metadata +196 -0
@@ -0,0 +1,67 @@
1
+ require 'logger'
2
+
3
+ module IMW
4
+ LOG_FILE_DESTINATION = STDERR unless defined?(LOG_FILE_DESTINATION)
5
+ LOG_TIMEFORMAT = "%Y%m%d-%H:%M:%S " unless defined?(LOG_TIMEFORMAT)
6
+
7
+ class << self; attr_accessor :log end
8
+ #
9
+ # Create a Logger and point it at LOG_FILE_DESTINATION
10
+ #
11
+ # LOG_FILE_DESTINATION is STDOUT by default; redefine it in your
12
+ # ~/.imwrc, or set IMW.log yourself, if that's not cool.
13
+ #
14
+ def self.instantiate_logger!
15
+ IMW.log ||= Logger.new(LOG_FILE_DESTINATION)
16
+ IMW.log.datetime_format = "%Y%m%d-%H:%M:%S "
17
+ IMW.log.level = Logger::INFO
18
+ end
19
+
20
+ def announce *events
21
+ options = events.extract_options!
22
+ options.reverse_merge! :level => Logger::INFO
23
+ # puts [options, events ].inspect, "*"*76
24
+ IMW.log.add options[:level], events.join("\n")
25
+ end
26
+ def banner *events
27
+ options = events.extract_options!
28
+ options.reverse_merge! :level => Logger::INFO
29
+ ["*"*75, events, "*"*75].flatten.each{|ev| announce(ev, options) }
30
+ end
31
+
32
+ PROGRESS_TRACKERS = {}
33
+ #
34
+ # When the slowly-changing tracked variable +var+ changes value,
35
+ # announce its new value. Always announces on first call.
36
+ #
37
+ # Ex:
38
+ # track_progress :indexing_names, name[0..0] # announce at each initial letter
39
+ # track_progress :files, (i % 1000) # announce at each 1,000 iterations
40
+ #
41
+ def track_progress tracker, val
42
+ unless (IMW::PROGRESS_TRACKERS.include?(tracker)) &&
43
+ (IMW::PROGRESS_TRACKERS[tracker] == val)
44
+ announce "#{tracker.to_s.gsub(/_/,' ')}: #{val}"
45
+ IMW::PROGRESS_TRACKERS[tracker] = val
46
+ end
47
+ end
48
+
49
+ PROGRESS_COUNTERS = {}
50
+ #
51
+ # Log repetitions in a given context
52
+ #
53
+ # At every n'th (default 1000) call,
54
+ # announce progress in the IMW.log
55
+ #
56
+ def track_count tracker, every=1000
57
+ PROGRESS_COUNTERS[tracker] ||= 0
58
+ PROGRESS_COUNTERS[tracker] += 1
59
+ chunk = every * (PROGRESS_COUNTERS[tracker]/every).to_i
60
+ track_progress "count_of_#{tracker}", chunk
61
+ end
62
+ end
63
+
64
+ #
65
+ # Make the default logger
66
+ #
67
+ IMW.instantiate_logger!
@@ -0,0 +1,63 @@
1
+ module IMW
2
+ # Return a string representing the current UTC time in the IMW
3
+ # format.
4
+ def self.current_utc_time_string
5
+ Time.now.utc.strftime(IMW::STRFTIME_FORMAT)
6
+ end
7
+
8
+
9
+ # A simple counter. The +value+ and +add+ methods read and
10
+ # increment the counter's value.
11
+ #
12
+ # counter = IMW::Counter.new
13
+ # counter.value #=> 0
14
+ # counter.add 1
15
+ # counter.value #=> 1
16
+ #
17
+ # The +next!+ method acts as like C's <tt>value++</tt>, incrementing
18
+ # +value+ _after_ it is referenced.
19
+ #
20
+ # counter = IMW::Counter.new
21
+ # counter.value #=> 0
22
+ # counter.next! #=> 0
23
+ # counter.value #=> 1
24
+ #
25
+ # Counters can also be reset
26
+ #
27
+ # counter.reset!
28
+ # counter.value #=> 0
29
+ class Counter
30
+
31
+ attr_accessor :value, :starting_value, :increment
32
+
33
+ # Return a new Counter. The first argument is the starting value
34
+ # (defaults to 0) and the second is the increment (defaults to 1).
35
+ def initialize starting_value=0,increment=1
36
+ @starting_value = starting_value
37
+ @value = starting_value
38
+ @increment = increment
39
+ end
40
+
41
+ # Add +amount+ (defaults to the value of <tt>@increment</tt>).
42
+ def add amount=nil
43
+ @value += amount || @increment
44
+ end
45
+ alias_method :add!, :add
46
+
47
+ # Increment the counter by <tt>@increment</tt> but return its
48
+ # value _before_ being incremented.
49
+ def next!
50
+ old_value = @value
51
+ @value += @increment
52
+ old_value
53
+ end
54
+
55
+ # Reset the counter to +value+ (defaults to the value of
56
+ # <tt>@starting_value</tt>).
57
+ def reset! value=nil
58
+ @value = value || @starting_value
59
+ end
60
+ end
61
+ end
62
+
63
+ # puts "#{File.basename(__FILE__)}: Your Monkeywrench seems suddenly more utilisable." # at bottom
@@ -0,0 +1,115 @@
1
+ #
2
+ # h2. lib/imw/utils/paths.rb -- defines the path structure of IMW
3
+ #
4
+ # == About
5
+ #
6
+ # IMW uses lots of different directories to keep information on data
7
+ # and datasets separate. This module interfaces with the
8
+ # configuration files to establish the paths to these IMW directories
9
+ # and provides functions and mixins for IMW objects to use to access
10
+ # these paths.
11
+ #
12
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
13
+ # Copyright:: Copyright (c) 2008 infochimps.org
14
+ # License:: GPL 3.0
15
+ # Website:: http://infinitemonkeywrench.org/
16
+ #
17
+
18
+ module IMW
19
+
20
+ # Implements methods designed to work with an object's
21
+ # <tt>@paths</tt> attributes, adding and deleting symbolic
22
+ # references to paths and expanding calls to +path_to+ from that
23
+ # attribute or (when a miss) from <tt>IMW::PATHS</tt>.
24
+ #
25
+ # An including class should therefore define an array attribute
26
+ # <tt>@paths</tt>.
27
+ module Paths
28
+
29
+ # Expands a shorthand workflow path specification to an
30
+ # actual file path.
31
+ #
32
+ # add_path :mlb_08, 'gd2.mlb.com/components/game/mlb/year_2008'
33
+ # path_to :ripd, :mlb_08, 'month_06', 'day_08', 'miniscoreboard.xml'
34
+ # => (...)/data/ripd/gd2.mlb.com/components/game/mlb/year_2008/month_06/day_08/miniscoreboard.xml
35
+ def path_to *pathsegs
36
+ begin
37
+ path = Pathname.new path_to_helper(*pathsegs)
38
+ path.absolute? ? File.expand_path(path) : path.to_s
39
+ rescue Exception => e
40
+ raise("Can't find path to '#{pathsegs}': #{e}");
41
+ end
42
+ end
43
+
44
+ private
45
+ def path_to_helper *pathsegs # :nodoc:
46
+ # +path_to_helper+ handles the recursive calls for +path_to+.
47
+ expanded = pathsegs.flatten.compact.map do |pathseg|
48
+ case
49
+ when pathseg.is_a?(Symbol) && @paths.include?(pathseg) then path_to(@paths[pathseg])
50
+ when pathseg.is_a?(Symbol) && IMW::PATHS.include?(pathseg) then path_to(IMW::PATHS[pathseg])
51
+ when pathseg.is_a?(Symbol) then raise IMW::PathError.new("No path expansion set for #{pathseg.inspect}")
52
+ else pathseg
53
+ end
54
+ end
55
+ File.join(*expanded)
56
+ end
57
+ public
58
+
59
+ # Adds a symbolic path for expansion by +path_to+.
60
+ def add_path sym, *pathsegs
61
+ @paths[sym] = pathsegs.flatten
62
+ end
63
+
64
+ # Removes a symbolic path for expansion by +path_to+.
65
+ def remove_path sym
66
+ @paths.delete sym if @paths.include? sym
67
+ end
68
+ end
69
+
70
+ class Dataset
71
+ attr_reader :paths
72
+ include IMW::Paths
73
+
74
+ private
75
+ def set_paths
76
+ @paths = {}
77
+ add_path :self, File.dirname(eval('__FILE__'))
78
+ end
79
+ end
80
+
81
+ def self.path_to *pathsegs
82
+ begin
83
+ path = Pathname.new IMW.path_to_helper(*pathsegs)
84
+ path.absolute? ? File.expand_path(path) : path.to_s
85
+ rescue Exception => e
86
+ raise("Can't find path to '#{pathsegs}': #{e}");
87
+ end
88
+ end
89
+
90
+ private
91
+ def self.path_to_helper *pathsegs # :nodoc:
92
+ # +path_to_helper+ handles the recursive calls for +path_to+.
93
+ expanded = pathsegs.flatten.compact.map do |pathseg|
94
+ case
95
+ when pathseg.is_a?(Symbol) && IMW::PATHS.include?(pathseg) then path_to(IMW::PATHS[pathseg])
96
+ when pathseg.is_a?(Symbol) then raise IMW::PathError.new("No path expansion set for #{pathseg.inspect}")
97
+ else pathseg
98
+ end
99
+ end
100
+ File.join(*expanded)
101
+ end
102
+ public
103
+
104
+ # Adds a symbolic path for expansion by +path_to+.
105
+ def self.add_path sym, *pathsegs
106
+ IMW::PATHS[sym] = pathsegs.flatten
107
+ end
108
+
109
+ # Removes a symbolic path for expansion by +path_to+.
110
+ def self.remove_path sym
111
+ IMW::PATHS.delete sym if IMW::PATHS.include? sym
112
+ end
113
+ end
114
+
115
+ # puts "#{File.basename(__FILE__)}: Your monkeywrench glows alternately dim then bright as you wander, suggesting to you which paths to take."
@@ -0,0 +1,59 @@
1
+ require 'imw/utils'
2
+ require 'imw/utils/uuid'
3
+ require 'addressable/uri'
4
+ module Addressable
5
+ #
6
+ # Add the #scrubbed and #revhost calls
7
+ #
8
+ class URI
9
+ SAFE_CHARS = %r{a-zA-Z0-9\-\._!\(\)\*\'}
10
+ PATH_CHARS = %r{#{SAFE_CHARS}\$&\+,:=@\/;}
11
+ RESERVED_CHARS = %r{\$&\+,:=@\/;\?\%}
12
+ UNSAFE_CHARS = %r{\\ \"\#<>\[\]\^\`\|\~\{\}}
13
+ HOST_HEAD = '(?:[a-z0-9\-]+\.)+'
14
+ HOST_TLD = '(?:[a-z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|jobs|museum)'
15
+
16
+ def host_valid?
17
+ !!(host =~ %r{\A#{HOST_HEAD}#{HOST_TLD}\z}i)
18
+ end
19
+ def path_valid?
20
+ !!(path =~ %r{\A[#{PATH_CHARS}%]*\z})
21
+ end
22
+ def simple_connection_part?
23
+ ( ['http', nil].include?(scheme) &&
24
+ [80, nil].include?(port) &&
25
+ (self.to_hash.values_at(:password, :user).join.blank?) )
26
+ end
27
+
28
+ #
29
+ # Does this look like a
30
+ #
31
+ def simple?
32
+ host_valid? && path_valid? && simple_connection_part?
33
+ end
34
+
35
+ #
36
+ # +revhost+
37
+ # the dot-reversed host:
38
+ # foo.company.com => com.company.foo
39
+ #
40
+ def revhost
41
+ return host unless host =~ /\./
42
+ host.split('.').reverse.join('.')
43
+ end
44
+ #
45
+ # +uuid+ -- RFC-4122 ver.5 uuid; guaranteed to be universally unique
46
+ #
47
+ # See
48
+ # http://www.faqs.org/rfcs/rfc4122.html
49
+ #
50
+ def url_uuid
51
+ UUID.sha1_create(UUID_URL_NAMESPACE, self.normalize.to_s)
52
+ end
53
+ end
54
+ end
55
+
56
+ class << Addressable::URI
57
+ alias_method :encode_segment, :encode_component if ! defined?(encode_segment)
58
+ alias_method :unencode_segment, :unencode_component if ! defined?(unencode_segment)
59
+ end
@@ -0,0 +1,33 @@
1
+ require 'uuidtools'
2
+
3
+ class UUID
4
+
5
+ #
6
+ # A string suitable for using as a path name --
7
+ #
8
+ # Ex.
9
+ # urn:uuid:3c0dce44-80a8-11dd-a897-001ff35a0a8b =>
10
+ # urn_uuid/3c0dce44/80a8/11dd/a897/001ff35a0a8b
11
+ #
12
+ # It's well possible there are more perspicacious choices for points to split
13
+ # the string, but until we hit that limit this'll do.
14
+ #
15
+ def to_path
16
+ 'urn_uuid/' + to_s.gsub(/[\:\-]/,'/')
17
+ end
18
+
19
+ def self.hex_to_str str
20
+ /([\da-f]{8})([\da-f]{4})([\da-f]{4})([\da-f]{4})([\da-f]{12})/.match(str).captures.join '-'
21
+ end
22
+
23
+
24
+ def self.parse_hex str
25
+ parse(UUID.hex_to_str(str))
26
+ end
27
+
28
+ # Overrides UUIDTools -- force 32 hex digits (leading zeros)
29
+ def hexdigest
30
+ "%032x" % self.to_i
31
+ end
32
+
33
+ end
@@ -0,0 +1,38 @@
1
+
2
+
3
+ # Return true if <tt>email</tt> is a valid email address
4
+ def is_email?(email)
5
+ raise ArgumentError, "'email' must be a string" if email.class != String
6
+ return false if email.empty?
7
+
8
+ parts = email.split('@')
9
+ return false if parts.size != 2
10
+
11
+ local = parts.first
12
+ return false if not local =~ /[a-zA-Z0-9_~=+-.]*/ # allowed characters
13
+ return false if local[0,1] == '.' # starts with .
14
+ return false if local[-1,1] == '.' # end with .
15
+ return false if local.include?('..') # can't repeat .
16
+
17
+ domain = parts.last
18
+ return false if not is_domain?(domain)
19
+
20
+ return true
21
+ end
22
+
23
+ # Return true if <tt>domain</tt> is a valid domain name
24
+ def is_domain?(domain)
25
+ raise ArgumentError, "'domain' must be a string" if domain.class != String
26
+ return false if domain.empty?
27
+
28
+ return false if domain.size > 255 # max number of characters in a domain
29
+ return false if not domain =~ /^[a-zA-Z0-9.\-]+$/ # allowed characters
30
+ parts = domain.split('.')
31
+ return false if parts.size > 127 # max number of subdomains
32
+ parts.all? {|part| return false if part.size > 63} # max number of characters in a subdomain
33
+
34
+ return true
35
+ end
36
+
37
+
38
+ # puts "#{File.basename(__FILE__)}: As you shape your body to the confines of your container you feel a tremendous sense of validation." # at bottom
@@ -0,0 +1,12 @@
1
+ # copied from activewarehouse-etl gem
2
+ module IMWVersion #:nodoc:
3
+ unless defined?(VERSION)
4
+ module VERSION #:nodoc:
5
+ MAJOR = 0
6
+ MINOR = 0
7
+ TINY = 0
8
+
9
+ STRING = [MAJOR, MINOR, TINY].join('.')
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,113 @@
1
+
2
+ class ActiveRecord::Base
3
+ class << self
4
+ end
5
+ # def merge!(hsh)
6
+ # hsh = hsh.dup
7
+ # # puts hsh.to_yaml
8
+ # # has_many datasets, notes, fields, contributors
9
+ # self.class.reflect_on_all_associations.each do |ass|
10
+ # # ["@macro", "@class_name", "@name", "@primary_key_name", "@options",
11
+ # # "@klass",
12
+ # # "@through_reflection",
13
+ # # "@active_record",
14
+ # puts [ass.name, ass.macro, ass.primary_key_name].to_yaml
15
+ # if ass.macro == :has_many
16
+ # els = hsh.delete(ass.name.to_s) || []
17
+ # puts "!!!!!!!!!!!!!!!!!!!!!!!!!!", els, '!!'
18
+ # els.each do |el|
19
+ # puts el
20
+ # self[ass.name] = ass.klass.new().merge!(el)
21
+ # end
22
+ # end
23
+ # hsh.each do |key,val|
24
+ # self[key] = val
25
+ # end
26
+ # p self
27
+ # p self.datasets if self.respond_to? 'datasets'
28
+ # end
29
+ # end
30
+ def undump(hsh)
31
+ puts "unumping from #{hsh.to_json}"
32
+ hsh.each{ |k,v| self[k] = v }
33
+ self.save!
34
+ self
35
+ end
36
+ end
37
+
38
+ class Pool < ActiveRecord::Base
39
+ def undump(hsh)
40
+ { :datasets => Dataset, :fields => Field,
41
+ :contributors => Contributor, :pool_notes => PoolNote }.each do |field, klass|
42
+ vals = hsh.delete(field.to_s) || []
43
+ puts "Undumping #{vals} info #{field}"
44
+ self[field.to_s] = vals.map{|val| f = klass.new().undump(val); f.save!; f}
45
+ end
46
+ super
47
+ self
48
+ end
49
+ end
50
+
51
+ class Dataset < ActiveRecord::Base
52
+ def undump(hsh)
53
+ { :datasets => Dataset, :fields => Field,
54
+ :contributors => Contributor, :dataset_notes => DatasetNote }.each do |field, klass|
55
+ vals = hsh.delete(field.to_s) || []
56
+ puts "Undumping #{vals} info #{field}"
57
+ self[field.to_s] = vals.map{|val| f = klass.new().undump(val); f.save!; f}
58
+ end
59
+ super
60
+ puts "Got Dataset #{self.to_yaml}"
61
+ self
62
+ end
63
+ end
64
+
65
+ class IMW < OpenStruct
66
+
67
+ #
68
+ # Takes an Infochimps Stupid Schema stream and
69
+ # constructs the corresponding objects.
70
+ #
71
+ # Here are the rules:
72
+ # * the schema has the structure
73
+ # # this has to be first.
74
+ # - infochimps_schema:
75
+ # schema_version: 0.2 # in case stuff changes
76
+ # # then any number of imw objects:
77
+ # - pool: (...)
78
+ # fields: [era, innings_pitched,
79
+ # - dataset: (...)
80
+ # fields:
81
+ # - name: Earned Run Average
82
+ # handle: era
83
+ # concept: baseball-era
84
+ # units: earned_runs / (9*innings_pitched)
85
+ # - contributor: (...)
86
+ # - field: (...)
87
+ #
88
+ # * Objects are referred to by __handle__, *NOT* __id__. If an ID is
89
+ # included, and an object exists with a non-matching ID or handle,
90
+ # an error will be raised.
91
+ #
92
+ # * We want to make the schema files maintainable by hand, which means that
93
+ # the loader tries to be smart about inline-defined objects. That is, you
94
+ # can either refer to (via handle) a field defined elsewhere, or you can
95
+ # define the field in whole, and trust that the Right Thing will
96
+ # happen. This presents the problem of collisions, though. If a bulk object
97
+ # update arrives, we need to know whom to believe -- bulk loader or
98
+ # database. In the absence of versioning: we look up the object by its
99
+ # handle. If there's an existing object, any new information (fields with
100
+ # values in new that are blank in old) is added to it. If the object is
101
+ # defined at the top level, it wins; if the object is defined as a sub field
102
+ # it loses.
103
+ #
104
+ # * Every interesting object (Pool, Dataset, Contributor, Field) has a desc:
105
+ # attribute (for Pool and Dataset it's virtual but never mind) to describe
106
+ # __itself__. Additionally, every interesting relationship has its own desc: field.
107
+ #
108
+
109
+ def self.undump(schema)
110
+
111
+ # compact then merge -- kill off blank
112
+ end
113
+ end