imw 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. data/.gitignore +15 -0
  2. data/CHANGELOG +0 -0
  3. data/LICENSE +674 -0
  4. data/README.rdoc +101 -0
  5. data/Rakefile +20 -0
  6. data/VERSION +1 -0
  7. data/etc/imwrc.rb +76 -0
  8. data/lib/imw.rb +42 -0
  9. data/lib/imw/boot.rb +58 -0
  10. data/lib/imw/dataset.rb +233 -0
  11. data/lib/imw/dataset/datamapper.rb +66 -0
  12. data/lib/imw/dataset/datamapper/time_and_user_stamps.rb +37 -0
  13. data/lib/imw/dataset/loaddump.rb +50 -0
  14. data/lib/imw/dataset/old/file_collection.rb +88 -0
  15. data/lib/imw/dataset/old/file_collection_utils.rb +71 -0
  16. data/lib/imw/dataset/scaffold.rb +132 -0
  17. data/lib/imw/dataset/scraped_uri.rb +305 -0
  18. data/lib/imw/dataset/scrub/old_working_scrubber.rb +87 -0
  19. data/lib/imw/dataset/scrub/scrub.rb +147 -0
  20. data/lib/imw/dataset/scrub/scrub_simple_url.rb +38 -0
  21. data/lib/imw/dataset/scrub/scrub_test.rb +60 -0
  22. data/lib/imw/dataset/scrub/slug.rb +101 -0
  23. data/lib/imw/dataset/stats.rb +73 -0
  24. data/lib/imw/dataset/stats/counter.rb +23 -0
  25. data/lib/imw/dataset/task.rb +38 -0
  26. data/lib/imw/dataset/workflow.rb +81 -0
  27. data/lib/imw/files.rb +110 -0
  28. data/lib/imw/files/archive.rb +113 -0
  29. data/lib/imw/files/basicfile.rb +122 -0
  30. data/lib/imw/files/binary.rb +28 -0
  31. data/lib/imw/files/compressed_file.rb +93 -0
  32. data/lib/imw/files/compressed_files_and_archives.rb +348 -0
  33. data/lib/imw/files/compressible.rb +103 -0
  34. data/lib/imw/files/csv.rb +112 -0
  35. data/lib/imw/files/json.rb +41 -0
  36. data/lib/imw/files/sgml.rb +65 -0
  37. data/lib/imw/files/text.rb +68 -0
  38. data/lib/imw/files/yaml.rb +46 -0
  39. data/lib/imw/packagers.rb +8 -0
  40. data/lib/imw/packagers/archiver.rb +108 -0
  41. data/lib/imw/packagers/s3_mover.rb +28 -0
  42. data/lib/imw/parsers.rb +7 -0
  43. data/lib/imw/parsers/html_parser.rb +382 -0
  44. data/lib/imw/parsers/html_parser/matchers.rb +306 -0
  45. data/lib/imw/parsers/line_parser.rb +87 -0
  46. data/lib/imw/parsers/regexp_parser.rb +72 -0
  47. data/lib/imw/utils.rb +24 -0
  48. data/lib/imw/utils/components.rb +61 -0
  49. data/lib/imw/utils/config.rb +46 -0
  50. data/lib/imw/utils/error.rb +54 -0
  51. data/lib/imw/utils/extensions/array.rb +125 -0
  52. data/lib/imw/utils/extensions/class/attribute_accessors.rb +8 -0
  53. data/lib/imw/utils/extensions/core.rb +43 -0
  54. data/lib/imw/utils/extensions/dir.rb +24 -0
  55. data/lib/imw/utils/extensions/file_core.rb +64 -0
  56. data/lib/imw/utils/extensions/hash.rb +218 -0
  57. data/lib/imw/utils/extensions/hpricot.rb +48 -0
  58. data/lib/imw/utils/extensions/string.rb +49 -0
  59. data/lib/imw/utils/extensions/struct.rb +42 -0
  60. data/lib/imw/utils/extensions/symbol.rb +28 -0
  61. data/lib/imw/utils/extensions/typed_struct.rb +22 -0
  62. data/lib/imw/utils/extensions/uri.rb +59 -0
  63. data/lib/imw/utils/log.rb +67 -0
  64. data/lib/imw/utils/misc.rb +63 -0
  65. data/lib/imw/utils/paths.rb +115 -0
  66. data/lib/imw/utils/uri.rb +59 -0
  67. data/lib/imw/utils/uuid.rb +33 -0
  68. data/lib/imw/utils/validate.rb +38 -0
  69. data/lib/imw/utils/version.rb +12 -0
  70. data/lib/imw/utils/view.rb +113 -0
  71. data/lib/imw/utils/view/dump_csv.rb +112 -0
  72. data/lib/imw/utils/view/dump_csv_older.rb +117 -0
  73. data/spec/data/sample.csv +131 -0
  74. data/spec/data/sample.tsv +131 -0
  75. data/spec/data/sample.txt +131 -0
  76. data/spec/data/sample.xml +653 -0
  77. data/spec/data/sample.yaml +652 -0
  78. data/spec/imw/dataset/datamapper/uri_spec.rb +43 -0
  79. data/spec/imw/dataset/datamapper_spec_helper.rb +11 -0
  80. data/spec/imw/files/archive_spec.rb +118 -0
  81. data/spec/imw/files/basicfile_spec.rb +121 -0
  82. data/spec/imw/files/bz2_spec.rb +32 -0
  83. data/spec/imw/files/compressed_file_spec.rb +96 -0
  84. data/spec/imw/files/compressible_spec.rb +100 -0
  85. data/spec/imw/files/file_spec.rb +144 -0
  86. data/spec/imw/files/gz_spec.rb +32 -0
  87. data/spec/imw/files/rar_spec.rb +33 -0
  88. data/spec/imw/files/tar_spec.rb +31 -0
  89. data/spec/imw/files/text_spec.rb +23 -0
  90. data/spec/imw/files/zip_spec.rb +31 -0
  91. data/spec/imw/files_spec.rb +38 -0
  92. data/spec/imw/packagers/archiver_spec.rb +125 -0
  93. data/spec/imw/packagers/s3_mover_spec.rb +7 -0
  94. data/spec/imw/parsers/line_parser_spec.rb +96 -0
  95. data/spec/imw/parsers/regexp_parser_spec.rb +42 -0
  96. data/spec/imw/utils/extensions/file_core_spec.rb +72 -0
  97. data/spec/imw/utils/extensions/find_spec.rb +113 -0
  98. data/spec/imw/utils/paths_spec.rb +38 -0
  99. data/spec/imw/workflow/rip/local_spec.rb +89 -0
  100. data/spec/imw/workflow/rip_spec.rb +27 -0
  101. data/spec/rcov.opts +1 -0
  102. data/spec/spec.opts +4 -0
  103. data/spec/spec_helper.rb +32 -0
  104. data/spec/support/archive_contents_matcher.rb +94 -0
  105. data/spec/support/custom_matchers.rb +21 -0
  106. data/spec/support/directory_contents_matcher.rb +61 -0
  107. data/spec/support/extensions.rb +18 -0
  108. data/spec/support/file_contents_matcher.rb +50 -0
  109. data/spec/support/random.rb +210 -0
  110. data/spec/support/without_regard_to_order_matcher.rb +58 -0
  111. metadata +196 -0
@@ -0,0 +1,67 @@
1
+ require 'logger'
2
+
3
+ module IMW
4
+ LOG_FILE_DESTINATION = STDERR unless defined?(LOG_FILE_DESTINATION)
5
+ LOG_TIMEFORMAT = "%Y%m%d-%H:%M:%S " unless defined?(LOG_TIMEFORMAT)
6
+
7
+ class << self; attr_accessor :log end
8
+ #
9
+ # Create a Logger and point it at LOG_FILE_DESTINATION
10
+ #
11
+ # LOG_FILE_DESTINATION is STDOUT by default; redefine it in your
12
+ # ~/.imwrc, or set IMW.log yourself, if that's not cool.
13
+ #
14
+ def self.instantiate_logger!
15
+ IMW.log ||= Logger.new(LOG_FILE_DESTINATION)
16
+ IMW.log.datetime_format = "%Y%m%d-%H:%M:%S "
17
+ IMW.log.level = Logger::INFO
18
+ end
19
+
20
+ def announce *events
21
+ options = events.extract_options!
22
+ options.reverse_merge! :level => Logger::INFO
23
+ # puts [options, events ].inspect, "*"*76
24
+ IMW.log.add options[:level], events.join("\n")
25
+ end
26
+ def banner *events
27
+ options = events.extract_options!
28
+ options.reverse_merge! :level => Logger::INFO
29
+ ["*"*75, events, "*"*75].flatten.each{|ev| announce(ev, options) }
30
+ end
31
+
32
+ PROGRESS_TRACKERS = {}
33
+ #
34
+ # When the slowly-changing tracked variable +var+ changes value,
35
+ # announce its new value. Always announces on first call.
36
+ #
37
+ # Ex:
38
+ # track_progress :indexing_names, name[0..0] # announce at each initial letter
39
+ # track_progress :files, (i % 1000) # announce at each 1,000 iterations
40
+ #
41
+ def track_progress tracker, val
42
+ unless (IMW::PROGRESS_TRACKERS.include?(tracker)) &&
43
+ (IMW::PROGRESS_TRACKERS[tracker] == val)
44
+ announce "#{tracker.to_s.gsub(/_/,' ')}: #{val}"
45
+ IMW::PROGRESS_TRACKERS[tracker] = val
46
+ end
47
+ end
48
+
49
+ PROGRESS_COUNTERS = {}
50
+ #
51
+ # Log repetitions in a given context
52
+ #
53
+ # At every n'th (default 1000) call,
54
+ # announce progress in the IMW.log
55
+ #
56
+ def track_count tracker, every=1000
57
+ PROGRESS_COUNTERS[tracker] ||= 0
58
+ PROGRESS_COUNTERS[tracker] += 1
59
+ chunk = every * (PROGRESS_COUNTERS[tracker]/every).to_i
60
+ track_progress "count_of_#{tracker}", chunk
61
+ end
62
+ end
63
+
64
+ #
65
+ # Make the default logger
66
+ #
67
+ IMW.instantiate_logger!
@@ -0,0 +1,63 @@
1
+ module IMW
2
+ # Return a string representing the current UTC time in the IMW
3
+ # format.
4
+ def self.current_utc_time_string
5
+ Time.now.utc.strftime(IMW::STRFTIME_FORMAT)
6
+ end
7
+
8
+
9
+ # A simple counter. The +value+ and +add+ methods read and
10
+ # increment the counter's value.
11
+ #
12
+ # counter = IMW::Counter.new
13
+ # counter.value #=> 0
14
+ # counter.add 1
15
+ # counter.value #=> 1
16
+ #
17
+ # The +next!+ method acts as like C's <tt>value++</tt>, incrementing
18
+ # +value+ _after_ it is referenced.
19
+ #
20
+ # counter = IMW::Counter.new
21
+ # counter.value #=> 0
22
+ # counter.next! #=> 0
23
+ # counter.value #=> 1
24
+ #
25
+ # Counters can also be reset
26
+ #
27
+ # counter.reset!
28
+ # counter.value #=> 0
29
+ class Counter
30
+
31
+ attr_accessor :value, :starting_value, :increment
32
+
33
+ # Return a new Counter. The first argument is the starting value
34
+ # (defaults to 0) and the second is the increment (defaults to 1).
35
+ def initialize starting_value=0,increment=1
36
+ @starting_value = starting_value
37
+ @value = starting_value
38
+ @increment = increment
39
+ end
40
+
41
+ # Add +amount+ (defaults to the value of <tt>@increment</tt>).
42
+ def add amount=nil
43
+ @value += amount || @increment
44
+ end
45
+ alias_method :add!, :add
46
+
47
+ # Increment the counter by <tt>@increment</tt> but return its
48
+ # value _before_ being incremented.
49
+ def next!
50
+ old_value = @value
51
+ @value += @increment
52
+ old_value
53
+ end
54
+
55
+ # Reset the counter to +value+ (defaults to the value of
56
+ # <tt>@starting_value</tt>).
57
+ def reset! value=nil
58
+ @value = value || @starting_value
59
+ end
60
+ end
61
+ end
62
+
63
+ # puts "#{File.basename(__FILE__)}: Your Monkeywrench seems suddenly more utilisable." # at bottom
@@ -0,0 +1,115 @@
1
+ #
2
+ # h2. lib/imw/utils/paths.rb -- defines the path structure of IMW
3
+ #
4
+ # == About
5
+ #
6
+ # IMW uses lots of different directories to keep information on data
7
+ # and datasets separate. This module interfaces with the
8
+ # configuration files to establish the paths to these IMW directories
9
+ # and provides functions and mixins for IMW objects to use to access
10
+ # these paths.
11
+ #
12
+ # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org)
13
+ # Copyright:: Copyright (c) 2008 infochimps.org
14
+ # License:: GPL 3.0
15
+ # Website:: http://infinitemonkeywrench.org/
16
+ #
17
+
18
+ module IMW
19
+
20
+ # Implements methods designed to work with an object's
21
+ # <tt>@paths</tt> attributes, adding and deleting symbolic
22
+ # references to paths and expanding calls to +path_to+ from that
23
+ # attribute or (when a miss) from <tt>IMW::PATHS</tt>.
24
+ #
25
+ # An including class should therefore define an array attribute
26
+ # <tt>@paths</tt>.
27
+ module Paths
28
+
29
+ # Expands a shorthand workflow path specification to an
30
+ # actual file path.
31
+ #
32
+ # add_path :mlb_08, 'gd2.mlb.com/components/game/mlb/year_2008'
33
+ # path_to :ripd, :mlb_08, 'month_06', 'day_08', 'miniscoreboard.xml'
34
+ # => (...)/data/ripd/gd2.mlb.com/components/game/mlb/year_2008/month_06/day_08/miniscoreboard.xml
35
+ def path_to *pathsegs
36
+ begin
37
+ path = Pathname.new path_to_helper(*pathsegs)
38
+ path.absolute? ? File.expand_path(path) : path.to_s
39
+ rescue Exception => e
40
+ raise("Can't find path to '#{pathsegs}': #{e}");
41
+ end
42
+ end
43
+
44
+ private
45
+ def path_to_helper *pathsegs # :nodoc:
46
+ # +path_to_helper+ handles the recursive calls for +path_to+.
47
+ expanded = pathsegs.flatten.compact.map do |pathseg|
48
+ case
49
+ when pathseg.is_a?(Symbol) && @paths.include?(pathseg) then path_to(@paths[pathseg])
50
+ when pathseg.is_a?(Symbol) && IMW::PATHS.include?(pathseg) then path_to(IMW::PATHS[pathseg])
51
+ when pathseg.is_a?(Symbol) then raise IMW::PathError.new("No path expansion set for #{pathseg.inspect}")
52
+ else pathseg
53
+ end
54
+ end
55
+ File.join(*expanded)
56
+ end
57
+ public
58
+
59
+ # Adds a symbolic path for expansion by +path_to+.
60
+ def add_path sym, *pathsegs
61
+ @paths[sym] = pathsegs.flatten
62
+ end
63
+
64
+ # Removes a symbolic path for expansion by +path_to+.
65
+ def remove_path sym
66
+ @paths.delete sym if @paths.include? sym
67
+ end
68
+ end
69
+
70
+ class Dataset
71
+ attr_reader :paths
72
+ include IMW::Paths
73
+
74
+ private
75
+ def set_paths
76
+ @paths = {}
77
+ add_path :self, File.dirname(eval('__FILE__'))
78
+ end
79
+ end
80
+
81
+ def self.path_to *pathsegs
82
+ begin
83
+ path = Pathname.new IMW.path_to_helper(*pathsegs)
84
+ path.absolute? ? File.expand_path(path) : path.to_s
85
+ rescue Exception => e
86
+ raise("Can't find path to '#{pathsegs}': #{e}");
87
+ end
88
+ end
89
+
90
+ private
91
+ def self.path_to_helper *pathsegs # :nodoc:
92
+ # +path_to_helper+ handles the recursive calls for +path_to+.
93
+ expanded = pathsegs.flatten.compact.map do |pathseg|
94
+ case
95
+ when pathseg.is_a?(Symbol) && IMW::PATHS.include?(pathseg) then path_to(IMW::PATHS[pathseg])
96
+ when pathseg.is_a?(Symbol) then raise IMW::PathError.new("No path expansion set for #{pathseg.inspect}")
97
+ else pathseg
98
+ end
99
+ end
100
+ File.join(*expanded)
101
+ end
102
+ public
103
+
104
+ # Adds a symbolic path for expansion by +path_to+.
105
+ def self.add_path sym, *pathsegs
106
+ IMW::PATHS[sym] = pathsegs.flatten
107
+ end
108
+
109
+ # Removes a symbolic path for expansion by +path_to+.
110
+ def self.remove_path sym
111
+ IMW::PATHS.delete sym if IMW::PATHS.include? sym
112
+ end
113
+ end
114
+
115
+ # puts "#{File.basename(__FILE__)}: Your monkeywrench glows alternately dim then bright as you wander, suggesting to you which paths to take."
@@ -0,0 +1,59 @@
1
+ require 'imw/utils'
2
+ require 'imw/utils/uuid'
3
+ require 'addressable/uri'
4
+ module Addressable
5
+ #
6
+ # Add the #scrubbed and #revhost calls
7
+ #
8
+ class URI
9
+ SAFE_CHARS = %r{a-zA-Z0-9\-\._!\(\)\*\'}
10
+ PATH_CHARS = %r{#{SAFE_CHARS}\$&\+,:=@\/;}
11
+ RESERVED_CHARS = %r{\$&\+,:=@\/;\?\%}
12
+ UNSAFE_CHARS = %r{\\ \"\#<>\[\]\^\`\|\~\{\}}
13
+ HOST_HEAD = '(?:[a-z0-9\-]+\.)+'
14
+ HOST_TLD = '(?:[a-z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|jobs|museum)'
15
+
16
+ def host_valid?
17
+ !!(host =~ %r{\A#{HOST_HEAD}#{HOST_TLD}\z}i)
18
+ end
19
+ def path_valid?
20
+ !!(path =~ %r{\A[#{PATH_CHARS}%]*\z})
21
+ end
22
+ def simple_connection_part?
23
+ ( ['http', nil].include?(scheme) &&
24
+ [80, nil].include?(port) &&
25
+ (self.to_hash.values_at(:password, :user).join.blank?) )
26
+ end
27
+
28
+ #
29
+ # Does this look like a
30
+ #
31
+ def simple?
32
+ host_valid? && path_valid? && simple_connection_part?
33
+ end
34
+
35
+ #
36
+ # +revhost+
37
+ # the dot-reversed host:
38
+ # foo.company.com => com.company.foo
39
+ #
40
+ def revhost
41
+ return host unless host =~ /\./
42
+ host.split('.').reverse.join('.')
43
+ end
44
+ #
45
+ # +uuid+ -- RFC-4122 ver.5 uuid; guaranteed to be universally unique
46
+ #
47
+ # See
48
+ # http://www.faqs.org/rfcs/rfc4122.html
49
+ #
50
+ def url_uuid
51
+ UUID.sha1_create(UUID_URL_NAMESPACE, self.normalize.to_s)
52
+ end
53
+ end
54
+ end
55
+
56
+ class << Addressable::URI
57
+ alias_method :encode_segment, :encode_component if ! defined?(encode_segment)
58
+ alias_method :unencode_segment, :unencode_component if ! defined?(unencode_segment)
59
+ end
@@ -0,0 +1,33 @@
1
+ require 'uuidtools'
2
+
3
+ class UUID
4
+
5
+ #
6
+ # A string suitable for using as a path name --
7
+ #
8
+ # Ex.
9
+ # urn:uuid:3c0dce44-80a8-11dd-a897-001ff35a0a8b =>
10
+ # urn_uuid/3c0dce44/80a8/11dd/a897/001ff35a0a8b
11
+ #
12
+ # It's well possible there are more perspicacious choices for points to split
13
+ # the string, but until we hit that limit this'll do.
14
+ #
15
+ def to_path
16
+ 'urn_uuid/' + to_s.gsub(/[\:\-]/,'/')
17
+ end
18
+
19
+ def self.hex_to_str str
20
+ /([\da-f]{8})([\da-f]{4})([\da-f]{4})([\da-f]{4})([\da-f]{12})/.match(str).captures.join '-'
21
+ end
22
+
23
+
24
+ def self.parse_hex str
25
+ parse(UUID.hex_to_str(str))
26
+ end
27
+
28
+ # Overrides UUIDTools -- force 32 hex digits (leading zeros)
29
+ def hexdigest
30
+ "%032x" % self.to_i
31
+ end
32
+
33
+ end
@@ -0,0 +1,38 @@
1
+
2
+
3
+ # Return true if <tt>email</tt> is a valid email address
4
+ def is_email?(email)
5
+ raise ArgumentError, "'email' must be a string" if email.class != String
6
+ return false if email.empty?
7
+
8
+ parts = email.split('@')
9
+ return false if parts.size != 2
10
+
11
+ local = parts.first
12
+ return false if not local =~ /[a-zA-Z0-9_~=+-.]*/ # allowed characters
13
+ return false if local[0,1] == '.' # starts with .
14
+ return false if local[-1,1] == '.' # end with .
15
+ return false if local.include?('..') # can't repeat .
16
+
17
+ domain = parts.last
18
+ return false if not is_domain?(domain)
19
+
20
+ return true
21
+ end
22
+
23
+ # Return true if <tt>domain</tt> is a valid domain name
24
+ def is_domain?(domain)
25
+ raise ArgumentError, "'domain' must be a string" if domain.class != String
26
+ return false if domain.empty?
27
+
28
+ return false if domain.size > 255 # max number of characters in a domain
29
+ return false if not domain =~ /^[a-zA-Z0-9.\-]+$/ # allowed characters
30
+ parts = domain.split('.')
31
+ return false if parts.size > 127 # max number of subdomains
32
+ parts.all? {|part| return false if part.size > 63} # max number of characters in a subdomain
33
+
34
+ return true
35
+ end
36
+
37
+
38
+ # puts "#{File.basename(__FILE__)}: As you shape your body to the confines of your container you feel a tremendous sense of validation." # at bottom
@@ -0,0 +1,12 @@
1
+ # copied from activewarehouse-etl gem
2
+ module IMWVersion #:nodoc:
3
+ unless defined?(VERSION)
4
+ module VERSION #:nodoc:
5
+ MAJOR = 0
6
+ MINOR = 0
7
+ TINY = 0
8
+
9
+ STRING = [MAJOR, MINOR, TINY].join('.')
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,113 @@
1
+
2
+ class ActiveRecord::Base
3
+ class << self
4
+ end
5
+ # def merge!(hsh)
6
+ # hsh = hsh.dup
7
+ # # puts hsh.to_yaml
8
+ # # has_many datasets, notes, fields, contributors
9
+ # self.class.reflect_on_all_associations.each do |ass|
10
+ # # ["@macro", "@class_name", "@name", "@primary_key_name", "@options",
11
+ # # "@klass",
12
+ # # "@through_reflection",
13
+ # # "@active_record",
14
+ # puts [ass.name, ass.macro, ass.primary_key_name].to_yaml
15
+ # if ass.macro == :has_many
16
+ # els = hsh.delete(ass.name.to_s) || []
17
+ # puts "!!!!!!!!!!!!!!!!!!!!!!!!!!", els, '!!'
18
+ # els.each do |el|
19
+ # puts el
20
+ # self[ass.name] = ass.klass.new().merge!(el)
21
+ # end
22
+ # end
23
+ # hsh.each do |key,val|
24
+ # self[key] = val
25
+ # end
26
+ # p self
27
+ # p self.datasets if self.respond_to? 'datasets'
28
+ # end
29
+ # end
30
+ def undump(hsh)
31
+ puts "unumping from #{hsh.to_json}"
32
+ hsh.each{ |k,v| self[k] = v }
33
+ self.save!
34
+ self
35
+ end
36
+ end
37
+
38
+ class Pool < ActiveRecord::Base
39
+ def undump(hsh)
40
+ { :datasets => Dataset, :fields => Field,
41
+ :contributors => Contributor, :pool_notes => PoolNote }.each do |field, klass|
42
+ vals = hsh.delete(field.to_s) || []
43
+ puts "Undumping #{vals} info #{field}"
44
+ self[field.to_s] = vals.map{|val| f = klass.new().undump(val); f.save!; f}
45
+ end
46
+ super
47
+ self
48
+ end
49
+ end
50
+
51
+ class Dataset < ActiveRecord::Base
52
+ def undump(hsh)
53
+ { :datasets => Dataset, :fields => Field,
54
+ :contributors => Contributor, :dataset_notes => DatasetNote }.each do |field, klass|
55
+ vals = hsh.delete(field.to_s) || []
56
+ puts "Undumping #{vals} info #{field}"
57
+ self[field.to_s] = vals.map{|val| f = klass.new().undump(val); f.save!; f}
58
+ end
59
+ super
60
+ puts "Got Dataset #{self.to_yaml}"
61
+ self
62
+ end
63
+ end
64
+
65
+ class IMW < OpenStruct
66
+
67
+ #
68
+ # Takes an Infochimps Stupid Schema stream and
69
+ # constructs the corresponding objects.
70
+ #
71
+ # Here are the rules:
72
+ # * the schema has the structure
73
+ # # this has to be first.
74
+ # - infochimps_schema:
75
+ # schema_version: 0.2 # in case stuff changes
76
+ # # then any number of imw objects:
77
+ # - pool: (...)
78
+ # fields: [era, innings_pitched,
79
+ # - dataset: (...)
80
+ # fields:
81
+ # - name: Earned Run Average
82
+ # handle: era
83
+ # concept: baseball-era
84
+ # units: earned_runs / (9*innings_pitched)
85
+ # - contributor: (...)
86
+ # - field: (...)
87
+ #
88
+ # * Objects are referred to by __handle__, *NOT* __id__. If an ID is
89
+ # included, and an object exists with a non-matching ID or handle,
90
+ # an error will be raised.
91
+ #
92
+ # * We want to make the schema files maintainable by hand, which means that
93
+ # the loader tries to be smart about inline-defined objects. That is, you
94
+ # can either refer to (via handle) a field defined elsewhere, or you can
95
+ # define the field in whole, and trust that the Right Thing will
96
+ # happen. This presents the problem of collisions, though. If a bulk object
97
+ # update arrives, we need to know whom to believe -- bulk loader or
98
+ # database. In the absence of versioning: we look up the object by its
99
+ # handle. If there's an existing object, any new information (fields with
100
+ # values in new that are blank in old) is added to it. If the object is
101
+ # defined at the top level, it wins; if the object is defined as a sub field
102
+ # it loses.
103
+ #
104
+ # * Every interesting object (Pool, Dataset, Contributor, Field) has a desc:
105
+ # attribute (for Pool and Dataset it's virtual but never mind) to describe
106
+ # __itself__. Additionally, every interesting relationship has its own desc: field.
107
+ #
108
+
109
+ def self.undump(schema)
110
+
111
+ # compact then merge -- kill off blank
112
+ end
113
+ end