imw 0.2.7 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. data/Gemfile +23 -0
  2. data/Gemfile.lock +47 -0
  3. data/LICENSE +20 -674
  4. data/README.rdoc +3 -4
  5. data/VERSION +1 -1
  6. data/lib/imw.rb +64 -35
  7. data/lib/imw/dataset.rb +12 -2
  8. data/lib/imw/formats.rb +4 -2
  9. data/lib/imw/formats/delimited.rb +96 -36
  10. data/lib/imw/formats/excel.rb +69 -101
  11. data/lib/imw/formats/json.rb +3 -5
  12. data/lib/imw/formats/pdf.rb +71 -0
  13. data/lib/imw/formats/yaml.rb +3 -5
  14. data/lib/imw/metadata.rb +66 -0
  15. data/lib/imw/metadata/contains_metadata.rb +44 -0
  16. data/lib/imw/metadata/dsl.rb +111 -0
  17. data/lib/imw/metadata/field.rb +65 -0
  18. data/lib/imw/metadata/schema.rb +227 -0
  19. data/lib/imw/metadata/schematized.rb +27 -0
  20. data/lib/imw/parsers.rb +1 -0
  21. data/lib/imw/parsers/flat.rb +44 -0
  22. data/lib/imw/resource.rb +36 -224
  23. data/lib/imw/schemes.rb +3 -1
  24. data/lib/imw/schemes/hdfs.rb +12 -1
  25. data/lib/imw/schemes/http.rb +1 -2
  26. data/lib/imw/schemes/local.rb +139 -16
  27. data/lib/imw/schemes/remote.rb +14 -9
  28. data/lib/imw/schemes/s3.rb +12 -0
  29. data/lib/imw/schemes/sql.rb +117 -0
  30. data/lib/imw/tools.rb +5 -3
  31. data/lib/imw/tools/downloader.rb +63 -0
  32. data/lib/imw/tools/summarizer.rb +21 -10
  33. data/lib/imw/utils.rb +10 -0
  34. data/lib/imw/utils/dynamically_extendable.rb +137 -0
  35. data/lib/imw/utils/error.rb +3 -0
  36. data/lib/imw/utils/extensions.rb +0 -4
  37. data/lib/imw/utils/extensions/array.rb +6 -7
  38. data/lib/imw/utils/extensions/hash.rb +3 -5
  39. data/lib/imw/utils/extensions/string.rb +3 -3
  40. data/lib/imw/utils/has_uri.rb +114 -0
  41. data/spec/data/{sample.csv → formats/delimited/sample.csv} +1 -1
  42. data/spec/data/{sample.tsv → formats/delimited/sample.tsv} +0 -0
  43. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +11 -0
  44. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +16 -0
  45. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +11 -0
  46. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +22 -0
  47. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +22 -0
  48. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +12 -0
  49. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +13 -0
  50. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +22 -0
  51. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +22 -0
  52. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +10 -0
  53. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +15 -0
  54. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +10 -0
  55. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +21 -0
  56. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +21 -0
  57. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +11 -0
  58. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +12 -0
  59. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +21 -0
  60. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +21 -0
  61. data/spec/data/formats/excel/sample.xls +0 -0
  62. data/spec/data/formats/json/sample.json +1 -0
  63. data/spec/data/formats/none/sample +650 -0
  64. data/spec/data/formats/sgml/sample.xml +617 -0
  65. data/spec/data/formats/text/sample.txt +650 -0
  66. data/spec/data/formats/yaml/sample.yaml +410 -0
  67. data/spec/data/schema-tabular.yaml +11 -0
  68. data/spec/imw/formats/delimited_spec.rb +34 -2
  69. data/spec/imw/formats/excel_spec.rb +55 -0
  70. data/spec/imw/formats/json_spec.rb +3 -3
  71. data/spec/imw/formats/sgml_spec.rb +4 -4
  72. data/spec/imw/formats/yaml_spec.rb +3 -3
  73. data/spec/imw/metadata/field_spec.rb +26 -0
  74. data/spec/imw/metadata/schema_spec.rb +27 -0
  75. data/spec/imw/metadata_spec.rb +39 -0
  76. data/spec/imw/parsers/line_parser_spec.rb +1 -1
  77. data/spec/imw/resource_spec.rb +0 -100
  78. data/spec/imw/schemes/hdfs_spec.rb +19 -13
  79. data/spec/imw/schemes/local_spec.rb +59 -3
  80. data/spec/imw/schemes/s3_spec.rb +4 -0
  81. data/spec/imw/utils/dynamically_extendable_spec.rb +69 -0
  82. data/spec/imw/utils/has_uri_spec.rb +55 -0
  83. data/spec/spec_helper.rb +1 -2
  84. data/spec/support/random.rb +4 -4
  85. metadata +58 -17
  86. data/CHANGELOG +0 -0
  87. data/TODO +0 -18
  88. data/spec/data/sample.json +0 -782
  89. data/spec/data/sample.txt +0 -131
  90. data/spec/data/sample.xml +0 -653
  91. data/spec/data/sample.yaml +0 -651
  92. data/spec/spec.opts +0 -4
  93. data/spec/support/extensions.rb +0 -18
@@ -27,15 +27,13 @@ module IMW
27
27
  load(&block)
28
28
  end
29
29
 
30
- # Dump the +data+ into this resource. It must be opened for
30
+ # Emit the +data+ into this resource. It must be opened for
31
31
  # writing.
32
32
  #
33
- # @param [Hash, String, Array, Fixnum] data the Ruby object to dump
34
- # @option options [true, false] :persist (false) Don't close the IO object after writing
35
- def dump data, options={}
33
+ # @param [Hash, String, Array, Fixnum] data the Ruby object to emit
34
+ def emit data, options={}
36
35
  require 'json'
37
36
  write(data.to_json)
38
- io.close unless options[:persist]
39
37
  self
40
38
  end
41
39
  end
@@ -0,0 +1,71 @@
1
+ module IMW
2
+ module Formats
3
+
4
+ # Defines methods for parsing and generating PDF.
5
+ #
6
+ # Uses PDF::Reader for parsing and Prawn for generating.
7
+ module Pdf
8
+
9
+ # Return a snippet of text from this PDF.
10
+ #
11
+ # @return [String]
12
+ def snippet
13
+ begin
14
+ require 'pdf/reader'
15
+ snippetizer = Snippetizer.new
16
+ PDF::Reader.file(path, snippetizer)
17
+ snippetizer.snippet
18
+ rescue Snippetizer::SnippetEndError
19
+ snippetizer.snippet
20
+ rescue
21
+ ''
22
+ end
23
+ end
24
+
25
+ # A receiver class used by PDF::Reader which agglomerates text
26
+ # up to 1024 bytes and then bails.
27
+ class Snippetizer
28
+
29
+ # A custom error class that can be thrown while receiving text
30
+ # from PDF::Reader to cut-short walking large PDF documents.
31
+ SnippetEndError = Class.new(IMW::Error)
32
+
33
+ # The snippet being built by this snippetizer.
34
+ attr_accessor :snippet
35
+
36
+ def initialize
37
+ @snippet = ''
38
+ end
39
+
40
+ # Agglomerates text from PDF::Reader up to a fixed size of
41
+ # 1024 bytes.
42
+ #
43
+ # Will convert a single-space line from PDF::Reader as a
44
+ # newline character.
45
+ #
46
+ # FIXME How does the receiver ask PDF::Reader to abort walking
47
+ # the document now that enough text has been returned? Till a
48
+ # more graceful way is found this method simply raises an
49
+ # error, creating a GOTO...
50
+ def show_text *params
51
+ params.each do |string|
52
+ if @snippet.size < 1024
53
+ if string == ' '
54
+ @snippet += "\n"
55
+ else
56
+ @snippet += string[0..1024]
57
+ end
58
+ else
59
+ raise SnippetEndError.new
60
+ end
61
+ end
62
+ end
63
+ alias_method :show_text_with_positioning, :show_text
64
+ alias_method :move_to_next_line_and_show_text, :show_text
65
+ alias_method :set_spacing_next_line_show_text, :show_text
66
+ end
67
+
68
+ end
69
+ end
70
+ end
71
+
@@ -27,15 +27,13 @@ module IMW
27
27
  load(&block)
28
28
  end
29
29
 
30
- # Dump the +data+ into this resource. It must be opened for
30
+ # Emit the +data+ into this resource. It must be opened for
31
31
  # writing.
32
32
  #
33
- # @param [Hash, String, Array, Fixnum] data the Ruby object to dump
34
- # @option options [true, false] :persist (false) Don't close the IO object after writing
35
- def dump data, options={}
33
+ # @param [Hash, String, Array, Fixnum] data the Ruby object to emit
34
+ def emit data, options={}
36
35
  require 'yaml'
37
36
  write(data.to_yaml)
38
- io.close unless options[:persist]
39
37
  self
40
38
  end
41
39
  end
@@ -0,0 +1,66 @@
1
+ module IMW
2
+
3
+ # A collection of classes for describing the metadata associated
4
+ # with a dataset's fields.
5
+ class Metadata < Hash
6
+
7
+ autoload :Field, 'imw/metadata/field'
8
+ autoload :Schema, 'imw/metadata/schema'
9
+ autoload :Schematized, 'imw/metadata/schematized'
10
+ autoload :DSL, 'imw/metadata/dsl'
11
+ autoload :ContainsMetadata, 'imw/metadata/contains_metadata'
12
+
13
+ # The resource this Schema is anchored to.
14
+ #
15
+ # This attribute is useful for letting relative paths in a
16
+ # schema file refer to a common base URL.
17
+ #
18
+ # @return [IMW::Resource]
19
+ attr_reader :base
20
+
21
+ # Set the resource this Schema is anchored to.
22
+ #
23
+ # @param [IMW::Resource, String, Addressable::URI] new_base
24
+ def base= new_base
25
+ base_resource = IMW.open(new_base)
26
+ base_resource.should_exist!("Metdata base directory must exist")
27
+ raise IMW::PathError.new("Metadata base must be a directory") unless base_resource.is_directory?
28
+ @base = base_resource
29
+ end
30
+
31
+ def initialize obj=nil, options={}
32
+ super()
33
+ self.base = options[:base] if options[:base]
34
+ obj.each_pair { |resource, schema| self[resource] = Schema.new(schema) } if obj
35
+ end
36
+
37
+ def self.load metadata_resource, options
38
+ resource = IMW.open(metadata_resource)
39
+ new(resource.load, {:base => resource.dirname}.merge(options))
40
+ end
41
+
42
+ def []= resource_spec, schema_spec
43
+ schema = schema_spec.is_a?(Schema) ? schema_spec : Schema.new(schema_spec)
44
+ super(absolute_uri(resource_spec), schema_spec)
45
+ end
46
+
47
+ def [] resource_spec
48
+ super(absolute_uri(resource_spec))
49
+ end
50
+
51
+ def describe? resource_spec
52
+ has_key?(absolute_uri(resource_spec))
53
+ end
54
+
55
+ protected
56
+
57
+ def absolute_uri resource_spec
58
+ if base && resource_spec.to_s !~ %r{(^/|://)} # relative path
59
+ base.join(resource_spec).to_s
60
+ else
61
+ resource_spec.to_s
62
+ end
63
+ end
64
+
65
+ end
66
+ end
@@ -0,0 +1,44 @@
1
+ module IMW
2
+ class Metadata
3
+
4
+ # A module that can be mixed into any class defining a +contents+
5
+ # methods which returns an Array of URI strings.
6
+ module ContainsMetadata
7
+
8
+ # The path at which this resource's metadata file lives.
9
+ #
10
+ # Will default to any file beginning with +metadata+ and ending
11
+ # with a +yaml+ or +json+ extension contained in this resource's
12
+ # +contents+.
13
+ #
14
+ # @return [String, nil]
15
+ def metadata_uri
16
+ @metadata_uri ||= contents.detect { |path| path =~ /metadata.*\.(ya?ml|json)$/ }
17
+ end
18
+
19
+ # Explicitly set the path to the metadata for this resource.
20
+ attr_writer :metadata_uri
21
+
22
+ # Does this resource contain metadata for other resources it
23
+ # contains?
24
+ #
25
+ # @return [true, false]
26
+ def metadata?
27
+ (!! metadata_uri)
28
+ end
29
+
30
+ # Return the metadata for this resource.
31
+ #
32
+ # @return [IMW::Metadata, nil]
33
+ def metadata
34
+ @metadata ||= metadata? && IMW::Metadata.load(metadata_uri)
35
+ end
36
+
37
+ # Explicitly set the metadata for this resource.
38
+ attr_writer :metadata
39
+
40
+ end
41
+ end
42
+ end
43
+
44
+
@@ -0,0 +1,111 @@
1
+ module IMW
2
+ class Metadata
3
+
4
+ # A module which defines a DSL that can be used to define metadata
5
+ # for an object.
6
+ module DSL
7
+
8
+ # Open a new resource at the given URI.
9
+ #
10
+ # If this dataset has metadata and it describes the resource
11
+ # then configure the resource to understand its schema..
12
+ #
13
+ # The +schema+ property passed via the options hash will
14
+ # override this.
15
+ #
16
+ # @param [String, Addressable::Uri, IMW::Resource] uri
17
+ # @param [Hash] options
18
+ # @return [IMW::Resource]
19
+ # @see IMW.open
20
+ def open uri, options={}, &block
21
+ schema_options = (options[:schema].nil? && metadata && metadata.describe?(uri)) ? {:schema => metadata[uri]} : {}
22
+ IMW.open(uri, options.merge(schema_options), &block)
23
+ end
24
+
25
+ def open! uri, options={}, &block
26
+ self.open(uri, options.merge(:mode => 'w'), &block)
27
+ end
28
+
29
+ # When called without a block return this object's metadata.
30
+ #
31
+ # metadata
32
+ # #=> { '/path/to/file' => [...], '/path/to/other/file' => [...], ... }
33
+ #
34
+ # When called with a block, accumulate schema and fields into
35
+ # this object's metadata
36
+ #
37
+ # metadata do
38
+ #
39
+ # schema "/path/to/file" do
40
+ # # ...
41
+ # end
42
+ #
43
+ # schema "/path/to/other/file" do
44
+ # # ...
45
+ # end
46
+ # end
47
+ #
48
+ # @see [IMW::Metadata::Schema]
49
+ # @see [IMW::Metadata::Field]
50
+ # @return [IMW::Metadata]
51
+ def metadata arg=nil, options={}, &block
52
+ case arg
53
+ when Hash
54
+ @metadata ||= Metadata.new(arg, options)
55
+ when nil
56
+ @metadata ||= Metadata.new nil, options
57
+ else
58
+ @metadata ||= Metadata.load(arg, options)
59
+ end
60
+ @metadata.base = options[:base] if options[:base]
61
+ return @metadata unless block_given?
62
+ yield
63
+ end
64
+
65
+ def schema resource, options={}, &block
66
+ new_field_accumulator!
67
+ yield
68
+ metadata[resource] = Schema.new(last_field_accumulator!)
69
+ end
70
+
71
+ def field name, options={}
72
+ accumulate_field Field.new(options.merge(:name => name))
73
+ end
74
+
75
+ def has_one name, options={}, &block
76
+ new_field_accumulator!
77
+ yield
78
+ accumulate_field Field.new(options.merge(:name => name, :has_one => last_field_accumulator!))
79
+ end
80
+
81
+ def has_many name, options={}, &block
82
+ new_field_accumulator!
83
+ yield
84
+ accumulate_field Field.new(options.merge(:name => name, :has_many => last_field_accumulator!))
85
+ end
86
+
87
+ protected
88
+
89
+ def field_accumulators # :nodoc:
90
+ @field_accumulators ||= []
91
+ end
92
+
93
+ def new_field_accumulator! # :nodoc:
94
+ field_accumulators.push([])
95
+ end
96
+
97
+ def last_field_accumulator! # :nodoc:
98
+ field_accumulators.pop
99
+ end
100
+
101
+ def field_accumulator? # :nodoc:
102
+ ! field_accumulators.empty?
103
+ end
104
+
105
+ def accumulate_field f # :nodoc:
106
+ # raise IMW::SchemaError.new("No record or sub-record to accumulate fields in!") unless field_accumulator?
107
+ field_accumulators.last << f if field_accumulator?
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,65 @@
1
+ module IMW
2
+
3
+ class Metadata
4
+
5
+ # Conceptually, a field is a "slot" for which "records" can have
6
+ # values.
7
+ #
8
+ # An IMW::Metadata::Field is essentially a Hash that has one required
9
+ # property: a name.
10
+ #
11
+ # IMW::Metadata::Field.new('id')
12
+ # #=> { 'name' => 'id' }
13
+ #
14
+ # But you can declare as many other properties as you want (as long
15
+ # as you include a +name+):
16
+ #
17
+ # IMW::Metadata::Field.new 'name' => 'id', 'type' => :integer, 'title' => "ID", 'description' => "Auto-incremented."
18
+ # #=> { 'name' => 'id', 'type' => :integer, 'title' > "ID", 'description' => "Auto-incremented." }
19
+ #
20
+ # Some properties make a field special:
21
+ #
22
+ # <tt>has_many</tt>::
23
+ # Denotes that this record is in a "has_many" relationship with
24
+ # one or more other records. The corresponding value should be
25
+ # an array
26
+ #
27
+ # <tt>has_one</tt>::
28
+ # Denotes that this record is in a "has_one" relationship with
29
+ # one or more other records. The corresponding value should be
30
+ # an Array in which each key names the joined record and each
31
+ # value is an Array of fields describing the joined record..
32
+ #
33
+ # @see IMW::Metadata::Record for more usage of the
34
+ # <tt>:has_many</tt> and <tt>:has_one</tt> properties.
35
+ class Field < Hash
36
+
37
+ def initialize obj
38
+ super()
39
+ if obj.is_a?(Hash) || obj.is_a?(Field)
40
+ merge!(obj)
41
+ raise IMW::ArgumentError.new("A field must have a name") if obj['name'].blank?
42
+ else
43
+ self['name'] = obj.to_s.strip
44
+ end
45
+ end
46
+
47
+ def hierarchical?
48
+ has_key?('has_many') || has_key?('has_one')
49
+ end
50
+ alias_method :nested?, :hierarchical?
51
+
52
+ def flat?
53
+ ! hierarchical?
54
+ end
55
+
56
+ def titleize
57
+ self['title'] || self['name'].capitalize # FIXME we can do better than this!
58
+ end
59
+
60
+ def associations
61
+ end
62
+
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,227 @@
1
+ module IMW
2
+ class Metadata
3
+
4
+ # A class to describe the schema of a resource.
5
+ #
6
+ # A Schema is built on top of an Array because it is often
7
+ # important to have an ordering for a record's fields.
8
+ #
9
+ # For fields with no such ordering, an Array also works because
10
+ # each of its element will be a field with a +name+ that can be
11
+ # used to index the corresponding field.
12
+ #
13
+ # A Schema is instantiated with a basic Ruby data structure.
14
+ #
15
+ # == Tabular Data
16
+ #
17
+ # Tabular data formats (CSV, TSV, &c.) contain flat records
18
+ # consisting of repeating rows with the same fields in the same
19
+ # position. A sample of delimited data looks like
20
+ #
21
+ # ID,Name,Genus,Species
22
+ # 001,Gray-bellied Night Monkey,Aotus,lemurinus
23
+ # 002,Panamanian Night Monkey,Aotus,zonalis
24
+ # 003,Hernández-Camacho's Night Monkey,Aotus,jorgehernandezi
25
+ # 004,Gray-handed Night Monkey,Aotus,griseimembra
26
+ # 005,Hershkovitz's Night Monkey,Aotus,hershkovitzi
27
+ # ...
28
+ #
29
+ # The schema of these records is summarized as a Ruby data
30
+ # structure in the following way
31
+ #
32
+ # [
33
+ # { :name => :id, :type => :integer },
34
+ # { :name => :name, :type => :string, :title => "Common Name" },
35
+ # { :name => :genus, :type => :string, :title => "Genus" },
36
+ # { :name => :species, :type => :string, :title => "Species" }
37
+ # ]
38
+ #
39
+ # The outer-most Array represents each row and each Hash in the
40
+ # Array represents one of the fields in a row. A Schema
41
+ # initialized with the above Ruby code can be thought of and
42
+ # played with as an Array of Hashes even though it really is a
43
+ # Schema object of Field objects.
44
+ #
45
+ # == Hierarchical Data
46
+ #
47
+ # Hierarchical data formats (JSON, YAML, XML, &c.) can have
48
+ # arbitrarily complex records with fields within fields and so on.
49
+ # A sample of hierarchical XML data looks like
50
+ #
51
+ # <genera>
52
+ # <genus>
53
+ # <name>Mandrillus</name>
54
+ # <species>
55
+ # <species id="113">
56
+ # <name>sphinx</name>
57
+ # <common_name>Mandrill</common_name>
58
+ # </species>
59
+ # <species id="114">
60
+ # <name>leucophaeus</name>
61
+ # <common_name>Drill</common_name>
62
+ # </species>
63
+ # </species>
64
+ # </genus>
65
+ # <genus>
66
+ # <name>Rungwecebus</name>
67
+ # <species>
68
+ # <species id="100">
69
+ # <name>kipunji</name>
70
+ # <common_name>Kipunji</common_name>
71
+ # </species>
72
+ # </species>
73
+ # </genus>
74
+ #
75
+ # These records are described by the following Ruby data structure
76
+ #
77
+ # [
78
+ # { :name => :genera,
79
+ # :has_many => [
80
+ # { :name => 'name', :type => :string, title => "Genus" },
81
+ # { :name => 'species',
82
+ # :has_many => [
83
+ # { :name => :id, :type => :integer },
84
+ # { :name => :name, :type => :string, :title => "Species" },
85
+ # { :name => :common_name, :type => :string, :title => "Common Name" }
86
+ # ]
87
+ # }
88
+ # ]
89
+ # }
90
+ # ]
91
+ #
92
+ # By IMW convention, the outer-most element of the Schema is still
93
+ # an Array describing a collection of identical records even
94
+ # though XML data must have a single root node, limiting the
95
+ # collection to a single record.
96
+ #
97
+ # The first field of the Schema is named +genera+ and it uses the
98
+ # special field property +has_many+ to denote that the field
99
+ # points to a collection of sub-records.
100
+ #
101
+ # Each of these sub-records has its own sub-schema defined by the
102
+ # Array that the +has_many+ property keys to. In this case, the
103
+ # two fields are +name+ and +species+. +name+ is a simple String
104
+ # value while +species+ itself points at another collection of
105
+ # objects.
106
+ #
107
+ # This second-level nested record (a particular species) is itself
108
+ # composed of the three (flat) fields +id+, +name+, and
109
+ # +common_name+. Note that the Schema doesn't know (or care) that
110
+ # the +id+ field is contained in an XML attribute while the +name+
111
+ # and +common_name+ fields are contained as text within daughter
112
+ # nodes.
113
+ #
114
+ # A different way of structure the same information, this time
115
+ # expressed in YAML:
116
+ #
117
+ # ---
118
+ # Mandrillus:
119
+ # - :species: sphinx
120
+ # :name: Mandrill
121
+ # :id: "113"
122
+ # - :species: leucophaeus
123
+ # :name: Drill
124
+ # :id: "114"
125
+ # Rungwecebus:
126
+ # - :species: kipunji
127
+ # :name: Kipunji
128
+ # :id: "100"
129
+ #
130
+ # Would lead to a different Schema
131
+ #
132
+ # [
133
+ # { :name => :genus, :title => "Genus",
134
+ # :has_many => [
135
+ # { :name => :id, :type => :integer },
136
+ # { :name => :name, :type => :string, :title => "Common Name" },
137
+ # { :name => :species, :type => :string, :title => "Species" }
138
+ # ]
139
+ # }
140
+ # ]
141
+ #
142
+ # Where the unnecessary outer wrapper field +genera+ has been
143
+ # dispensed with.
144
+ #
145
+ # In addition to "has many" relationships a record can have a
146
+ # "has_one" relationship. The above data might be expressed
147
+ #
148
+ # ---
149
+ # Mandrillus:
150
+ # - species: sphinx
151
+ # name: Mandrill
152
+ # id: "113"
153
+ # discoverer:
154
+ # name: Dr. Monkeypants
155
+ # year: 1838
156
+ # - species: leucophaeus
157
+ # name: Drill
158
+ # id: "114"
159
+ # discoverer:
160
+ # name: Ms. Cecelia Apefingers
161
+ # year: 1921
162
+ #
163
+ # would result in the following Schema:
164
+ #
165
+ # [
166
+ # { :name => :genus, :title => "Genus",
167
+ # :has_many => [
168
+ # { :name => :id, :type => :integer },
169
+ # { :name => :name, :type => :string, :title => "Common Name" },
170
+ # { :name => :species, :type => :string },
171
+ # { :name => :discoverer,
172
+ # :has_one => [
173
+ # { :name => 'name', :type => :string },
174
+ # { :name => 'year', :type => :integer }
175
+ # ]
176
+ # }
177
+ # ]
178
+ # }
179
+ # ]
180
+ #
181
+ # The +discoverer+ field is marked as +has_one+ which means the
182
+ # +name+ and +year+ fields in the corresponding Array will be
183
+ # interpreted as fields in a single attached sub-record.
184
+ #
185
+ # = Compact Schemas
186
+ #
187
+ # The internal hashes in a Schema specification are really Field
188
+ # objects and the initializer will promote Strings and Symbols to
189
+ # Field objects automatically. This means that the above Schema
190
+ # specification could be replaced by
191
+ #
192
+ # [
193
+ # { :name => :genus
194
+ # :has_many => [
195
+ # :id,
196
+ # :name,
197
+ # :species,
198
+ # { :name => :discoverer,
199
+ # :has_one => [
200
+ # :name,
201
+ # :year
202
+ # ]
203
+ # }
204
+ # ]
205
+ # }
206
+ # ]
207
+ #
208
+ # though there is an accompanying loss of metadata about each
209
+ # field.
210
+ class Schema < Array
211
+
212
+ def initialize input=nil
213
+ super()
214
+ concat(input.map { |field| IMW::Metadata::Field.new(field) }) if input
215
+ end
216
+
217
+ def self.load resource
218
+ new(IMW.open(resource).load)
219
+ end
220
+
221
+ def [] index
222
+ [Integer, Range].include?(index.class) ? super(index) : detect { |field| field[:name].to_s == index.to_s }
223
+ end
224
+
225
+ end
226
+ end
227
+ end