imw 0.2.7 → 0.2.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (93) hide show
  1. data/Gemfile +23 -0
  2. data/Gemfile.lock +47 -0
  3. data/LICENSE +20 -674
  4. data/README.rdoc +3 -4
  5. data/VERSION +1 -1
  6. data/lib/imw.rb +64 -35
  7. data/lib/imw/dataset.rb +12 -2
  8. data/lib/imw/formats.rb +4 -2
  9. data/lib/imw/formats/delimited.rb +96 -36
  10. data/lib/imw/formats/excel.rb +69 -101
  11. data/lib/imw/formats/json.rb +3 -5
  12. data/lib/imw/formats/pdf.rb +71 -0
  13. data/lib/imw/formats/yaml.rb +3 -5
  14. data/lib/imw/metadata.rb +66 -0
  15. data/lib/imw/metadata/contains_metadata.rb +44 -0
  16. data/lib/imw/metadata/dsl.rb +111 -0
  17. data/lib/imw/metadata/field.rb +65 -0
  18. data/lib/imw/metadata/schema.rb +227 -0
  19. data/lib/imw/metadata/schematized.rb +27 -0
  20. data/lib/imw/parsers.rb +1 -0
  21. data/lib/imw/parsers/flat.rb +44 -0
  22. data/lib/imw/resource.rb +36 -224
  23. data/lib/imw/schemes.rb +3 -1
  24. data/lib/imw/schemes/hdfs.rb +12 -1
  25. data/lib/imw/schemes/http.rb +1 -2
  26. data/lib/imw/schemes/local.rb +139 -16
  27. data/lib/imw/schemes/remote.rb +14 -9
  28. data/lib/imw/schemes/s3.rb +12 -0
  29. data/lib/imw/schemes/sql.rb +117 -0
  30. data/lib/imw/tools.rb +5 -3
  31. data/lib/imw/tools/downloader.rb +63 -0
  32. data/lib/imw/tools/summarizer.rb +21 -10
  33. data/lib/imw/utils.rb +10 -0
  34. data/lib/imw/utils/dynamically_extendable.rb +137 -0
  35. data/lib/imw/utils/error.rb +3 -0
  36. data/lib/imw/utils/extensions.rb +0 -4
  37. data/lib/imw/utils/extensions/array.rb +6 -7
  38. data/lib/imw/utils/extensions/hash.rb +3 -5
  39. data/lib/imw/utils/extensions/string.rb +3 -3
  40. data/lib/imw/utils/has_uri.rb +114 -0
  41. data/spec/data/{sample.csv → formats/delimited/sample.csv} +1 -1
  42. data/spec/data/{sample.tsv → formats/delimited/sample.tsv} +0 -0
  43. data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +11 -0
  44. data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +16 -0
  45. data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +11 -0
  46. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +22 -0
  47. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +22 -0
  48. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +12 -0
  49. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +13 -0
  50. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +22 -0
  51. data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +22 -0
  52. data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +10 -0
  53. data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +15 -0
  54. data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +10 -0
  55. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +21 -0
  56. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +21 -0
  57. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +11 -0
  58. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +12 -0
  59. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +21 -0
  60. data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +21 -0
  61. data/spec/data/formats/excel/sample.xls +0 -0
  62. data/spec/data/formats/json/sample.json +1 -0
  63. data/spec/data/formats/none/sample +650 -0
  64. data/spec/data/formats/sgml/sample.xml +617 -0
  65. data/spec/data/formats/text/sample.txt +650 -0
  66. data/spec/data/formats/yaml/sample.yaml +410 -0
  67. data/spec/data/schema-tabular.yaml +11 -0
  68. data/spec/imw/formats/delimited_spec.rb +34 -2
  69. data/spec/imw/formats/excel_spec.rb +55 -0
  70. data/spec/imw/formats/json_spec.rb +3 -3
  71. data/spec/imw/formats/sgml_spec.rb +4 -4
  72. data/spec/imw/formats/yaml_spec.rb +3 -3
  73. data/spec/imw/metadata/field_spec.rb +26 -0
  74. data/spec/imw/metadata/schema_spec.rb +27 -0
  75. data/spec/imw/metadata_spec.rb +39 -0
  76. data/spec/imw/parsers/line_parser_spec.rb +1 -1
  77. data/spec/imw/resource_spec.rb +0 -100
  78. data/spec/imw/schemes/hdfs_spec.rb +19 -13
  79. data/spec/imw/schemes/local_spec.rb +59 -3
  80. data/spec/imw/schemes/s3_spec.rb +4 -0
  81. data/spec/imw/utils/dynamically_extendable_spec.rb +69 -0
  82. data/spec/imw/utils/has_uri_spec.rb +55 -0
  83. data/spec/spec_helper.rb +1 -2
  84. data/spec/support/random.rb +4 -4
  85. metadata +58 -17
  86. data/CHANGELOG +0 -0
  87. data/TODO +0 -18
  88. data/spec/data/sample.json +0 -782
  89. data/spec/data/sample.txt +0 -131
  90. data/spec/data/sample.xml +0 -653
  91. data/spec/data/sample.yaml +0 -651
  92. data/spec/spec.opts +0 -4
  93. data/spec/support/extensions.rb +0 -18
@@ -27,15 +27,13 @@ module IMW
27
27
  load(&block)
28
28
  end
29
29
 
30
- # Dump the +data+ into this resource. It must be opened for
30
+ # Emit the +data+ into this resource. It must be opened for
31
31
  # writing.
32
32
  #
33
- # @param [Hash, String, Array, Fixnum] data the Ruby object to dump
34
- # @option options [true, false] :persist (false) Don't close the IO object after writing
35
- def dump data, options={}
33
+ # @param [Hash, String, Array, Fixnum] data the Ruby object to emit
34
+ def emit data, options={}
36
35
  require 'json'
37
36
  write(data.to_json)
38
- io.close unless options[:persist]
39
37
  self
40
38
  end
41
39
  end
@@ -0,0 +1,71 @@
1
+ module IMW
2
+ module Formats
3
+
4
+ # Defines methods for parsing and generating PDF.
5
+ #
6
+ # Uses PDF::Reader for parsing and Prawn for generating.
7
+ module Pdf
8
+
9
+ # Return a snippet of text from this PDF.
10
+ #
11
+ # @return [String]
12
+ def snippet
13
+ begin
14
+ require 'pdf/reader'
15
+ snippetizer = Snippetizer.new
16
+ PDF::Reader.file(path, snippetizer)
17
+ snippetizer.snippet
18
+ rescue Snippetizer::SnippetEndError
19
+ snippetizer.snippet
20
+ rescue
21
+ ''
22
+ end
23
+ end
24
+
25
+ # A receiver class used by PDF::Reader which agglomerates text
26
+ # up to 1024 bytes and then bails.
27
+ class Snippetizer
28
+
29
+ # A custom error class that can be thrown while receiving text
30
+ # from PDF::Reader to cut-short walking large PDF documents.
31
+ SnippetEndError = Class.new(IMW::Error)
32
+
33
+ # The snippet being built by this snippetizer.
34
+ attr_accessor :snippet
35
+
36
+ def initialize
37
+ @snippet = ''
38
+ end
39
+
40
+ # Agglomerates text from PDF::Reader up to a fixed size of
41
+ # 1024 bytes.
42
+ #
43
+ # Will convert a single-space line from PDF::Reader as a
44
+ # newline character.
45
+ #
46
+ # FIXME How does the receiver ask PDF::Reader to abort walking
47
+ # the document now that enough text has been returned? Till a
48
+ # more graceful way is found this method simply raises an
49
+ # error, creating a GOTO...
50
+ def show_text *params
51
+ params.each do |string|
52
+ if @snippet.size < 1024
53
+ if string == ' '
54
+ @snippet += "\n"
55
+ else
56
+ @snippet += string[0..1024]
57
+ end
58
+ else
59
+ raise SnippetEndError.new
60
+ end
61
+ end
62
+ end
63
+ alias_method :show_text_with_positioning, :show_text
64
+ alias_method :move_to_next_line_and_show_text, :show_text
65
+ alias_method :set_spacing_next_line_show_text, :show_text
66
+ end
67
+
68
+ end
69
+ end
70
+ end
71
+
@@ -27,15 +27,13 @@ module IMW
27
27
  load(&block)
28
28
  end
29
29
 
30
- # Dump the +data+ into this resource. It must be opened for
30
+ # Emit the +data+ into this resource. It must be opened for
31
31
  # writing.
32
32
  #
33
- # @param [Hash, String, Array, Fixnum] data the Ruby object to dump
34
- # @option options [true, false] :persist (false) Don't close the IO object after writing
35
- def dump data, options={}
33
+ # @param [Hash, String, Array, Fixnum] data the Ruby object to emit
34
+ def emit data, options={}
36
35
  require 'yaml'
37
36
  write(data.to_yaml)
38
- io.close unless options[:persist]
39
37
  self
40
38
  end
41
39
  end
@@ -0,0 +1,66 @@
1
+ module IMW
2
+
3
+ # A collection of classes for describing the metadata associated
4
+ # with a dataset's fields.
5
+ class Metadata < Hash
6
+
7
+ autoload :Field, 'imw/metadata/field'
8
+ autoload :Schema, 'imw/metadata/schema'
9
+ autoload :Schematized, 'imw/metadata/schematized'
10
+ autoload :DSL, 'imw/metadata/dsl'
11
+ autoload :ContainsMetadata, 'imw/metadata/contains_metadata'
12
+
13
+ # The resource this Schema is anchored to.
14
+ #
15
+ # This attribute is useful for letting relative paths in a
16
+ # schema file refer to a common base URL.
17
+ #
18
+ # @return [IMW::Resource]
19
+ attr_reader :base
20
+
21
+ # Set the resource this Schema is anchored to.
22
+ #
23
+ # @param [IMW::Resource, String, Addressable::URI] new_base
24
+ def base= new_base
25
+ base_resource = IMW.open(new_base)
26
+ base_resource.should_exist!("Metdata base directory must exist")
27
+ raise IMW::PathError.new("Metadata base must be a directory") unless base_resource.is_directory?
28
+ @base = base_resource
29
+ end
30
+
31
+ def initialize obj=nil, options={}
32
+ super()
33
+ self.base = options[:base] if options[:base]
34
+ obj.each_pair { |resource, schema| self[resource] = Schema.new(schema) } if obj
35
+ end
36
+
37
+ def self.load metadata_resource, options
38
+ resource = IMW.open(metadata_resource)
39
+ new(resource.load, {:base => resource.dirname}.merge(options))
40
+ end
41
+
42
+ def []= resource_spec, schema_spec
43
+ schema = schema_spec.is_a?(Schema) ? schema_spec : Schema.new(schema_spec)
44
+ super(absolute_uri(resource_spec), schema_spec)
45
+ end
46
+
47
+ def [] resource_spec
48
+ super(absolute_uri(resource_spec))
49
+ end
50
+
51
+ def describe? resource_spec
52
+ has_key?(absolute_uri(resource_spec))
53
+ end
54
+
55
+ protected
56
+
57
+ def absolute_uri resource_spec
58
+ if base && resource_spec.to_s !~ %r{(^/|://)} # relative path
59
+ base.join(resource_spec).to_s
60
+ else
61
+ resource_spec.to_s
62
+ end
63
+ end
64
+
65
+ end
66
+ end
@@ -0,0 +1,44 @@
1
+ module IMW
2
+ class Metadata
3
+
4
+ # A module that can be mixed into any class defining a +contents+
5
+ # methods which returns an Array of URI strings.
6
+ module ContainsMetadata
7
+
8
+ # The path at which this resource's metadata file lives.
9
+ #
10
+ # Will default to any file beginning with +metadata+ and ending
11
+ # with a +yaml+ or +json+ extension contained in this resource's
12
+ # +contents+.
13
+ #
14
+ # @return [String, nil]
15
+ def metadata_uri
16
+ @metadata_uri ||= contents.detect { |path| path =~ /metadata.*\.(ya?ml|json)$/ }
17
+ end
18
+
19
+ # Explicitly set the path to the metadata for this resource.
20
+ attr_writer :metadata_uri
21
+
22
+ # Does this resource contain metadata for other resources it
23
+ # contains?
24
+ #
25
+ # @return [true, false]
26
+ def metadata?
27
+ (!! metadata_uri)
28
+ end
29
+
30
+ # Return the metadata for this resource.
31
+ #
32
+ # @return [IMW::Metadata, nil]
33
+ def metadata
34
+ @metadata ||= metadata? && IMW::Metadata.load(metadata_uri)
35
+ end
36
+
37
+ # Explicitly set the metadata for this resource.
38
+ attr_writer :metadata
39
+
40
+ end
41
+ end
42
+ end
43
+
44
+
@@ -0,0 +1,111 @@
1
+ module IMW
2
+ class Metadata
3
+
4
+ # A module which defines a DSL that can be used to define metadata
5
+ # for an object.
6
+ module DSL
7
+
8
+ # Open a new resource at the given URI.
9
+ #
10
+ # If this dataset has metadata and it describes the resource
11
+ # then configure the resource to understand its schema..
12
+ #
13
+ # The +schema+ property passed via the options hash will
14
+ # override this.
15
+ #
16
+ # @param [String, Addressable::Uri, IMW::Resource] uri
17
+ # @param [Hash] options
18
+ # @return [IMW::Resource]
19
+ # @see IMW.open
20
+ def open uri, options={}, &block
21
+ schema_options = (options[:schema].nil? && metadata && metadata.describe?(uri)) ? {:schema => metadata[uri]} : {}
22
+ IMW.open(uri, options.merge(schema_options), &block)
23
+ end
24
+
25
+ def open! uri, options={}, &block
26
+ self.open(uri, options.merge(:mode => 'w'), &block)
27
+ end
28
+
29
+ # When called without a block return this object's metadata.
30
+ #
31
+ # metadata
32
+ # #=> { '/path/to/file' => [...], '/path/to/other/file' => [...], ... }
33
+ #
34
+ # When called with a block, accumulate schema and fields into
35
+ # this object's metadata
36
+ #
37
+ # metadata do
38
+ #
39
+ # schema "/path/to/file" do
40
+ # # ...
41
+ # end
42
+ #
43
+ # schema "/path/to/other/file" do
44
+ # # ...
45
+ # end
46
+ # end
47
+ #
48
+ # @see [IMW::Metadata::Schema]
49
+ # @see [IMW::Metadata::Field]
50
+ # @return [IMW::Metadata]
51
+ def metadata arg=nil, options={}, &block
52
+ case arg
53
+ when Hash
54
+ @metadata ||= Metadata.new(arg, options)
55
+ when nil
56
+ @metadata ||= Metadata.new nil, options
57
+ else
58
+ @metadata ||= Metadata.load(arg, options)
59
+ end
60
+ @metadata.base = options[:base] if options[:base]
61
+ return @metadata unless block_given?
62
+ yield
63
+ end
64
+
65
+ def schema resource, options={}, &block
66
+ new_field_accumulator!
67
+ yield
68
+ metadata[resource] = Schema.new(last_field_accumulator!)
69
+ end
70
+
71
+ def field name, options={}
72
+ accumulate_field Field.new(options.merge(:name => name))
73
+ end
74
+
75
+ def has_one name, options={}, &block
76
+ new_field_accumulator!
77
+ yield
78
+ accumulate_field Field.new(options.merge(:name => name, :has_one => last_field_accumulator!))
79
+ end
80
+
81
+ def has_many name, options={}, &block
82
+ new_field_accumulator!
83
+ yield
84
+ accumulate_field Field.new(options.merge(:name => name, :has_many => last_field_accumulator!))
85
+ end
86
+
87
+ protected
88
+
89
+ def field_accumulators # :nodoc:
90
+ @field_accumulators ||= []
91
+ end
92
+
93
+ def new_field_accumulator! # :nodoc:
94
+ field_accumulators.push([])
95
+ end
96
+
97
+ def last_field_accumulator! # :nodoc:
98
+ field_accumulators.pop
99
+ end
100
+
101
+ def field_accumulator? # :nodoc:
102
+ ! field_accumulators.empty?
103
+ end
104
+
105
+ def accumulate_field f # :nodoc:
106
+ # raise IMW::SchemaError.new("No record or sub-record to accumulate fields in!") unless field_accumulator?
107
+ field_accumulators.last << f if field_accumulator?
108
+ end
109
+ end
110
+ end
111
+ end
@@ -0,0 +1,65 @@
1
+ module IMW
2
+
3
+ class Metadata
4
+
5
+ # Conceptually, a field is a "slot" for which "records" can have
6
+ # values.
7
+ #
8
+ # An IMW::Metadata::Field is essentially a Hash that has one required
9
+ # property: a name.
10
+ #
11
+ # IMW::Metadata::Field.new('id')
12
+ # #=> { 'name' => 'id' }
13
+ #
14
+ # But you can declare as many other properties as you want (as long
15
+ # as you include a +name+):
16
+ #
17
+ # IMW::Metadata::Field.new 'name' => 'id', 'type' => :integer, 'title' => "ID", 'description' => "Auto-incremented."
18
+ # #=> { 'name' => 'id', 'type' => :integer, 'title' > "ID", 'description' => "Auto-incremented." }
19
+ #
20
+ # Some properties make a field special:
21
+ #
22
+ # <tt>has_many</tt>::
23
+ # Denotes that this record is in a "has_many" relationship with
24
+ # one or more other records. The corresponding value should be
25
+ # an array
26
+ #
27
+ # <tt>has_one</tt>::
28
+ # Denotes that this record is in a "has_one" relationship with
29
+ # one or more other records. The corresponding value should be
30
+ # an Array in which each key names the joined record and each
31
+ # value is an Array of fields describing the joined record..
32
+ #
33
+ # @see IMW::Metadata::Record for more usage of the
34
+ # <tt>:has_many</tt> and <tt>:has_one</tt> properties.
35
+ class Field < Hash
36
+
37
+ def initialize obj
38
+ super()
39
+ if obj.is_a?(Hash) || obj.is_a?(Field)
40
+ merge!(obj)
41
+ raise IMW::ArgumentError.new("A field must have a name") if obj['name'].blank?
42
+ else
43
+ self['name'] = obj.to_s.strip
44
+ end
45
+ end
46
+
47
+ def hierarchical?
48
+ has_key?('has_many') || has_key?('has_one')
49
+ end
50
+ alias_method :nested?, :hierarchical?
51
+
52
+ def flat?
53
+ ! hierarchical?
54
+ end
55
+
56
+ def titleize
57
+ self['title'] || self['name'].capitalize # FIXME we can do better than this!
58
+ end
59
+
60
+ def associations
61
+ end
62
+
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,227 @@
1
+ module IMW
2
+ class Metadata
3
+
4
+ # A class to describe the schema of a resource.
5
+ #
6
+ # A Schema is built on top of an Array because it is often
7
+ # important to have an ordering for a record's fields.
8
+ #
9
+ # For fields with no such ordering, an Array also works because
10
+ # each of its element will be a field with a +name+ that can be
11
+ # used to index the corresponding field.
12
+ #
13
+ # A Schema is instantiated with a basic Ruby data structure.
14
+ #
15
+ # == Tabular Data
16
+ #
17
+ # Tabular data formats (CSV, TSV, &c.) contain flat records
18
+ # consisting of repeating rows with the same fields in the same
19
+ # position. A sample of delimited data looks like
20
+ #
21
+ # ID,Name,Genus,Species
22
+ # 001,Gray-bellied Night Monkey,Aotus,lemurinus
23
+ # 002,Panamanian Night Monkey,Aotus,zonalis
24
+ # 003,Hernández-Camacho's Night Monkey,Aotus,jorgehernandezi
25
+ # 004,Gray-handed Night Monkey,Aotus,griseimembra
26
+ # 005,Hershkovitz's Night Monkey,Aotus,hershkovitzi
27
+ # ...
28
+ #
29
+ # The schema of these records is summarized as a Ruby data
30
+ # structure in the following way
31
+ #
32
+ # [
33
+ # { :name => :id, :type => :integer },
34
+ # { :name => :name, :type => :string, :title => "Common Name" },
35
+ # { :name => :genus, :type => :string, :title => "Genus" },
36
+ # { :name => :species, :type => :string, :title => "Species" }
37
+ # ]
38
+ #
39
+ # The outer-most Array represents each row and each Hash in the
40
+ # Array represents one of the fields in a row. A Schema
41
+ # initialized with the above Ruby code can be thought of and
42
+ # played with as an Array of Hashes even though it really is a
43
+ # Schema object of Field objects.
44
+ #
45
+ # == Hierarchical Data
46
+ #
47
+ # Hierarchical data formats (JSON, YAML, XML, &c.) can have
48
+ # arbitrarily complex records with fields within fields and so on.
49
+ # A sample of hierarchical XML data looks like
50
+ #
51
+ # <genera>
52
+ # <genus>
53
+ # <name>Mandrillus</name>
54
+ # <species>
55
+ # <species id="113">
56
+ # <name>sphinx</name>
57
+ # <common_name>Mandrill</common_name>
58
+ # </species>
59
+ # <species id="114">
60
+ # <name>leucophaeus</name>
61
+ # <common_name>Drill</common_name>
62
+ # </species>
63
+ # </species>
64
+ # </genus>
65
+ # <genus>
66
+ # <name>Rungwecebus</name>
67
+ # <species>
68
+ # <species id="100">
69
+ # <name>kipunji</name>
70
+ # <common_name>Kipunji</common_name>
71
+ # </species>
72
+ # </species>
73
+ # </genus>
74
+ #
75
+ # These records are described by the following Ruby data structure
76
+ #
77
+ # [
78
+ # { :name => :genera,
79
+ # :has_many => [
80
+ # { :name => 'name', :type => :string, title => "Genus" },
81
+ # { :name => 'species',
82
+ # :has_many => [
83
+ # { :name => :id, :type => :integer },
84
+ # { :name => :name, :type => :string, :title => "Species" },
85
+ # { :name => :common_name, :type => :string, :title => "Common Name" }
86
+ # ]
87
+ # }
88
+ # ]
89
+ # }
90
+ # ]
91
+ #
92
+ # By IMW convention, the outer-most element of the Schema is still
93
+ # an Array describing a collection of identical records even
94
+ # though XML data must have a single root node, limiting the
95
+ # collection to a single record.
96
+ #
97
+ # The first field of the Schema is named +genera+ and it uses the
98
+ # special field property +has_many+ to denote that the field
99
+ # points to a collection of sub-records.
100
+ #
101
+ # Each of these sub-records has its own sub-schema defined by the
102
+ # Array that the +has_many+ property keys to. In this case, the
103
+ # two fields are +name+ and +species+. +name+ is a simple String
104
+ # value while +species+ itself points at another collection of
105
+ # objects.
106
+ #
107
+ # This second-level nested record (a particular species) is itself
108
+ # composed of the three (flat) fields +id+, +name+, and
109
+ # +common_name+. Note that the Schema doesn't know (or care) that
110
+ # the +id+ field is contained in an XML attribute while the +name+
111
+ # and +common_name+ fields are contained as text within daughter
112
+ # nodes.
113
+ #
114
+ # A different way of structure the same information, this time
115
+ # expressed in YAML:
116
+ #
117
+ # ---
118
+ # Mandrillus:
119
+ # - :species: sphinx
120
+ # :name: Mandrill
121
+ # :id: "113"
122
+ # - :species: leucophaeus
123
+ # :name: Drill
124
+ # :id: "114"
125
+ # Rungwecebus:
126
+ # - :species: kipunji
127
+ # :name: Kipunji
128
+ # :id: "100"
129
+ #
130
+ # Would lead to a different Schema
131
+ #
132
+ # [
133
+ # { :name => :genus, :title => "Genus",
134
+ # :has_many => [
135
+ # { :name => :id, :type => :integer },
136
+ # { :name => :name, :type => :string, :title => "Common Name" },
137
+ # { :name => :species, :type => :string, :title => "Species" }
138
+ # ]
139
+ # }
140
+ # ]
141
+ #
142
+ # Where the unnecessary outer wrapper field +genera+ has been
143
+ # dispensed with.
144
+ #
145
+ # In addition to "has many" relationships a record can have a
146
+ # "has_one" relationship. The above data might be expressed
147
+ #
148
+ # ---
149
+ # Mandrillus:
150
+ # - species: sphinx
151
+ # name: Mandrill
152
+ # id: "113"
153
+ # discoverer:
154
+ # name: Dr. Monkeypants
155
+ # year: 1838
156
+ # - species: leucophaeus
157
+ # name: Drill
158
+ # id: "114"
159
+ # discoverer:
160
+ # name: Ms. Cecelia Apefingers
161
+ # year: 1921
162
+ #
163
+ # would result in the following Schema:
164
+ #
165
+ # [
166
+ # { :name => :genus, :title => "Genus",
167
+ # :has_many => [
168
+ # { :name => :id, :type => :integer },
169
+ # { :name => :name, :type => :string, :title => "Common Name" },
170
+ # { :name => :species, :type => :string },
171
+ # { :name => :discoverer,
172
+ # :has_one => [
173
+ # { :name => 'name', :type => :string },
174
+ # { :name => 'year', :type => :integer }
175
+ # ]
176
+ # }
177
+ # ]
178
+ # }
179
+ # ]
180
+ #
181
+ # The +discoverer+ field is marked as +has_one+ which means the
182
+ # +name+ and +year+ fields in the corresponding Array will be
183
+ # interpreted as fields in a single attached sub-record.
184
+ #
185
+ # = Compact Schemas
186
+ #
187
+ # The internal hashes in a Schema specification are really Field
188
+ # objects and the initializer will promote Strings and Symbols to
189
+ # Field objects automatically. This means that the above Schema
190
+ # specification could be replaced by
191
+ #
192
+ # [
193
+ # { :name => :genus
194
+ # :has_many => [
195
+ # :id,
196
+ # :name,
197
+ # :species,
198
+ # { :name => :discoverer,
199
+ # :has_one => [
200
+ # :name,
201
+ # :year
202
+ # ]
203
+ # }
204
+ # ]
205
+ # }
206
+ # ]
207
+ #
208
+ # though there is an accompanying loss of metadata about each
209
+ # field.
210
+ class Schema < Array
211
+
212
+ def initialize input=nil
213
+ super()
214
+ concat(input.map { |field| IMW::Metadata::Field.new(field) }) if input
215
+ end
216
+
217
+ def self.load resource
218
+ new(IMW.open(resource).load)
219
+ end
220
+
221
+ def [] index
222
+ [Integer, Range].include?(index.class) ? super(index) : detect { |field| field[:name].to_s == index.to_s }
223
+ end
224
+
225
+ end
226
+ end
227
+ end