imw 0.2.16 → 0.2.17
Sign up to get free protection for your applications and to get access to all the features.
- data/VERSION +1 -1
- data/lib/imw/dataset.rb +1 -1
- data/lib/imw/formats/delimited.rb +9 -14
- data/lib/imw/metadata.rb +38 -21
- data/lib/imw/metadata/contains_metadata.rb +35 -25
- data/lib/imw/metadata/field.rb +0 -28
- data/lib/imw/metadata/has_metadata.rb +93 -0
- data/lib/imw/metadata/has_summary.rb +51 -0
- data/lib/imw/metadata/schema.rb +6 -216
- data/lib/imw/resource.rb +2 -5
- data/lib/imw/schemes/http.rb +1 -1
- data/lib/imw/schemes/local.rb +18 -46
- data/lib/imw/schemes/sql.rb +12 -0
- data/lib/imw/tools/summarizer.rb +12 -13
- data/spec/imw/formats/delimited_spec.rb +3 -12
- data/spec/imw/metadata/contains_metadata_spec.rb +56 -0
- data/spec/imw/metadata/field_spec.rb +4 -5
- data/spec/imw/metadata/has_metadata_spec.rb +58 -0
- data/spec/imw/metadata/has_summary_spec.rb +32 -0
- data/spec/imw/metadata/schema_spec.rb +10 -13
- data/spec/imw/metadata_spec.rb +68 -21
- data/spec/imw/schemes/local_spec.rb +12 -22
- data/spec/imw/schemes/s3_spec.rb +0 -1
- metadata +12 -5
- data/lib/imw/metadata/schematized.rb +0 -27
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.17
|
data/lib/imw/dataset.rb
CHANGED
@@ -11,12 +11,6 @@ module IMW
|
|
11
11
|
# @abstract
|
12
12
|
module Delimited
|
13
13
|
|
14
|
-
# Ensure that this delimited resource is described by a an
|
15
|
-
# ordered collection of flat fields.
|
16
|
-
def validate_schema!
|
17
|
-
raise IMW::SchemaError.new("#{self.class} resources must be described by an ordered set of flat fields") if schema.any?(&:nested?)
|
18
|
-
end
|
19
|
-
|
20
14
|
# Default options to be passed to
|
21
15
|
# FasterCSV[http://fastercsv.rubyforge.org/]; see its
|
22
16
|
# documentation for more information.
|
@@ -24,7 +18,7 @@ module IMW
|
|
24
18
|
# @return [Hash]
|
25
19
|
def delimited_options
|
26
20
|
@delimited_options ||= {
|
27
|
-
:headers =>
|
21
|
+
:headers => fields && fields.map { |field| field['name'] }
|
28
22
|
}.merge(resource_options_compatible_with_faster_csv)
|
29
23
|
end
|
30
24
|
|
@@ -68,7 +62,7 @@ module IMW
|
|
68
62
|
# of this delimited data is a row of headers.
|
69
63
|
#
|
70
64
|
# @return [true, false]
|
71
|
-
def
|
65
|
+
def fields_in_first_line?
|
72
66
|
# grab the header and up to 10 body rows
|
73
67
|
require 'fastercsv'
|
74
68
|
copy = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
|
@@ -93,15 +87,16 @@ module IMW
|
|
93
87
|
determinant && determinant >= 0.05
|
94
88
|
end
|
95
89
|
|
96
|
-
# If it seems like there are
|
97
|
-
# data then go ahead and use them to define
|
90
|
+
# If it seems like there are fields in the first line of this
|
91
|
+
# data then go ahead and use them to define this resource's
|
92
|
+
# fields.
|
98
93
|
#
|
99
|
-
# Will overwrite
|
100
|
-
def
|
101
|
-
return unless
|
94
|
+
# Will overwrite any fields already present for this resource.
|
95
|
+
def guess_fields!
|
96
|
+
return unless fields_in_first_line?
|
102
97
|
copy = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
|
103
98
|
names = (copy.shift || []) rescue []
|
104
|
-
self.
|
99
|
+
self.fields = names.map { |n| { 'name' => n } }
|
105
100
|
delimited_options[:headers] = names
|
106
101
|
end
|
107
102
|
|
data/lib/imw/metadata.rb
CHANGED
@@ -4,13 +4,13 @@ module IMW
|
|
4
4
|
# with a dataset's fields.
|
5
5
|
class Metadata < Hash
|
6
6
|
|
7
|
-
autoload :Field,
|
8
|
-
autoload :Schema,
|
9
|
-
autoload :Schematized, 'imw/metadata/schematized'
|
10
|
-
autoload :DSL, 'imw/metadata/dsl'
|
7
|
+
autoload :Field, 'imw/metadata/field'
|
8
|
+
autoload :Schema, 'imw/metadata/schema'
|
11
9
|
autoload :ContainsMetadata, 'imw/metadata/contains_metadata'
|
10
|
+
autoload :HasSummary, 'imw/metadata/has_summary'
|
11
|
+
autoload :HasMetadata, 'imw/metadata/has_metadata'
|
12
12
|
|
13
|
-
# The resource this
|
13
|
+
# The resource this metadata is anchored to.
|
14
14
|
#
|
15
15
|
# This attribute is useful for letting relative paths in a
|
16
16
|
# schema file refer to a common base URL.
|
@@ -18,12 +18,12 @@ module IMW
|
|
18
18
|
# @return [IMW::Resource]
|
19
19
|
attr_reader :base
|
20
20
|
|
21
|
-
# Set the resource this
|
21
|
+
# Set the base resource this metdata is anchored to.
|
22
22
|
#
|
23
23
|
# @param [IMW::Resource, String, Addressable::URI] new_base
|
24
24
|
def base= new_base
|
25
25
|
base_resource = IMW.open(new_base)
|
26
|
-
base_resource.should_exist!("
|
26
|
+
base_resource.should_exist!("Metadata base directory must exist")
|
27
27
|
raise IMW::PathError.new("Metadata base must be a directory") unless base_resource.is_directory?
|
28
28
|
@base = base_resource
|
29
29
|
end
|
@@ -31,34 +31,51 @@ module IMW
|
|
31
31
|
def initialize obj=nil, options={}
|
32
32
|
super()
|
33
33
|
self.base = options[:base] if options[:base]
|
34
|
-
|
34
|
+
if obj
|
35
|
+
obj.each_pair do |resource, metadata|
|
36
|
+
self[resource] = metadata
|
37
|
+
end
|
38
|
+
end
|
35
39
|
end
|
36
40
|
|
37
|
-
def self.load
|
38
|
-
resource = IMW.open(
|
41
|
+
def self.load obj, options={}
|
42
|
+
resource = IMW.open(obj)
|
39
43
|
new(resource.load, {:base => resource.dirname}.merge(options))
|
40
44
|
end
|
41
45
|
|
42
|
-
def []=
|
43
|
-
|
44
|
-
|
46
|
+
def []= resource, metadata
|
47
|
+
super(absolute_uri(resource), metadata)
|
48
|
+
end
|
49
|
+
|
50
|
+
def [] resource
|
51
|
+
super(absolute_uri(resource))
|
52
|
+
end
|
53
|
+
|
54
|
+
def describe? resource
|
55
|
+
self[(absolute_uri(resource))]
|
45
56
|
end
|
57
|
+
alias_method :describes?, :describe?
|
46
58
|
|
47
|
-
def
|
48
|
-
|
59
|
+
def description_for resource
|
60
|
+
return unless describes?(resource)
|
61
|
+
self[resource]['description']
|
49
62
|
end
|
50
63
|
|
51
|
-
def
|
52
|
-
|
64
|
+
def fields_for resource
|
65
|
+
return unless describes?(resource)
|
66
|
+
(self[resource]['fields'] || []).map { |f| Metadata::Field.new(f) }
|
53
67
|
end
|
54
68
|
|
55
69
|
protected
|
56
70
|
|
57
|
-
def absolute_uri
|
58
|
-
|
59
|
-
|
71
|
+
def absolute_uri resource
|
72
|
+
obj = IMW.open(resource)
|
73
|
+
if base && obj.uri.to_s !~ %r{(^/|://)} # relative path
|
74
|
+
s = base.join(obj.uri.to_s).uri.to_s
|
75
|
+
s
|
60
76
|
else
|
61
|
-
|
77
|
+
s = obj.uri.to_s
|
78
|
+
s
|
62
79
|
end
|
63
80
|
end
|
64
81
|
|
@@ -1,44 +1,54 @@
|
|
1
|
-
module IMW
|
1
|
+
module IMW
|
2
2
|
class Metadata
|
3
3
|
|
4
|
-
# A module
|
5
|
-
#
|
4
|
+
# A module for finding metadata describing the sub-resources of a
|
5
|
+
# given resource.
|
6
|
+
#
|
7
|
+
# An including class describing the parent resource must define
|
8
|
+
# the +contents+ method which must return an Array of Strings
|
9
|
+
# contained within the parent . These objects will be matched
|
10
|
+
# against possible metadata URIs and the corresponding
|
11
|
+
# IMW::Metadata class created on the fly.
|
12
|
+
#
|
13
|
+
# In case no such object is found, the class should also define
|
14
|
+
# the +basename+ and +path+ methods which will be used to generate
|
15
|
+
# a default URI where metadata about the parent's resources should
|
16
|
+
# live.
|
6
17
|
module ContainsMetadata
|
7
18
|
|
8
|
-
# The
|
19
|
+
# The URI containing the metadata for this resource and its
|
20
|
+
# contents.
|
9
21
|
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
22
|
+
# Looks for an existing JSON or YAML file containing the strings
|
23
|
+
# "icss" or "metadata" directly contained within this resource.
|
24
|
+
#
|
25
|
+
# If none are found, defaults to a URI named after this
|
26
|
+
# resource's basename with the string ".icss.yaml" appended.
|
13
27
|
#
|
14
28
|
# @return [String, nil]
|
15
|
-
def
|
16
|
-
|
29
|
+
def default_metadata_uri
|
30
|
+
contents.detect { |path| path =~ /(icss|metadata).*\.(ya?ml|json)$/i } || File.join(path, "#{basename}.icss.yaml")
|
17
31
|
end
|
18
32
|
|
19
|
-
#
|
20
|
-
attr_writer :metadata_uri
|
21
|
-
|
22
|
-
# Does this resource contain metadata for other resources it
|
23
|
-
# contains?
|
33
|
+
# Return the metadata for this resource if it exists.
|
24
34
|
#
|
25
|
-
#
|
26
|
-
def metadata?
|
27
|
-
(!! metadata_uri)
|
28
|
-
end
|
29
|
-
|
30
|
-
# Return the metadata for this resource.
|
35
|
+
# Will look for an existing resource at +default_metadata_uri+.
|
31
36
|
#
|
32
37
|
# @return [IMW::Metadata, nil]
|
33
38
|
def metadata
|
34
|
-
@metadata
|
39
|
+
return @metadata if @metadata
|
40
|
+
obj = IMW.open(default_metadata_uri)
|
41
|
+
self.metadata=(obj) if obj.exist?
|
42
|
+
@metadata
|
35
43
|
end
|
36
44
|
|
37
|
-
#
|
38
|
-
|
45
|
+
# Set the metadata for this resource to +obj+.
|
46
|
+
#
|
47
|
+
# @param [String, Addressable::URI, IMW::Resource] obj
|
48
|
+
def metadata= obj
|
49
|
+
@metadata = IMW::Metadata.load(obj)
|
50
|
+
end
|
39
51
|
|
40
52
|
end
|
41
53
|
end
|
42
54
|
end
|
43
|
-
|
44
|
-
|
data/lib/imw/metadata/field.rb
CHANGED
@@ -16,22 +16,6 @@ module IMW
|
|
16
16
|
#
|
17
17
|
# IMW::Metadata::Field.new 'name' => 'id', 'type' => :integer, 'title' => "ID", 'description' => "Auto-incremented."
|
18
18
|
# #=> { 'name' => 'id', 'type' => :integer, 'title' > "ID", 'description' => "Auto-incremented." }
|
19
|
-
#
|
20
|
-
# Some properties make a field special:
|
21
|
-
#
|
22
|
-
# <tt>has_many</tt>::
|
23
|
-
# Denotes that this record is in a "has_many" relationship with
|
24
|
-
# one or more other records. The corresponding value should be
|
25
|
-
# an array
|
26
|
-
#
|
27
|
-
# <tt>has_one</tt>::
|
28
|
-
# Denotes that this record is in a "has_one" relationship with
|
29
|
-
# one or more other records. The corresponding value should be
|
30
|
-
# an Array in which each key names the joined record and each
|
31
|
-
# value is an Array of fields describing the joined record..
|
32
|
-
#
|
33
|
-
# @see IMW::Metadata::Record for more usage of the
|
34
|
-
# <tt>:has_many</tt> and <tt>:has_one</tt> properties.
|
35
19
|
class Field < Hash
|
36
20
|
|
37
21
|
def initialize obj
|
@@ -43,23 +27,11 @@ module IMW
|
|
43
27
|
self['name'] = obj.to_s.strip
|
44
28
|
end
|
45
29
|
end
|
46
|
-
|
47
|
-
def hierarchical?
|
48
|
-
has_key?('has_many') || has_key?('has_one')
|
49
|
-
end
|
50
|
-
alias_method :nested?, :hierarchical?
|
51
|
-
|
52
|
-
def flat?
|
53
|
-
! hierarchical?
|
54
|
-
end
|
55
30
|
|
56
31
|
def titleize
|
57
32
|
self['title'] || self['name'].capitalize # FIXME we can do better than this!
|
58
33
|
end
|
59
34
|
|
60
|
-
def associations
|
61
|
-
end
|
62
|
-
|
63
35
|
end
|
64
36
|
end
|
65
37
|
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
module IMW
|
2
|
+
class Metadata
|
3
|
+
|
4
|
+
|
5
|
+
# A module which defines how a resource finds Metadata that it can
|
6
|
+
# look up metadata about itself.
|
7
|
+
#
|
8
|
+
# "metadata" in this context is defined as accessors for
|
9
|
+
# +metadata+ (IMW::Metadata), +schema+ (IMW::Metadata::Schema),
|
10
|
+
# +fields+ (IMW::Metadata::Field), and +description+ (String).
|
11
|
+
#
|
12
|
+
# An including class should define a method +dir+ which should
|
13
|
+
# return an object that might contain Metadata, i.e. - that
|
14
|
+
# includes the IMW::Metadata::ContainsMetadata module.
|
15
|
+
#
|
16
|
+
# An including class can optionally define the methods +snippet+
|
17
|
+
# which returns a snippet of the resource as well as
|
18
|
+
# +record_count+ to return a count of how many records the
|
19
|
+
# resource contains.
|
20
|
+
module HasMetadata
|
21
|
+
|
22
|
+
# The schema for this object.
|
23
|
+
#
|
24
|
+
# @return [Hash]
|
25
|
+
def schema
|
26
|
+
return @schema if @schema
|
27
|
+
@schema = IMW::Metadata::Schema.new
|
28
|
+
@schema[:type] = "record"
|
29
|
+
@schema[:namespace] = "schema.imw.resource"
|
30
|
+
@schema[:name] = (basename || '')
|
31
|
+
@schema[:doc] = description
|
32
|
+
@schema[:fields] = fields
|
33
|
+
|
34
|
+
@schema[:non_avro ] = {}
|
35
|
+
@schema[:non_avro][:snippet] = snippet if respond_to?(:snippet)
|
36
|
+
@schema[:non_avro][:record_count] = record_count if respond_to?(:record_count)
|
37
|
+
@schema
|
38
|
+
end
|
39
|
+
|
40
|
+
# Return the metadata object that contains metadata for this
|
41
|
+
# resource.
|
42
|
+
#
|
43
|
+
# Will look in this resource's directory and recursively upward
|
44
|
+
# till the root directory is reached or a metadata file is
|
45
|
+
# discovered.
|
46
|
+
#
|
47
|
+
# @return [IMW::Metadata, nil]
|
48
|
+
def metadata
|
49
|
+
return @metadata if @metadata
|
50
|
+
d = dir
|
51
|
+
while d.path != '/'
|
52
|
+
break if d.metadata && d.metadata.describes?(self)
|
53
|
+
d = d.dir
|
54
|
+
end
|
55
|
+
@metadata = d.metadata
|
56
|
+
end
|
57
|
+
|
58
|
+
# The fields for this resource's data.
|
59
|
+
#
|
60
|
+
# Each field will be a Hash of information.
|
61
|
+
#
|
62
|
+
# @return [Array<Hash>]
|
63
|
+
def fields
|
64
|
+
@fields ||= metadata && metadata.fields_for(self)
|
65
|
+
end
|
66
|
+
|
67
|
+
# Set the fields for this resource.
|
68
|
+
#
|
69
|
+
# @param [Array<Hash>] new_fields
|
70
|
+
# @return [Array<Hash>]
|
71
|
+
def fields= new_fields
|
72
|
+
@fields = new_fields.map { |f| Metadata::Field.new(f) }
|
73
|
+
end
|
74
|
+
|
75
|
+
# A description for this Resource.
|
76
|
+
#
|
77
|
+
# @return [String]
|
78
|
+
def description
|
79
|
+
@description ||= metadata && metadata.description_for(self)
|
80
|
+
end
|
81
|
+
|
82
|
+
# Set the description of this Resource.
|
83
|
+
#
|
84
|
+
# @param [String] new_description
|
85
|
+
# @return [String]
|
86
|
+
def description= new_description
|
87
|
+
@description = new_description
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module IMW
|
2
|
+
class Metadata
|
3
|
+
|
4
|
+
# A module for generating a summary & schema of a resource.
|
5
|
+
#
|
6
|
+
# The including class should define methods +uri+, +basename+, +extension+.
|
7
|
+
module HasSummary
|
8
|
+
|
9
|
+
# Return a full summary of this Resource.
|
10
|
+
#
|
11
|
+
# The summary will include "external" information about how this
|
12
|
+
# resource appears to the world (via its URI), "internal"
|
13
|
+
# metadata about this resource (its description, &c.), as well
|
14
|
+
# as the structure of this resource's data (it's schema's fields
|
15
|
+
# and a snippet).
|
16
|
+
#
|
17
|
+
# Will return a Hash, with a <tt>:schema</tt> key which maps to
|
18
|
+
# a well-formed AVRO schema for this resource.
|
19
|
+
#
|
20
|
+
# @return [Hash]
|
21
|
+
def summary
|
22
|
+
return @summary if @summary
|
23
|
+
@summary = external_summary
|
24
|
+
@summary[:schema] = schema if respond_to?(:schema)
|
25
|
+
@summary[:contents] = resources.map(&:summary) if respond_to?(:resources)
|
26
|
+
@summary
|
27
|
+
end
|
28
|
+
|
29
|
+
# Return information (usually scheme-dependent) on how this
|
30
|
+
# resource is situated in the world, i.e. - its URI, its size,
|
31
|
+
# how many lines it has, &c.
|
32
|
+
#
|
33
|
+
# Modules which override this should chain with +super+:
|
34
|
+
#
|
35
|
+
# # in my_scheme.rb
|
36
|
+
# def external_summary
|
37
|
+
# super().merge(:user => 'bob', :password => 'smith')
|
38
|
+
# end
|
39
|
+
#
|
40
|
+
# @return [Hash]
|
41
|
+
def external_summary
|
42
|
+
{
|
43
|
+
:uri => uri.to_s,
|
44
|
+
:basename => basename,
|
45
|
+
:extension => extension
|
46
|
+
}
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
end
|
data/lib/imw/metadata/schema.rb
CHANGED
@@ -1,227 +1,17 @@
|
|
1
1
|
module IMW
|
2
|
+
|
2
3
|
class Metadata
|
3
4
|
|
4
|
-
#
|
5
|
-
#
|
6
|
-
# A Schema is built on top of an Array because it is often
|
7
|
-
# important to have an ordering for a record's fields.
|
8
|
-
#
|
9
|
-
# For fields with no such ordering, an Array also works because
|
10
|
-
# each of its element will be a field with a +name+ that can be
|
11
|
-
# used to index the corresponding field.
|
12
|
-
#
|
13
|
-
# A Schema is instantiated with a basic Ruby data structure.
|
14
|
-
#
|
15
|
-
# == Tabular Data
|
16
|
-
#
|
17
|
-
# Tabular data formats (CSV, TSV, &c.) contain flat records
|
18
|
-
# consisting of repeating rows with the same fields in the same
|
19
|
-
# position. A sample of delimited data looks like
|
20
|
-
#
|
21
|
-
# ID,Name,Genus,Species
|
22
|
-
# 001,Gray-bellied Night Monkey,Aotus,lemurinus
|
23
|
-
# 002,Panamanian Night Monkey,Aotus,zonalis
|
24
|
-
# 003,Hernández-Camacho's Night Monkey,Aotus,jorgehernandezi
|
25
|
-
# 004,Gray-handed Night Monkey,Aotus,griseimembra
|
26
|
-
# 005,Hershkovitz's Night Monkey,Aotus,hershkovitzi
|
27
|
-
# ...
|
28
|
-
#
|
29
|
-
# The schema of these records is summarized as a Ruby data
|
30
|
-
# structure in the following way
|
31
|
-
#
|
32
|
-
# [
|
33
|
-
# { :name => :id, :type => :integer },
|
34
|
-
# { :name => :name, :type => :string, :title => "Common Name" },
|
35
|
-
# { :name => :genus, :type => :string, :title => "Genus" },
|
36
|
-
# { :name => :species, :type => :string, :title => "Species" }
|
37
|
-
# ]
|
38
|
-
#
|
39
|
-
# The outer-most Array represents each row and each Hash in the
|
40
|
-
# Array represents one of the fields in a row. A Schema
|
41
|
-
# initialized with the above Ruby code can be thought of and
|
42
|
-
# played with as an Array of Hashes even though it really is a
|
43
|
-
# Schema object of Field objects.
|
44
|
-
#
|
45
|
-
# == Hierarchical Data
|
46
|
-
#
|
47
|
-
# Hierarchical data formats (JSON, YAML, XML, &c.) can have
|
48
|
-
# arbitrarily complex records with fields within fields and so on.
|
49
|
-
# A sample of hierarchical XML data looks like
|
50
|
-
#
|
51
|
-
# <genera>
|
52
|
-
# <genus>
|
53
|
-
# <name>Mandrillus</name>
|
54
|
-
# <species>
|
55
|
-
# <species id="113">
|
56
|
-
# <name>sphinx</name>
|
57
|
-
# <common_name>Mandrill</common_name>
|
58
|
-
# </species>
|
59
|
-
# <species id="114">
|
60
|
-
# <name>leucophaeus</name>
|
61
|
-
# <common_name>Drill</common_name>
|
62
|
-
# </species>
|
63
|
-
# </species>
|
64
|
-
# </genus>
|
65
|
-
# <genus>
|
66
|
-
# <name>Rungwecebus</name>
|
67
|
-
# <species>
|
68
|
-
# <species id="100">
|
69
|
-
# <name>kipunji</name>
|
70
|
-
# <common_name>Kipunji</common_name>
|
71
|
-
# </species>
|
72
|
-
# </species>
|
73
|
-
# </genus>
|
74
|
-
#
|
75
|
-
# These records are described by the following Ruby data structure
|
76
|
-
#
|
77
|
-
# [
|
78
|
-
# { :name => :genera,
|
79
|
-
# :has_many => [
|
80
|
-
# { :name => 'name', :type => :string, title => "Genus" },
|
81
|
-
# { :name => 'species',
|
82
|
-
# :has_many => [
|
83
|
-
# { :name => :id, :type => :integer },
|
84
|
-
# { :name => :name, :type => :string, :title => "Species" },
|
85
|
-
# { :name => :common_name, :type => :string, :title => "Common Name" }
|
86
|
-
# ]
|
87
|
-
# }
|
88
|
-
# ]
|
89
|
-
# }
|
90
|
-
# ]
|
91
|
-
#
|
92
|
-
# By IMW convention, the outer-most element of the Schema is still
|
93
|
-
# an Array describing a collection of identical records even
|
94
|
-
# though XML data must have a single root node, limiting the
|
95
|
-
# collection to a single record.
|
96
|
-
#
|
97
|
-
# The first field of the Schema is named +genera+ and it uses the
|
98
|
-
# special field property +has_many+ to denote that the field
|
99
|
-
# points to a collection of sub-records.
|
5
|
+
# Represents a schema for data.
|
100
6
|
#
|
101
|
-
#
|
102
|
-
|
103
|
-
# two fields are +name+ and +species+. +name+ is a simple String
|
104
|
-
# value while +species+ itself points at another collection of
|
105
|
-
# objects.
|
106
|
-
#
|
107
|
-
# This second-level nested record (a particular species) is itself
|
108
|
-
# composed of the three (flat) fields +id+, +name+, and
|
109
|
-
# +common_name+. Note that the Schema doesn't know (or care) that
|
110
|
-
# the +id+ field is contained in an XML attribute while the +name+
|
111
|
-
# and +common_name+ fields are contained as text within daughter
|
112
|
-
# nodes.
|
113
|
-
#
|
114
|
-
# A different way of structure the same information, this time
|
115
|
-
# expressed in YAML:
|
116
|
-
#
|
117
|
-
# ---
|
118
|
-
# Mandrillus:
|
119
|
-
# - :species: sphinx
|
120
|
-
# :name: Mandrill
|
121
|
-
# :id: "113"
|
122
|
-
# - :species: leucophaeus
|
123
|
-
# :name: Drill
|
124
|
-
# :id: "114"
|
125
|
-
# Rungwecebus:
|
126
|
-
# - :species: kipunji
|
127
|
-
# :name: Kipunji
|
128
|
-
# :id: "100"
|
129
|
-
#
|
130
|
-
# Would lead to a different Schema
|
131
|
-
#
|
132
|
-
# [
|
133
|
-
# { :name => :genus, :title => "Genus",
|
134
|
-
# :has_many => [
|
135
|
-
# { :name => :id, :type => :integer },
|
136
|
-
# { :name => :name, :type => :string, :title => "Common Name" },
|
137
|
-
# { :name => :species, :type => :string, :title => "Species" }
|
138
|
-
# ]
|
139
|
-
# }
|
140
|
-
# ]
|
141
|
-
#
|
142
|
-
# Where the unnecessary outer wrapper field +genera+ has been
|
143
|
-
# dispensed with.
|
144
|
-
#
|
145
|
-
# In addition to "has many" relationships a record can have a
|
146
|
-
# "has_one" relationship. The above data might be expressed
|
147
|
-
#
|
148
|
-
# ---
|
149
|
-
# Mandrillus:
|
150
|
-
# - species: sphinx
|
151
|
-
# name: Mandrill
|
152
|
-
# id: "113"
|
153
|
-
# discoverer:
|
154
|
-
# name: Dr. Monkeypants
|
155
|
-
# year: 1838
|
156
|
-
# - species: leucophaeus
|
157
|
-
# name: Drill
|
158
|
-
# id: "114"
|
159
|
-
# discoverer:
|
160
|
-
# name: Ms. Cecelia Apefingers
|
161
|
-
# year: 1921
|
162
|
-
#
|
163
|
-
# would result in the following Schema:
|
164
|
-
#
|
165
|
-
# [
|
166
|
-
# { :name => :genus, :title => "Genus",
|
167
|
-
# :has_many => [
|
168
|
-
# { :name => :id, :type => :integer },
|
169
|
-
# { :name => :name, :type => :string, :title => "Common Name" },
|
170
|
-
# { :name => :species, :type => :string },
|
171
|
-
# { :name => :discoverer,
|
172
|
-
# :has_one => [
|
173
|
-
# { :name => 'name', :type => :string },
|
174
|
-
# { :name => 'year', :type => :integer }
|
175
|
-
# ]
|
176
|
-
# }
|
177
|
-
# ]
|
178
|
-
# }
|
179
|
-
# ]
|
180
|
-
#
|
181
|
-
# The +discoverer+ field is marked as +has_one+ which means the
|
182
|
-
# +name+ and +year+ fields in the corresponding Array will be
|
183
|
-
# interpreted as fields in a single attached sub-record.
|
184
|
-
#
|
185
|
-
# = Compact Schemas
|
186
|
-
#
|
187
|
-
# The internal hashes in a Schema specification are really Field
|
188
|
-
# objects and the initializer will promote Strings and Symbols to
|
189
|
-
# Field objects automatically. This means that the above Schema
|
190
|
-
# specification could be replaced by
|
191
|
-
#
|
192
|
-
# [
|
193
|
-
# { :name => :genus
|
194
|
-
# :has_many => [
|
195
|
-
# :id,
|
196
|
-
# :name,
|
197
|
-
# :species,
|
198
|
-
# { :name => :discoverer,
|
199
|
-
# :has_one => [
|
200
|
-
# :name,
|
201
|
-
# :year
|
202
|
-
# ]
|
203
|
-
# }
|
204
|
-
# ]
|
205
|
-
# }
|
206
|
-
# ]
|
207
|
-
#
|
208
|
-
# though there is an accompanying loss of metadata about each
|
209
|
-
# field.
|
210
|
-
class Schema < Array
|
7
|
+
# FIXME add methods that help couple nicely with Avro schemata.
|
8
|
+
class Schema < Hash
|
211
9
|
|
212
|
-
def initialize
|
10
|
+
def initialize obj=nil
|
213
11
|
super()
|
214
|
-
|
215
|
-
end
|
216
|
-
|
217
|
-
def self.load resource
|
218
|
-
new(IMW.open(resource).load)
|
12
|
+
merge!(obj) if obj.is_a?(Hash) || obj.is_a?(Schema)
|
219
13
|
end
|
220
14
|
|
221
|
-
def [] index
|
222
|
-
[Integer, Range].include?(index.class) ? super(index) : detect { |field| field[:name].to_s == index.to_s }
|
223
|
-
end
|
224
|
-
|
225
15
|
end
|
226
16
|
end
|
227
17
|
end
|