imw 0.2.7 → 0.2.8
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +23 -0
- data/Gemfile.lock +47 -0
- data/LICENSE +20 -674
- data/README.rdoc +3 -4
- data/VERSION +1 -1
- data/lib/imw.rb +64 -35
- data/lib/imw/dataset.rb +12 -2
- data/lib/imw/formats.rb +4 -2
- data/lib/imw/formats/delimited.rb +96 -36
- data/lib/imw/formats/excel.rb +69 -101
- data/lib/imw/formats/json.rb +3 -5
- data/lib/imw/formats/pdf.rb +71 -0
- data/lib/imw/formats/yaml.rb +3 -5
- data/lib/imw/metadata.rb +66 -0
- data/lib/imw/metadata/contains_metadata.rb +44 -0
- data/lib/imw/metadata/dsl.rb +111 -0
- data/lib/imw/metadata/field.rb +65 -0
- data/lib/imw/metadata/schema.rb +227 -0
- data/lib/imw/metadata/schematized.rb +27 -0
- data/lib/imw/parsers.rb +1 -0
- data/lib/imw/parsers/flat.rb +44 -0
- data/lib/imw/resource.rb +36 -224
- data/lib/imw/schemes.rb +3 -1
- data/lib/imw/schemes/hdfs.rb +12 -1
- data/lib/imw/schemes/http.rb +1 -2
- data/lib/imw/schemes/local.rb +139 -16
- data/lib/imw/schemes/remote.rb +14 -9
- data/lib/imw/schemes/s3.rb +12 -0
- data/lib/imw/schemes/sql.rb +117 -0
- data/lib/imw/tools.rb +5 -3
- data/lib/imw/tools/downloader.rb +63 -0
- data/lib/imw/tools/summarizer.rb +21 -10
- data/lib/imw/utils.rb +10 -0
- data/lib/imw/utils/dynamically_extendable.rb +137 -0
- data/lib/imw/utils/error.rb +3 -0
- data/lib/imw/utils/extensions.rb +0 -4
- data/lib/imw/utils/extensions/array.rb +6 -7
- data/lib/imw/utils/extensions/hash.rb +3 -5
- data/lib/imw/utils/extensions/string.rb +3 -3
- data/lib/imw/utils/has_uri.rb +114 -0
- data/spec/data/{sample.csv → formats/delimited/sample.csv} +1 -1
- data/spec/data/{sample.tsv → formats/delimited/sample.tsv} +0 -0
- data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +11 -0
- data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +16 -0
- data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +11 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +22 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +22 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +12 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +13 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +22 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +22 -0
- data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +10 -0
- data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +15 -0
- data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +10 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +21 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +21 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +11 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +12 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +21 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +21 -0
- data/spec/data/formats/excel/sample.xls +0 -0
- data/spec/data/formats/json/sample.json +1 -0
- data/spec/data/formats/none/sample +650 -0
- data/spec/data/formats/sgml/sample.xml +617 -0
- data/spec/data/formats/text/sample.txt +650 -0
- data/spec/data/formats/yaml/sample.yaml +410 -0
- data/spec/data/schema-tabular.yaml +11 -0
- data/spec/imw/formats/delimited_spec.rb +34 -2
- data/spec/imw/formats/excel_spec.rb +55 -0
- data/spec/imw/formats/json_spec.rb +3 -3
- data/spec/imw/formats/sgml_spec.rb +4 -4
- data/spec/imw/formats/yaml_spec.rb +3 -3
- data/spec/imw/metadata/field_spec.rb +26 -0
- data/spec/imw/metadata/schema_spec.rb +27 -0
- data/spec/imw/metadata_spec.rb +39 -0
- data/spec/imw/parsers/line_parser_spec.rb +1 -1
- data/spec/imw/resource_spec.rb +0 -100
- data/spec/imw/schemes/hdfs_spec.rb +19 -13
- data/spec/imw/schemes/local_spec.rb +59 -3
- data/spec/imw/schemes/s3_spec.rb +4 -0
- data/spec/imw/utils/dynamically_extendable_spec.rb +69 -0
- data/spec/imw/utils/has_uri_spec.rb +55 -0
- data/spec/spec_helper.rb +1 -2
- data/spec/support/random.rb +4 -4
- metadata +58 -17
- data/CHANGELOG +0 -0
- data/TODO +0 -18
- data/spec/data/sample.json +0 -782
- data/spec/data/sample.txt +0 -131
- data/spec/data/sample.xml +0 -653
- data/spec/data/sample.yaml +0 -651
- data/spec/spec.opts +0 -4
- data/spec/support/extensions.rb +0 -18
data/lib/imw/formats/json.rb
CHANGED
@@ -27,15 +27,13 @@ module IMW
|
|
27
27
|
load(&block)
|
28
28
|
end
|
29
29
|
|
30
|
-
#
|
30
|
+
# Emit the +data+ into this resource. It must be opened for
|
31
31
|
# writing.
|
32
32
|
#
|
33
|
-
# @param [Hash, String, Array, Fixnum] data the Ruby object to
|
34
|
-
|
35
|
-
def dump data, options={}
|
33
|
+
# @param [Hash, String, Array, Fixnum] data the Ruby object to emit
|
34
|
+
def emit data, options={}
|
36
35
|
require 'json'
|
37
36
|
write(data.to_json)
|
38
|
-
io.close unless options[:persist]
|
39
37
|
self
|
40
38
|
end
|
41
39
|
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
module IMW
|
2
|
+
module Formats
|
3
|
+
|
4
|
+
# Defines methods for parsing and generating PDF.
|
5
|
+
#
|
6
|
+
# Uses PDF::Reader for parsing and Prawn for generating.
|
7
|
+
module Pdf
|
8
|
+
|
9
|
+
# Return a snippet of text from this PDF.
|
10
|
+
#
|
11
|
+
# @return [String]
|
12
|
+
def snippet
|
13
|
+
begin
|
14
|
+
require 'pdf/reader'
|
15
|
+
snippetizer = Snippetizer.new
|
16
|
+
PDF::Reader.file(path, snippetizer)
|
17
|
+
snippetizer.snippet
|
18
|
+
rescue Snippetizer::SnippetEndError
|
19
|
+
snippetizer.snippet
|
20
|
+
rescue
|
21
|
+
''
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# A receiver class used by PDF::Reader which agglomerates text
|
26
|
+
# up to 1024 bytes and then bails.
|
27
|
+
class Snippetizer
|
28
|
+
|
29
|
+
# A custom error class that can be thrown while receiving text
|
30
|
+
# from PDF::Reader to cut-short walking large PDF documents.
|
31
|
+
SnippetEndError = Class.new(IMW::Error)
|
32
|
+
|
33
|
+
# The snippet being built by this snippetizer.
|
34
|
+
attr_accessor :snippet
|
35
|
+
|
36
|
+
def initialize
|
37
|
+
@snippet = ''
|
38
|
+
end
|
39
|
+
|
40
|
+
# Agglomerates text from PDF::Reader up to a fixed size of
|
41
|
+
# 1024 bytes.
|
42
|
+
#
|
43
|
+
# Will convert a single-space line from PDF::Reader as a
|
44
|
+
# newline character.
|
45
|
+
#
|
46
|
+
# FIXME How does the receiver ask PDF::Reader to abort walking
|
47
|
+
# the document now that enough text has been returned? Till a
|
48
|
+
# more graceful way is found this method simply raises an
|
49
|
+
# error, creating a GOTO...
|
50
|
+
def show_text *params
|
51
|
+
params.each do |string|
|
52
|
+
if @snippet.size < 1024
|
53
|
+
if string == ' '
|
54
|
+
@snippet += "\n"
|
55
|
+
else
|
56
|
+
@snippet += string[0..1024]
|
57
|
+
end
|
58
|
+
else
|
59
|
+
raise SnippetEndError.new
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
alias_method :show_text_with_positioning, :show_text
|
64
|
+
alias_method :move_to_next_line_and_show_text, :show_text
|
65
|
+
alias_method :set_spacing_next_line_show_text, :show_text
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
data/lib/imw/formats/yaml.rb
CHANGED
@@ -27,15 +27,13 @@ module IMW
|
|
27
27
|
load(&block)
|
28
28
|
end
|
29
29
|
|
30
|
-
#
|
30
|
+
# Emit the +data+ into this resource. It must be opened for
|
31
31
|
# writing.
|
32
32
|
#
|
33
|
-
# @param [Hash, String, Array, Fixnum] data the Ruby object to
|
34
|
-
|
35
|
-
def dump data, options={}
|
33
|
+
# @param [Hash, String, Array, Fixnum] data the Ruby object to emit
|
34
|
+
def emit data, options={}
|
36
35
|
require 'yaml'
|
37
36
|
write(data.to_yaml)
|
38
|
-
io.close unless options[:persist]
|
39
37
|
self
|
40
38
|
end
|
41
39
|
end
|
data/lib/imw/metadata.rb
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
module IMW
|
2
|
+
|
3
|
+
# A collection of classes for describing the metadata associated
|
4
|
+
# with a dataset's fields.
|
5
|
+
class Metadata < Hash
|
6
|
+
|
7
|
+
autoload :Field, 'imw/metadata/field'
|
8
|
+
autoload :Schema, 'imw/metadata/schema'
|
9
|
+
autoload :Schematized, 'imw/metadata/schematized'
|
10
|
+
autoload :DSL, 'imw/metadata/dsl'
|
11
|
+
autoload :ContainsMetadata, 'imw/metadata/contains_metadata'
|
12
|
+
|
13
|
+
# The resource this Schema is anchored to.
|
14
|
+
#
|
15
|
+
# This attribute is useful for letting relative paths in a
|
16
|
+
# schema file refer to a common base URL.
|
17
|
+
#
|
18
|
+
# @return [IMW::Resource]
|
19
|
+
attr_reader :base
|
20
|
+
|
21
|
+
# Set the resource this Schema is anchored to.
|
22
|
+
#
|
23
|
+
# @param [IMW::Resource, String, Addressable::URI] new_base
|
24
|
+
def base= new_base
|
25
|
+
base_resource = IMW.open(new_base)
|
26
|
+
base_resource.should_exist!("Metdata base directory must exist")
|
27
|
+
raise IMW::PathError.new("Metadata base must be a directory") unless base_resource.is_directory?
|
28
|
+
@base = base_resource
|
29
|
+
end
|
30
|
+
|
31
|
+
def initialize obj=nil, options={}
|
32
|
+
super()
|
33
|
+
self.base = options[:base] if options[:base]
|
34
|
+
obj.each_pair { |resource, schema| self[resource] = Schema.new(schema) } if obj
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.load metadata_resource, options
|
38
|
+
resource = IMW.open(metadata_resource)
|
39
|
+
new(resource.load, {:base => resource.dirname}.merge(options))
|
40
|
+
end
|
41
|
+
|
42
|
+
def []= resource_spec, schema_spec
|
43
|
+
schema = schema_spec.is_a?(Schema) ? schema_spec : Schema.new(schema_spec)
|
44
|
+
super(absolute_uri(resource_spec), schema_spec)
|
45
|
+
end
|
46
|
+
|
47
|
+
def [] resource_spec
|
48
|
+
super(absolute_uri(resource_spec))
|
49
|
+
end
|
50
|
+
|
51
|
+
def describe? resource_spec
|
52
|
+
has_key?(absolute_uri(resource_spec))
|
53
|
+
end
|
54
|
+
|
55
|
+
protected
|
56
|
+
|
57
|
+
def absolute_uri resource_spec
|
58
|
+
if base && resource_spec.to_s !~ %r{(^/|://)} # relative path
|
59
|
+
base.join(resource_spec).to_s
|
60
|
+
else
|
61
|
+
resource_spec.to_s
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module IMW
|
2
|
+
class Metadata
|
3
|
+
|
4
|
+
# A module that can be mixed into any class defining a +contents+
|
5
|
+
# methods which returns an Array of URI strings.
|
6
|
+
module ContainsMetadata
|
7
|
+
|
8
|
+
# The path at which this resource's metadata file lives.
|
9
|
+
#
|
10
|
+
# Will default to any file beginning with +metadata+ and ending
|
11
|
+
# with a +yaml+ or +json+ extension contained in this resource's
|
12
|
+
# +contents+.
|
13
|
+
#
|
14
|
+
# @return [String, nil]
|
15
|
+
def metadata_uri
|
16
|
+
@metadata_uri ||= contents.detect { |path| path =~ /metadata.*\.(ya?ml|json)$/ }
|
17
|
+
end
|
18
|
+
|
19
|
+
# Explicitly set the path to the metadata for this resource.
|
20
|
+
attr_writer :metadata_uri
|
21
|
+
|
22
|
+
# Does this resource contain metadata for other resources it
|
23
|
+
# contains?
|
24
|
+
#
|
25
|
+
# @return [true, false]
|
26
|
+
def metadata?
|
27
|
+
(!! metadata_uri)
|
28
|
+
end
|
29
|
+
|
30
|
+
# Return the metadata for this resource.
|
31
|
+
#
|
32
|
+
# @return [IMW::Metadata, nil]
|
33
|
+
def metadata
|
34
|
+
@metadata ||= metadata? && IMW::Metadata.load(metadata_uri)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Explicitly set the metadata for this resource.
|
38
|
+
attr_writer :metadata
|
39
|
+
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
|
@@ -0,0 +1,111 @@
|
|
1
|
+
module IMW
|
2
|
+
class Metadata
|
3
|
+
|
4
|
+
# A module which defines a DSL that can be used to define metadata
|
5
|
+
# for an object.
|
6
|
+
module DSL
|
7
|
+
|
8
|
+
# Open a new resource at the given URI.
|
9
|
+
#
|
10
|
+
# If this dataset has metadata and it describes the resource
|
11
|
+
# then configure the resource to understand its schema..
|
12
|
+
#
|
13
|
+
# The +schema+ property passed via the options hash will
|
14
|
+
# override this.
|
15
|
+
#
|
16
|
+
# @param [String, Addressable::Uri, IMW::Resource] uri
|
17
|
+
# @param [Hash] options
|
18
|
+
# @return [IMW::Resource]
|
19
|
+
# @see IMW.open
|
20
|
+
def open uri, options={}, &block
|
21
|
+
schema_options = (options[:schema].nil? && metadata && metadata.describe?(uri)) ? {:schema => metadata[uri]} : {}
|
22
|
+
IMW.open(uri, options.merge(schema_options), &block)
|
23
|
+
end
|
24
|
+
|
25
|
+
def open! uri, options={}, &block
|
26
|
+
self.open(uri, options.merge(:mode => 'w'), &block)
|
27
|
+
end
|
28
|
+
|
29
|
+
# When called without a block return this object's metadata.
|
30
|
+
#
|
31
|
+
# metadata
|
32
|
+
# #=> { '/path/to/file' => [...], '/path/to/other/file' => [...], ... }
|
33
|
+
#
|
34
|
+
# When called with a block, accumulate schema and fields into
|
35
|
+
# this object's metadata
|
36
|
+
#
|
37
|
+
# metadata do
|
38
|
+
#
|
39
|
+
# schema "/path/to/file" do
|
40
|
+
# # ...
|
41
|
+
# end
|
42
|
+
#
|
43
|
+
# schema "/path/to/other/file" do
|
44
|
+
# # ...
|
45
|
+
# end
|
46
|
+
# end
|
47
|
+
#
|
48
|
+
# @see [IMW::Metadata::Schema]
|
49
|
+
# @see [IMW::Metadata::Field]
|
50
|
+
# @return [IMW::Metadata]
|
51
|
+
def metadata arg=nil, options={}, &block
|
52
|
+
case arg
|
53
|
+
when Hash
|
54
|
+
@metadata ||= Metadata.new(arg, options)
|
55
|
+
when nil
|
56
|
+
@metadata ||= Metadata.new nil, options
|
57
|
+
else
|
58
|
+
@metadata ||= Metadata.load(arg, options)
|
59
|
+
end
|
60
|
+
@metadata.base = options[:base] if options[:base]
|
61
|
+
return @metadata unless block_given?
|
62
|
+
yield
|
63
|
+
end
|
64
|
+
|
65
|
+
def schema resource, options={}, &block
|
66
|
+
new_field_accumulator!
|
67
|
+
yield
|
68
|
+
metadata[resource] = Schema.new(last_field_accumulator!)
|
69
|
+
end
|
70
|
+
|
71
|
+
def field name, options={}
|
72
|
+
accumulate_field Field.new(options.merge(:name => name))
|
73
|
+
end
|
74
|
+
|
75
|
+
def has_one name, options={}, &block
|
76
|
+
new_field_accumulator!
|
77
|
+
yield
|
78
|
+
accumulate_field Field.new(options.merge(:name => name, :has_one => last_field_accumulator!))
|
79
|
+
end
|
80
|
+
|
81
|
+
def has_many name, options={}, &block
|
82
|
+
new_field_accumulator!
|
83
|
+
yield
|
84
|
+
accumulate_field Field.new(options.merge(:name => name, :has_many => last_field_accumulator!))
|
85
|
+
end
|
86
|
+
|
87
|
+
protected
|
88
|
+
|
89
|
+
def field_accumulators # :nodoc:
|
90
|
+
@field_accumulators ||= []
|
91
|
+
end
|
92
|
+
|
93
|
+
def new_field_accumulator! # :nodoc:
|
94
|
+
field_accumulators.push([])
|
95
|
+
end
|
96
|
+
|
97
|
+
def last_field_accumulator! # :nodoc:
|
98
|
+
field_accumulators.pop
|
99
|
+
end
|
100
|
+
|
101
|
+
def field_accumulator? # :nodoc:
|
102
|
+
! field_accumulators.empty?
|
103
|
+
end
|
104
|
+
|
105
|
+
def accumulate_field f # :nodoc:
|
106
|
+
# raise IMW::SchemaError.new("No record or sub-record to accumulate fields in!") unless field_accumulator?
|
107
|
+
field_accumulators.last << f if field_accumulator?
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
module IMW
|
2
|
+
|
3
|
+
class Metadata
|
4
|
+
|
5
|
+
# Conceptually, a field is a "slot" for which "records" can have
|
6
|
+
# values.
|
7
|
+
#
|
8
|
+
# An IMW::Metadata::Field is essentially a Hash that has one required
|
9
|
+
# property: a name.
|
10
|
+
#
|
11
|
+
# IMW::Metadata::Field.new('id')
|
12
|
+
# #=> { 'name' => 'id' }
|
13
|
+
#
|
14
|
+
# But you can declare as many other properties as you want (as long
|
15
|
+
# as you include a +name+):
|
16
|
+
#
|
17
|
+
# IMW::Metadata::Field.new 'name' => 'id', 'type' => :integer, 'title' => "ID", 'description' => "Auto-incremented."
|
18
|
+
# #=> { 'name' => 'id', 'type' => :integer, 'title' > "ID", 'description' => "Auto-incremented." }
|
19
|
+
#
|
20
|
+
# Some properties make a field special:
|
21
|
+
#
|
22
|
+
# <tt>has_many</tt>::
|
23
|
+
# Denotes that this record is in a "has_many" relationship with
|
24
|
+
# one or more other records. The corresponding value should be
|
25
|
+
# an array
|
26
|
+
#
|
27
|
+
# <tt>has_one</tt>::
|
28
|
+
# Denotes that this record is in a "has_one" relationship with
|
29
|
+
# one or more other records. The corresponding value should be
|
30
|
+
# an Array in which each key names the joined record and each
|
31
|
+
# value is an Array of fields describing the joined record..
|
32
|
+
#
|
33
|
+
# @see IMW::Metadata::Record for more usage of the
|
34
|
+
# <tt>:has_many</tt> and <tt>:has_one</tt> properties.
|
35
|
+
class Field < Hash
|
36
|
+
|
37
|
+
def initialize obj
|
38
|
+
super()
|
39
|
+
if obj.is_a?(Hash) || obj.is_a?(Field)
|
40
|
+
merge!(obj)
|
41
|
+
raise IMW::ArgumentError.new("A field must have a name") if obj['name'].blank?
|
42
|
+
else
|
43
|
+
self['name'] = obj.to_s.strip
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def hierarchical?
|
48
|
+
has_key?('has_many') || has_key?('has_one')
|
49
|
+
end
|
50
|
+
alias_method :nested?, :hierarchical?
|
51
|
+
|
52
|
+
def flat?
|
53
|
+
! hierarchical?
|
54
|
+
end
|
55
|
+
|
56
|
+
def titleize
|
57
|
+
self['title'] || self['name'].capitalize # FIXME we can do better than this!
|
58
|
+
end
|
59
|
+
|
60
|
+
def associations
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
@@ -0,0 +1,227 @@
|
|
1
|
+
module IMW
|
2
|
+
class Metadata
|
3
|
+
|
4
|
+
# A class to describe the schema of a resource.
|
5
|
+
#
|
6
|
+
# A Schema is built on top of an Array because it is often
|
7
|
+
# important to have an ordering for a record's fields.
|
8
|
+
#
|
9
|
+
# For fields with no such ordering, an Array also works because
|
10
|
+
# each of its element will be a field with a +name+ that can be
|
11
|
+
# used to index the corresponding field.
|
12
|
+
#
|
13
|
+
# A Schema is instantiated with a basic Ruby data structure.
|
14
|
+
#
|
15
|
+
# == Tabular Data
|
16
|
+
#
|
17
|
+
# Tabular data formats (CSV, TSV, &c.) contain flat records
|
18
|
+
# consisting of repeating rows with the same fields in the same
|
19
|
+
# position. A sample of delimited data looks like
|
20
|
+
#
|
21
|
+
# ID,Name,Genus,Species
|
22
|
+
# 001,Gray-bellied Night Monkey,Aotus,lemurinus
|
23
|
+
# 002,Panamanian Night Monkey,Aotus,zonalis
|
24
|
+
# 003,Hernández-Camacho's Night Monkey,Aotus,jorgehernandezi
|
25
|
+
# 004,Gray-handed Night Monkey,Aotus,griseimembra
|
26
|
+
# 005,Hershkovitz's Night Monkey,Aotus,hershkovitzi
|
27
|
+
# ...
|
28
|
+
#
|
29
|
+
# The schema of these records is summarized as a Ruby data
|
30
|
+
# structure in the following way
|
31
|
+
#
|
32
|
+
# [
|
33
|
+
# { :name => :id, :type => :integer },
|
34
|
+
# { :name => :name, :type => :string, :title => "Common Name" },
|
35
|
+
# { :name => :genus, :type => :string, :title => "Genus" },
|
36
|
+
# { :name => :species, :type => :string, :title => "Species" }
|
37
|
+
# ]
|
38
|
+
#
|
39
|
+
# The outer-most Array represents each row and each Hash in the
|
40
|
+
# Array represents one of the fields in a row. A Schema
|
41
|
+
# initialized with the above Ruby code can be thought of and
|
42
|
+
# played with as an Array of Hashes even though it really is a
|
43
|
+
# Schema object of Field objects.
|
44
|
+
#
|
45
|
+
# == Hierarchical Data
|
46
|
+
#
|
47
|
+
# Hierarchical data formats (JSON, YAML, XML, &c.) can have
|
48
|
+
# arbitrarily complex records with fields within fields and so on.
|
49
|
+
# A sample of hierarchical XML data looks like
|
50
|
+
#
|
51
|
+
# <genera>
|
52
|
+
# <genus>
|
53
|
+
# <name>Mandrillus</name>
|
54
|
+
# <species>
|
55
|
+
# <species id="113">
|
56
|
+
# <name>sphinx</name>
|
57
|
+
# <common_name>Mandrill</common_name>
|
58
|
+
# </species>
|
59
|
+
# <species id="114">
|
60
|
+
# <name>leucophaeus</name>
|
61
|
+
# <common_name>Drill</common_name>
|
62
|
+
# </species>
|
63
|
+
# </species>
|
64
|
+
# </genus>
|
65
|
+
# <genus>
|
66
|
+
# <name>Rungwecebus</name>
|
67
|
+
# <species>
|
68
|
+
# <species id="100">
|
69
|
+
# <name>kipunji</name>
|
70
|
+
# <common_name>Kipunji</common_name>
|
71
|
+
# </species>
|
72
|
+
# </species>
|
73
|
+
# </genus>
|
74
|
+
#
|
75
|
+
# These records are described by the following Ruby data structure
|
76
|
+
#
|
77
|
+
# [
|
78
|
+
# { :name => :genera,
|
79
|
+
# :has_many => [
|
80
|
+
# { :name => 'name', :type => :string, title => "Genus" },
|
81
|
+
# { :name => 'species',
|
82
|
+
# :has_many => [
|
83
|
+
# { :name => :id, :type => :integer },
|
84
|
+
# { :name => :name, :type => :string, :title => "Species" },
|
85
|
+
# { :name => :common_name, :type => :string, :title => "Common Name" }
|
86
|
+
# ]
|
87
|
+
# }
|
88
|
+
# ]
|
89
|
+
# }
|
90
|
+
# ]
|
91
|
+
#
|
92
|
+
# By IMW convention, the outer-most element of the Schema is still
|
93
|
+
# an Array describing a collection of identical records even
|
94
|
+
# though XML data must have a single root node, limiting the
|
95
|
+
# collection to a single record.
|
96
|
+
#
|
97
|
+
# The first field of the Schema is named +genera+ and it uses the
|
98
|
+
# special field property +has_many+ to denote that the field
|
99
|
+
# points to a collection of sub-records.
|
100
|
+
#
|
101
|
+
# Each of these sub-records has its own sub-schema defined by the
|
102
|
+
# Array that the +has_many+ property keys to. In this case, the
|
103
|
+
# two fields are +name+ and +species+. +name+ is a simple String
|
104
|
+
# value while +species+ itself points at another collection of
|
105
|
+
# objects.
|
106
|
+
#
|
107
|
+
# This second-level nested record (a particular species) is itself
|
108
|
+
# composed of the three (flat) fields +id+, +name+, and
|
109
|
+
# +common_name+. Note that the Schema doesn't know (or care) that
|
110
|
+
# the +id+ field is contained in an XML attribute while the +name+
|
111
|
+
# and +common_name+ fields are contained as text within daughter
|
112
|
+
# nodes.
|
113
|
+
#
|
114
|
+
# A different way of structure the same information, this time
|
115
|
+
# expressed in YAML:
|
116
|
+
#
|
117
|
+
# ---
|
118
|
+
# Mandrillus:
|
119
|
+
# - :species: sphinx
|
120
|
+
# :name: Mandrill
|
121
|
+
# :id: "113"
|
122
|
+
# - :species: leucophaeus
|
123
|
+
# :name: Drill
|
124
|
+
# :id: "114"
|
125
|
+
# Rungwecebus:
|
126
|
+
# - :species: kipunji
|
127
|
+
# :name: Kipunji
|
128
|
+
# :id: "100"
|
129
|
+
#
|
130
|
+
# Would lead to a different Schema
|
131
|
+
#
|
132
|
+
# [
|
133
|
+
# { :name => :genus, :title => "Genus",
|
134
|
+
# :has_many => [
|
135
|
+
# { :name => :id, :type => :integer },
|
136
|
+
# { :name => :name, :type => :string, :title => "Common Name" },
|
137
|
+
# { :name => :species, :type => :string, :title => "Species" }
|
138
|
+
# ]
|
139
|
+
# }
|
140
|
+
# ]
|
141
|
+
#
|
142
|
+
# Where the unnecessary outer wrapper field +genera+ has been
|
143
|
+
# dispensed with.
|
144
|
+
#
|
145
|
+
# In addition to "has many" relationships a record can have a
|
146
|
+
# "has_one" relationship. The above data might be expressed
|
147
|
+
#
|
148
|
+
# ---
|
149
|
+
# Mandrillus:
|
150
|
+
# - species: sphinx
|
151
|
+
# name: Mandrill
|
152
|
+
# id: "113"
|
153
|
+
# discoverer:
|
154
|
+
# name: Dr. Monkeypants
|
155
|
+
# year: 1838
|
156
|
+
# - species: leucophaeus
|
157
|
+
# name: Drill
|
158
|
+
# id: "114"
|
159
|
+
# discoverer:
|
160
|
+
# name: Ms. Cecelia Apefingers
|
161
|
+
# year: 1921
|
162
|
+
#
|
163
|
+
# would result in the following Schema:
|
164
|
+
#
|
165
|
+
# [
|
166
|
+
# { :name => :genus, :title => "Genus",
|
167
|
+
# :has_many => [
|
168
|
+
# { :name => :id, :type => :integer },
|
169
|
+
# { :name => :name, :type => :string, :title => "Common Name" },
|
170
|
+
# { :name => :species, :type => :string },
|
171
|
+
# { :name => :discoverer,
|
172
|
+
# :has_one => [
|
173
|
+
# { :name => 'name', :type => :string },
|
174
|
+
# { :name => 'year', :type => :integer }
|
175
|
+
# ]
|
176
|
+
# }
|
177
|
+
# ]
|
178
|
+
# }
|
179
|
+
# ]
|
180
|
+
#
|
181
|
+
# The +discoverer+ field is marked as +has_one+ which means the
|
182
|
+
# +name+ and +year+ fields in the corresponding Array will be
|
183
|
+
# interpreted as fields in a single attached sub-record.
|
184
|
+
#
|
185
|
+
# = Compact Schemas
|
186
|
+
#
|
187
|
+
# The internal hashes in a Schema specification are really Field
|
188
|
+
# objects and the initializer will promote Strings and Symbols to
|
189
|
+
# Field objects automatically. This means that the above Schema
|
190
|
+
# specification could be replaced by
|
191
|
+
#
|
192
|
+
# [
|
193
|
+
# { :name => :genus
|
194
|
+
# :has_many => [
|
195
|
+
# :id,
|
196
|
+
# :name,
|
197
|
+
# :species,
|
198
|
+
# { :name => :discoverer,
|
199
|
+
# :has_one => [
|
200
|
+
# :name,
|
201
|
+
# :year
|
202
|
+
# ]
|
203
|
+
# }
|
204
|
+
# ]
|
205
|
+
# }
|
206
|
+
# ]
|
207
|
+
#
|
208
|
+
# though there is an accompanying loss of metadata about each
|
209
|
+
# field.
|
210
|
+
class Schema < Array
|
211
|
+
|
212
|
+
def initialize input=nil
|
213
|
+
super()
|
214
|
+
concat(input.map { |field| IMW::Metadata::Field.new(field) }) if input
|
215
|
+
end
|
216
|
+
|
217
|
+
def self.load resource
|
218
|
+
new(IMW.open(resource).load)
|
219
|
+
end
|
220
|
+
|
221
|
+
def [] index
|
222
|
+
[Integer, Range].include?(index.class) ? super(index) : detect { |field| field[:name].to_s == index.to_s }
|
223
|
+
end
|
224
|
+
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|