imw 0.2.7 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +23 -0
- data/Gemfile.lock +47 -0
- data/LICENSE +20 -674
- data/README.rdoc +3 -4
- data/VERSION +1 -1
- data/lib/imw.rb +64 -35
- data/lib/imw/dataset.rb +12 -2
- data/lib/imw/formats.rb +4 -2
- data/lib/imw/formats/delimited.rb +96 -36
- data/lib/imw/formats/excel.rb +69 -101
- data/lib/imw/formats/json.rb +3 -5
- data/lib/imw/formats/pdf.rb +71 -0
- data/lib/imw/formats/yaml.rb +3 -5
- data/lib/imw/metadata.rb +66 -0
- data/lib/imw/metadata/contains_metadata.rb +44 -0
- data/lib/imw/metadata/dsl.rb +111 -0
- data/lib/imw/metadata/field.rb +65 -0
- data/lib/imw/metadata/schema.rb +227 -0
- data/lib/imw/metadata/schematized.rb +27 -0
- data/lib/imw/parsers.rb +1 -0
- data/lib/imw/parsers/flat.rb +44 -0
- data/lib/imw/resource.rb +36 -224
- data/lib/imw/schemes.rb +3 -1
- data/lib/imw/schemes/hdfs.rb +12 -1
- data/lib/imw/schemes/http.rb +1 -2
- data/lib/imw/schemes/local.rb +139 -16
- data/lib/imw/schemes/remote.rb +14 -9
- data/lib/imw/schemes/s3.rb +12 -0
- data/lib/imw/schemes/sql.rb +117 -0
- data/lib/imw/tools.rb +5 -3
- data/lib/imw/tools/downloader.rb +63 -0
- data/lib/imw/tools/summarizer.rb +21 -10
- data/lib/imw/utils.rb +10 -0
- data/lib/imw/utils/dynamically_extendable.rb +137 -0
- data/lib/imw/utils/error.rb +3 -0
- data/lib/imw/utils/extensions.rb +0 -4
- data/lib/imw/utils/extensions/array.rb +6 -7
- data/lib/imw/utils/extensions/hash.rb +3 -5
- data/lib/imw/utils/extensions/string.rb +3 -3
- data/lib/imw/utils/has_uri.rb +114 -0
- data/spec/data/{sample.csv → formats/delimited/sample.csv} +1 -1
- data/spec/data/{sample.tsv → formats/delimited/sample.tsv} +0 -0
- data/spec/data/formats/delimited/with_schema/ace-hardware-locations.tsv +11 -0
- data/spec/data/formats/delimited/with_schema/all-countries-ip-address-to-geolocation-data.tsv +16 -0
- data/spec/data/formats/delimited/with_schema/complete-list-of-starbucks-locations.tsv +11 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +22 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +22 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-myspace-application-counts.tsv +12 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +13 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +22 -0
- data/spec/data/formats/delimited/with_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +22 -0
- data/spec/data/formats/delimited/without_schema/ace-hardware-locations.tsv +10 -0
- data/spec/data/formats/delimited/without_schema/all-countries-ip-address-to-geolocation-data.tsv +15 -0
- data/spec/data/formats/delimited/without_schema/complete-list-of-starbucks-locations.tsv +10 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-cumulative-word-count-from-from-dec.tsv +21 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-adds-by-zip-cod.tsv +21 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-myspace-application-counts.tsv +11 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-latlong.tsv +12 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-user-count-by-zip-code.tsv +21 -0
- data/spec/data/formats/delimited/without_schema/myspace-user-activity-stream-word-count-by-day-from-december-200.tsv +21 -0
- data/spec/data/formats/excel/sample.xls +0 -0
- data/spec/data/formats/json/sample.json +1 -0
- data/spec/data/formats/none/sample +650 -0
- data/spec/data/formats/sgml/sample.xml +617 -0
- data/spec/data/formats/text/sample.txt +650 -0
- data/spec/data/formats/yaml/sample.yaml +410 -0
- data/spec/data/schema-tabular.yaml +11 -0
- data/spec/imw/formats/delimited_spec.rb +34 -2
- data/spec/imw/formats/excel_spec.rb +55 -0
- data/spec/imw/formats/json_spec.rb +3 -3
- data/spec/imw/formats/sgml_spec.rb +4 -4
- data/spec/imw/formats/yaml_spec.rb +3 -3
- data/spec/imw/metadata/field_spec.rb +26 -0
- data/spec/imw/metadata/schema_spec.rb +27 -0
- data/spec/imw/metadata_spec.rb +39 -0
- data/spec/imw/parsers/line_parser_spec.rb +1 -1
- data/spec/imw/resource_spec.rb +0 -100
- data/spec/imw/schemes/hdfs_spec.rb +19 -13
- data/spec/imw/schemes/local_spec.rb +59 -3
- data/spec/imw/schemes/s3_spec.rb +4 -0
- data/spec/imw/utils/dynamically_extendable_spec.rb +69 -0
- data/spec/imw/utils/has_uri_spec.rb +55 -0
- data/spec/spec_helper.rb +1 -2
- data/spec/support/random.rb +4 -4
- metadata +58 -17
- data/CHANGELOG +0 -0
- data/TODO +0 -18
- data/spec/data/sample.json +0 -782
- data/spec/data/sample.txt +0 -131
- data/spec/data/sample.xml +0 -653
- data/spec/data/sample.yaml +0 -651
- data/spec/spec.opts +0 -4
- data/spec/support/extensions.rb +0 -18
data/lib/imw/formats/json.rb
CHANGED
|
@@ -27,15 +27,13 @@ module IMW
|
|
|
27
27
|
load(&block)
|
|
28
28
|
end
|
|
29
29
|
|
|
30
|
-
#
|
|
30
|
+
# Emit the +data+ into this resource. It must be opened for
|
|
31
31
|
# writing.
|
|
32
32
|
#
|
|
33
|
-
# @param [Hash, String, Array, Fixnum] data the Ruby object to
|
|
34
|
-
|
|
35
|
-
def dump data, options={}
|
|
33
|
+
# @param [Hash, String, Array, Fixnum] data the Ruby object to emit
|
|
34
|
+
def emit data, options={}
|
|
36
35
|
require 'json'
|
|
37
36
|
write(data.to_json)
|
|
38
|
-
io.close unless options[:persist]
|
|
39
37
|
self
|
|
40
38
|
end
|
|
41
39
|
end
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
module IMW
|
|
2
|
+
module Formats
|
|
3
|
+
|
|
4
|
+
# Defines methods for parsing and generating PDF.
|
|
5
|
+
#
|
|
6
|
+
# Uses PDF::Reader for parsing and Prawn for generating.
|
|
7
|
+
module Pdf
|
|
8
|
+
|
|
9
|
+
# Return a snippet of text from this PDF.
|
|
10
|
+
#
|
|
11
|
+
# @return [String]
|
|
12
|
+
def snippet
|
|
13
|
+
begin
|
|
14
|
+
require 'pdf/reader'
|
|
15
|
+
snippetizer = Snippetizer.new
|
|
16
|
+
PDF::Reader.file(path, snippetizer)
|
|
17
|
+
snippetizer.snippet
|
|
18
|
+
rescue Snippetizer::SnippetEndError
|
|
19
|
+
snippetizer.snippet
|
|
20
|
+
rescue
|
|
21
|
+
''
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# A receiver class used by PDF::Reader which agglomerates text
|
|
26
|
+
# up to 1024 bytes and then bails.
|
|
27
|
+
class Snippetizer
|
|
28
|
+
|
|
29
|
+
# A custom error class that can be thrown while receiving text
|
|
30
|
+
# from PDF::Reader to cut-short walking large PDF documents.
|
|
31
|
+
SnippetEndError = Class.new(IMW::Error)
|
|
32
|
+
|
|
33
|
+
# The snippet being built by this snippetizer.
|
|
34
|
+
attr_accessor :snippet
|
|
35
|
+
|
|
36
|
+
def initialize
|
|
37
|
+
@snippet = ''
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Agglomerates text from PDF::Reader up to a fixed size of
|
|
41
|
+
# 1024 bytes.
|
|
42
|
+
#
|
|
43
|
+
# Will convert a single-space line from PDF::Reader as a
|
|
44
|
+
# newline character.
|
|
45
|
+
#
|
|
46
|
+
# FIXME How does the receiver ask PDF::Reader to abort walking
|
|
47
|
+
# the document now that enough text has been returned? Till a
|
|
48
|
+
# more graceful way is found this method simply raises an
|
|
49
|
+
# error, creating a GOTO...
|
|
50
|
+
def show_text *params
|
|
51
|
+
params.each do |string|
|
|
52
|
+
if @snippet.size < 1024
|
|
53
|
+
if string == ' '
|
|
54
|
+
@snippet += "\n"
|
|
55
|
+
else
|
|
56
|
+
@snippet += string[0..1024]
|
|
57
|
+
end
|
|
58
|
+
else
|
|
59
|
+
raise SnippetEndError.new
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
alias_method :show_text_with_positioning, :show_text
|
|
64
|
+
alias_method :move_to_next_line_and_show_text, :show_text
|
|
65
|
+
alias_method :set_spacing_next_line_show_text, :show_text
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
data/lib/imw/formats/yaml.rb
CHANGED
|
@@ -27,15 +27,13 @@ module IMW
|
|
|
27
27
|
load(&block)
|
|
28
28
|
end
|
|
29
29
|
|
|
30
|
-
#
|
|
30
|
+
# Emit the +data+ into this resource. It must be opened for
|
|
31
31
|
# writing.
|
|
32
32
|
#
|
|
33
|
-
# @param [Hash, String, Array, Fixnum] data the Ruby object to
|
|
34
|
-
|
|
35
|
-
def dump data, options={}
|
|
33
|
+
# @param [Hash, String, Array, Fixnum] data the Ruby object to emit
|
|
34
|
+
def emit data, options={}
|
|
36
35
|
require 'yaml'
|
|
37
36
|
write(data.to_yaml)
|
|
38
|
-
io.close unless options[:persist]
|
|
39
37
|
self
|
|
40
38
|
end
|
|
41
39
|
end
|
data/lib/imw/metadata.rb
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
module IMW
|
|
2
|
+
|
|
3
|
+
# A collection of classes for describing the metadata associated
|
|
4
|
+
# with a dataset's fields.
|
|
5
|
+
class Metadata < Hash
|
|
6
|
+
|
|
7
|
+
autoload :Field, 'imw/metadata/field'
|
|
8
|
+
autoload :Schema, 'imw/metadata/schema'
|
|
9
|
+
autoload :Schematized, 'imw/metadata/schematized'
|
|
10
|
+
autoload :DSL, 'imw/metadata/dsl'
|
|
11
|
+
autoload :ContainsMetadata, 'imw/metadata/contains_metadata'
|
|
12
|
+
|
|
13
|
+
# The resource this Schema is anchored to.
|
|
14
|
+
#
|
|
15
|
+
# This attribute is useful for letting relative paths in a
|
|
16
|
+
# schema file refer to a common base URL.
|
|
17
|
+
#
|
|
18
|
+
# @return [IMW::Resource]
|
|
19
|
+
attr_reader :base
|
|
20
|
+
|
|
21
|
+
# Set the resource this Schema is anchored to.
|
|
22
|
+
#
|
|
23
|
+
# @param [IMW::Resource, String, Addressable::URI] new_base
|
|
24
|
+
def base= new_base
|
|
25
|
+
base_resource = IMW.open(new_base)
|
|
26
|
+
base_resource.should_exist!("Metdata base directory must exist")
|
|
27
|
+
raise IMW::PathError.new("Metadata base must be a directory") unless base_resource.is_directory?
|
|
28
|
+
@base = base_resource
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def initialize obj=nil, options={}
|
|
32
|
+
super()
|
|
33
|
+
self.base = options[:base] if options[:base]
|
|
34
|
+
obj.each_pair { |resource, schema| self[resource] = Schema.new(schema) } if obj
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def self.load metadata_resource, options
|
|
38
|
+
resource = IMW.open(metadata_resource)
|
|
39
|
+
new(resource.load, {:base => resource.dirname}.merge(options))
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def []= resource_spec, schema_spec
|
|
43
|
+
schema = schema_spec.is_a?(Schema) ? schema_spec : Schema.new(schema_spec)
|
|
44
|
+
super(absolute_uri(resource_spec), schema_spec)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def [] resource_spec
|
|
48
|
+
super(absolute_uri(resource_spec))
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def describe? resource_spec
|
|
52
|
+
has_key?(absolute_uri(resource_spec))
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
protected
|
|
56
|
+
|
|
57
|
+
def absolute_uri resource_spec
|
|
58
|
+
if base && resource_spec.to_s !~ %r{(^/|://)} # relative path
|
|
59
|
+
base.join(resource_spec).to_s
|
|
60
|
+
else
|
|
61
|
+
resource_spec.to_s
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
end
|
|
66
|
+
end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
module IMW
|
|
2
|
+
class Metadata
|
|
3
|
+
|
|
4
|
+
# A module that can be mixed into any class defining a +contents+
|
|
5
|
+
# methods which returns an Array of URI strings.
|
|
6
|
+
module ContainsMetadata
|
|
7
|
+
|
|
8
|
+
# The path at which this resource's metadata file lives.
|
|
9
|
+
#
|
|
10
|
+
# Will default to any file beginning with +metadata+ and ending
|
|
11
|
+
# with a +yaml+ or +json+ extension contained in this resource's
|
|
12
|
+
# +contents+.
|
|
13
|
+
#
|
|
14
|
+
# @return [String, nil]
|
|
15
|
+
def metadata_uri
|
|
16
|
+
@metadata_uri ||= contents.detect { |path| path =~ /metadata.*\.(ya?ml|json)$/ }
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# Explicitly set the path to the metadata for this resource.
|
|
20
|
+
attr_writer :metadata_uri
|
|
21
|
+
|
|
22
|
+
# Does this resource contain metadata for other resources it
|
|
23
|
+
# contains?
|
|
24
|
+
#
|
|
25
|
+
# @return [true, false]
|
|
26
|
+
def metadata?
|
|
27
|
+
(!! metadata_uri)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Return the metadata for this resource.
|
|
31
|
+
#
|
|
32
|
+
# @return [IMW::Metadata, nil]
|
|
33
|
+
def metadata
|
|
34
|
+
@metadata ||= metadata? && IMW::Metadata.load(metadata_uri)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Explicitly set the metadata for this resource.
|
|
38
|
+
attr_writer :metadata
|
|
39
|
+
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
module IMW
|
|
2
|
+
class Metadata
|
|
3
|
+
|
|
4
|
+
# A module which defines a DSL that can be used to define metadata
|
|
5
|
+
# for an object.
|
|
6
|
+
module DSL
|
|
7
|
+
|
|
8
|
+
# Open a new resource at the given URI.
|
|
9
|
+
#
|
|
10
|
+
# If this dataset has metadata and it describes the resource
|
|
11
|
+
# then configure the resource to understand its schema..
|
|
12
|
+
#
|
|
13
|
+
# The +schema+ property passed via the options hash will
|
|
14
|
+
# override this.
|
|
15
|
+
#
|
|
16
|
+
# @param [String, Addressable::Uri, IMW::Resource] uri
|
|
17
|
+
# @param [Hash] options
|
|
18
|
+
# @return [IMW::Resource]
|
|
19
|
+
# @see IMW.open
|
|
20
|
+
def open uri, options={}, &block
|
|
21
|
+
schema_options = (options[:schema].nil? && metadata && metadata.describe?(uri)) ? {:schema => metadata[uri]} : {}
|
|
22
|
+
IMW.open(uri, options.merge(schema_options), &block)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def open! uri, options={}, &block
|
|
26
|
+
self.open(uri, options.merge(:mode => 'w'), &block)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# When called without a block return this object's metadata.
|
|
30
|
+
#
|
|
31
|
+
# metadata
|
|
32
|
+
# #=> { '/path/to/file' => [...], '/path/to/other/file' => [...], ... }
|
|
33
|
+
#
|
|
34
|
+
# When called with a block, accumulate schema and fields into
|
|
35
|
+
# this object's metadata
|
|
36
|
+
#
|
|
37
|
+
# metadata do
|
|
38
|
+
#
|
|
39
|
+
# schema "/path/to/file" do
|
|
40
|
+
# # ...
|
|
41
|
+
# end
|
|
42
|
+
#
|
|
43
|
+
# schema "/path/to/other/file" do
|
|
44
|
+
# # ...
|
|
45
|
+
# end
|
|
46
|
+
# end
|
|
47
|
+
#
|
|
48
|
+
# @see [IMW::Metadata::Schema]
|
|
49
|
+
# @see [IMW::Metadata::Field]
|
|
50
|
+
# @return [IMW::Metadata]
|
|
51
|
+
def metadata arg=nil, options={}, &block
|
|
52
|
+
case arg
|
|
53
|
+
when Hash
|
|
54
|
+
@metadata ||= Metadata.new(arg, options)
|
|
55
|
+
when nil
|
|
56
|
+
@metadata ||= Metadata.new nil, options
|
|
57
|
+
else
|
|
58
|
+
@metadata ||= Metadata.load(arg, options)
|
|
59
|
+
end
|
|
60
|
+
@metadata.base = options[:base] if options[:base]
|
|
61
|
+
return @metadata unless block_given?
|
|
62
|
+
yield
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def schema resource, options={}, &block
|
|
66
|
+
new_field_accumulator!
|
|
67
|
+
yield
|
|
68
|
+
metadata[resource] = Schema.new(last_field_accumulator!)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def field name, options={}
|
|
72
|
+
accumulate_field Field.new(options.merge(:name => name))
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def has_one name, options={}, &block
|
|
76
|
+
new_field_accumulator!
|
|
77
|
+
yield
|
|
78
|
+
accumulate_field Field.new(options.merge(:name => name, :has_one => last_field_accumulator!))
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def has_many name, options={}, &block
|
|
82
|
+
new_field_accumulator!
|
|
83
|
+
yield
|
|
84
|
+
accumulate_field Field.new(options.merge(:name => name, :has_many => last_field_accumulator!))
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
protected
|
|
88
|
+
|
|
89
|
+
def field_accumulators # :nodoc:
|
|
90
|
+
@field_accumulators ||= []
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def new_field_accumulator! # :nodoc:
|
|
94
|
+
field_accumulators.push([])
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def last_field_accumulator! # :nodoc:
|
|
98
|
+
field_accumulators.pop
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def field_accumulator? # :nodoc:
|
|
102
|
+
! field_accumulators.empty?
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def accumulate_field f # :nodoc:
|
|
106
|
+
# raise IMW::SchemaError.new("No record or sub-record to accumulate fields in!") unless field_accumulator?
|
|
107
|
+
field_accumulators.last << f if field_accumulator?
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
module IMW
|
|
2
|
+
|
|
3
|
+
class Metadata
|
|
4
|
+
|
|
5
|
+
# Conceptually, a field is a "slot" for which "records" can have
|
|
6
|
+
# values.
|
|
7
|
+
#
|
|
8
|
+
# An IMW::Metadata::Field is essentially a Hash that has one required
|
|
9
|
+
# property: a name.
|
|
10
|
+
#
|
|
11
|
+
# IMW::Metadata::Field.new('id')
|
|
12
|
+
# #=> { 'name' => 'id' }
|
|
13
|
+
#
|
|
14
|
+
# But you can declare as many other properties as you want (as long
|
|
15
|
+
# as you include a +name+):
|
|
16
|
+
#
|
|
17
|
+
# IMW::Metadata::Field.new 'name' => 'id', 'type' => :integer, 'title' => "ID", 'description' => "Auto-incremented."
|
|
18
|
+
# #=> { 'name' => 'id', 'type' => :integer, 'title' > "ID", 'description' => "Auto-incremented." }
|
|
19
|
+
#
|
|
20
|
+
# Some properties make a field special:
|
|
21
|
+
#
|
|
22
|
+
# <tt>has_many</tt>::
|
|
23
|
+
# Denotes that this record is in a "has_many" relationship with
|
|
24
|
+
# one or more other records. The corresponding value should be
|
|
25
|
+
# an array
|
|
26
|
+
#
|
|
27
|
+
# <tt>has_one</tt>::
|
|
28
|
+
# Denotes that this record is in a "has_one" relationship with
|
|
29
|
+
# one or more other records. The corresponding value should be
|
|
30
|
+
# an Array in which each key names the joined record and each
|
|
31
|
+
# value is an Array of fields describing the joined record..
|
|
32
|
+
#
|
|
33
|
+
# @see IMW::Metadata::Record for more usage of the
|
|
34
|
+
# <tt>:has_many</tt> and <tt>:has_one</tt> properties.
|
|
35
|
+
class Field < Hash
|
|
36
|
+
|
|
37
|
+
def initialize obj
|
|
38
|
+
super()
|
|
39
|
+
if obj.is_a?(Hash) || obj.is_a?(Field)
|
|
40
|
+
merge!(obj)
|
|
41
|
+
raise IMW::ArgumentError.new("A field must have a name") if obj['name'].blank?
|
|
42
|
+
else
|
|
43
|
+
self['name'] = obj.to_s.strip
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def hierarchical?
|
|
48
|
+
has_key?('has_many') || has_key?('has_one')
|
|
49
|
+
end
|
|
50
|
+
alias_method :nested?, :hierarchical?
|
|
51
|
+
|
|
52
|
+
def flat?
|
|
53
|
+
! hierarchical?
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def titleize
|
|
57
|
+
self['title'] || self['name'].capitalize # FIXME we can do better than this!
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def associations
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
module IMW
|
|
2
|
+
class Metadata
|
|
3
|
+
|
|
4
|
+
# A class to describe the schema of a resource.
|
|
5
|
+
#
|
|
6
|
+
# A Schema is built on top of an Array because it is often
|
|
7
|
+
# important to have an ordering for a record's fields.
|
|
8
|
+
#
|
|
9
|
+
# For fields with no such ordering, an Array also works because
|
|
10
|
+
# each of its element will be a field with a +name+ that can be
|
|
11
|
+
# used to index the corresponding field.
|
|
12
|
+
#
|
|
13
|
+
# A Schema is instantiated with a basic Ruby data structure.
|
|
14
|
+
#
|
|
15
|
+
# == Tabular Data
|
|
16
|
+
#
|
|
17
|
+
# Tabular data formats (CSV, TSV, &c.) contain flat records
|
|
18
|
+
# consisting of repeating rows with the same fields in the same
|
|
19
|
+
# position. A sample of delimited data looks like
|
|
20
|
+
#
|
|
21
|
+
# ID,Name,Genus,Species
|
|
22
|
+
# 001,Gray-bellied Night Monkey,Aotus,lemurinus
|
|
23
|
+
# 002,Panamanian Night Monkey,Aotus,zonalis
|
|
24
|
+
# 003,Hernández-Camacho's Night Monkey,Aotus,jorgehernandezi
|
|
25
|
+
# 004,Gray-handed Night Monkey,Aotus,griseimembra
|
|
26
|
+
# 005,Hershkovitz's Night Monkey,Aotus,hershkovitzi
|
|
27
|
+
# ...
|
|
28
|
+
#
|
|
29
|
+
# The schema of these records is summarized as a Ruby data
|
|
30
|
+
# structure in the following way
|
|
31
|
+
#
|
|
32
|
+
# [
|
|
33
|
+
# { :name => :id, :type => :integer },
|
|
34
|
+
# { :name => :name, :type => :string, :title => "Common Name" },
|
|
35
|
+
# { :name => :genus, :type => :string, :title => "Genus" },
|
|
36
|
+
# { :name => :species, :type => :string, :title => "Species" }
|
|
37
|
+
# ]
|
|
38
|
+
#
|
|
39
|
+
# The outer-most Array represents each row and each Hash in the
|
|
40
|
+
# Array represents one of the fields in a row. A Schema
|
|
41
|
+
# initialized with the above Ruby code can be thought of and
|
|
42
|
+
# played with as an Array of Hashes even though it really is a
|
|
43
|
+
# Schema object of Field objects.
|
|
44
|
+
#
|
|
45
|
+
# == Hierarchical Data
|
|
46
|
+
#
|
|
47
|
+
# Hierarchical data formats (JSON, YAML, XML, &c.) can have
|
|
48
|
+
# arbitrarily complex records with fields within fields and so on.
|
|
49
|
+
# A sample of hierarchical XML data looks like
|
|
50
|
+
#
|
|
51
|
+
# <genera>
|
|
52
|
+
# <genus>
|
|
53
|
+
# <name>Mandrillus</name>
|
|
54
|
+
# <species>
|
|
55
|
+
# <species id="113">
|
|
56
|
+
# <name>sphinx</name>
|
|
57
|
+
# <common_name>Mandrill</common_name>
|
|
58
|
+
# </species>
|
|
59
|
+
# <species id="114">
|
|
60
|
+
# <name>leucophaeus</name>
|
|
61
|
+
# <common_name>Drill</common_name>
|
|
62
|
+
# </species>
|
|
63
|
+
# </species>
|
|
64
|
+
# </genus>
|
|
65
|
+
# <genus>
|
|
66
|
+
# <name>Rungwecebus</name>
|
|
67
|
+
# <species>
|
|
68
|
+
# <species id="100">
|
|
69
|
+
# <name>kipunji</name>
|
|
70
|
+
# <common_name>Kipunji</common_name>
|
|
71
|
+
# </species>
|
|
72
|
+
# </species>
|
|
73
|
+
# </genus>
|
|
74
|
+
#
|
|
75
|
+
# These records are described by the following Ruby data structure
|
|
76
|
+
#
|
|
77
|
+
# [
|
|
78
|
+
# { :name => :genera,
|
|
79
|
+
# :has_many => [
|
|
80
|
+
# { :name => 'name', :type => :string, title => "Genus" },
|
|
81
|
+
# { :name => 'species',
|
|
82
|
+
# :has_many => [
|
|
83
|
+
# { :name => :id, :type => :integer },
|
|
84
|
+
# { :name => :name, :type => :string, :title => "Species" },
|
|
85
|
+
# { :name => :common_name, :type => :string, :title => "Common Name" }
|
|
86
|
+
# ]
|
|
87
|
+
# }
|
|
88
|
+
# ]
|
|
89
|
+
# }
|
|
90
|
+
# ]
|
|
91
|
+
#
|
|
92
|
+
# By IMW convention, the outer-most element of the Schema is still
|
|
93
|
+
# an Array describing a collection of identical records even
|
|
94
|
+
# though XML data must have a single root node, limiting the
|
|
95
|
+
# collection to a single record.
|
|
96
|
+
#
|
|
97
|
+
# The first field of the Schema is named +genera+ and it uses the
|
|
98
|
+
# special field property +has_many+ to denote that the field
|
|
99
|
+
# points to a collection of sub-records.
|
|
100
|
+
#
|
|
101
|
+
# Each of these sub-records has its own sub-schema defined by the
|
|
102
|
+
# Array that the +has_many+ property keys to. In this case, the
|
|
103
|
+
# two fields are +name+ and +species+. +name+ is a simple String
|
|
104
|
+
# value while +species+ itself points at another collection of
|
|
105
|
+
# objects.
|
|
106
|
+
#
|
|
107
|
+
# This second-level nested record (a particular species) is itself
|
|
108
|
+
# composed of the three (flat) fields +id+, +name+, and
|
|
109
|
+
# +common_name+. Note that the Schema doesn't know (or care) that
|
|
110
|
+
# the +id+ field is contained in an XML attribute while the +name+
|
|
111
|
+
# and +common_name+ fields are contained as text within daughter
|
|
112
|
+
# nodes.
|
|
113
|
+
#
|
|
114
|
+
# A different way of structure the same information, this time
|
|
115
|
+
# expressed in YAML:
|
|
116
|
+
#
|
|
117
|
+
# ---
|
|
118
|
+
# Mandrillus:
|
|
119
|
+
# - :species: sphinx
|
|
120
|
+
# :name: Mandrill
|
|
121
|
+
# :id: "113"
|
|
122
|
+
# - :species: leucophaeus
|
|
123
|
+
# :name: Drill
|
|
124
|
+
# :id: "114"
|
|
125
|
+
# Rungwecebus:
|
|
126
|
+
# - :species: kipunji
|
|
127
|
+
# :name: Kipunji
|
|
128
|
+
# :id: "100"
|
|
129
|
+
#
|
|
130
|
+
# Would lead to a different Schema
|
|
131
|
+
#
|
|
132
|
+
# [
|
|
133
|
+
# { :name => :genus, :title => "Genus",
|
|
134
|
+
# :has_many => [
|
|
135
|
+
# { :name => :id, :type => :integer },
|
|
136
|
+
# { :name => :name, :type => :string, :title => "Common Name" },
|
|
137
|
+
# { :name => :species, :type => :string, :title => "Species" }
|
|
138
|
+
# ]
|
|
139
|
+
# }
|
|
140
|
+
# ]
|
|
141
|
+
#
|
|
142
|
+
# Where the unnecessary outer wrapper field +genera+ has been
|
|
143
|
+
# dispensed with.
|
|
144
|
+
#
|
|
145
|
+
# In addition to "has many" relationships a record can have a
|
|
146
|
+
# "has_one" relationship. The above data might be expressed
|
|
147
|
+
#
|
|
148
|
+
# ---
|
|
149
|
+
# Mandrillus:
|
|
150
|
+
# - species: sphinx
|
|
151
|
+
# name: Mandrill
|
|
152
|
+
# id: "113"
|
|
153
|
+
# discoverer:
|
|
154
|
+
# name: Dr. Monkeypants
|
|
155
|
+
# year: 1838
|
|
156
|
+
# - species: leucophaeus
|
|
157
|
+
# name: Drill
|
|
158
|
+
# id: "114"
|
|
159
|
+
# discoverer:
|
|
160
|
+
# name: Ms. Cecelia Apefingers
|
|
161
|
+
# year: 1921
|
|
162
|
+
#
|
|
163
|
+
# would result in the following Schema:
|
|
164
|
+
#
|
|
165
|
+
# [
|
|
166
|
+
# { :name => :genus, :title => "Genus",
|
|
167
|
+
# :has_many => [
|
|
168
|
+
# { :name => :id, :type => :integer },
|
|
169
|
+
# { :name => :name, :type => :string, :title => "Common Name" },
|
|
170
|
+
# { :name => :species, :type => :string },
|
|
171
|
+
# { :name => :discoverer,
|
|
172
|
+
# :has_one => [
|
|
173
|
+
# { :name => 'name', :type => :string },
|
|
174
|
+
# { :name => 'year', :type => :integer }
|
|
175
|
+
# ]
|
|
176
|
+
# }
|
|
177
|
+
# ]
|
|
178
|
+
# }
|
|
179
|
+
# ]
|
|
180
|
+
#
|
|
181
|
+
# The +discoverer+ field is marked as +has_one+ which means the
|
|
182
|
+
# +name+ and +year+ fields in the corresponding Array will be
|
|
183
|
+
# interpreted as fields in a single attached sub-record.
|
|
184
|
+
#
|
|
185
|
+
# = Compact Schemas
|
|
186
|
+
#
|
|
187
|
+
# The internal hashes in a Schema specification are really Field
|
|
188
|
+
# objects and the initializer will promote Strings and Symbols to
|
|
189
|
+
# Field objects automatically. This means that the above Schema
|
|
190
|
+
# specification could be replaced by
|
|
191
|
+
#
|
|
192
|
+
# [
|
|
193
|
+
# { :name => :genus
|
|
194
|
+
# :has_many => [
|
|
195
|
+
# :id,
|
|
196
|
+
# :name,
|
|
197
|
+
# :species,
|
|
198
|
+
# { :name => :discoverer,
|
|
199
|
+
# :has_one => [
|
|
200
|
+
# :name,
|
|
201
|
+
# :year
|
|
202
|
+
# ]
|
|
203
|
+
# }
|
|
204
|
+
# ]
|
|
205
|
+
# }
|
|
206
|
+
# ]
|
|
207
|
+
#
|
|
208
|
+
# though there is an accompanying loss of metadata about each
|
|
209
|
+
# field.
|
|
210
|
+
class Schema < Array
|
|
211
|
+
|
|
212
|
+
def initialize input=nil
|
|
213
|
+
super()
|
|
214
|
+
concat(input.map { |field| IMW::Metadata::Field.new(field) }) if input
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
def self.load resource
|
|
218
|
+
new(IMW.open(resource).load)
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
def [] index
|
|
222
|
+
[Integer, Range].include?(index.class) ? super(index) : detect { |field| field[:name].to_s == index.to_s }
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
end
|
|
226
|
+
end
|
|
227
|
+
end
|