imw 0.2.16 → 0.2.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.16
1
+ 0.2.17
data/lib/imw/dataset.rb CHANGED
@@ -119,7 +119,7 @@ module IMW
119
119
 
120
120
  # Provides this dataset with DSL like methods to construct a
121
121
  # schema in an IMW file.
122
- include IMW::Metadata::DSL
122
+ # include IMW::Metadata::DSL
123
123
 
124
124
  end
125
125
  end
@@ -11,12 +11,6 @@ module IMW
11
11
  # @abstract
12
12
  module Delimited
13
13
 
14
- # Ensure that this delimited resource is described by a an
15
- # ordered collection of flat fields.
16
- def validate_schema!
17
- raise IMW::SchemaError.new("#{self.class} resources must be described by an ordered set of flat fields") if schema.any?(&:nested?)
18
- end
19
-
20
14
  # Default options to be passed to
21
15
  # FasterCSV[http://fastercsv.rubyforge.org/]; see its
22
16
  # documentation for more information.
@@ -24,7 +18,7 @@ module IMW
24
18
  # @return [Hash]
25
19
  def delimited_options
26
20
  @delimited_options ||= {
27
- :headers => schema && schema.map { |field| field['name'] }
21
+ :headers => fields && fields.map { |field| field['name'] }
28
22
  }.merge(resource_options_compatible_with_faster_csv)
29
23
  end
30
24
 
@@ -68,7 +62,7 @@ module IMW
68
62
  # of this delimited data is a row of headers.
69
63
  #
70
64
  # @return [true, false]
71
- def headers_in_first_line?
65
+ def fields_in_first_line?
72
66
  # grab the header and up to 10 body rows
73
67
  require 'fastercsv'
74
68
  copy = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
@@ -93,15 +87,16 @@ module IMW
93
87
  determinant && determinant >= 0.05
94
88
  end
95
89
 
96
- # If it seems like there are headers in the first line of this
97
- # data then go ahead and use them to define a schema.
90
+ # If it seems like there are fields in the first line of this
91
+ # data then go ahead and use them to define this resource's
92
+ # fields.
98
93
  #
99
- # Will overwrite a schema already present for this resource.
100
- def guess_schema!
101
- return unless headers_in_first_line?
94
+ # Will overwrite any fields already present for this resource.
95
+ def guess_fields!
96
+ return unless fields_in_first_line?
102
97
  copy = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
103
98
  names = (copy.shift || []) rescue []
104
- self.schema = IMW::Metadata::Schema.new(names)
99
+ self.fields = names.map { |n| { 'name' => n } }
105
100
  delimited_options[:headers] = names
106
101
  end
107
102
 
data/lib/imw/metadata.rb CHANGED
@@ -4,13 +4,13 @@ module IMW
4
4
  # with a dataset's fields.
5
5
  class Metadata < Hash
6
6
 
7
- autoload :Field, 'imw/metadata/field'
8
- autoload :Schema, 'imw/metadata/schema'
9
- autoload :Schematized, 'imw/metadata/schematized'
10
- autoload :DSL, 'imw/metadata/dsl'
7
+ autoload :Field, 'imw/metadata/field'
8
+ autoload :Schema, 'imw/metadata/schema'
11
9
  autoload :ContainsMetadata, 'imw/metadata/contains_metadata'
10
+ autoload :HasSummary, 'imw/metadata/has_summary'
11
+ autoload :HasMetadata, 'imw/metadata/has_metadata'
12
12
 
13
- # The resource this Schema is anchored to.
13
+ # The resource this metadata is anchored to.
14
14
  #
15
15
  # This attribute is useful for letting relative paths in a
16
16
  # schema file refer to a common base URL.
@@ -18,12 +18,12 @@ module IMW
18
18
  # @return [IMW::Resource]
19
19
  attr_reader :base
20
20
 
21
- # Set the resource this Schema is anchored to.
21
+ # Set the base resource this metdata is anchored to.
22
22
  #
23
23
  # @param [IMW::Resource, String, Addressable::URI] new_base
24
24
  def base= new_base
25
25
  base_resource = IMW.open(new_base)
26
- base_resource.should_exist!("Metdata base directory must exist")
26
+ base_resource.should_exist!("Metadata base directory must exist")
27
27
  raise IMW::PathError.new("Metadata base must be a directory") unless base_resource.is_directory?
28
28
  @base = base_resource
29
29
  end
@@ -31,34 +31,51 @@ module IMW
31
31
  def initialize obj=nil, options={}
32
32
  super()
33
33
  self.base = options[:base] if options[:base]
34
- obj.each_pair { |resource, schema| self[resource] = Schema.new(schema) } if obj
34
+ if obj
35
+ obj.each_pair do |resource, metadata|
36
+ self[resource] = metadata
37
+ end
38
+ end
35
39
  end
36
40
 
37
- def self.load metadata_resource, options
38
- resource = IMW.open(metadata_resource)
41
+ def self.load obj, options={}
42
+ resource = IMW.open(obj)
39
43
  new(resource.load, {:base => resource.dirname}.merge(options))
40
44
  end
41
45
 
42
- def []= resource_spec, schema_spec
43
- schema = schema_spec.is_a?(Schema) ? schema_spec : Schema.new(schema_spec)
44
- super(absolute_uri(resource_spec), schema_spec)
46
+ def []= resource, metadata
47
+ super(absolute_uri(resource), metadata)
48
+ end
49
+
50
+ def [] resource
51
+ super(absolute_uri(resource))
52
+ end
53
+
54
+ def describe? resource
55
+ self[(absolute_uri(resource))]
45
56
  end
57
+ alias_method :describes?, :describe?
46
58
 
47
- def [] resource_spec
48
- super(absolute_uri(resource_spec))
59
+ def description_for resource
60
+ return unless describes?(resource)
61
+ self[resource]['description']
49
62
  end
50
63
 
51
- def describe? resource_spec
52
- has_key?(absolute_uri(resource_spec))
64
+ def fields_for resource
65
+ return unless describes?(resource)
66
+ (self[resource]['fields'] || []).map { |f| Metadata::Field.new(f) }
53
67
  end
54
68
 
55
69
  protected
56
70
 
57
- def absolute_uri resource_spec
58
- if base && resource_spec.to_s !~ %r{(^/|://)} # relative path
59
- base.join(resource_spec).to_s
71
+ def absolute_uri resource
72
+ obj = IMW.open(resource)
73
+ if base && obj.uri.to_s !~ %r{(^/|://)} # relative path
74
+ s = base.join(obj.uri.to_s).uri.to_s
75
+ s
60
76
  else
61
- resource_spec.to_s
77
+ s = obj.uri.to_s
78
+ s
62
79
  end
63
80
  end
64
81
 
@@ -1,44 +1,54 @@
1
- module IMW
1
+ module IMW
2
2
  class Metadata
3
3
 
4
- # A module that can be mixed into any class defining a +contents+
5
- # methods which returns an Array of URI strings.
4
+ # A module for finding metadata describing the sub-resources of a
5
+ # given resource.
6
+ #
7
+ # An including class describing the parent resource must define
8
+ # the +contents+ method which must return an Array of Strings
9
+ # contained within the parent . These objects will be matched
10
+ # against possible metadata URIs and the corresponding
11
+ # IMW::Metadata class created on the fly.
12
+ #
13
+ # In case no such object is found, the class should also define
14
+ # the +basename+ and +path+ methods which will be used to generate
15
+ # a default URI where metadata about the parent's resources should
16
+ # live.
6
17
  module ContainsMetadata
7
18
 
8
- # The path at which this resource's metadata file lives.
19
+ # The URI containing the metadata for this resource and its
20
+ # contents.
9
21
  #
10
- # Will default to any file beginning with +metadata+ and ending
11
- # with a +yaml+ or +json+ extension contained in this resource's
12
- # +contents+.
22
+ # Looks for an existing JSON or YAML file containing the strings
23
+ # "icss" or "metadata" directly contained within this resource.
24
+ #
25
+ # If none are found, defaults to a URI named after this
26
+ # resource's basename with the string ".icss.yaml" appended.
13
27
  #
14
28
  # @return [String, nil]
15
- def metadata_uri
16
- @metadata_uri ||= contents.detect { |path| path =~ /metadata.*\.(ya?ml|json)$/ }
29
+ def default_metadata_uri
30
+ contents.detect { |path| path =~ /(icss|metadata).*\.(ya?ml|json)$/i } || File.join(path, "#{basename}.icss.yaml")
17
31
  end
18
32
 
19
- # Explicitly set the path to the metadata for this resource.
20
- attr_writer :metadata_uri
21
-
22
- # Does this resource contain metadata for other resources it
23
- # contains?
33
+ # Return the metadata for this resource if it exists.
24
34
  #
25
- # @return [true, false]
26
- def metadata?
27
- (!! metadata_uri)
28
- end
29
-
30
- # Return the metadata for this resource.
35
+ # Will look for an existing resource at +default_metadata_uri+.
31
36
  #
32
37
  # @return [IMW::Metadata, nil]
33
38
  def metadata
34
- @metadata ||= metadata? && IMW::Metadata.load(metadata_uri)
39
+ return @metadata if @metadata
40
+ obj = IMW.open(default_metadata_uri)
41
+ self.metadata=(obj) if obj.exist?
42
+ @metadata
35
43
  end
36
44
 
37
- # Explicitly set the metadata for this resource.
38
- attr_writer :metadata
45
+ # Set the metadata for this resource to +obj+.
46
+ #
47
+ # @param [String, Addressable::URI, IMW::Resource] obj
48
+ def metadata= obj
49
+ @metadata = IMW::Metadata.load(obj)
50
+ end
39
51
 
40
52
  end
41
53
  end
42
54
  end
43
-
44
-
@@ -16,22 +16,6 @@ module IMW
16
16
  #
17
17
  # IMW::Metadata::Field.new 'name' => 'id', 'type' => :integer, 'title' => "ID", 'description' => "Auto-incremented."
18
18
  # #=> { 'name' => 'id', 'type' => :integer, 'title' > "ID", 'description' => "Auto-incremented." }
19
- #
20
- # Some properties make a field special:
21
- #
22
- # <tt>has_many</tt>::
23
- # Denotes that this record is in a "has_many" relationship with
24
- # one or more other records. The corresponding value should be
25
- # an array
26
- #
27
- # <tt>has_one</tt>::
28
- # Denotes that this record is in a "has_one" relationship with
29
- # one or more other records. The corresponding value should be
30
- # an Array in which each key names the joined record and each
31
- # value is an Array of fields describing the joined record..
32
- #
33
- # @see IMW::Metadata::Record for more usage of the
34
- # <tt>:has_many</tt> and <tt>:has_one</tt> properties.
35
19
  class Field < Hash
36
20
 
37
21
  def initialize obj
@@ -43,23 +27,11 @@ module IMW
43
27
  self['name'] = obj.to_s.strip
44
28
  end
45
29
  end
46
-
47
- def hierarchical?
48
- has_key?('has_many') || has_key?('has_one')
49
- end
50
- alias_method :nested?, :hierarchical?
51
-
52
- def flat?
53
- ! hierarchical?
54
- end
55
30
 
56
31
  def titleize
57
32
  self['title'] || self['name'].capitalize # FIXME we can do better than this!
58
33
  end
59
34
 
60
- def associations
61
- end
62
-
63
35
  end
64
36
  end
65
37
  end
@@ -0,0 +1,93 @@
1
+ module IMW
2
+ class Metadata
3
+
4
+
5
+ # A module which defines how a resource finds Metadata that it can
6
+ # look up metadata about itself.
7
+ #
8
+ # "metadata" in this context is defined as accessors for
9
+ # +metadata+ (IMW::Metadata), +schema+ (IMW::Metadata::Schema),
10
+ # +fields+ (IMW::Metadata::Field), and +description+ (String).
11
+ #
12
+ # An including class should define a method +dir+ which should
13
+ # return an object that might contain Metadata, i.e. - that
14
+ # includes the IMW::Metadata::ContainsMetadata module.
15
+ #
16
+ # An including class can optionally define the methods +snippet+
17
+ # which returns a snippet of the resource as well as
18
+ # +record_count+ to return a count of how many records the
19
+ # resource contains.
20
+ module HasMetadata
21
+
22
+ # The schema for this object.
23
+ #
24
+ # @return [Hash]
25
+ def schema
26
+ return @schema if @schema
27
+ @schema = IMW::Metadata::Schema.new
28
+ @schema[:type] = "record"
29
+ @schema[:namespace] = "schema.imw.resource"
30
+ @schema[:name] = (basename || '')
31
+ @schema[:doc] = description
32
+ @schema[:fields] = fields
33
+
34
+ @schema[:non_avro ] = {}
35
+ @schema[:non_avro][:snippet] = snippet if respond_to?(:snippet)
36
+ @schema[:non_avro][:record_count] = record_count if respond_to?(:record_count)
37
+ @schema
38
+ end
39
+
40
+ # Return the metadata object that contains metadata for this
41
+ # resource.
42
+ #
43
+ # Will look in this resource's directory and recursively upward
44
+ # till the root directory is reached or a metadata file is
45
+ # discovered.
46
+ #
47
+ # @return [IMW::Metadata, nil]
48
+ def metadata
49
+ return @metadata if @metadata
50
+ d = dir
51
+ while d.path != '/'
52
+ break if d.metadata && d.metadata.describes?(self)
53
+ d = d.dir
54
+ end
55
+ @metadata = d.metadata
56
+ end
57
+
58
+ # The fields for this resource's data.
59
+ #
60
+ # Each field will be a Hash of information.
61
+ #
62
+ # @return [Array<Hash>]
63
+ def fields
64
+ @fields ||= metadata && metadata.fields_for(self)
65
+ end
66
+
67
+ # Set the fields for this resource.
68
+ #
69
+ # @param [Array<Hash>] new_fields
70
+ # @return [Array<Hash>]
71
+ def fields= new_fields
72
+ @fields = new_fields.map { |f| Metadata::Field.new(f) }
73
+ end
74
+
75
+ # A description for this Resource.
76
+ #
77
+ # @return [String]
78
+ def description
79
+ @description ||= metadata && metadata.description_for(self)
80
+ end
81
+
82
+ # Set the description of this Resource.
83
+ #
84
+ # @param [String] new_description
85
+ # @return [String]
86
+ def description= new_description
87
+ @description = new_description
88
+ end
89
+
90
+ end
91
+ end
92
+ end
93
+
@@ -0,0 +1,51 @@
1
+ module IMW
2
+ class Metadata
3
+
4
+ # A module for generating a summary & schema of a resource.
5
+ #
6
+ # The including class should define methods +uri+, +basename+, +extension+.
7
+ module HasSummary
8
+
9
+ # Return a full summary of this Resource.
10
+ #
11
+ # The summary will include "external" information about how this
12
+ # resource appears to the world (via its URI), "internal"
13
+ # metadata about this resource (its description, &c.), as well
14
+ # as the structure of this resource's data (it's schema's fields
15
+ # and a snippet).
16
+ #
17
+ # Will return a Hash, with a <tt>:schema</tt> key which maps to
18
+ # a well-formed AVRO schema for this resource.
19
+ #
20
+ # @return [Hash]
21
+ def summary
22
+ return @summary if @summary
23
+ @summary = external_summary
24
+ @summary[:schema] = schema if respond_to?(:schema)
25
+ @summary[:contents] = resources.map(&:summary) if respond_to?(:resources)
26
+ @summary
27
+ end
28
+
29
+ # Return information (usually scheme-dependent) on how this
30
+ # resource is situated in the world, i.e. - its URI, its size,
31
+ # how many lines it has, &c.
32
+ #
33
+ # Modules which override this should chain with +super+:
34
+ #
35
+ # # in my_scheme.rb
36
+ # def external_summary
37
+ # super().merge(:user => 'bob', :password => 'smith')
38
+ # end
39
+ #
40
+ # @return [Hash]
41
+ def external_summary
42
+ {
43
+ :uri => uri.to_s,
44
+ :basename => basename,
45
+ :extension => extension
46
+ }
47
+ end
48
+ end
49
+
50
+ end
51
+ end
@@ -1,227 +1,17 @@
1
1
  module IMW
2
+
2
3
  class Metadata
3
4
 
4
- # A class to describe the schema of a resource.
5
- #
6
- # A Schema is built on top of an Array because it is often
7
- # important to have an ordering for a record's fields.
8
- #
9
- # For fields with no such ordering, an Array also works because
10
- # each of its element will be a field with a +name+ that can be
11
- # used to index the corresponding field.
12
- #
13
- # A Schema is instantiated with a basic Ruby data structure.
14
- #
15
- # == Tabular Data
16
- #
17
- # Tabular data formats (CSV, TSV, &c.) contain flat records
18
- # consisting of repeating rows with the same fields in the same
19
- # position. A sample of delimited data looks like
20
- #
21
- # ID,Name,Genus,Species
22
- # 001,Gray-bellied Night Monkey,Aotus,lemurinus
23
- # 002,Panamanian Night Monkey,Aotus,zonalis
24
- # 003,Hernández-Camacho's Night Monkey,Aotus,jorgehernandezi
25
- # 004,Gray-handed Night Monkey,Aotus,griseimembra
26
- # 005,Hershkovitz's Night Monkey,Aotus,hershkovitzi
27
- # ...
28
- #
29
- # The schema of these records is summarized as a Ruby data
30
- # structure in the following way
31
- #
32
- # [
33
- # { :name => :id, :type => :integer },
34
- # { :name => :name, :type => :string, :title => "Common Name" },
35
- # { :name => :genus, :type => :string, :title => "Genus" },
36
- # { :name => :species, :type => :string, :title => "Species" }
37
- # ]
38
- #
39
- # The outer-most Array represents each row and each Hash in the
40
- # Array represents one of the fields in a row. A Schema
41
- # initialized with the above Ruby code can be thought of and
42
- # played with as an Array of Hashes even though it really is a
43
- # Schema object of Field objects.
44
- #
45
- # == Hierarchical Data
46
- #
47
- # Hierarchical data formats (JSON, YAML, XML, &c.) can have
48
- # arbitrarily complex records with fields within fields and so on.
49
- # A sample of hierarchical XML data looks like
50
- #
51
- # <genera>
52
- # <genus>
53
- # <name>Mandrillus</name>
54
- # <species>
55
- # <species id="113">
56
- # <name>sphinx</name>
57
- # <common_name>Mandrill</common_name>
58
- # </species>
59
- # <species id="114">
60
- # <name>leucophaeus</name>
61
- # <common_name>Drill</common_name>
62
- # </species>
63
- # </species>
64
- # </genus>
65
- # <genus>
66
- # <name>Rungwecebus</name>
67
- # <species>
68
- # <species id="100">
69
- # <name>kipunji</name>
70
- # <common_name>Kipunji</common_name>
71
- # </species>
72
- # </species>
73
- # </genus>
74
- #
75
- # These records are described by the following Ruby data structure
76
- #
77
- # [
78
- # { :name => :genera,
79
- # :has_many => [
80
- # { :name => 'name', :type => :string, title => "Genus" },
81
- # { :name => 'species',
82
- # :has_many => [
83
- # { :name => :id, :type => :integer },
84
- # { :name => :name, :type => :string, :title => "Species" },
85
- # { :name => :common_name, :type => :string, :title => "Common Name" }
86
- # ]
87
- # }
88
- # ]
89
- # }
90
- # ]
91
- #
92
- # By IMW convention, the outer-most element of the Schema is still
93
- # an Array describing a collection of identical records even
94
- # though XML data must have a single root node, limiting the
95
- # collection to a single record.
96
- #
97
- # The first field of the Schema is named +genera+ and it uses the
98
- # special field property +has_many+ to denote that the field
99
- # points to a collection of sub-records.
5
+ # Represents a schema for data.
100
6
  #
101
- # Each of these sub-records has its own sub-schema defined by the
102
- # Array that the +has_many+ property keys to. In this case, the
103
- # two fields are +name+ and +species+. +name+ is a simple String
104
- # value while +species+ itself points at another collection of
105
- # objects.
106
- #
107
- # This second-level nested record (a particular species) is itself
108
- # composed of the three (flat) fields +id+, +name+, and
109
- # +common_name+. Note that the Schema doesn't know (or care) that
110
- # the +id+ field is contained in an XML attribute while the +name+
111
- # and +common_name+ fields are contained as text within daughter
112
- # nodes.
113
- #
114
- # A different way of structure the same information, this time
115
- # expressed in YAML:
116
- #
117
- # ---
118
- # Mandrillus:
119
- # - :species: sphinx
120
- # :name: Mandrill
121
- # :id: "113"
122
- # - :species: leucophaeus
123
- # :name: Drill
124
- # :id: "114"
125
- # Rungwecebus:
126
- # - :species: kipunji
127
- # :name: Kipunji
128
- # :id: "100"
129
- #
130
- # Would lead to a different Schema
131
- #
132
- # [
133
- # { :name => :genus, :title => "Genus",
134
- # :has_many => [
135
- # { :name => :id, :type => :integer },
136
- # { :name => :name, :type => :string, :title => "Common Name" },
137
- # { :name => :species, :type => :string, :title => "Species" }
138
- # ]
139
- # }
140
- # ]
141
- #
142
- # Where the unnecessary outer wrapper field +genera+ has been
143
- # dispensed with.
144
- #
145
- # In addition to "has many" relationships a record can have a
146
- # "has_one" relationship. The above data might be expressed
147
- #
148
- # ---
149
- # Mandrillus:
150
- # - species: sphinx
151
- # name: Mandrill
152
- # id: "113"
153
- # discoverer:
154
- # name: Dr. Monkeypants
155
- # year: 1838
156
- # - species: leucophaeus
157
- # name: Drill
158
- # id: "114"
159
- # discoverer:
160
- # name: Ms. Cecelia Apefingers
161
- # year: 1921
162
- #
163
- # would result in the following Schema:
164
- #
165
- # [
166
- # { :name => :genus, :title => "Genus",
167
- # :has_many => [
168
- # { :name => :id, :type => :integer },
169
- # { :name => :name, :type => :string, :title => "Common Name" },
170
- # { :name => :species, :type => :string },
171
- # { :name => :discoverer,
172
- # :has_one => [
173
- # { :name => 'name', :type => :string },
174
- # { :name => 'year', :type => :integer }
175
- # ]
176
- # }
177
- # ]
178
- # }
179
- # ]
180
- #
181
- # The +discoverer+ field is marked as +has_one+ which means the
182
- # +name+ and +year+ fields in the corresponding Array will be
183
- # interpreted as fields in a single attached sub-record.
184
- #
185
- # = Compact Schemas
186
- #
187
- # The internal hashes in a Schema specification are really Field
188
- # objects and the initializer will promote Strings and Symbols to
189
- # Field objects automatically. This means that the above Schema
190
- # specification could be replaced by
191
- #
192
- # [
193
- # { :name => :genus
194
- # :has_many => [
195
- # :id,
196
- # :name,
197
- # :species,
198
- # { :name => :discoverer,
199
- # :has_one => [
200
- # :name,
201
- # :year
202
- # ]
203
- # }
204
- # ]
205
- # }
206
- # ]
207
- #
208
- # though there is an accompanying loss of metadata about each
209
- # field.
210
- class Schema < Array
7
+ # FIXME add methods that help couple nicely with Avro schemata.
8
+ class Schema < Hash
211
9
 
212
- def initialize input=nil
10
+ def initialize obj=nil
213
11
  super()
214
- concat(input.map { |field| IMW::Metadata::Field.new(field) }) if input
215
- end
216
-
217
- def self.load resource
218
- new(IMW.open(resource).load)
12
+ merge!(obj) if obj.is_a?(Hash) || obj.is_a?(Schema)
219
13
  end
220
14
 
221
- def [] index
222
- [Integer, Range].include?(index.class) ? super(index) : detect { |field| field[:name].to_s == index.to_s }
223
- end
224
-
225
15
  end
226
16
  end
227
17
  end