imw 0.2.16 → 0.2.17

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.16
1
+ 0.2.17
data/lib/imw/dataset.rb CHANGED
@@ -119,7 +119,7 @@ module IMW
119
119
 
120
120
  # Provides this dataset with DSL like methods to construct a
121
121
  # schema in an IMW file.
122
- include IMW::Metadata::DSL
122
+ # include IMW::Metadata::DSL
123
123
 
124
124
  end
125
125
  end
@@ -11,12 +11,6 @@ module IMW
11
11
  # @abstract
12
12
  module Delimited
13
13
 
14
- # Ensure that this delimited resource is described by a an
15
- # ordered collection of flat fields.
16
- def validate_schema!
17
- raise IMW::SchemaError.new("#{self.class} resources must be described by an ordered set of flat fields") if schema.any?(&:nested?)
18
- end
19
-
20
14
  # Default options to be passed to
21
15
  # FasterCSV[http://fastercsv.rubyforge.org/]; see its
22
16
  # documentation for more information.
@@ -24,7 +18,7 @@ module IMW
24
18
  # @return [Hash]
25
19
  def delimited_options
26
20
  @delimited_options ||= {
27
- :headers => schema && schema.map { |field| field['name'] }
21
+ :headers => fields && fields.map { |field| field['name'] }
28
22
  }.merge(resource_options_compatible_with_faster_csv)
29
23
  end
30
24
 
@@ -68,7 +62,7 @@ module IMW
68
62
  # of this delimited data is a row of headers.
69
63
  #
70
64
  # @return [true, false]
71
- def headers_in_first_line?
65
+ def fields_in_first_line?
72
66
  # grab the header and up to 10 body rows
73
67
  require 'fastercsv'
74
68
  copy = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
@@ -93,15 +87,16 @@ module IMW
93
87
  determinant && determinant >= 0.05
94
88
  end
95
89
 
96
- # If it seems like there are headers in the first line of this
97
- # data then go ahead and use them to define a schema.
90
+ # If it seems like there are fields in the first line of this
91
+ # data then go ahead and use them to define this resource's
92
+ # fields.
98
93
  #
99
- # Will overwrite a schema already present for this resource.
100
- def guess_schema!
101
- return unless headers_in_first_line?
94
+ # Will overwrite any fields already present for this resource.
95
+ def guess_fields!
96
+ return unless fields_in_first_line?
102
97
  copy = FasterCSV.new(io, resource_options_compatible_with_faster_csv.merge(:headers => false))
103
98
  names = (copy.shift || []) rescue []
104
- self.schema = IMW::Metadata::Schema.new(names)
99
+ self.fields = names.map { |n| { 'name' => n } }
105
100
  delimited_options[:headers] = names
106
101
  end
107
102
 
data/lib/imw/metadata.rb CHANGED
@@ -4,13 +4,13 @@ module IMW
4
4
  # with a dataset's fields.
5
5
  class Metadata < Hash
6
6
 
7
- autoload :Field, 'imw/metadata/field'
8
- autoload :Schema, 'imw/metadata/schema'
9
- autoload :Schematized, 'imw/metadata/schematized'
10
- autoload :DSL, 'imw/metadata/dsl'
7
+ autoload :Field, 'imw/metadata/field'
8
+ autoload :Schema, 'imw/metadata/schema'
11
9
  autoload :ContainsMetadata, 'imw/metadata/contains_metadata'
10
+ autoload :HasSummary, 'imw/metadata/has_summary'
11
+ autoload :HasMetadata, 'imw/metadata/has_metadata'
12
12
 
13
- # The resource this Schema is anchored to.
13
+ # The resource this metadata is anchored to.
14
14
  #
15
15
  # This attribute is useful for letting relative paths in a
16
16
  # schema file refer to a common base URL.
@@ -18,12 +18,12 @@ module IMW
18
18
  # @return [IMW::Resource]
19
19
  attr_reader :base
20
20
 
21
- # Set the resource this Schema is anchored to.
21
+ # Set the base resource this metdata is anchored to.
22
22
  #
23
23
  # @param [IMW::Resource, String, Addressable::URI] new_base
24
24
  def base= new_base
25
25
  base_resource = IMW.open(new_base)
26
- base_resource.should_exist!("Metdata base directory must exist")
26
+ base_resource.should_exist!("Metadata base directory must exist")
27
27
  raise IMW::PathError.new("Metadata base must be a directory") unless base_resource.is_directory?
28
28
  @base = base_resource
29
29
  end
@@ -31,34 +31,51 @@ module IMW
31
31
  def initialize obj=nil, options={}
32
32
  super()
33
33
  self.base = options[:base] if options[:base]
34
- obj.each_pair { |resource, schema| self[resource] = Schema.new(schema) } if obj
34
+ if obj
35
+ obj.each_pair do |resource, metadata|
36
+ self[resource] = metadata
37
+ end
38
+ end
35
39
  end
36
40
 
37
- def self.load metadata_resource, options
38
- resource = IMW.open(metadata_resource)
41
+ def self.load obj, options={}
42
+ resource = IMW.open(obj)
39
43
  new(resource.load, {:base => resource.dirname}.merge(options))
40
44
  end
41
45
 
42
- def []= resource_spec, schema_spec
43
- schema = schema_spec.is_a?(Schema) ? schema_spec : Schema.new(schema_spec)
44
- super(absolute_uri(resource_spec), schema_spec)
46
+ def []= resource, metadata
47
+ super(absolute_uri(resource), metadata)
48
+ end
49
+
50
+ def [] resource
51
+ super(absolute_uri(resource))
52
+ end
53
+
54
+ def describe? resource
55
+ self[(absolute_uri(resource))]
45
56
  end
57
+ alias_method :describes?, :describe?
46
58
 
47
- def [] resource_spec
48
- super(absolute_uri(resource_spec))
59
+ def description_for resource
60
+ return unless describes?(resource)
61
+ self[resource]['description']
49
62
  end
50
63
 
51
- def describe? resource_spec
52
- has_key?(absolute_uri(resource_spec))
64
+ def fields_for resource
65
+ return unless describes?(resource)
66
+ (self[resource]['fields'] || []).map { |f| Metadata::Field.new(f) }
53
67
  end
54
68
 
55
69
  protected
56
70
 
57
- def absolute_uri resource_spec
58
- if base && resource_spec.to_s !~ %r{(^/|://)} # relative path
59
- base.join(resource_spec).to_s
71
+ def absolute_uri resource
72
+ obj = IMW.open(resource)
73
+ if base && obj.uri.to_s !~ %r{(^/|://)} # relative path
74
+ s = base.join(obj.uri.to_s).uri.to_s
75
+ s
60
76
  else
61
- resource_spec.to_s
77
+ s = obj.uri.to_s
78
+ s
62
79
  end
63
80
  end
64
81
 
@@ -1,44 +1,54 @@
1
- module IMW
1
+ module IMW
2
2
  class Metadata
3
3
 
4
- # A module that can be mixed into any class defining a +contents+
5
- # methods which returns an Array of URI strings.
4
+ # A module for finding metadata describing the sub-resources of a
5
+ # given resource.
6
+ #
7
+ # An including class describing the parent resource must define
8
+ # the +contents+ method which must return an Array of Strings
9
+ # contained within the parent . These objects will be matched
10
+ # against possible metadata URIs and the corresponding
11
+ # IMW::Metadata class created on the fly.
12
+ #
13
+ # In case no such object is found, the class should also define
14
+ # the +basename+ and +path+ methods which will be used to generate
15
+ # a default URI where metadata about the parent's resources should
16
+ # live.
6
17
  module ContainsMetadata
7
18
 
8
- # The path at which this resource's metadata file lives.
19
+ # The URI containing the metadata for this resource and its
20
+ # contents.
9
21
  #
10
- # Will default to any file beginning with +metadata+ and ending
11
- # with a +yaml+ or +json+ extension contained in this resource's
12
- # +contents+.
22
+ # Looks for an existing JSON or YAML file containing the strings
23
+ # "icss" or "metadata" directly contained within this resource.
24
+ #
25
+ # If none are found, defaults to a URI named after this
26
+ # resource's basename with the string ".icss.yaml" appended.
13
27
  #
14
28
  # @return [String, nil]
15
- def metadata_uri
16
- @metadata_uri ||= contents.detect { |path| path =~ /metadata.*\.(ya?ml|json)$/ }
29
+ def default_metadata_uri
30
+ contents.detect { |path| path =~ /(icss|metadata).*\.(ya?ml|json)$/i } || File.join(path, "#{basename}.icss.yaml")
17
31
  end
18
32
 
19
- # Explicitly set the path to the metadata for this resource.
20
- attr_writer :metadata_uri
21
-
22
- # Does this resource contain metadata for other resources it
23
- # contains?
33
+ # Return the metadata for this resource if it exists.
24
34
  #
25
- # @return [true, false]
26
- def metadata?
27
- (!! metadata_uri)
28
- end
29
-
30
- # Return the metadata for this resource.
35
+ # Will look for an existing resource at +default_metadata_uri+.
31
36
  #
32
37
  # @return [IMW::Metadata, nil]
33
38
  def metadata
34
- @metadata ||= metadata? && IMW::Metadata.load(metadata_uri)
39
+ return @metadata if @metadata
40
+ obj = IMW.open(default_metadata_uri)
41
+ self.metadata=(obj) if obj.exist?
42
+ @metadata
35
43
  end
36
44
 
37
- # Explicitly set the metadata for this resource.
38
- attr_writer :metadata
45
+ # Set the metadata for this resource to +obj+.
46
+ #
47
+ # @param [String, Addressable::URI, IMW::Resource] obj
48
+ def metadata= obj
49
+ @metadata = IMW::Metadata.load(obj)
50
+ end
39
51
 
40
52
  end
41
53
  end
42
54
  end
43
-
44
-
@@ -16,22 +16,6 @@ module IMW
16
16
  #
17
17
  # IMW::Metadata::Field.new 'name' => 'id', 'type' => :integer, 'title' => "ID", 'description' => "Auto-incremented."
18
18
  # #=> { 'name' => 'id', 'type' => :integer, 'title' > "ID", 'description' => "Auto-incremented." }
19
- #
20
- # Some properties make a field special:
21
- #
22
- # <tt>has_many</tt>::
23
- # Denotes that this record is in a "has_many" relationship with
24
- # one or more other records. The corresponding value should be
25
- # an array
26
- #
27
- # <tt>has_one</tt>::
28
- # Denotes that this record is in a "has_one" relationship with
29
- # one or more other records. The corresponding value should be
30
- # an Array in which each key names the joined record and each
31
- # value is an Array of fields describing the joined record..
32
- #
33
- # @see IMW::Metadata::Record for more usage of the
34
- # <tt>:has_many</tt> and <tt>:has_one</tt> properties.
35
19
  class Field < Hash
36
20
 
37
21
  def initialize obj
@@ -43,23 +27,11 @@ module IMW
43
27
  self['name'] = obj.to_s.strip
44
28
  end
45
29
  end
46
-
47
- def hierarchical?
48
- has_key?('has_many') || has_key?('has_one')
49
- end
50
- alias_method :nested?, :hierarchical?
51
-
52
- def flat?
53
- ! hierarchical?
54
- end
55
30
 
56
31
  def titleize
57
32
  self['title'] || self['name'].capitalize # FIXME we can do better than this!
58
33
  end
59
34
 
60
- def associations
61
- end
62
-
63
35
  end
64
36
  end
65
37
  end
@@ -0,0 +1,93 @@
1
+ module IMW
2
+ class Metadata
3
+
4
+
5
+ # A module which defines how a resource finds Metadata that it can
6
+ # look up metadata about itself.
7
+ #
8
+ # "metadata" in this context is defined as accessors for
9
+ # +metadata+ (IMW::Metadata), +schema+ (IMW::Metadata::Schema),
10
+ # +fields+ (IMW::Metadata::Field), and +description+ (String).
11
+ #
12
+ # An including class should define a method +dir+ which should
13
+ # return an object that might contain Metadata, i.e. - that
14
+ # includes the IMW::Metadata::ContainsMetadata module.
15
+ #
16
+ # An including class can optionally define the methods +snippet+
17
+ # which returns a snippet of the resource as well as
18
+ # +record_count+ to return a count of how many records the
19
+ # resource contains.
20
+ module HasMetadata
21
+
22
+ # The schema for this object.
23
+ #
24
+ # @return [Hash]
25
+ def schema
26
+ return @schema if @schema
27
+ @schema = IMW::Metadata::Schema.new
28
+ @schema[:type] = "record"
29
+ @schema[:namespace] = "schema.imw.resource"
30
+ @schema[:name] = (basename || '')
31
+ @schema[:doc] = description
32
+ @schema[:fields] = fields
33
+
34
+ @schema[:non_avro ] = {}
35
+ @schema[:non_avro][:snippet] = snippet if respond_to?(:snippet)
36
+ @schema[:non_avro][:record_count] = record_count if respond_to?(:record_count)
37
+ @schema
38
+ end
39
+
40
+ # Return the metadata object that contains metadata for this
41
+ # resource.
42
+ #
43
+ # Will look in this resource's directory and recursively upward
44
+ # till the root directory is reached or a metadata file is
45
+ # discovered.
46
+ #
47
+ # @return [IMW::Metadata, nil]
48
+ def metadata
49
+ return @metadata if @metadata
50
+ d = dir
51
+ while d.path != '/'
52
+ break if d.metadata && d.metadata.describes?(self)
53
+ d = d.dir
54
+ end
55
+ @metadata = d.metadata
56
+ end
57
+
58
+ # The fields for this resource's data.
59
+ #
60
+ # Each field will be a Hash of information.
61
+ #
62
+ # @return [Array<Hash>]
63
+ def fields
64
+ @fields ||= metadata && metadata.fields_for(self)
65
+ end
66
+
67
+ # Set the fields for this resource.
68
+ #
69
+ # @param [Array<Hash>] new_fields
70
+ # @return [Array<Hash>]
71
+ def fields= new_fields
72
+ @fields = new_fields.map { |f| Metadata::Field.new(f) }
73
+ end
74
+
75
+ # A description for this Resource.
76
+ #
77
+ # @return [String]
78
+ def description
79
+ @description ||= metadata && metadata.description_for(self)
80
+ end
81
+
82
+ # Set the description of this Resource.
83
+ #
84
+ # @param [String] new_description
85
+ # @return [String]
86
+ def description= new_description
87
+ @description = new_description
88
+ end
89
+
90
+ end
91
+ end
92
+ end
93
+
@@ -0,0 +1,51 @@
1
+ module IMW
2
+ class Metadata
3
+
4
+ # A module for generating a summary & schema of a resource.
5
+ #
6
+ # The including class should define methods +uri+, +basename+, +extension+.
7
+ module HasSummary
8
+
9
+ # Return a full summary of this Resource.
10
+ #
11
+ # The summary will include "external" information about how this
12
+ # resource appears to the world (via its URI), "internal"
13
+ # metadata about this resource (its description, &c.), as well
14
+ # as the structure of this resource's data (it's schema's fields
15
+ # and a snippet).
16
+ #
17
+ # Will return a Hash, with a <tt>:schema</tt> key which maps to
18
+ # a well-formed AVRO schema for this resource.
19
+ #
20
+ # @return [Hash]
21
+ def summary
22
+ return @summary if @summary
23
+ @summary = external_summary
24
+ @summary[:schema] = schema if respond_to?(:schema)
25
+ @summary[:contents] = resources.map(&:summary) if respond_to?(:resources)
26
+ @summary
27
+ end
28
+
29
+ # Return information (usually scheme-dependent) on how this
30
+ # resource is situated in the world, i.e. - its URI, its size,
31
+ # how many lines it has, &c.
32
+ #
33
+ # Modules which override this should chain with +super+:
34
+ #
35
+ # # in my_scheme.rb
36
+ # def external_summary
37
+ # super().merge(:user => 'bob', :password => 'smith')
38
+ # end
39
+ #
40
+ # @return [Hash]
41
+ def external_summary
42
+ {
43
+ :uri => uri.to_s,
44
+ :basename => basename,
45
+ :extension => extension
46
+ }
47
+ end
48
+ end
49
+
50
+ end
51
+ end
@@ -1,227 +1,17 @@
1
1
  module IMW
2
+
2
3
  class Metadata
3
4
 
4
- # A class to describe the schema of a resource.
5
- #
6
- # A Schema is built on top of an Array because it is often
7
- # important to have an ordering for a record's fields.
8
- #
9
- # For fields with no such ordering, an Array also works because
10
- # each of its element will be a field with a +name+ that can be
11
- # used to index the corresponding field.
12
- #
13
- # A Schema is instantiated with a basic Ruby data structure.
14
- #
15
- # == Tabular Data
16
- #
17
- # Tabular data formats (CSV, TSV, &c.) contain flat records
18
- # consisting of repeating rows with the same fields in the same
19
- # position. A sample of delimited data looks like
20
- #
21
- # ID,Name,Genus,Species
22
- # 001,Gray-bellied Night Monkey,Aotus,lemurinus
23
- # 002,Panamanian Night Monkey,Aotus,zonalis
24
- # 003,Hernández-Camacho's Night Monkey,Aotus,jorgehernandezi
25
- # 004,Gray-handed Night Monkey,Aotus,griseimembra
26
- # 005,Hershkovitz's Night Monkey,Aotus,hershkovitzi
27
- # ...
28
- #
29
- # The schema of these records is summarized as a Ruby data
30
- # structure in the following way
31
- #
32
- # [
33
- # { :name => :id, :type => :integer },
34
- # { :name => :name, :type => :string, :title => "Common Name" },
35
- # { :name => :genus, :type => :string, :title => "Genus" },
36
- # { :name => :species, :type => :string, :title => "Species" }
37
- # ]
38
- #
39
- # The outer-most Array represents each row and each Hash in the
40
- # Array represents one of the fields in a row. A Schema
41
- # initialized with the above Ruby code can be thought of and
42
- # played with as an Array of Hashes even though it really is a
43
- # Schema object of Field objects.
44
- #
45
- # == Hierarchical Data
46
- #
47
- # Hierarchical data formats (JSON, YAML, XML, &c.) can have
48
- # arbitrarily complex records with fields within fields and so on.
49
- # A sample of hierarchical XML data looks like
50
- #
51
- # <genera>
52
- # <genus>
53
- # <name>Mandrillus</name>
54
- # <species>
55
- # <species id="113">
56
- # <name>sphinx</name>
57
- # <common_name>Mandrill</common_name>
58
- # </species>
59
- # <species id="114">
60
- # <name>leucophaeus</name>
61
- # <common_name>Drill</common_name>
62
- # </species>
63
- # </species>
64
- # </genus>
65
- # <genus>
66
- # <name>Rungwecebus</name>
67
- # <species>
68
- # <species id="100">
69
- # <name>kipunji</name>
70
- # <common_name>Kipunji</common_name>
71
- # </species>
72
- # </species>
73
- # </genus>
74
- #
75
- # These records are described by the following Ruby data structure
76
- #
77
- # [
78
- # { :name => :genera,
79
- # :has_many => [
80
- # { :name => 'name', :type => :string, title => "Genus" },
81
- # { :name => 'species',
82
- # :has_many => [
83
- # { :name => :id, :type => :integer },
84
- # { :name => :name, :type => :string, :title => "Species" },
85
- # { :name => :common_name, :type => :string, :title => "Common Name" }
86
- # ]
87
- # }
88
- # ]
89
- # }
90
- # ]
91
- #
92
- # By IMW convention, the outer-most element of the Schema is still
93
- # an Array describing a collection of identical records even
94
- # though XML data must have a single root node, limiting the
95
- # collection to a single record.
96
- #
97
- # The first field of the Schema is named +genera+ and it uses the
98
- # special field property +has_many+ to denote that the field
99
- # points to a collection of sub-records.
5
+ # Represents a schema for data.
100
6
  #
101
- # Each of these sub-records has its own sub-schema defined by the
102
- # Array that the +has_many+ property keys to. In this case, the
103
- # two fields are +name+ and +species+. +name+ is a simple String
104
- # value while +species+ itself points at another collection of
105
- # objects.
106
- #
107
- # This second-level nested record (a particular species) is itself
108
- # composed of the three (flat) fields +id+, +name+, and
109
- # +common_name+. Note that the Schema doesn't know (or care) that
110
- # the +id+ field is contained in an XML attribute while the +name+
111
- # and +common_name+ fields are contained as text within daughter
112
- # nodes.
113
- #
114
- # A different way of structure the same information, this time
115
- # expressed in YAML:
116
- #
117
- # ---
118
- # Mandrillus:
119
- # - :species: sphinx
120
- # :name: Mandrill
121
- # :id: "113"
122
- # - :species: leucophaeus
123
- # :name: Drill
124
- # :id: "114"
125
- # Rungwecebus:
126
- # - :species: kipunji
127
- # :name: Kipunji
128
- # :id: "100"
129
- #
130
- # Would lead to a different Schema
131
- #
132
- # [
133
- # { :name => :genus, :title => "Genus",
134
- # :has_many => [
135
- # { :name => :id, :type => :integer },
136
- # { :name => :name, :type => :string, :title => "Common Name" },
137
- # { :name => :species, :type => :string, :title => "Species" }
138
- # ]
139
- # }
140
- # ]
141
- #
142
- # Where the unnecessary outer wrapper field +genera+ has been
143
- # dispensed with.
144
- #
145
- # In addition to "has many" relationships a record can have a
146
- # "has_one" relationship. The above data might be expressed
147
- #
148
- # ---
149
- # Mandrillus:
150
- # - species: sphinx
151
- # name: Mandrill
152
- # id: "113"
153
- # discoverer:
154
- # name: Dr. Monkeypants
155
- # year: 1838
156
- # - species: leucophaeus
157
- # name: Drill
158
- # id: "114"
159
- # discoverer:
160
- # name: Ms. Cecelia Apefingers
161
- # year: 1921
162
- #
163
- # would result in the following Schema:
164
- #
165
- # [
166
- # { :name => :genus, :title => "Genus",
167
- # :has_many => [
168
- # { :name => :id, :type => :integer },
169
- # { :name => :name, :type => :string, :title => "Common Name" },
170
- # { :name => :species, :type => :string },
171
- # { :name => :discoverer,
172
- # :has_one => [
173
- # { :name => 'name', :type => :string },
174
- # { :name => 'year', :type => :integer }
175
- # ]
176
- # }
177
- # ]
178
- # }
179
- # ]
180
- #
181
- # The +discoverer+ field is marked as +has_one+ which means the
182
- # +name+ and +year+ fields in the corresponding Array will be
183
- # interpreted as fields in a single attached sub-record.
184
- #
185
- # = Compact Schemas
186
- #
187
- # The internal hashes in a Schema specification are really Field
188
- # objects and the initializer will promote Strings and Symbols to
189
- # Field objects automatically. This means that the above Schema
190
- # specification could be replaced by
191
- #
192
- # [
193
- # { :name => :genus
194
- # :has_many => [
195
- # :id,
196
- # :name,
197
- # :species,
198
- # { :name => :discoverer,
199
- # :has_one => [
200
- # :name,
201
- # :year
202
- # ]
203
- # }
204
- # ]
205
- # }
206
- # ]
207
- #
208
- # though there is an accompanying loss of metadata about each
209
- # field.
210
- class Schema < Array
7
+ # FIXME add methods that help couple nicely with Avro schemata.
8
+ class Schema < Hash
211
9
 
212
- def initialize input=nil
10
+ def initialize obj=nil
213
11
  super()
214
- concat(input.map { |field| IMW::Metadata::Field.new(field) }) if input
215
- end
216
-
217
- def self.load resource
218
- new(IMW.open(resource).load)
12
+ merge!(obj) if obj.is_a?(Hash) || obj.is_a?(Schema)
219
13
  end
220
14
 
221
- def [] index
222
- [Integer, Range].include?(index.class) ? super(index) : detect { |field| field[:name].to_s == index.to_s }
223
- end
224
-
225
15
  end
226
16
  end
227
17
  end