mida 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.rdoc +18 -2
- data/README.rdoc +22 -11
- data/Rakefile +2 -2
- data/lib/mida.rb +1 -1
- data/lib/mida/datatype.rb +15 -0
- data/lib/mida/datatype/boolean.rb +18 -0
- data/lib/mida/datatype/float.rb +15 -0
- data/lib/mida/datatype/integer.rb +15 -0
- data/lib/mida/datatype/iso8601date.rb +17 -0
- data/lib/mida/datatype/number.rb +15 -0
- data/lib/mida/datatype/text.rb +13 -0
- data/lib/mida/document.rb +21 -19
- data/lib/mida/genericvocabulary.rb +13 -0
- data/lib/mida/item.rb +83 -71
- data/lib/mida/itemprop.rb +55 -30
- data/lib/mida/itemscope.rb +82 -0
- data/lib/mida/propertydesc.rb +36 -0
- data/lib/mida/vocabulary.rb +60 -6
- data/spec/datatype/boolean_spec.rb +27 -0
- data/spec/datatype/float_spec.rb +23 -0
- data/spec/datatype/integer_spec.rb +23 -0
- data/spec/datatype/iso8601date_spec.rb +20 -0
- data/spec/datatype/number_spec.rb +23 -0
- data/spec/datatype/text_spec.rb +14 -0
- data/spec/document_spec.rb +31 -487
- data/spec/item_spec.rb +163 -472
- data/spec/itemprop_spec.rb +40 -45
- data/spec/itemscope_spec.rb +287 -0
- data/spec/propertydesc_spec.rb +56 -0
- data/spec/spec_helper.rb +13 -36
- data/spec/vocabulary_spec.rb +148 -0
- metadata +22 -6
- data/lib/mida/vocabulary/generic.rb +0 -15
- data/lib/mida/vocabularydesc.rb +0 -57
- data/spec/vocabularydesc_spec.rb +0 -106
data/CHANGELOG.rdoc
CHANGED
@@ -1,7 +1,23 @@
|
|
1
|
+
== 0.3.0 (29th June 2011)
|
2
|
+
* Merge +VocabularyDesc+ into +Vocabulary+
|
3
|
+
* Vocabularies are now auto registered using +inherited+ hook
|
4
|
+
* Removed vocabulary from <tt>Item#to_h</tt>
|
5
|
+
* Deprecate +types+ to describe a Vocabulary property if favour of +extract+
|
6
|
+
* Add +DataType+ so can use <tt>DataType::Text</tt> instead of +String+ for a
|
7
|
+
type
|
8
|
+
* Add various <tt>DataType</tt>s: +Boolean+, +Float+, +Integer+, +Number+,
|
9
|
+
+ISO8601Date+, +Text+
|
10
|
+
* Add Bundler support
|
11
|
+
* Properties marked as <tt>has_one</tt> now output a single value instead of
|
12
|
+
an +Array+
|
13
|
+
* <tt>Document#search</tt> now only uses a +Regexp+ to search with
|
14
|
+
* +Document+ now includes +Enumerable+ Mixin
|
15
|
+
|
1
16
|
== 0.2.0 (3rd May 2011)
|
2
17
|
* Add ability to describe and conform to vocabularies
|
3
|
-
* Rename Mida::Property to Mida::Itemprop to better reflect
|
4
|
-
|
18
|
+
* Rename <tt>Mida::Property</tt> to <tt>Mida::Itemprop</tt> to better reflect
|
19
|
+
use
|
20
|
+
* Make some of the <tt>Mida::Itemprop</tt> class methods private
|
5
21
|
|
6
22
|
== 0.1.3 (18th April 2011)
|
7
23
|
* Ensure itemprops are parsed properly if containing non-microdata elements
|
data/README.rdoc
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
= Mida
|
2
2
|
|
3
|
-
* {Mida Project Page}[
|
3
|
+
* {Mida Project Page}[http://lawrencewoodman.github.com/mida]
|
4
|
+
* {Mida Github Repository}[https://github.com/LawrenceWoodman/mida]
|
4
5
|
* {Mida Bug Tracker}[https://github.com/LawrenceWoodman/mida/issues]
|
5
6
|
|
6
7
|
== Description
|
@@ -43,8 +44,8 @@ To return all the +Items+ that use one of Google's Review vocabularies:
|
|
43
44
|
doc.search(%r{http://data-vocabulary\.org.*?review.*?}i)
|
44
45
|
|
45
46
|
=== Inspecting an +Item+
|
46
|
-
Each +Item+ is a <tt>Mida::Item</tt> instance and has
|
47
|
-
interest
|
47
|
+
Each +Item+ is a <tt>Mida::Item</tt> instance and has four main methods of
|
48
|
+
interest: +type+, +vocabulary+, +properties+ and +id+.
|
48
49
|
|
49
50
|
To find out the +itemtype+ of the +Item+:
|
50
51
|
puts doc.items.first.type
|
@@ -60,21 +61,31 @@ To see the +properties+ of the +Item+:
|
|
60
61
|
|
61
62
|
=== Working with Vocabularies
|
62
63
|
Mida allows you to define vocabularies, so that input data can be constrained to match
|
63
|
-
expected patterns. By default a generic vocabulary (<tt>Mida::
|
64
|
+
expected patterns. By default a generic vocabulary (<tt>Mida::GenericVocabulary</tt>)
|
64
65
|
is registered which will match against any +itemtype+ with any number of properties.
|
65
66
|
|
66
|
-
If you want to specify a vocabulary you create a class derived from <tt>Mida::
|
67
|
-
and use +itemtype+, +has_one+, +has_many+ and +
|
67
|
+
If you want to specify a vocabulary you create a class derived from <tt>Mida::Vocabulary</tt>
|
68
|
+
and use +itemtype+, +has_one+, +has_many+ and +extract+ to describe the vocabulary.
|
68
69
|
|
69
70
|
As an example the following describes a subset of Google's Review vocabulary:
|
70
|
-
|
71
|
-
|
71
|
+
|
72
|
+
class Rating < Mida::Vocabulary
|
73
|
+
itemtype %r{http://data-vocabulary.org/rating}i
|
74
|
+
has_one 'best'
|
75
|
+
has_one 'worst'
|
76
|
+
has_one 'value'
|
77
|
+
end
|
78
|
+
|
79
|
+
class Review < Mida::Vocabulary
|
80
|
+
itemtype %r{http://data-vocabulary.org/review}i
|
72
81
|
has_one 'itemreviewed'
|
73
|
-
has_one 'rating'
|
82
|
+
has_one 'rating' do
|
83
|
+
extract Rating, Mida::DataType::Text
|
84
|
+
end
|
74
85
|
end
|
75
86
|
|
76
|
-
|
77
|
-
|
87
|
+
When you create a subclass of <tt>Mida::Vocabulary</tt> it automatically
|
88
|
+
registers the Vocabulary.
|
78
89
|
|
79
90
|
Now if Mida is parsing some input and manages to match against the +Review+ +itemtype+, it
|
80
91
|
will only allow the specified properties and will reject any that don't have the correct number. It
|
data/Rakefile
CHANGED
@@ -6,10 +6,10 @@ spec = Gem::Specification.new do |s|
|
|
6
6
|
s.name = "mida"
|
7
7
|
s.summary = "A Microdata parser/extractor library"
|
8
8
|
s.description = "A Microdata parser and extractor library, based on the latest published version of the Microdata Specification, dated 5th April 2011."
|
9
|
-
s.version = "0.
|
9
|
+
s.version = "0.3.0"
|
10
10
|
s.author = "Lawrence Woodman"
|
11
11
|
s.email = "lwoodman@vlifesystems.com"
|
12
|
-
s.homepage = %q{http://github.com/
|
12
|
+
s.homepage = %q{http://lawrencewoodman.github.com/mida/}
|
13
13
|
s.platform = Gem::Platform::RUBY
|
14
14
|
s.required_ruby_version = '>=1.9'
|
15
15
|
s.files = Dir['lib/**/*.rb'] + Dir['spec/**/*.rb'] + Dir['*.rdoc'] + Dir['Rakefile']
|
data/lib/mida.rb
CHANGED
@@ -0,0 +1,15 @@
|
|
1
|
+
module Mida
|
2
|
+
# Module to hold the various data types.
|
3
|
+
# Each DataType should be a module containing the class method: +extract+
|
4
|
+
# which returns the value extracted or raises an +ArgumentError+ exception
|
5
|
+
# if input value is not valid.
|
6
|
+
module DataType
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
require 'mida/datatype/boolean'
|
11
|
+
require 'mida/datatype/float'
|
12
|
+
require 'mida/datatype/integer'
|
13
|
+
require 'mida/datatype/iso8601date'
|
14
|
+
require 'mida/datatype/number'
|
15
|
+
require 'mida/datatype/text'
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Mida
|
2
|
+
module DataType
|
3
|
+
|
4
|
+
# Boolean data type
|
5
|
+
module Boolean
|
6
|
+
|
7
|
+
# Returns the +value+ as a boolean
|
8
|
+
# or raises ArgumentError if not valid
|
9
|
+
def self.extract(value)
|
10
|
+
case value.downcase
|
11
|
+
when 'true' then true
|
12
|
+
when 'false' then false
|
13
|
+
else raise ArgumentError, 'Invalid value'
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Mida
|
2
|
+
module DataType
|
3
|
+
|
4
|
+
# Float data type
|
5
|
+
module Float
|
6
|
+
|
7
|
+
# Returns the +value+ as a floating point number
|
8
|
+
# Relies on +Float+ to raise +ArgumentError+ if not valid
|
9
|
+
def self.extract(value)
|
10
|
+
Float(value)
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'date'
|
2
|
+
|
3
|
+
module Mida
|
4
|
+
module DataType
|
5
|
+
|
6
|
+
# ISO 8601 Date data type
|
7
|
+
module ISO8601Date
|
8
|
+
|
9
|
+
# Returns the +value+ as a +DateTime+ instance
|
10
|
+
# Relies on <tt>DateTime#iso8601</tt> to raise
|
11
|
+
# +ArgumentError+ if not valid
|
12
|
+
def self.extract(value)
|
13
|
+
DateTime.iso8601(value)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
data/lib/mida/document.rb
CHANGED
@@ -4,6 +4,7 @@ module Mida
|
|
4
4
|
|
5
5
|
# Class that holds the extracted Microdata
|
6
6
|
class Document
|
7
|
+
include Enumerable
|
7
8
|
|
8
9
|
# An Array of Mida::Item objects. These are all top-level
|
9
10
|
# and hence not properties of other Items
|
@@ -20,25 +21,27 @@ module Mida
|
|
20
21
|
@items = extract_items
|
21
22
|
end
|
22
23
|
|
23
|
-
#
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
# to match against
|
28
|
-
def search(vocabulary, items=@items)
|
29
|
-
found_items = []
|
30
|
-
regexp_passed = vocabulary.kind_of?(Regexp)
|
31
|
-
regexp = if regexp_passed then vocabulary else vocabulary.itemtype end
|
24
|
+
# Implements method for Enumerable
|
25
|
+
def each
|
26
|
+
@items.each {|item| yield(item)}
|
27
|
+
end
|
32
28
|
|
33
|
-
|
29
|
+
# Returns an array of matching <tt>Mida::Item</tt> objects
|
30
|
+
#
|
31
|
+
# This drills down through each +Item+ to find match items
|
32
|
+
#
|
33
|
+
# [itemtype] A regexp to match the item types against
|
34
|
+
# [items] An array of items to search. If no argument supplied, will
|
35
|
+
# search through all items in the document.
|
36
|
+
def search(itemtype, items=@items)
|
37
|
+
items.each_with_object([]) do |item, found_items|
|
34
38
|
# Allows matching against empty string, otherwise couldn't match
|
35
39
|
# as item.type can be nil
|
36
|
-
if (item.type.nil? && "" =~
|
40
|
+
if (item.type.nil? && "" =~ itemtype) || (item.type =~ itemtype)
|
37
41
|
found_items << item
|
38
42
|
end
|
39
|
-
found_items
|
43
|
+
found_items.concat(search_values(item.properties.values, itemtype))
|
40
44
|
end
|
41
|
-
found_items
|
42
45
|
end
|
43
46
|
|
44
47
|
private
|
@@ -47,18 +50,17 @@ module Mida
|
|
47
50
|
return nil unless itemscopes
|
48
51
|
|
49
52
|
itemscopes.collect do |itemscope|
|
50
|
-
|
53
|
+
itemscope = Itemscope.new(itemscope, @page_url)
|
54
|
+
Item.new(itemscope)
|
51
55
|
end
|
52
56
|
end
|
53
57
|
|
54
58
|
def search_values(values, vocabulary)
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
elsif value.is_a?(Array) then items += search_values(value, vocabulary)
|
59
|
+
values.each_with_object([]) do |value, items|
|
60
|
+
if value.is_a?(Item) then items.concat(search(vocabulary, [value]))
|
61
|
+
elsif value.is_a?(Array) then items.concat(search_values(value, vocabulary))
|
59
62
|
end
|
60
63
|
end
|
61
|
-
items
|
62
64
|
end
|
63
65
|
|
64
66
|
end
|
data/lib/mida/item.rb
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
require 'nokogiri'
|
2
|
+
require 'mida'
|
2
3
|
|
3
4
|
module Mida
|
4
5
|
|
5
|
-
# Class that holds
|
6
|
+
# Class that holds a validated item
|
6
7
|
class Item
|
8
|
+
|
7
9
|
# The vocabulary used to interpret this item
|
8
10
|
attr_reader :vocabulary
|
9
11
|
|
@@ -18,29 +20,26 @@ module Mida
|
|
18
20
|
# or <tt>Mida::Item</tt> instances
|
19
21
|
attr_reader :properties
|
20
22
|
|
21
|
-
# Create a new Item object
|
23
|
+
# Create a new Item object from an +Itemscope+ and validates
|
24
|
+
# its +properties+
|
22
25
|
#
|
23
|
-
# [itemscope] The itemscope that
|
24
|
-
|
25
|
-
|
26
|
-
@
|
27
|
-
@type, @id = extract_attribute('itemtype'), extract_attribute('itemid')
|
26
|
+
# [itemscope] The itemscope that has been parsed by +Itemscope+
|
27
|
+
def initialize(itemscope)
|
28
|
+
@type = itemscope.type
|
29
|
+
@id = itemscope.id
|
28
30
|
@vocabulary = Mida::Vocabulary.find(@type)
|
29
|
-
@properties =
|
30
|
-
add_itemref_properties
|
31
|
-
parse_elements(extract_elements(@itemscope))
|
31
|
+
@properties = itemscope.properties
|
32
32
|
validate_properties
|
33
33
|
end
|
34
34
|
|
35
35
|
# Return a Hash representation
|
36
36
|
# of the form:
|
37
|
-
# {
|
38
|
-
# type: 'The item type',
|
37
|
+
# { type: 'http://example.com/vocab/review',
|
39
38
|
# id: 'urn:isbn:1-934356-08-5',
|
40
39
|
# properties: {'a name' => 'avalue' }
|
41
40
|
# }
|
42
41
|
def to_h
|
43
|
-
{
|
42
|
+
{type: @type, id: @id, properties: properties_to_h(@properties)}
|
44
43
|
end
|
45
44
|
|
46
45
|
def to_s
|
@@ -58,63 +57,101 @@ module Mida
|
|
58
57
|
def validate_properties
|
59
58
|
@properties =
|
60
59
|
@properties.each_with_object({}) do |(property, values), hash|
|
61
|
-
|
62
|
-
|
63
|
-
end
|
60
|
+
valid_values = validate_values(property, values)
|
61
|
+
hash[property] = valid_values unless valid_values.nil?
|
64
62
|
end
|
65
63
|
end
|
66
64
|
|
67
|
-
# Return whether the number of values conforms to
|
68
|
-
def valid_num_values?(
|
69
|
-
|
70
|
-
property_spec = @vocabulary.prop_spec[property]
|
71
|
-
(property_spec[:num] == :many ||
|
72
|
-
(property_spec[:num] == :one && values.length == 1))
|
65
|
+
# Return whether the number of values conforms to +num+
|
66
|
+
def valid_num_values?(num, values)
|
67
|
+
num == :many || (num == :one && values.length == 1)
|
73
68
|
end
|
74
69
|
|
70
|
+
# Return whether this property name is valid
|
75
71
|
def valid_property?(property, values)
|
76
|
-
[property, :any].any?
|
72
|
+
[property, :any].any? do |prop|
|
73
|
+
@vocabulary.properties.has_key?(prop)
|
74
|
+
end
|
77
75
|
end
|
78
76
|
|
79
|
-
|
80
|
-
|
81
|
-
|
77
|
+
# Return valid values, converted to the correct +DataType+
|
78
|
+
# or +Item+ and number if necessary
|
79
|
+
def validate_values(property, values)
|
80
|
+
return nil unless valid_property?(property, values)
|
81
|
+
prop_num = property_number(property)
|
82
|
+
return nil unless valid_num_values?(prop_num, values)
|
83
|
+
prop_types = property_types(property)
|
84
|
+
|
85
|
+
valid_values = values.each_with_object([]) do |value, valid_values|
|
86
|
+
new_value = validate_value(prop_types, value)
|
87
|
+
valid_values << new_value unless new_value.nil?
|
88
|
+
end
|
89
|
+
|
90
|
+
# Convert property to correct number
|
91
|
+
prop_num == :many ? valid_values : valid_values.first
|
92
|
+
end
|
93
|
+
|
94
|
+
# Returns value converted to correct +DataType+ or +Item+
|
95
|
+
# or +nil+ if not valid
|
96
|
+
def validate_value(prop_types, value)
|
97
|
+
if is_itemscope?(value)
|
98
|
+
valid_itemtype?(prop_types, value.type) ? Item.new(value) : nil
|
99
|
+
elsif (extract_value = datatype_extract(prop_types, value))
|
100
|
+
extract_value
|
101
|
+
elsif prop_types.include?(:any)
|
102
|
+
value
|
82
103
|
else
|
83
|
-
|
104
|
+
nil
|
84
105
|
end
|
106
|
+
end
|
85
107
|
|
86
|
-
|
108
|
+
# Return the correct type for this property
|
109
|
+
def property_types(property)
|
110
|
+
if @vocabulary.properties.has_key?(property)
|
111
|
+
@vocabulary.properties[property][:types]
|
112
|
+
else
|
113
|
+
@vocabulary.properties[:any][:types]
|
114
|
+
end
|
87
115
|
end
|
88
116
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
return true
|
117
|
+
# Return the correct number for this property
|
118
|
+
def property_number(property)
|
119
|
+
if @vocabulary.properties.has_key?(property)
|
120
|
+
@vocabulary.properties[property][:num]
|
121
|
+
else
|
122
|
+
@vocabulary.properties[:any][:num]
|
96
123
|
end
|
97
|
-
false
|
98
124
|
end
|
99
125
|
|
100
|
-
def
|
101
|
-
|
126
|
+
def is_itemscope?(object)
|
127
|
+
object.kind_of?(Itemscope)
|
102
128
|
end
|
103
129
|
|
104
|
-
|
105
|
-
|
130
|
+
# Returns whether the +itemtype+ is a valid type
|
131
|
+
def valid_itemtype?(valid_types, itemtype)
|
132
|
+
return true if valid_types.include?(:any)
|
133
|
+
|
134
|
+
valid_types.find do |type|
|
135
|
+
type.respond_to?(:itemtype) && type.itemtype =~ itemtype
|
136
|
+
end
|
106
137
|
end
|
107
138
|
|
108
|
-
#
|
109
|
-
|
110
|
-
|
139
|
+
# Returns the extracted value or +nil+ if none of the datatypes
|
140
|
+
# could extract the +value+
|
141
|
+
def datatype_extract(valid_types, value)
|
142
|
+
valid_types.find do |type|
|
143
|
+
begin
|
144
|
+
return type.extract(value) if type.respond_to?(:extract)
|
145
|
+
rescue ArgumentError
|
146
|
+
end
|
147
|
+
end
|
148
|
+
nil
|
111
149
|
end
|
112
150
|
|
113
151
|
# The value as it should appear in to_h()
|
114
152
|
def value_to_h(value)
|
115
|
-
|
116
|
-
|
117
|
-
when value.is_a?(Item) then value.to_h
|
153
|
+
if value.is_a?(Array) then value.collect {|element| value_to_h(element)}
|
154
|
+
elsif value.is_a?(Item) then value.to_h
|
118
155
|
else value
|
119
156
|
end
|
120
157
|
end
|
@@ -125,31 +162,6 @@ module Mida
|
|
125
162
|
end
|
126
163
|
end
|
127
164
|
|
128
|
-
# Add any properties referred to by 'itemref'
|
129
|
-
def add_itemref_properties
|
130
|
-
itemref = extract_attribute('itemref')
|
131
|
-
if itemref
|
132
|
-
itemref.split.each {|id| parse_elements(find_with_id(id))}
|
133
|
-
end
|
134
|
-
end
|
135
|
-
|
136
|
-
def parse_elements(elements)
|
137
|
-
elements.each {|element| parse_element(element)}
|
138
|
-
end
|
139
|
-
|
140
|
-
def parse_element(element)
|
141
|
-
itemscope = element.attribute('itemscope')
|
142
|
-
itemprop = element.attribute('itemprop')
|
143
|
-
internal_elements = extract_elements(element)
|
144
|
-
add_itemprop(element) if itemscope || itemprop
|
145
|
-
parse_elements(internal_elements) if internal_elements && !itemscope
|
146
|
-
end
|
147
|
-
|
148
|
-
def add_itemprop(itemprop)
|
149
|
-
properties = Itemprop.parse(itemprop, @page_url)
|
150
|
-
properties.each { |name, value| (@properties[name] ||= []) << value }
|
151
|
-
end
|
152
|
-
|
153
165
|
end
|
154
166
|
|
155
167
|
end
|