mida 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.rdoc +18 -2
- data/README.rdoc +22 -11
- data/Rakefile +2 -2
- data/lib/mida.rb +1 -1
- data/lib/mida/datatype.rb +15 -0
- data/lib/mida/datatype/boolean.rb +18 -0
- data/lib/mida/datatype/float.rb +15 -0
- data/lib/mida/datatype/integer.rb +15 -0
- data/lib/mida/datatype/iso8601date.rb +17 -0
- data/lib/mida/datatype/number.rb +15 -0
- data/lib/mida/datatype/text.rb +13 -0
- data/lib/mida/document.rb +21 -19
- data/lib/mida/genericvocabulary.rb +13 -0
- data/lib/mida/item.rb +83 -71
- data/lib/mida/itemprop.rb +55 -30
- data/lib/mida/itemscope.rb +82 -0
- data/lib/mida/propertydesc.rb +36 -0
- data/lib/mida/vocabulary.rb +60 -6
- data/spec/datatype/boolean_spec.rb +27 -0
- data/spec/datatype/float_spec.rb +23 -0
- data/spec/datatype/integer_spec.rb +23 -0
- data/spec/datatype/iso8601date_spec.rb +20 -0
- data/spec/datatype/number_spec.rb +23 -0
- data/spec/datatype/text_spec.rb +14 -0
- data/spec/document_spec.rb +31 -487
- data/spec/item_spec.rb +163 -472
- data/spec/itemprop_spec.rb +40 -45
- data/spec/itemscope_spec.rb +287 -0
- data/spec/propertydesc_spec.rb +56 -0
- data/spec/spec_helper.rb +13 -36
- data/spec/vocabulary_spec.rb +148 -0
- metadata +22 -6
- data/lib/mida/vocabulary/generic.rb +0 -15
- data/lib/mida/vocabularydesc.rb +0 -57
- data/spec/vocabularydesc_spec.rb +0 -106
data/CHANGELOG.rdoc
CHANGED
@@ -1,7 +1,23 @@
|
|
1
|
+
== 0.3.0 (29th June 2011)
|
2
|
+
* Merge +VocabularyDesc+ into +Vocabulary+
|
3
|
+
* Vocabularies are now auto registered using +inherited+ hook
|
4
|
+
* Removed vocabulary from <tt>Item#to_h</tt>
|
5
|
+
* Deprecate +types+ to describe a Vocabulary property if favour of +extract+
|
6
|
+
* Add +DataType+ so can use <tt>DataType::Text</tt> instead of +String+ for a
|
7
|
+
type
|
8
|
+
* Add various <tt>DataType</tt>s: +Boolean+, +Float+, +Integer+, +Number+,
|
9
|
+
+ISO8601Date+, +Text+
|
10
|
+
* Add Bundler support
|
11
|
+
* Properties marked as <tt>has_one</tt> now output a single value instead of
|
12
|
+
an +Array+
|
13
|
+
* <tt>Document#search</tt> now only uses a +Regexp+ to search with
|
14
|
+
* +Document+ now includes +Enumerable+ Mixin
|
15
|
+
|
1
16
|
== 0.2.0 (3rd May 2011)
|
2
17
|
* Add ability to describe and conform to vocabularies
|
3
|
-
* Rename Mida::Property to Mida::Itemprop to better reflect
|
4
|
-
|
18
|
+
* Rename <tt>Mida::Property</tt> to <tt>Mida::Itemprop</tt> to better reflect
|
19
|
+
use
|
20
|
+
* Make some of the <tt>Mida::Itemprop</tt> class methods private
|
5
21
|
|
6
22
|
== 0.1.3 (18th April 2011)
|
7
23
|
* Ensure itemprops are parsed properly if containing non-microdata elements
|
data/README.rdoc
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
= Mida
|
2
2
|
|
3
|
-
* {Mida Project Page}[
|
3
|
+
* {Mida Project Page}[http://lawrencewoodman.github.com/mida]
|
4
|
+
* {Mida Github Repository}[https://github.com/LawrenceWoodman/mida]
|
4
5
|
* {Mida Bug Tracker}[https://github.com/LawrenceWoodman/mida/issues]
|
5
6
|
|
6
7
|
== Description
|
@@ -43,8 +44,8 @@ To return all the +Items+ that use one of Google's Review vocabularies:
|
|
43
44
|
doc.search(%r{http://data-vocabulary\.org.*?review.*?}i)
|
44
45
|
|
45
46
|
=== Inspecting an +Item+
|
46
|
-
Each +Item+ is a <tt>Mida::Item</tt> instance and has
|
47
|
-
interest
|
47
|
+
Each +Item+ is a <tt>Mida::Item</tt> instance and has four main methods of
|
48
|
+
interest: +type+, +vocabulary+, +properties+ and +id+.
|
48
49
|
|
49
50
|
To find out the +itemtype+ of the +Item+:
|
50
51
|
puts doc.items.first.type
|
@@ -60,21 +61,31 @@ To see the +properties+ of the +Item+:
|
|
60
61
|
|
61
62
|
=== Working with Vocabularies
|
62
63
|
Mida allows you to define vocabularies, so that input data can be constrained to match
|
63
|
-
expected patterns. By default a generic vocabulary (<tt>Mida::
|
64
|
+
expected patterns. By default a generic vocabulary (<tt>Mida::GenericVocabulary</tt>)
|
64
65
|
is registered which will match against any +itemtype+ with any number of properties.
|
65
66
|
|
66
|
-
If you want to specify a vocabulary you create a class derived from <tt>Mida::
|
67
|
-
and use +itemtype+, +has_one+, +has_many+ and +
|
67
|
+
If you want to specify a vocabulary you create a class derived from <tt>Mida::Vocabulary</tt>
|
68
|
+
and use +itemtype+, +has_one+, +has_many+ and +extract+ to describe the vocabulary.
|
68
69
|
|
69
70
|
As an example the following describes a subset of Google's Review vocabulary:
|
70
|
-
|
71
|
-
|
71
|
+
|
72
|
+
class Rating < Mida::Vocabulary
|
73
|
+
itemtype %r{http://data-vocabulary.org/rating}i
|
74
|
+
has_one 'best'
|
75
|
+
has_one 'worst'
|
76
|
+
has_one 'value'
|
77
|
+
end
|
78
|
+
|
79
|
+
class Review < Mida::Vocabulary
|
80
|
+
itemtype %r{http://data-vocabulary.org/review}i
|
72
81
|
has_one 'itemreviewed'
|
73
|
-
has_one 'rating'
|
82
|
+
has_one 'rating' do
|
83
|
+
extract Rating, Mida::DataType::Text
|
84
|
+
end
|
74
85
|
end
|
75
86
|
|
76
|
-
|
77
|
-
|
87
|
+
When you create a subclass of <tt>Mida::Vocabulary</tt> it automatically
|
88
|
+
registers the Vocabulary.
|
78
89
|
|
79
90
|
Now if Mida is parsing some input and manages to match against the +Review+ +itemtype+, it
|
80
91
|
will only allow the specified properties and will reject any that don't have the correct number. It
|
data/Rakefile
CHANGED
@@ -6,10 +6,10 @@ spec = Gem::Specification.new do |s|
|
|
6
6
|
s.name = "mida"
|
7
7
|
s.summary = "A Microdata parser/extractor library"
|
8
8
|
s.description = "A Microdata parser and extractor library, based on the latest published version of the Microdata Specification, dated 5th April 2011."
|
9
|
-
s.version = "0.
|
9
|
+
s.version = "0.3.0"
|
10
10
|
s.author = "Lawrence Woodman"
|
11
11
|
s.email = "lwoodman@vlifesystems.com"
|
12
|
-
s.homepage = %q{http://github.com/
|
12
|
+
s.homepage = %q{http://lawrencewoodman.github.com/mida/}
|
13
13
|
s.platform = Gem::Platform::RUBY
|
14
14
|
s.required_ruby_version = '>=1.9'
|
15
15
|
s.files = Dir['lib/**/*.rb'] + Dir['spec/**/*.rb'] + Dir['*.rdoc'] + Dir['Rakefile']
|
data/lib/mida.rb
CHANGED
@@ -0,0 +1,15 @@
|
|
1
|
+
module Mida
|
2
|
+
# Module to hold the various data types.
|
3
|
+
# Each DataType should be a module containing the class method: +extract+
|
4
|
+
# which returns the value extracted or raises an +ArgumentError+ exception
|
5
|
+
# if input value is not valid.
|
6
|
+
module DataType
|
7
|
+
end
|
8
|
+
end
|
9
|
+
|
10
|
+
require 'mida/datatype/boolean'
|
11
|
+
require 'mida/datatype/float'
|
12
|
+
require 'mida/datatype/integer'
|
13
|
+
require 'mida/datatype/iso8601date'
|
14
|
+
require 'mida/datatype/number'
|
15
|
+
require 'mida/datatype/text'
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Mida
|
2
|
+
module DataType
|
3
|
+
|
4
|
+
# Boolean data type
|
5
|
+
module Boolean
|
6
|
+
|
7
|
+
# Returns the +value+ as a boolean
|
8
|
+
# or raises ArgumentError if not valid
|
9
|
+
def self.extract(value)
|
10
|
+
case value.downcase
|
11
|
+
when 'true' then true
|
12
|
+
when 'false' then false
|
13
|
+
else raise ArgumentError, 'Invalid value'
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Mida
|
2
|
+
module DataType
|
3
|
+
|
4
|
+
# Float data type
|
5
|
+
module Float
|
6
|
+
|
7
|
+
# Returns the +value+ as a floating point number
|
8
|
+
# Relies on +Float+ to raise +ArgumentError+ if not valid
|
9
|
+
def self.extract(value)
|
10
|
+
Float(value)
|
11
|
+
end
|
12
|
+
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'date'
|
2
|
+
|
3
|
+
module Mida
|
4
|
+
module DataType
|
5
|
+
|
6
|
+
# ISO 8601 Date data type
|
7
|
+
module ISO8601Date
|
8
|
+
|
9
|
+
# Returns the +value+ as a +DateTime+ instance
|
10
|
+
# Relies on <tt>DateTime#iso8601</tt> to raise
|
11
|
+
# +ArgumentError+ if not valid
|
12
|
+
def self.extract(value)
|
13
|
+
DateTime.iso8601(value)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
data/lib/mida/document.rb
CHANGED
@@ -4,6 +4,7 @@ module Mida
|
|
4
4
|
|
5
5
|
# Class that holds the extracted Microdata
|
6
6
|
class Document
|
7
|
+
include Enumerable
|
7
8
|
|
8
9
|
# An Array of Mida::Item objects. These are all top-level
|
9
10
|
# and hence not properties of other Items
|
@@ -20,25 +21,27 @@ module Mida
|
|
20
21
|
@items = extract_items
|
21
22
|
end
|
22
23
|
|
23
|
-
#
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
# to match against
|
28
|
-
def search(vocabulary, items=@items)
|
29
|
-
found_items = []
|
30
|
-
regexp_passed = vocabulary.kind_of?(Regexp)
|
31
|
-
regexp = if regexp_passed then vocabulary else vocabulary.itemtype end
|
24
|
+
# Implements method for Enumerable
|
25
|
+
def each
|
26
|
+
@items.each {|item| yield(item)}
|
27
|
+
end
|
32
28
|
|
33
|
-
|
29
|
+
# Returns an array of matching <tt>Mida::Item</tt> objects
|
30
|
+
#
|
31
|
+
# This drills down through each +Item+ to find match items
|
32
|
+
#
|
33
|
+
# [itemtype] A regexp to match the item types against
|
34
|
+
# [items] An array of items to search. If no argument supplied, will
|
35
|
+
# search through all items in the document.
|
36
|
+
def search(itemtype, items=@items)
|
37
|
+
items.each_with_object([]) do |item, found_items|
|
34
38
|
# Allows matching against empty string, otherwise couldn't match
|
35
39
|
# as item.type can be nil
|
36
|
-
if (item.type.nil? && "" =~
|
40
|
+
if (item.type.nil? && "" =~ itemtype) || (item.type =~ itemtype)
|
37
41
|
found_items << item
|
38
42
|
end
|
39
|
-
found_items
|
43
|
+
found_items.concat(search_values(item.properties.values, itemtype))
|
40
44
|
end
|
41
|
-
found_items
|
42
45
|
end
|
43
46
|
|
44
47
|
private
|
@@ -47,18 +50,17 @@ module Mida
|
|
47
50
|
return nil unless itemscopes
|
48
51
|
|
49
52
|
itemscopes.collect do |itemscope|
|
50
|
-
|
53
|
+
itemscope = Itemscope.new(itemscope, @page_url)
|
54
|
+
Item.new(itemscope)
|
51
55
|
end
|
52
56
|
end
|
53
57
|
|
54
58
|
def search_values(values, vocabulary)
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
elsif value.is_a?(Array) then items += search_values(value, vocabulary)
|
59
|
+
values.each_with_object([]) do |value, items|
|
60
|
+
if value.is_a?(Item) then items.concat(search(vocabulary, [value]))
|
61
|
+
elsif value.is_a?(Array) then items.concat(search_values(value, vocabulary))
|
59
62
|
end
|
60
63
|
end
|
61
|
-
items
|
62
64
|
end
|
63
65
|
|
64
66
|
end
|
data/lib/mida/item.rb
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
require 'nokogiri'
|
2
|
+
require 'mida'
|
2
3
|
|
3
4
|
module Mida
|
4
5
|
|
5
|
-
# Class that holds
|
6
|
+
# Class that holds a validated item
|
6
7
|
class Item
|
8
|
+
|
7
9
|
# The vocabulary used to interpret this item
|
8
10
|
attr_reader :vocabulary
|
9
11
|
|
@@ -18,29 +20,26 @@ module Mida
|
|
18
20
|
# or <tt>Mida::Item</tt> instances
|
19
21
|
attr_reader :properties
|
20
22
|
|
21
|
-
# Create a new Item object
|
23
|
+
# Create a new Item object from an +Itemscope+ and validates
|
24
|
+
# its +properties+
|
22
25
|
#
|
23
|
-
# [itemscope] The itemscope that
|
24
|
-
|
25
|
-
|
26
|
-
@
|
27
|
-
@type, @id = extract_attribute('itemtype'), extract_attribute('itemid')
|
26
|
+
# [itemscope] The itemscope that has been parsed by +Itemscope+
|
27
|
+
def initialize(itemscope)
|
28
|
+
@type = itemscope.type
|
29
|
+
@id = itemscope.id
|
28
30
|
@vocabulary = Mida::Vocabulary.find(@type)
|
29
|
-
@properties =
|
30
|
-
add_itemref_properties
|
31
|
-
parse_elements(extract_elements(@itemscope))
|
31
|
+
@properties = itemscope.properties
|
32
32
|
validate_properties
|
33
33
|
end
|
34
34
|
|
35
35
|
# Return a Hash representation
|
36
36
|
# of the form:
|
37
|
-
# {
|
38
|
-
# type: 'The item type',
|
37
|
+
# { type: 'http://example.com/vocab/review',
|
39
38
|
# id: 'urn:isbn:1-934356-08-5',
|
40
39
|
# properties: {'a name' => 'avalue' }
|
41
40
|
# }
|
42
41
|
def to_h
|
43
|
-
{
|
42
|
+
{type: @type, id: @id, properties: properties_to_h(@properties)}
|
44
43
|
end
|
45
44
|
|
46
45
|
def to_s
|
@@ -58,63 +57,101 @@ module Mida
|
|
58
57
|
def validate_properties
|
59
58
|
@properties =
|
60
59
|
@properties.each_with_object({}) do |(property, values), hash|
|
61
|
-
|
62
|
-
|
63
|
-
end
|
60
|
+
valid_values = validate_values(property, values)
|
61
|
+
hash[property] = valid_values unless valid_values.nil?
|
64
62
|
end
|
65
63
|
end
|
66
64
|
|
67
|
-
# Return whether the number of values conforms to
|
68
|
-
def valid_num_values?(
|
69
|
-
|
70
|
-
property_spec = @vocabulary.prop_spec[property]
|
71
|
-
(property_spec[:num] == :many ||
|
72
|
-
(property_spec[:num] == :one && values.length == 1))
|
65
|
+
# Return whether the number of values conforms to +num+
|
66
|
+
def valid_num_values?(num, values)
|
67
|
+
num == :many || (num == :one && values.length == 1)
|
73
68
|
end
|
74
69
|
|
70
|
+
# Return whether this property name is valid
|
75
71
|
def valid_property?(property, values)
|
76
|
-
[property, :any].any?
|
72
|
+
[property, :any].any? do |prop|
|
73
|
+
@vocabulary.properties.has_key?(prop)
|
74
|
+
end
|
77
75
|
end
|
78
76
|
|
79
|
-
|
80
|
-
|
81
|
-
|
77
|
+
# Return valid values, converted to the correct +DataType+
|
78
|
+
# or +Item+ and number if necessary
|
79
|
+
def validate_values(property, values)
|
80
|
+
return nil unless valid_property?(property, values)
|
81
|
+
prop_num = property_number(property)
|
82
|
+
return nil unless valid_num_values?(prop_num, values)
|
83
|
+
prop_types = property_types(property)
|
84
|
+
|
85
|
+
valid_values = values.each_with_object([]) do |value, valid_values|
|
86
|
+
new_value = validate_value(prop_types, value)
|
87
|
+
valid_values << new_value unless new_value.nil?
|
88
|
+
end
|
89
|
+
|
90
|
+
# Convert property to correct number
|
91
|
+
prop_num == :many ? valid_values : valid_values.first
|
92
|
+
end
|
93
|
+
|
94
|
+
# Returns value converted to correct +DataType+ or +Item+
|
95
|
+
# or +nil+ if not valid
|
96
|
+
def validate_value(prop_types, value)
|
97
|
+
if is_itemscope?(value)
|
98
|
+
valid_itemtype?(prop_types, value.type) ? Item.new(value) : nil
|
99
|
+
elsif (extract_value = datatype_extract(prop_types, value))
|
100
|
+
extract_value
|
101
|
+
elsif prop_types.include?(:any)
|
102
|
+
value
|
82
103
|
else
|
83
|
-
|
104
|
+
nil
|
84
105
|
end
|
106
|
+
end
|
85
107
|
|
86
|
-
|
108
|
+
# Return the correct type for this property
|
109
|
+
def property_types(property)
|
110
|
+
if @vocabulary.properties.has_key?(property)
|
111
|
+
@vocabulary.properties[property][:types]
|
112
|
+
else
|
113
|
+
@vocabulary.properties[:any][:types]
|
114
|
+
end
|
87
115
|
end
|
88
116
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
return true
|
117
|
+
# Return the correct number for this property
|
118
|
+
def property_number(property)
|
119
|
+
if @vocabulary.properties.has_key?(property)
|
120
|
+
@vocabulary.properties[property][:num]
|
121
|
+
else
|
122
|
+
@vocabulary.properties[:any][:num]
|
96
123
|
end
|
97
|
-
false
|
98
124
|
end
|
99
125
|
|
100
|
-
def
|
101
|
-
|
126
|
+
def is_itemscope?(object)
|
127
|
+
object.kind_of?(Itemscope)
|
102
128
|
end
|
103
129
|
|
104
|
-
|
105
|
-
|
130
|
+
# Returns whether the +itemtype+ is a valid type
|
131
|
+
def valid_itemtype?(valid_types, itemtype)
|
132
|
+
return true if valid_types.include?(:any)
|
133
|
+
|
134
|
+
valid_types.find do |type|
|
135
|
+
type.respond_to?(:itemtype) && type.itemtype =~ itemtype
|
136
|
+
end
|
106
137
|
end
|
107
138
|
|
108
|
-
#
|
109
|
-
|
110
|
-
|
139
|
+
# Returns the extracted value or +nil+ if none of the datatypes
|
140
|
+
# could extract the +value+
|
141
|
+
def datatype_extract(valid_types, value)
|
142
|
+
valid_types.find do |type|
|
143
|
+
begin
|
144
|
+
return type.extract(value) if type.respond_to?(:extract)
|
145
|
+
rescue ArgumentError
|
146
|
+
end
|
147
|
+
end
|
148
|
+
nil
|
111
149
|
end
|
112
150
|
|
113
151
|
# The value as it should appear in to_h()
|
114
152
|
def value_to_h(value)
|
115
|
-
|
116
|
-
|
117
|
-
when value.is_a?(Item) then value.to_h
|
153
|
+
if value.is_a?(Array) then value.collect {|element| value_to_h(element)}
|
154
|
+
elsif value.is_a?(Item) then value.to_h
|
118
155
|
else value
|
119
156
|
end
|
120
157
|
end
|
@@ -125,31 +162,6 @@ module Mida
|
|
125
162
|
end
|
126
163
|
end
|
127
164
|
|
128
|
-
# Add any properties referred to by 'itemref'
|
129
|
-
def add_itemref_properties
|
130
|
-
itemref = extract_attribute('itemref')
|
131
|
-
if itemref
|
132
|
-
itemref.split.each {|id| parse_elements(find_with_id(id))}
|
133
|
-
end
|
134
|
-
end
|
135
|
-
|
136
|
-
def parse_elements(elements)
|
137
|
-
elements.each {|element| parse_element(element)}
|
138
|
-
end
|
139
|
-
|
140
|
-
def parse_element(element)
|
141
|
-
itemscope = element.attribute('itemscope')
|
142
|
-
itemprop = element.attribute('itemprop')
|
143
|
-
internal_elements = extract_elements(element)
|
144
|
-
add_itemprop(element) if itemscope || itemprop
|
145
|
-
parse_elements(internal_elements) if internal_elements && !itemscope
|
146
|
-
end
|
147
|
-
|
148
|
-
def add_itemprop(itemprop)
|
149
|
-
properties = Itemprop.parse(itemprop, @page_url)
|
150
|
-
properties.each { |name, value| (@properties[name] ||= []) << value }
|
151
|
-
end
|
152
|
-
|
153
165
|
end
|
154
166
|
|
155
167
|
end
|