mida 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ = The MIT License
2
+
3
+ Copyright (c) 2011 Lawrence Woodman
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,68 @@
1
+ = Mida
2
+
3
+ * {Mida Project Page}[https://github.com/LawrenceWoodman/mida]
4
+ * {Mida Bug Tracker}[https://github.com/LawrenceWoodman/mida/issues]
5
+
6
+ == Description
7
+ A Microdata[http://en.wikipedia.org/wiki/Microdata_(HTML5)] parser and
8
+ extractor library for ruby.
9
+ This is based on the latest Published version of the Microdata Specification
10
+ dated {5th April 2011}[http://www.w3.org/TR/2011/WD-microdata-20110405/].
11
+
12
+ == Installation
13
+ With Ruby and Rubygems:
14
+ gem install mida
15
+
16
+ === Requirements:
17
+
18
+ * +Nokogiri+
19
+
20
+ == Usage
21
+ The following examples assume that you have required +mida+ and
22
+ +open-uri+.
23
+
24
+ === Extracting Microdata from a page
25
+ All the Microdata is extracted from a page when a new <tt>Mida::Document</tt> instance
26
+ is created.
27
+
28
+ To extract all the Microdata from a webpage:
29
+ url = 'http://example.com'
30
+ open(url) {|f| doc = Mida::Document.new(f, url)}
31
+
32
+ The top-level +Items+ will be held in an array accessible via
33
+ <tt>doc.items</tt>.
34
+
35
+ To simply list all the top-level +Items+ that have been found:
36
+ puts doc.items
37
+
38
+ === Searching
39
+ If you want to search for an +Item+ that has a specific +itemtype+/vocabulary
40
+ this can be done with the +search+ method.
41
+
42
+ To return all the +Items+ that use one of Google's Review vocabularies:
43
+ doc.search(%r{http://data-vocabulary\.org.*?review.*?}i)
44
+
45
+ === Inspecting an +Item+
46
+ Each +Item+ is a <tt>Mida::Item</tt> instance and has three main methods of
47
+ interest, +type+, +properties+ and +id+.
48
+
49
+ To find out the +itemtype+ of the +Item+:
50
+ puts doc.items.first.type
51
+
52
+ To find out the +itemid+ of the +Item+:
53
+ puts doc.items.first.id
54
+
55
+ Properties are returned as a hash containing name/values pairs. The
56
+ values will be an array of either +String+ or <tt>Mida::Item</tt> instances.
57
+
58
+ To see the +properties+ of the +Item+:
59
+ puts doc.items.first.properties
60
+
61
+ == Bugs/Feature Requests
62
+ If you find a bug or want to make a feature request, please report it at the
63
+ Mida project's {issues tracker}[https://github.com/LawrenceWoodman/mida/issues]
64
+ on github.
65
+
66
+ == License
67
+ Copyright (c) 2011 Lawrence Woodman.
68
+ This software is licensed under the MIT License. Please see the file, LICENSE.rdoc, for details.
@@ -0,0 +1,26 @@
1
+ task :default => :spec
2
+
3
+ desc "Create Gem"
4
+ require 'rake/gempackagetask'
5
+ spec = Gem::Specification.new do |s|
6
+ s.name = "mida"
7
+ s.summary = "A Microdata parser"
8
+ s.description = File.read(File.join(File.dirname(__FILE__), 'README.rdoc'))
9
+ s.version = "0.0.0"
10
+ s.author = "Lawrence Woodman"
11
+ s.email = "lwoodman@vlifesystems.com"
12
+ s.homepage = %q{http://github.com/LawrenceWoodman/mida}
13
+ s.platform = Gem::Platform::RUBY
14
+ s.required_ruby_version = '>=1.9'
15
+ s.files = Dir['lib/**/*.rb'] + Dir['spec/**/*.rb'] + Dir['*.rdoc'] + Dir['Rakefile']
16
+ s.has_rdoc = true
17
+ s.extra_rdoc_files = ['README.rdoc', 'LICENSE.rdoc']
18
+ s.rdoc_options << '--main' << 'README.rdoc'
19
+ s.add_dependency('nokogiri')
20
+ s.add_development_dependency('rspec')
21
+ end
22
+ Rake::GemPackageTask.new(spec).define
23
+
24
+ desc "Run Specs"
25
+ require 'rspec/core/rake_task'
26
+ RSpec::Core::RakeTask.new(:spec)
@@ -0,0 +1,6 @@
1
+ = Todo List
2
+
3
+ * Support img rating in alt for google?
4
+ * Look further in extra complications of microdata, e.g. alt tag for img rating and different size ratings
5
+ http://www.google.com/support/webmasters/bin/answer.py?answer=172705
6
+ * Put nested itemscopes, that are not a property of its parent into the parents hash using [:nested]
@@ -0,0 +1,6 @@
1
+ $LOAD_PATH.unshift File.dirname(__FILE__)
2
+ Dir[File.dirname(__FILE__) + '/mida/*.rb'].each { |f| require f }
3
+
4
+ # Mida is a Microdata parser and extractor.
5
+ module Mida
6
+ end
@@ -0,0 +1,61 @@
1
+ require 'nokogiri'
2
+
3
+ module Mida
4
+
5
+ # Class that holds the extracted Microdata
6
+ class Document
7
+
8
+ # An Array of Mida::Item objects. These are all top-level
9
+ # and hence not properties of other Items
10
+ attr_reader :items
11
+
12
+ # Create a new Microdata object
13
+ #
14
+ # [target] The string containing the html that you want to parse
15
+ # [page_url] The url of target used for form absolute urls. This must
16
+ # include the filename, e.g. index.html.
17
+ def initialize(target, page_url=nil)
18
+ @doc = Nokogiri(target)
19
+ @page_url = page_url
20
+ @items = extract_items
21
+ end
22
+
23
+ # Returns an array of matching Mida::Item objects
24
+ #
25
+ # [vocabulary] A regexp to match the item types against
26
+ def search(vocabulary, items=@items)
27
+ found_items = []
28
+ items.each do |item|
29
+ # Allows matching against empty string, otherwise couldn't match
30
+ # as item.type can be nil
31
+ if (item.type.nil? && "" =~ vocabulary) || (item.type =~ vocabulary)
32
+ found_items << item
33
+ end
34
+ found_items += search_values(item.properties.values, vocabulary)
35
+ end
36
+ found_items
37
+ end
38
+
39
+ private
40
+ def extract_items
41
+ items_doc = @doc.search('//*[@itemscope and not(@itemprop)]')
42
+ return nil unless items_doc
43
+
44
+ items_doc.collect do |item_doc|
45
+ Item.new(item_doc, @page_url)
46
+ end
47
+ end
48
+
49
+ def search_values(values, vocabulary)
50
+ items = []
51
+ values.each do |value|
52
+ if value.is_a?(Mida::Item) then items += search(vocabulary, [value])
53
+ elsif value.is_a?(Array) then items += search_values(value, vocabulary)
54
+ end
55
+ end
56
+ items
57
+ end
58
+
59
+ end
60
+
61
+ end
@@ -0,0 +1,100 @@
1
+ require 'nokogiri'
2
+
3
+ module Mida
4
+
5
+ # Class that holds each item/itemscope
6
+ class Item
7
+ # The Type of the item
8
+ attr_reader :type
9
+
10
+ # The Global Identifier of the item
11
+ attr_reader :id
12
+
13
+ # A Hash representing the properties as name/values paris
14
+ # The values will be an array containing either +String+
15
+ # or <tt>Mida::Item</tt> instances
16
+ attr_reader :properties
17
+
18
+ # Create a new Item object
19
+ #
20
+ # [itemscope] The itemscope that you want to parse
21
+ # [page_url] The url of target used for form absolute urls
22
+ def initialize(itemscope, page_url=nil)
23
+ @itemscope, @page_url = itemscope, page_url
24
+ @type, @id = extract_attribute('itemtype'), extract_attribute('itemid')
25
+ @properties = {}
26
+ add_itemref_properties
27
+ traverse_elements(extract_elements(itemscope))
28
+ end
29
+
30
+ # Return a Hash representation
31
+ # of the form {type: 'The item type', properties: {'a name' => 'avalue' }}
32
+ def to_h
33
+ {type: @type, id: @id, properties: properties_to_h(@properties)}
34
+ end
35
+
36
+ def to_s
37
+ to_h.to_s
38
+ end
39
+
40
+ def ==(other)
41
+ @type == other.type and @id == other.id and @properties == other.properties
42
+ end
43
+
44
+ private
45
+
46
+ def extract_attribute(attribute)
47
+ (value = @itemscope.attribute(attribute)) ? value.value : nil
48
+ end
49
+
50
+ def extract_elements(itemscope)
51
+ itemscope.search('./*')
52
+ end
53
+
54
+ # Find an element with a matching id
55
+ def find_with_id(id)
56
+ @itemscope.search("//*[@id='#{id}']")
57
+ end
58
+
59
+ # The value as it should appear in to_h()
60
+ def value_to_h(value)
61
+ case
62
+ when value.is_a?(Array) then value.collect {|element| value_to_h(element)}
63
+ when value.is_a?(Item) then value.to_h
64
+ else value
65
+ end
66
+ end
67
+
68
+ def properties_to_h(properties)
69
+ hash = {}
70
+ properties.each { |name, value| hash[name] = value_to_h(value) }
71
+ hash
72
+ end
73
+
74
+ # Add any properties referred to by 'itemref'
75
+ def add_itemref_properties
76
+ itemref = extract_attribute('itemref')
77
+ if itemref
78
+ itemref.split.each {|id| traverse_elements(find_with_id(id))}
79
+ end
80
+ end
81
+
82
+ def traverse_elements(elements)
83
+ elements.each do |element|
84
+ internal_elements = extract_elements(element)
85
+ if internal_elements.empty? || element.attribute('itemscope')
86
+ add_itemprop(element)
87
+ else
88
+ traverse_elements(internal_elements)
89
+ end
90
+ end
91
+ end
92
+
93
+ def add_itemprop(itemprop)
94
+ properties = Property.parse(itemprop, @page_url)
95
+ properties.each { |name, value| (@properties[name] ||= []) << value }
96
+ end
97
+
98
+ end
99
+
100
+ end
@@ -0,0 +1,70 @@
1
+ require 'nokogiri'
2
+ require 'uri'
3
+
4
+ module Mida
5
+
6
+ # Module that parses itemprop elements
7
+ module Property
8
+
9
+ # Returns a Hash representing the property.
10
+ # Hash is of the form {'property name' => 'value'}
11
+ # [element] The itemprop element to be parsed
12
+ # [page_url] The url of the page, including the filename, used to form absolute urls
13
+ def self.parse(element, page_url=nil)
14
+ hash = {}
15
+ extract_property_names(element).each do |name|
16
+ hash[name] = extract_property(element, page_url)
17
+ end
18
+ hash
19
+ end
20
+
21
+ NON_TEXTCONTENT_ELEMENTS = {
22
+ 'a' => 'href', 'area' => 'href',
23
+ 'audio' => 'src', 'embed' => 'src',
24
+ 'iframe' => 'src', 'img' => 'src',
25
+ 'link' => 'href', 'meta' => 'content',
26
+ 'object' => 'data', 'source' => 'src',
27
+ 'time' => 'datetime', 'track' => 'src',
28
+ 'video' => 'src'
29
+ }
30
+
31
+ URL_ATTRIBUTES = ['data', 'href', 'src']
32
+
33
+ # This returns an empty string if can't form a valid
34
+ # absolute url as per the Microdata spec.
35
+ def self.make_absolute_url(url, page_url)
36
+ return url unless URI.parse(url).relative?
37
+ begin
38
+ URI.parse(page_url).merge(url).to_s
39
+ rescue URI::Error
40
+ ''
41
+ end
42
+ end
43
+
44
+ def self.extract_property_names(itemprop)
45
+ itemprop_attr = itemprop.attribute('itemprop')
46
+ itemprop_attr ? itemprop_attr.value.split() : []
47
+ end
48
+
49
+ def self.extract_property_value(itemprop, page_url)
50
+ element = itemprop.name
51
+ if NON_TEXTCONTENT_ELEMENTS.has_key?(element)
52
+ attribute = NON_TEXTCONTENT_ELEMENTS[element]
53
+ value = itemprop.attribute(attribute).value
54
+ (URL_ATTRIBUTES.include?(attribute)) ? make_absolute_url(value, page_url) : value
55
+ else
56
+ itemprop.inner_text
57
+ end
58
+ end
59
+
60
+ def self.extract_property(itemprop, page_url)
61
+ if itemprop.attribute('itemscope')
62
+ Mida::Item.new(itemprop, page_url)
63
+ else
64
+ extract_property_value(itemprop, page_url)
65
+ end
66
+ end
67
+
68
+ end
69
+
70
+ end
@@ -0,0 +1,684 @@
1
+ require_relative 'spec_helper'
2
+ require_relative '../lib/mida'
3
+
4
+ def test_parsing(md, vocabulary, expected_results)
5
+ items = md.search(vocabulary)
6
+ expected_results.each_with_index do |expected_result,i|
7
+ item = items[i]
8
+ test_to_h(item, expected_result)
9
+ test_properties(item, expected_result)
10
+ end
11
+ end
12
+
13
+ def test_to_h(item, expected_result)
14
+ item.to_h.should == expected_result
15
+ end
16
+
17
+ def test_properties(item, expected_result)
18
+ item.properties.each do |name, value|
19
+ match_array(value, expected_result[:properties][name])
20
+ end
21
+ end
22
+
23
+ def match_array(value_array, expected_results)
24
+ value_array.each_with_index do |element, i|
25
+ if element.is_a?(Mida::Item)
26
+ test_properties(element, expected_results[i])
27
+ else
28
+ element.should == expected_results[i]
29
+ end
30
+ end
31
+ end
32
+
33
+ shared_examples_for 'one root itemscope' do
34
+ it 'should not match itemscopes with different names' do
35
+ @md.search(%r{nothing}).size.should == 0
36
+ end
37
+
38
+ it 'should find the correct number of itemscopes' do
39
+ @md.items.size.should == 1
40
+ end
41
+ end
42
+
43
+ describe Mida::Document, 'when run with a document containing textContent and non textContent itemprops' do
44
+ before do
45
+ @html = '
46
+ <html>
47
+ <head itemscope>
48
+ <link itemprop="link_field" rel="stylesheet" type="text/css" href="stylesheet.css" />
49
+ </head>
50
+ <body>
51
+ There is some text here
52
+ <div>
53
+ and also some here
54
+ <div itemscope>
55
+ <span itemprop="span_field">Some span content</span>
56
+ <time itemprop="dtreviewed" datetime="2009-01-06">Jan 6</time>.
57
+ <meta itemprop="meta_field" content="Some meta content">
58
+ <a itemprop="a_field1" href="http://example.com">non content</a>
59
+ <a itemprop="a_field2" href="welcome/index.html">non content</a>
60
+ <a itemprop="a_field3" href="/intro">non content</a>
61
+ <a itemprop="a_field4" href="/intro/index.html">non content</a>
62
+ <map name="somemap">
63
+ <area shape="rect" coords="0,0,50,120" href="left.html" />
64
+ <area itemprop="area_right" shape="rect" coords="51,0,120,120" href="right.html" />
65
+ </map>
66
+ <audio itemprop="audio_field" src="asound.ogg" controls="controls">
67
+ Audio tag not supported by your browser.
68
+ </audio>
69
+
70
+ <embed itemprop="embed_field" src="helloworld.swf" />
71
+ <iframe itemprop="iframe_field" src="http://www.example.com/iframe_test"></iframe>
72
+ <img itemprop="img_field" src="animage.png" width="120" height="120" usemap="#planetmap" />
73
+ <object itemprop="object_field" data="object.png" type="image/png" />
74
+ <audio controls="controls">
75
+ <source itemprop="source_field" src="song.ogg" type="audio/ogg" />
76
+ <track itemprop="track_field" src="atrack.ogg" />
77
+ Audio tag not supported by your browser.
78
+ </audio>
79
+ <video itemprop="video_field" src="movie.ogg" controls="controls">
80
+ Video tag not supported by your browser.
81
+ </video>
82
+ </div>
83
+ </div>
84
+ </body>
85
+ </html>
86
+ '
87
+ end
88
+
89
+
90
+ context 'when not given a page_url' do
91
+ before do
92
+ @md = Mida::Document.new(@html)
93
+ end
94
+
95
+ it 'should return all the properties and types with the correct values' do
96
+ expected_results = [
97
+ { type: nil, id: nil, properties: {'link_field' => ['']} },
98
+ { type: nil,
99
+ id: nil,
100
+ properties: {
101
+ 'span_field' => ['Some span content'],
102
+ 'dtreviewed' => ['2009-01-06'],
103
+ 'meta_field' => ['Some meta content'],
104
+ 'a_field1' => ['http://example.com'],
105
+ 'a_field2' => [''],
106
+ 'a_field3' => [''],
107
+ 'a_field4' => [''],
108
+ 'area_right' => [''],
109
+ 'audio_field' => [''],
110
+ 'embed_field' => [''],
111
+ 'iframe_field' => ['http://www.example.com/iframe_test'],
112
+ 'img_field' => [''],
113
+ 'object_field' => [''],
114
+ 'source_field' => [''],
115
+ 'track_field' => [''],
116
+ 'video_field' => ['']
117
+ }
118
+ }
119
+ ]
120
+
121
+ test_parsing(@md, %r{}, expected_results)
122
+
123
+ end
124
+ end
125
+
126
+ context 'when given a page_url' do
127
+ before do
128
+ @md = Mida::Document.new(@html, 'http://example.com/start/')
129
+ end
130
+
131
+ it 'should return all the properties and types with the correct values' do
132
+ expected_results = [
133
+ { type: nil, id: nil, properties: {
134
+ 'link_field' => ['http://example.com/start/stylesheet.css']
135
+ }
136
+ },
137
+ { type: nil,
138
+ id: nil,
139
+ properties: {
140
+ 'span_field' => ['Some span content'],
141
+ 'dtreviewed' => ['2009-01-06'],
142
+ 'meta_field' => ['Some meta content'],
143
+ 'a_field1' => ['http://example.com'],
144
+ 'a_field2' => ['http://example.com/start/welcome/index.html'],
145
+ 'a_field3' => ['http://example.com/intro'],
146
+ 'a_field4' => ['http://example.com/intro/index.html'],
147
+ 'area_right' => ['http://example.com/start/right.html'],
148
+ 'audio_field' => ['http://example.com/start/asound.ogg'],
149
+ 'embed_field' => ['http://example.com/start/helloworld.swf'],
150
+ 'iframe_field' => ['http://www.example.com/iframe_test'],
151
+ 'img_field' => ['http://example.com/start/animage.png'],
152
+ 'object_field' => ['http://example.com/start/object.png'],
153
+ 'source_field' => ['http://example.com/start/song.ogg'],
154
+ 'track_field' => ['http://example.com/start/atrack.ogg'],
155
+ 'video_field' => ['http://example.com/start/movie.ogg']
156
+ }
157
+ }
158
+ ]
159
+
160
+ test_parsing(@md, %r{}, expected_results)
161
+ end
162
+ end
163
+
164
+ end
165
+
166
+ describe Mida::Document, 'when run against a full html document containing one itemscope with no itemtype' do
167
+
168
+ before do
169
+ html = '
170
+ <html><body>
171
+ There is some text here
172
+ <div>
173
+ and also some here
174
+ <div itemscope>
175
+ <span itemprop="itemreviewed">Romeo Pizza</span>
176
+ Reviewed by <span itemprop="reviewer">Ulysses Grant</span> on
177
+ <time itemprop="dtreviewed" datetime="2009-01-06">Jan 6</time>.
178
+ <meta itemprop="fielda" content="a5482">
179
+
180
+ <span itemprop="summary">Delicious, tasty pizza in Eastlake!</span>
181
+ <span itemprop="description">This is a very nice pizza place.</span>
182
+ Rating: <span itemprop="rating">4.5</span>
183
+ </div>
184
+ </div>
185
+ </body></html>
186
+ '
187
+ @md = Mida::Document.new(html)
188
+
189
+ end
190
+
191
+ it_should_behave_like 'one root itemscope'
192
+
193
+ it 'should return all the properties and types with the correct values' do
194
+ expected_results = [{
195
+ type: nil,
196
+ id: nil,
197
+ properties: {
198
+ 'itemreviewed' => ['Romeo Pizza'],
199
+ 'reviewer' => ['Ulysses Grant'],
200
+ 'dtreviewed' => ['2009-01-06'],
201
+ 'fielda' => ['a5482'],
202
+ 'summary' => ['Delicious, tasty pizza in Eastlake!'],
203
+ 'description' => ['This is a very nice pizza place.'],
204
+ 'rating' => ['4.5']
205
+ }
206
+ }]
207
+
208
+ test_parsing(@md, %r{}, expected_results)
209
+ end
210
+
211
+ end
212
+
213
+ describe Mida::Document, 'when run against a full html document containing one itemscope nested within another' do
214
+
215
+ before do
216
+ html = '
217
+ <html><body>
218
+ There is some text here
219
+ <div>
220
+ and also some here
221
+ <div itemscope>
222
+ <span itemprop="itemreviewed">Romeo Pizza</span>
223
+ <div itemprop="address" itemscope>
224
+ <span itemprop="firstline">237 Italian Way</span>
225
+ <span itemprop="country">United Kingdom</span>
226
+ </div>
227
+ Rating: <span itemprop="rating">4.5</span>
228
+ </div>
229
+ </div>
230
+ </body></html>
231
+ '
232
+
233
+ @md = Mida::Document.new(html)
234
+
235
+ end
236
+
237
+ it_should_behave_like 'one root itemscope'
238
+
239
+ it 'should return all the properties and types with the correct values' do
240
+ expected_results = [{
241
+ type: nil,
242
+ id: nil,
243
+ properties: {
244
+ 'itemreviewed' => ['Romeo Pizza'],
245
+ 'address' => [{
246
+ type: nil, id: nil, properties: {
247
+ 'firstline' => ['237 Italian Way'],
248
+ 'country' => ['United Kingdom']
249
+ }
250
+ }],
251
+ 'rating' => ['4.5']
252
+ }
253
+ }]
254
+
255
+ test_parsing(@md, %r{}, expected_results)
256
+ end
257
+
258
+ end
259
+
260
+ describe Mida::Document, 'when run against a full html document containing one itemscope nested within another within another' do
261
+
262
+ before do
263
+ html = '
264
+ <html><body>
265
+ There is some text here
266
+ <div>
267
+ and also some here
268
+ <div itemscope>
269
+ <span itemprop="itemreviewed">Romeo Pizza</span>
270
+ <div itemprop="address" itemscope>
271
+ <div itemprop="firstline" itemscope>
272
+ <span itemprop="number">237</span>
273
+ <span itemprop="road">Italian Way</span>
274
+ </div>
275
+ <span itemprop="country">United Kingdom</span>
276
+ </div>
277
+ Rating: <span itemprop="rating">4.5</span>
278
+ </div>
279
+ </div>
280
+ </body></html>
281
+ '
282
+
283
+ @md = Mida::Document.new(html)
284
+ end
285
+
286
+ it_should_behave_like 'one root itemscope'
287
+
288
+ it 'should return all the properties and types with the correct values' do
289
+ expected_results = [{
290
+ type: nil,
291
+ id: nil,
292
+ properties: {
293
+ 'itemreviewed' => ['Romeo Pizza'],
294
+ 'address' => [{
295
+ type: nil,
296
+ id: nil,
297
+ properties: {
298
+ 'firstline' => [{
299
+ type: nil,
300
+ id: nil,
301
+ properties: {
302
+ 'number' => ['237'],
303
+ 'road' => ['Italian Way']
304
+ },
305
+ }],
306
+ 'country' => ['United Kingdom']
307
+ },
308
+ }],
309
+ 'rating' => ['4.5']
310
+ }
311
+ }]
312
+
313
+ test_parsing(@md, %r{^$}, expected_results)
314
+ end
315
+
316
+ end
317
+
318
+ describe Mida::Document, 'when run against a full html document containing one itemscope with an itemtype' do
319
+
320
+ before do
321
+ html = '
322
+ <html><body>
323
+ There is some text here
324
+ <div>
325
+ and also some here
326
+ <div itemscope itemtype="http://data-vocabulary.org/Review">
327
+ <span itemprop="itemreviewed">Romeo Pizza</span>
328
+ Reviewed by <span itemprop="reviewer">Ulysses Grant</span> on
329
+ <time itemprop="dtreviewed" datetime="2009-01-06">Jan 6</time>.
330
+ <span itemprop="summary">Delicious, tasty pizza in Eastlake!</span>
331
+ <span itemprop="description">This is a very nice pizza place.</span>
332
+ Rating: <span itemprop="rating">4.5</span>
333
+ </div>
334
+ </div>
335
+ </body></html>
336
+ '
337
+
338
+ @md = Mida::Document.new(html)
339
+
340
+ end
341
+
342
+ it_should_behave_like 'one root itemscope'
343
+
344
+ it 'should find the correct number of itemscopes if outer specified' do
345
+ @md.search(%r{http://data-vocabulary.org/Review}).size.should == 1
346
+ end
347
+
348
+ it 'should specify the correct type' do
349
+ @md.search(%r{http://data-vocabulary.org/Review}).first.type.should == 'http://data-vocabulary.org/Review'
350
+ end
351
+
352
+ it 'should return all the properties and types with the correct values' do
353
+ expected_results = [{
354
+ type: 'http://data-vocabulary.org/Review',
355
+ id: nil,
356
+ properties: {
357
+ 'itemreviewed' => ['Romeo Pizza'],
358
+ 'reviewer' => ['Ulysses Grant'],
359
+ 'dtreviewed' => ['2009-01-06'],
360
+ 'summary' => ['Delicious, tasty pizza in Eastlake!'],
361
+ 'description' => ['This is a very nice pizza place.'],
362
+ 'rating' => ['4.5']
363
+ }
364
+ }]
365
+ test_parsing(@md, %r{http://data-vocabulary.org/Review}, expected_results)
366
+ end
367
+
368
+ end
369
+
370
+ describe Mida::Document, 'when run against a full html document containing two non-nested itemscopes with itemtypes' do
371
+
372
+ before do
373
+ html = '
374
+ <html><body>
375
+ There is some text here
376
+ <div>
377
+ and also some here
378
+ <div itemscope itemtype="http://data-vocabulary.org/Review">
379
+ <span itemprop="itemreviewed">Romeo Pizza</span>
380
+ Rating: <span itemprop="rating">4.5</span>
381
+ </div>
382
+ <div itemscope itemtype="http://data-vocabulary.org/Organization">
383
+ <span itemprop="name">An org name</span>
384
+ <span itemprop="url">http://example.com</span>
385
+ </div>
386
+ </div>
387
+ </body></html>
388
+ '
389
+
390
+ @md = Mida::Document.new(html)
391
+
392
+ end
393
+
394
+ it 'should return all the itemscopes' do
395
+ @md.items.size.should == 2
396
+ end
397
+
398
+ it 'should give the type of each itemscope if none specified' do
399
+ itemscope_names = {
400
+ 'http://data-vocabulary.org/Review' => 0,
401
+ 'http://data-vocabulary.org/Organization' => 0
402
+ }
403
+
404
+ @md.items.each do |item|
405
+ itemscope_names[item.type] += 1
406
+ end
407
+
408
+ itemscope_names.size.should eq 2
409
+ itemscope_names.each { |name, num| num.should == 1 }
410
+ end
411
+
412
+
413
+ it 'should return all the properties and types with the correct values for 1st itemscope' do
414
+ expected_results = [{
415
+ type: 'http://data-vocabulary.org/Review',
416
+ id: nil,
417
+ properties: {
418
+ 'itemreviewed' => ['Romeo Pizza'],
419
+ 'rating' => ['4.5']
420
+ }
421
+ }]
422
+ test_parsing(@md, %r{http://data-vocabulary.org/Review}, expected_results)
423
+ end
424
+
425
+ it 'should return all the properties from the text for 2nd itemscope' do
426
+ expected_results = [{
427
+ type: 'http://data-vocabulary.org/Organization',
428
+ id: nil,
429
+ properties: {
430
+ 'name' => ['An org name'],
431
+ 'url' => ['http://example.com']
432
+ }
433
+ }]
434
+ test_parsing(@md, %r{http://data-vocabulary.org/Organization}, expected_results)
435
+ end
436
+
437
+ end
438
+
439
+ describe Mida::Document, 'when run against a full html document containing one
440
+ itemscope nested within another and the inner block is
441
+ surrounded with another non itemscope block' do
442
+
443
+ before do
444
+ html = '
445
+ <html><body>
446
+ <div itemscope itemtype="http://data-vocabulary.org/Product">
447
+ <ul class="reviews">
448
+ <li id="model" itemprop="name">DC07</li>
449
+ <li id="make" itemprop="brand">Dyson</li>
450
+ <li itemprop="review" itemscope itemtype="http://data-vocabulary.org/Review-aggregate">
451
+ <span class="ratingDetails">
452
+ <span itemprop="count">1</span> Review,
453
+ Average: <span itemprop="rating">5.0</span>
454
+ </span>
455
+ </li>
456
+ </ul>
457
+ </div>
458
+ </body></html>
459
+ '
460
+
461
+ @md = Mida::Document.new(html)
462
+ end
463
+
464
+ it_should_behave_like 'one root itemscope'
465
+
466
+ it 'should return the correct number of itemscopes' do
467
+ vocabularies = [
468
+ %r{http://data-vocabulary.org/Product},
469
+ %r{http://data-vocabulary.org/Review-aggregate}
470
+ ]
471
+ vocabularies.each {|vocabulary| @md.search(vocabulary).size.should == 1}
472
+ end
473
+
474
+ context "when looking at the outer vocabulary" do
475
+ it 'should return all the properties from the text with the correct values' do
476
+ expected_results = [{
477
+ type: 'http://data-vocabulary.org/Product',
478
+ id: nil,
479
+ properties: {
480
+ 'name' => ['DC07'],
481
+ 'brand' => ['Dyson'],
482
+ 'review' => [{
483
+ type: 'http://data-vocabulary.org/Review-aggregate',
484
+ id: nil,
485
+ properties: {
486
+ 'count' => ['1'],
487
+ 'rating' => ['5.0']
488
+ }
489
+ }]
490
+ }
491
+ }]
492
+
493
+ test_parsing(@md, %r{http://data-vocabulary.org/Product}, expected_results)
494
+ end
495
+ end
496
+
497
+ end
498
+
499
+ describe Mida::Document, 'when run against a document containing an itemscope
500
+ that contains another non-linked itemscope' do
501
+
502
+ before do
503
+ html = '
504
+ <html><body>
505
+ <div itemscope itemtype="http://data-vocabulary.org/Product">
506
+ <ul class="reviews">
507
+ <li id="model" itemprop="name">DC07</li>
508
+ <li id="make" itemprop="brand">Dyson</li>
509
+ <li itemscope itemtype="http://data-vocabulary.org/Review-aggregate">
510
+ <span class="ratingDetails">
511
+ <span itemprop="count">1</span> Review,
512
+ Average: <span itemprop="rating">5.0</span>
513
+ </span>
514
+ </li>
515
+ </ul>
516
+ </div>
517
+ </body></html>
518
+ '
519
+
520
+ @md = Mida::Document.new(html)
521
+ end
522
+
523
+ it 'should return the correct number of itemscopes when search used' do
524
+ vocabularies = {
525
+ %r{} => 2,
526
+ %r{http://data-vocabulary.org/Product} => 1,
527
+ %r{http://data-vocabulary.org/Review-aggregate} => 1
528
+ }
529
+ vocabularies.each {|vocabulary, num| @md.search(vocabulary).size.should == num}
530
+ end
531
+
532
+ it 'should return the correct number of items' do
533
+ @md.items.size.should == 2
534
+ end
535
+
536
+ context "when no vocabulary specified or looking at the outer vocabulary" do
537
+ it 'should return all the properties from the text with the correct values' do
538
+ pending("get the contains: feature working")
539
+ expected_result = {
540
+ type: 'http://data-vocabulary.org/Product',
541
+ id: nil,
542
+ properties: {
543
+ 'name' => 'DC07',
544
+ 'brand' => 'Dyson'
545
+ },
546
+ contains: {
547
+ type: 'http://data-vocabulary.org/Review-aggregate',
548
+ id: nil,
549
+ properties: {
550
+ 'count' => '1',
551
+ 'rating' => '5.0'
552
+ }
553
+ }
554
+ }
555
+
556
+ @md.search('http://data-vocabulary.org/Product').first.should == expected_result
557
+ end
558
+ end
559
+ end
560
+
561
+ describe Mida::Document, 'when run against a document using itemrefs' do
562
+
563
+ before do
564
+ html = '
565
+ <html><body>
566
+ <div itemscope id="amanda" itemref="a b">
567
+ <span itemprop="age">30</span>
568
+ </div>
569
+ <p id="a">Name: <span itemprop="name">Amanda</span></p>
570
+ <div id="b" itemprop="band" itemscope itemref="c"></div>
571
+ <div id="c">
572
+ <p>Band: <span itemprop="name">Jazz Band</span></p>
573
+ <p>Size: <span itemprop="size">12</span> players</p>
574
+ </div>
575
+ </body></html>
576
+ '
577
+
578
+ @md = Mida::Document.new(html)
579
+ end
580
+
581
+ it 'should return all the properties from the text with the correct values' do
582
+ expected_results = [{
583
+ type: nil,
584
+ id: nil,
585
+ properties: {
586
+ 'name' => ['Amanda'],
587
+ 'band' => [{
588
+ type: nil,
589
+ id: nil,
590
+ properties: {
591
+ 'name' => ['Jazz Band'],
592
+ 'size' => ['12']
593
+ }
594
+ }],
595
+ 'age' => ['30']
596
+ }
597
+ }]
598
+
599
+ test_parsing(@md, %r{}, expected_results)
600
+ end
601
+ end
602
+
603
+ describe Mida::Document, 'when run against a document using multiple itemprops with the same name' do
604
+
605
+ before do
606
+ html = '
607
+ <html><body>
608
+ <div itemscope itemtype="icecreams">
609
+ <p>Flavours in my favourite ice cream:</p>
610
+ <ul>
611
+ <li itemprop="flavour">Lemon sorbet</li>
612
+ <li itemprop="flavour">Apricot sorbet</li>
613
+ <li itemprop="flavour" itemscope itemtype="icecream-type">
614
+ <span itemprop="fruit">Strawberry</span>
615
+ <span itemprop="style">Homemade</span>
616
+ </li>
617
+ </ul>
618
+ </div>
619
+ </body></html>
620
+ '
621
+
622
+ @md = Mida::Document.new(html)
623
+ end
624
+
625
+ it_should_behave_like 'one root itemscope'
626
+
627
+ it 'should return the correct number of itemscopes' do
628
+ vocabularies = [
629
+ %r{icecreams},
630
+ %r{icecream-type}
631
+ ]
632
+ vocabularies.each {|vocabulary| @md.search(vocabulary).size.should == 1}
633
+ end
634
+
635
+ it 'should return all the properties from the text with the correct values' do
636
+ expected_results = [{
637
+ type: 'icecreams',
638
+ id: nil,
639
+ properties: {
640
+ 'flavour' => [
641
+ 'Lemon sorbet',
642
+ 'Apricot sorbet',
643
+ { type: 'icecream-type',
644
+ id: nil,
645
+ properties: {
646
+ 'fruit' => ['Strawberry'],
647
+ 'style' => ['Homemade']
648
+ }
649
+ }
650
+ ]
651
+ }
652
+ }]
653
+
654
+ test_parsing(@md, %r{icecreams}, expected_results)
655
+ end
656
+ end
657
+
658
+ describe Mida::Document, 'when run against a document using an itemprop with multiple properties' do
659
+
660
+ before do
661
+ html = '
662
+ <html><body>
663
+ <div itemscope>
664
+ <span itemprop="favourite-colour favourite-fruit">orange</span>
665
+ </div>
666
+ </body></html>
667
+ '
668
+
669
+ @md = Mida::Document.new(html)
670
+ end
671
+
672
+ it 'should return all the properties from the text with the correct values' do
673
+ expected_results = [{
674
+ type: nil,
675
+ id: nil,
676
+ properties: {
677
+ 'favourite-colour' => ['orange'],
678
+ 'favourite-fruit' => ['orange']
679
+ }
680
+ }]
681
+
682
+ test_parsing(@md, %r{}, expected_results)
683
+ end
684
+ end