mida 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,21 @@
1
+ = The MIT License
2
+
3
+ Copyright (c) 2011 Lawrence Woodman
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,68 @@
1
+ = Mida
2
+
3
+ * {Mida Project Page}[https://github.com/LawrenceWoodman/mida]
4
+ * {Mida Bug Tracker}[https://github.com/LawrenceWoodman/mida/issues]
5
+
6
+ == Description
7
+ A Microdata[http://en.wikipedia.org/wiki/Microdata_(HTML5)] parser and
8
+ extractor library for ruby.
9
+ This is based on the latest Published version of the Microdata Specification
10
+ dated {5th April 2011}[http://www.w3.org/TR/2011/WD-microdata-20110405/].
11
+
12
+ == Installation
13
+ With Ruby and Rubygems:
14
+ gem install mida
15
+
16
+ === Requirements:
17
+
18
+ * +Nokogiri+
19
+
20
+ == Usage
21
+ The following examples assume that you have required +mida+ and
22
+ +open-uri+.
23
+
24
+ === Extracting Microdata from a page
25
+ All the Microdata is extracted from a page when a new <tt>Mida::Document</tt> instance
26
+ is created.
27
+
28
+ To extract all the Microdata from a webpage:
29
+ url = 'http://example.com'
30
+ open(url) {|f| doc = Mida::Document.new(f, url)}
31
+
32
+ The top-level +Items+ will be held in an array accessible via
33
+ <tt>doc.items</tt>.
34
+
35
+ To simply list all the top-level +Items+ that have been found:
36
+ puts doc.items
37
+
38
+ === Searching
39
+ If you want to search for an +Item+ that has a specific +itemtype+/vocabulary
40
+ this can be done with the +search+ method.
41
+
42
+ To return all the +Items+ that use one of Google's Review vocabularies:
43
+ doc.search(%r{http://data-vocabulary\.org.*?review.*?}i)
44
+
45
+ === Inspecting an +Item+
46
+ Each +Item+ is a <tt>Mida::Item</tt> instance and has three main methods of
47
+ interest, +type+, +properties+ and +id+.
48
+
49
+ To find out the +itemtype+ of the +Item+:
50
+ puts doc.items.first.type
51
+
52
+ To find out the +itemid+ of the +Item+:
53
+ puts doc.items.first.id
54
+
55
+ Properties are returned as a hash containing name/values pairs. The
56
+ values will be an array of either +String+ or <tt>Mida::Item</tt> instances.
57
+
58
+ To see the +properties+ of the +Item+:
59
+ puts doc.items.first.properties
60
+
61
+ == Bugs/Feature Requests
62
+ If you find a bug or want to make a feature request, please report it at the
63
+ Mida project's {issues tracker}[https://github.com/LawrenceWoodman/mida/issues]
64
+ on github.
65
+
66
+ == License
67
+ Copyright (c) 2011 Lawrence Woodman.
68
+ This software is licensed under the MIT License. Please see the file, LICENSE.rdoc, for details.
@@ -0,0 +1,26 @@
1
+ task :default => :spec
2
+
3
+ desc "Create Gem"
4
+ require 'rake/gempackagetask'
5
+ spec = Gem::Specification.new do |s|
6
+ s.name = "mida"
7
+ s.summary = "A Microdata parser"
8
+ s.description = File.read(File.join(File.dirname(__FILE__), 'README.rdoc'))
9
+ s.version = "0.0.0"
10
+ s.author = "Lawrence Woodman"
11
+ s.email = "lwoodman@vlifesystems.com"
12
+ s.homepage = %q{http://github.com/LawrenceWoodman/mida}
13
+ s.platform = Gem::Platform::RUBY
14
+ s.required_ruby_version = '>=1.9'
15
+ s.files = Dir['lib/**/*.rb'] + Dir['spec/**/*.rb'] + Dir['*.rdoc'] + Dir['Rakefile']
16
+ s.has_rdoc = true
17
+ s.extra_rdoc_files = ['README.rdoc', 'LICENSE.rdoc']
18
+ s.rdoc_options << '--main' << 'README.rdoc'
19
+ s.add_dependency('nokogiri')
20
+ s.add_development_dependency('rspec')
21
+ end
22
+ Rake::GemPackageTask.new(spec).define
23
+
24
+ desc "Run Specs"
25
+ require 'rspec/core/rake_task'
26
+ RSpec::Core::RakeTask.new(:spec)
@@ -0,0 +1,6 @@
1
+ = Todo List
2
+
3
+ * Support img rating in alt for google?
4
+ * Look further in extra complications of microdata, e.g. alt tag for img rating and different size ratings
5
+ http://www.google.com/support/webmasters/bin/answer.py?answer=172705
6
+ * Put nested itemscopes, that are not a property of its parent into the parents hash using [:nested]
@@ -0,0 +1,6 @@
1
+ $LOAD_PATH.unshift File.dirname(__FILE__)
2
+ Dir[File.dirname(__FILE__) + '/mida/*.rb'].each { |f| require f }
3
+
4
+ # Mida is a Microdata parser and extractor.
5
+ module Mida
6
+ end
@@ -0,0 +1,61 @@
1
+ require 'nokogiri'
2
+
3
+ module Mida
4
+
5
+ # Class that holds the extracted Microdata
6
+ class Document
7
+
8
+ # An Array of Mida::Item objects. These are all top-level
9
+ # and hence not properties of other Items
10
+ attr_reader :items
11
+
12
+ # Create a new Microdata object
13
+ #
14
+ # [target] The string containing the html that you want to parse
15
+ # [page_url] The url of target used for form absolute urls. This must
16
+ # include the filename, e.g. index.html.
17
+ def initialize(target, page_url=nil)
18
+ @doc = Nokogiri(target)
19
+ @page_url = page_url
20
+ @items = extract_items
21
+ end
22
+
23
+ # Returns an array of matching Mida::Item objects
24
+ #
25
+ # [vocabulary] A regexp to match the item types against
26
+ def search(vocabulary, items=@items)
27
+ found_items = []
28
+ items.each do |item|
29
+ # Allows matching against empty string, otherwise couldn't match
30
+ # as item.type can be nil
31
+ if (item.type.nil? && "" =~ vocabulary) || (item.type =~ vocabulary)
32
+ found_items << item
33
+ end
34
+ found_items += search_values(item.properties.values, vocabulary)
35
+ end
36
+ found_items
37
+ end
38
+
39
+ private
40
+ def extract_items
41
+ items_doc = @doc.search('//*[@itemscope and not(@itemprop)]')
42
+ return nil unless items_doc
43
+
44
+ items_doc.collect do |item_doc|
45
+ Item.new(item_doc, @page_url)
46
+ end
47
+ end
48
+
49
+ def search_values(values, vocabulary)
50
+ items = []
51
+ values.each do |value|
52
+ if value.is_a?(Mida::Item) then items += search(vocabulary, [value])
53
+ elsif value.is_a?(Array) then items += search_values(value, vocabulary)
54
+ end
55
+ end
56
+ items
57
+ end
58
+
59
+ end
60
+
61
+ end
@@ -0,0 +1,100 @@
1
+ require 'nokogiri'
2
+
3
+ module Mida
4
+
5
+ # Class that holds each item/itemscope
6
+ class Item
7
+ # The Type of the item
8
+ attr_reader :type
9
+
10
+ # The Global Identifier of the item
11
+ attr_reader :id
12
+
13
+ # A Hash representing the properties as name/values paris
14
+ # The values will be an array containing either +String+
15
+ # or <tt>Mida::Item</tt> instances
16
+ attr_reader :properties
17
+
18
+ # Create a new Item object
19
+ #
20
+ # [itemscope] The itemscope that you want to parse
21
+ # [page_url] The url of target used for form absolute urls
22
+ def initialize(itemscope, page_url=nil)
23
+ @itemscope, @page_url = itemscope, page_url
24
+ @type, @id = extract_attribute('itemtype'), extract_attribute('itemid')
25
+ @properties = {}
26
+ add_itemref_properties
27
+ traverse_elements(extract_elements(itemscope))
28
+ end
29
+
30
+ # Return a Hash representation
31
+ # of the form {type: 'The item type', properties: {'a name' => 'avalue' }}
32
+ def to_h
33
+ {type: @type, id: @id, properties: properties_to_h(@properties)}
34
+ end
35
+
36
+ def to_s
37
+ to_h.to_s
38
+ end
39
+
40
+ def ==(other)
41
+ @type == other.type and @id == other.id and @properties == other.properties
42
+ end
43
+
44
+ private
45
+
46
+ def extract_attribute(attribute)
47
+ (value = @itemscope.attribute(attribute)) ? value.value : nil
48
+ end
49
+
50
+ def extract_elements(itemscope)
51
+ itemscope.search('./*')
52
+ end
53
+
54
+ # Find an element with a matching id
55
+ def find_with_id(id)
56
+ @itemscope.search("//*[@id='#{id}']")
57
+ end
58
+
59
+ # The value as it should appear in to_h()
60
+ def value_to_h(value)
61
+ case
62
+ when value.is_a?(Array) then value.collect {|element| value_to_h(element)}
63
+ when value.is_a?(Item) then value.to_h
64
+ else value
65
+ end
66
+ end
67
+
68
+ def properties_to_h(properties)
69
+ hash = {}
70
+ properties.each { |name, value| hash[name] = value_to_h(value) }
71
+ hash
72
+ end
73
+
74
+ # Add any properties referred to by 'itemref'
75
+ def add_itemref_properties
76
+ itemref = extract_attribute('itemref')
77
+ if itemref
78
+ itemref.split.each {|id| traverse_elements(find_with_id(id))}
79
+ end
80
+ end
81
+
82
+ def traverse_elements(elements)
83
+ elements.each do |element|
84
+ internal_elements = extract_elements(element)
85
+ if internal_elements.empty? || element.attribute('itemscope')
86
+ add_itemprop(element)
87
+ else
88
+ traverse_elements(internal_elements)
89
+ end
90
+ end
91
+ end
92
+
93
+ def add_itemprop(itemprop)
94
+ properties = Property.parse(itemprop, @page_url)
95
+ properties.each { |name, value| (@properties[name] ||= []) << value }
96
+ end
97
+
98
+ end
99
+
100
+ end
@@ -0,0 +1,70 @@
1
+ require 'nokogiri'
2
+ require 'uri'
3
+
4
+ module Mida
5
+
6
+ # Module that parses itemprop elements
7
+ module Property
8
+
9
+ # Returns a Hash representing the property.
10
+ # Hash is of the form {'property name' => 'value'}
11
+ # [element] The itemprop element to be parsed
12
+ # [page_url] The url of the page, including the filename, used to form absolute urls
13
+ def self.parse(element, page_url=nil)
14
+ hash = {}
15
+ extract_property_names(element).each do |name|
16
+ hash[name] = extract_property(element, page_url)
17
+ end
18
+ hash
19
+ end
20
+
21
+ NON_TEXTCONTENT_ELEMENTS = {
22
+ 'a' => 'href', 'area' => 'href',
23
+ 'audio' => 'src', 'embed' => 'src',
24
+ 'iframe' => 'src', 'img' => 'src',
25
+ 'link' => 'href', 'meta' => 'content',
26
+ 'object' => 'data', 'source' => 'src',
27
+ 'time' => 'datetime', 'track' => 'src',
28
+ 'video' => 'src'
29
+ }
30
+
31
+ URL_ATTRIBUTES = ['data', 'href', 'src']
32
+
33
+ # This returns an empty string if can't form a valid
34
+ # absolute url as per the Microdata spec.
35
+ def self.make_absolute_url(url, page_url)
36
+ return url unless URI.parse(url).relative?
37
+ begin
38
+ URI.parse(page_url).merge(url).to_s
39
+ rescue URI::Error
40
+ ''
41
+ end
42
+ end
43
+
44
+ def self.extract_property_names(itemprop)
45
+ itemprop_attr = itemprop.attribute('itemprop')
46
+ itemprop_attr ? itemprop_attr.value.split() : []
47
+ end
48
+
49
+ def self.extract_property_value(itemprop, page_url)
50
+ element = itemprop.name
51
+ if NON_TEXTCONTENT_ELEMENTS.has_key?(element)
52
+ attribute = NON_TEXTCONTENT_ELEMENTS[element]
53
+ value = itemprop.attribute(attribute).value
54
+ (URL_ATTRIBUTES.include?(attribute)) ? make_absolute_url(value, page_url) : value
55
+ else
56
+ itemprop.inner_text
57
+ end
58
+ end
59
+
60
+ def self.extract_property(itemprop, page_url)
61
+ if itemprop.attribute('itemscope')
62
+ Mida::Item.new(itemprop, page_url)
63
+ else
64
+ extract_property_value(itemprop, page_url)
65
+ end
66
+ end
67
+
68
+ end
69
+
70
+ end
@@ -0,0 +1,684 @@
1
+ require_relative 'spec_helper'
2
+ require_relative '../lib/mida'
3
+
4
+ def test_parsing(md, vocabulary, expected_results)
5
+ items = md.search(vocabulary)
6
+ expected_results.each_with_index do |expected_result,i|
7
+ item = items[i]
8
+ test_to_h(item, expected_result)
9
+ test_properties(item, expected_result)
10
+ end
11
+ end
12
+
13
+ def test_to_h(item, expected_result)
14
+ item.to_h.should == expected_result
15
+ end
16
+
17
+ def test_properties(item, expected_result)
18
+ item.properties.each do |name, value|
19
+ match_array(value, expected_result[:properties][name])
20
+ end
21
+ end
22
+
23
+ def match_array(value_array, expected_results)
24
+ value_array.each_with_index do |element, i|
25
+ if element.is_a?(Mida::Item)
26
+ test_properties(element, expected_results[i])
27
+ else
28
+ element.should == expected_results[i]
29
+ end
30
+ end
31
+ end
32
+
33
+ shared_examples_for 'one root itemscope' do
34
+ it 'should not match itemscopes with different names' do
35
+ @md.search(%r{nothing}).size.should == 0
36
+ end
37
+
38
+ it 'should find the correct number of itemscopes' do
39
+ @md.items.size.should == 1
40
+ end
41
+ end
42
+
43
+ describe Mida::Document, 'when run with a document containing textContent and non textContent itemprops' do
44
+ before do
45
+ @html = '
46
+ <html>
47
+ <head itemscope>
48
+ <link itemprop="link_field" rel="stylesheet" type="text/css" href="stylesheet.css" />
49
+ </head>
50
+ <body>
51
+ There is some text here
52
+ <div>
53
+ and also some here
54
+ <div itemscope>
55
+ <span itemprop="span_field">Some span content</span>
56
+ <time itemprop="dtreviewed" datetime="2009-01-06">Jan 6</time>.
57
+ <meta itemprop="meta_field" content="Some meta content">
58
+ <a itemprop="a_field1" href="http://example.com">non content</a>
59
+ <a itemprop="a_field2" href="welcome/index.html">non content</a>
60
+ <a itemprop="a_field3" href="/intro">non content</a>
61
+ <a itemprop="a_field4" href="/intro/index.html">non content</a>
62
+ <map name="somemap">
63
+ <area shape="rect" coords="0,0,50,120" href="left.html" />
64
+ <area itemprop="area_right" shape="rect" coords="51,0,120,120" href="right.html" />
65
+ </map>
66
+ <audio itemprop="audio_field" src="asound.ogg" controls="controls">
67
+ Audio tag not supported by your browser.
68
+ </audio>
69
+
70
+ <embed itemprop="embed_field" src="helloworld.swf" />
71
+ <iframe itemprop="iframe_field" src="http://www.example.com/iframe_test"></iframe>
72
+ <img itemprop="img_field" src="animage.png" width="120" height="120" usemap="#planetmap" />
73
+ <object itemprop="object_field" data="object.png" type="image/png" />
74
+ <audio controls="controls">
75
+ <source itemprop="source_field" src="song.ogg" type="audio/ogg" />
76
+ <track itemprop="track_field" src="atrack.ogg" />
77
+ Audio tag not supported by your browser.
78
+ </audio>
79
+ <video itemprop="video_field" src="movie.ogg" controls="controls">
80
+ Video tag not supported by your browser.
81
+ </video>
82
+ </div>
83
+ </div>
84
+ </body>
85
+ </html>
86
+ '
87
+ end
88
+
89
+
90
+ context 'when not given a page_url' do
91
+ before do
92
+ @md = Mida::Document.new(@html)
93
+ end
94
+
95
+ it 'should return all the properties and types with the correct values' do
96
+ expected_results = [
97
+ { type: nil, id: nil, properties: {'link_field' => ['']} },
98
+ { type: nil,
99
+ id: nil,
100
+ properties: {
101
+ 'span_field' => ['Some span content'],
102
+ 'dtreviewed' => ['2009-01-06'],
103
+ 'meta_field' => ['Some meta content'],
104
+ 'a_field1' => ['http://example.com'],
105
+ 'a_field2' => [''],
106
+ 'a_field3' => [''],
107
+ 'a_field4' => [''],
108
+ 'area_right' => [''],
109
+ 'audio_field' => [''],
110
+ 'embed_field' => [''],
111
+ 'iframe_field' => ['http://www.example.com/iframe_test'],
112
+ 'img_field' => [''],
113
+ 'object_field' => [''],
114
+ 'source_field' => [''],
115
+ 'track_field' => [''],
116
+ 'video_field' => ['']
117
+ }
118
+ }
119
+ ]
120
+
121
+ test_parsing(@md, %r{}, expected_results)
122
+
123
+ end
124
+ end
125
+
126
+ context 'when given a page_url' do
127
+ before do
128
+ @md = Mida::Document.new(@html, 'http://example.com/start/')
129
+ end
130
+
131
+ it 'should return all the properties and types with the correct values' do
132
+ expected_results = [
133
+ { type: nil, id: nil, properties: {
134
+ 'link_field' => ['http://example.com/start/stylesheet.css']
135
+ }
136
+ },
137
+ { type: nil,
138
+ id: nil,
139
+ properties: {
140
+ 'span_field' => ['Some span content'],
141
+ 'dtreviewed' => ['2009-01-06'],
142
+ 'meta_field' => ['Some meta content'],
143
+ 'a_field1' => ['http://example.com'],
144
+ 'a_field2' => ['http://example.com/start/welcome/index.html'],
145
+ 'a_field3' => ['http://example.com/intro'],
146
+ 'a_field4' => ['http://example.com/intro/index.html'],
147
+ 'area_right' => ['http://example.com/start/right.html'],
148
+ 'audio_field' => ['http://example.com/start/asound.ogg'],
149
+ 'embed_field' => ['http://example.com/start/helloworld.swf'],
150
+ 'iframe_field' => ['http://www.example.com/iframe_test'],
151
+ 'img_field' => ['http://example.com/start/animage.png'],
152
+ 'object_field' => ['http://example.com/start/object.png'],
153
+ 'source_field' => ['http://example.com/start/song.ogg'],
154
+ 'track_field' => ['http://example.com/start/atrack.ogg'],
155
+ 'video_field' => ['http://example.com/start/movie.ogg']
156
+ }
157
+ }
158
+ ]
159
+
160
+ test_parsing(@md, %r{}, expected_results)
161
+ end
162
+ end
163
+
164
+ end
165
+
166
+ describe Mida::Document, 'when run against a full html document containing one itemscope with no itemtype' do
167
+
168
+ before do
169
+ html = '
170
+ <html><body>
171
+ There is some text here
172
+ <div>
173
+ and also some here
174
+ <div itemscope>
175
+ <span itemprop="itemreviewed">Romeo Pizza</span>
176
+ Reviewed by <span itemprop="reviewer">Ulysses Grant</span> on
177
+ <time itemprop="dtreviewed" datetime="2009-01-06">Jan 6</time>.
178
+ <meta itemprop="fielda" content="a5482">
179
+
180
+ <span itemprop="summary">Delicious, tasty pizza in Eastlake!</span>
181
+ <span itemprop="description">This is a very nice pizza place.</span>
182
+ Rating: <span itemprop="rating">4.5</span>
183
+ </div>
184
+ </div>
185
+ </body></html>
186
+ '
187
+ @md = Mida::Document.new(html)
188
+
189
+ end
190
+
191
+ it_should_behave_like 'one root itemscope'
192
+
193
+ it 'should return all the properties and types with the correct values' do
194
+ expected_results = [{
195
+ type: nil,
196
+ id: nil,
197
+ properties: {
198
+ 'itemreviewed' => ['Romeo Pizza'],
199
+ 'reviewer' => ['Ulysses Grant'],
200
+ 'dtreviewed' => ['2009-01-06'],
201
+ 'fielda' => ['a5482'],
202
+ 'summary' => ['Delicious, tasty pizza in Eastlake!'],
203
+ 'description' => ['This is a very nice pizza place.'],
204
+ 'rating' => ['4.5']
205
+ }
206
+ }]
207
+
208
+ test_parsing(@md, %r{}, expected_results)
209
+ end
210
+
211
+ end
212
+
213
+ describe Mida::Document, 'when run against a full html document containing one itemscope nested within another' do
214
+
215
+ before do
216
+ html = '
217
+ <html><body>
218
+ There is some text here
219
+ <div>
220
+ and also some here
221
+ <div itemscope>
222
+ <span itemprop="itemreviewed">Romeo Pizza</span>
223
+ <div itemprop="address" itemscope>
224
+ <span itemprop="firstline">237 Italian Way</span>
225
+ <span itemprop="country">United Kingdom</span>
226
+ </div>
227
+ Rating: <span itemprop="rating">4.5</span>
228
+ </div>
229
+ </div>
230
+ </body></html>
231
+ '
232
+
233
+ @md = Mida::Document.new(html)
234
+
235
+ end
236
+
237
+ it_should_behave_like 'one root itemscope'
238
+
239
+ it 'should return all the properties and types with the correct values' do
240
+ expected_results = [{
241
+ type: nil,
242
+ id: nil,
243
+ properties: {
244
+ 'itemreviewed' => ['Romeo Pizza'],
245
+ 'address' => [{
246
+ type: nil, id: nil, properties: {
247
+ 'firstline' => ['237 Italian Way'],
248
+ 'country' => ['United Kingdom']
249
+ }
250
+ }],
251
+ 'rating' => ['4.5']
252
+ }
253
+ }]
254
+
255
+ test_parsing(@md, %r{}, expected_results)
256
+ end
257
+
258
+ end
259
+
260
+ describe Mida::Document, 'when run against a full html document containing one itemscope nested within another within another' do
261
+
262
+ before do
263
+ html = '
264
+ <html><body>
265
+ There is some text here
266
+ <div>
267
+ and also some here
268
+ <div itemscope>
269
+ <span itemprop="itemreviewed">Romeo Pizza</span>
270
+ <div itemprop="address" itemscope>
271
+ <div itemprop="firstline" itemscope>
272
+ <span itemprop="number">237</span>
273
+ <span itemprop="road">Italian Way</span>
274
+ </div>
275
+ <span itemprop="country">United Kingdom</span>
276
+ </div>
277
+ Rating: <span itemprop="rating">4.5</span>
278
+ </div>
279
+ </div>
280
+ </body></html>
281
+ '
282
+
283
+ @md = Mida::Document.new(html)
284
+ end
285
+
286
+ it_should_behave_like 'one root itemscope'
287
+
288
+ it 'should return all the properties and types with the correct values' do
289
+ expected_results = [{
290
+ type: nil,
291
+ id: nil,
292
+ properties: {
293
+ 'itemreviewed' => ['Romeo Pizza'],
294
+ 'address' => [{
295
+ type: nil,
296
+ id: nil,
297
+ properties: {
298
+ 'firstline' => [{
299
+ type: nil,
300
+ id: nil,
301
+ properties: {
302
+ 'number' => ['237'],
303
+ 'road' => ['Italian Way']
304
+ },
305
+ }],
306
+ 'country' => ['United Kingdom']
307
+ },
308
+ }],
309
+ 'rating' => ['4.5']
310
+ }
311
+ }]
312
+
313
+ test_parsing(@md, %r{^$}, expected_results)
314
+ end
315
+
316
+ end
317
+
318
+ describe Mida::Document, 'when run against a full html document containing one itemscope with an itemtype' do
319
+
320
+ before do
321
+ html = '
322
+ <html><body>
323
+ There is some text here
324
+ <div>
325
+ and also some here
326
+ <div itemscope itemtype="http://data-vocabulary.org/Review">
327
+ <span itemprop="itemreviewed">Romeo Pizza</span>
328
+ Reviewed by <span itemprop="reviewer">Ulysses Grant</span> on
329
+ <time itemprop="dtreviewed" datetime="2009-01-06">Jan 6</time>.
330
+ <span itemprop="summary">Delicious, tasty pizza in Eastlake!</span>
331
+ <span itemprop="description">This is a very nice pizza place.</span>
332
+ Rating: <span itemprop="rating">4.5</span>
333
+ </div>
334
+ </div>
335
+ </body></html>
336
+ '
337
+
338
+ @md = Mida::Document.new(html)
339
+
340
+ end
341
+
342
+ it_should_behave_like 'one root itemscope'
343
+
344
+ it 'should find the correct number of itemscopes if outer specified' do
345
+ @md.search(%r{http://data-vocabulary.org/Review}).size.should == 1
346
+ end
347
+
348
+ it 'should specify the correct type' do
349
+ @md.search(%r{http://data-vocabulary.org/Review}).first.type.should == 'http://data-vocabulary.org/Review'
350
+ end
351
+
352
+ it 'should return all the properties and types with the correct values' do
353
+ expected_results = [{
354
+ type: 'http://data-vocabulary.org/Review',
355
+ id: nil,
356
+ properties: {
357
+ 'itemreviewed' => ['Romeo Pizza'],
358
+ 'reviewer' => ['Ulysses Grant'],
359
+ 'dtreviewed' => ['2009-01-06'],
360
+ 'summary' => ['Delicious, tasty pizza in Eastlake!'],
361
+ 'description' => ['This is a very nice pizza place.'],
362
+ 'rating' => ['4.5']
363
+ }
364
+ }]
365
+ test_parsing(@md, %r{http://data-vocabulary.org/Review}, expected_results)
366
+ end
367
+
368
+ end
369
+
370
+ describe Mida::Document, 'when run against a full html document containing two non-nested itemscopes with itemtypes' do
371
+
372
+ before do
373
+ html = '
374
+ <html><body>
375
+ There is some text here
376
+ <div>
377
+ and also some here
378
+ <div itemscope itemtype="http://data-vocabulary.org/Review">
379
+ <span itemprop="itemreviewed">Romeo Pizza</span>
380
+ Rating: <span itemprop="rating">4.5</span>
381
+ </div>
382
+ <div itemscope itemtype="http://data-vocabulary.org/Organization">
383
+ <span itemprop="name">An org name</span>
384
+ <span itemprop="url">http://example.com</span>
385
+ </div>
386
+ </div>
387
+ </body></html>
388
+ '
389
+
390
+ @md = Mida::Document.new(html)
391
+
392
+ end
393
+
394
+ it 'should return all the itemscopes' do
395
+ @md.items.size.should == 2
396
+ end
397
+
398
+ it 'should give the type of each itemscope if none specified' do
399
+ itemscope_names = {
400
+ 'http://data-vocabulary.org/Review' => 0,
401
+ 'http://data-vocabulary.org/Organization' => 0
402
+ }
403
+
404
+ @md.items.each do |item|
405
+ itemscope_names[item.type] += 1
406
+ end
407
+
408
+ itemscope_names.size.should eq 2
409
+ itemscope_names.each { |name, num| num.should == 1 }
410
+ end
411
+
412
+
413
+ it 'should return all the properties and types with the correct values for 1st itemscope' do
414
+ expected_results = [{
415
+ type: 'http://data-vocabulary.org/Review',
416
+ id: nil,
417
+ properties: {
418
+ 'itemreviewed' => ['Romeo Pizza'],
419
+ 'rating' => ['4.5']
420
+ }
421
+ }]
422
+ test_parsing(@md, %r{http://data-vocabulary.org/Review}, expected_results)
423
+ end
424
+
425
+ it 'should return all the properties from the text for 2nd itemscope' do
426
+ expected_results = [{
427
+ type: 'http://data-vocabulary.org/Organization',
428
+ id: nil,
429
+ properties: {
430
+ 'name' => ['An org name'],
431
+ 'url' => ['http://example.com']
432
+ }
433
+ }]
434
+ test_parsing(@md, %r{http://data-vocabulary.org/Organization}, expected_results)
435
+ end
436
+
437
+ end
438
+
439
+ describe Mida::Document, 'when run against a full html document containing one
440
+ itemscope nested within another and the inner block is
441
+ surrounded with another non itemscope block' do
442
+
443
+ before do
444
+ html = '
445
+ <html><body>
446
+ <div itemscope itemtype="http://data-vocabulary.org/Product">
447
+ <ul class="reviews">
448
+ <li id="model" itemprop="name">DC07</li>
449
+ <li id="make" itemprop="brand">Dyson</li>
450
+ <li itemprop="review" itemscope itemtype="http://data-vocabulary.org/Review-aggregate">
451
+ <span class="ratingDetails">
452
+ <span itemprop="count">1</span> Review,
453
+ Average: <span itemprop="rating">5.0</span>
454
+ </span>
455
+ </li>
456
+ </ul>
457
+ </div>
458
+ </body></html>
459
+ '
460
+
461
+ @md = Mida::Document.new(html)
462
+ end
463
+
464
+ it_should_behave_like 'one root itemscope'
465
+
466
+ it 'should return the correct number of itemscopes' do
467
+ vocabularies = [
468
+ %r{http://data-vocabulary.org/Product},
469
+ %r{http://data-vocabulary.org/Review-aggregate}
470
+ ]
471
+ vocabularies.each {|vocabulary| @md.search(vocabulary).size.should == 1}
472
+ end
473
+
474
+ context "when looking at the outer vocabulary" do
475
+ it 'should return all the properties from the text with the correct values' do
476
+ expected_results = [{
477
+ type: 'http://data-vocabulary.org/Product',
478
+ id: nil,
479
+ properties: {
480
+ 'name' => ['DC07'],
481
+ 'brand' => ['Dyson'],
482
+ 'review' => [{
483
+ type: 'http://data-vocabulary.org/Review-aggregate',
484
+ id: nil,
485
+ properties: {
486
+ 'count' => ['1'],
487
+ 'rating' => ['5.0']
488
+ }
489
+ }]
490
+ }
491
+ }]
492
+
493
+ test_parsing(@md, %r{http://data-vocabulary.org/Product}, expected_results)
494
+ end
495
+ end
496
+
497
+ end
498
+
499
+ describe Mida::Document, 'when run against a document containing an itemscope
500
+ that contains another non-linked itemscope' do
501
+
502
+ before do
503
+ html = '
504
+ <html><body>
505
+ <div itemscope itemtype="http://data-vocabulary.org/Product">
506
+ <ul class="reviews">
507
+ <li id="model" itemprop="name">DC07</li>
508
+ <li id="make" itemprop="brand">Dyson</li>
509
+ <li itemscope itemtype="http://data-vocabulary.org/Review-aggregate">
510
+ <span class="ratingDetails">
511
+ <span itemprop="count">1</span> Review,
512
+ Average: <span itemprop="rating">5.0</span>
513
+ </span>
514
+ </li>
515
+ </ul>
516
+ </div>
517
+ </body></html>
518
+ '
519
+
520
+ @md = Mida::Document.new(html)
521
+ end
522
+
523
+ it 'should return the correct number of itemscopes when search used' do
524
+ vocabularies = {
525
+ %r{} => 2,
526
+ %r{http://data-vocabulary.org/Product} => 1,
527
+ %r{http://data-vocabulary.org/Review-aggregate} => 1
528
+ }
529
+ vocabularies.each {|vocabulary, num| @md.search(vocabulary).size.should == num}
530
+ end
531
+
532
+ it 'should return the correct number of items' do
533
+ @md.items.size.should == 2
534
+ end
535
+
536
+ context "when no vocabulary specified or looking at the outer vocabulary" do
537
+ it 'should return all the properties from the text with the correct values' do
538
+ pending("get the contains: feature working")
539
+ expected_result = {
540
+ type: 'http://data-vocabulary.org/Product',
541
+ id: nil,
542
+ properties: {
543
+ 'name' => 'DC07',
544
+ 'brand' => 'Dyson'
545
+ },
546
+ contains: {
547
+ type: 'http://data-vocabulary.org/Review-aggregate',
548
+ id: nil,
549
+ properties: {
550
+ 'count' => '1',
551
+ 'rating' => '5.0'
552
+ }
553
+ }
554
+ }
555
+
556
+ @md.search('http://data-vocabulary.org/Product').first.should == expected_result
557
+ end
558
+ end
559
+ end
560
+
561
+ describe Mida::Document, 'when run against a document using itemrefs' do
562
+
563
+ before do
564
+ html = '
565
+ <html><body>
566
+ <div itemscope id="amanda" itemref="a b">
567
+ <span itemprop="age">30</span>
568
+ </div>
569
+ <p id="a">Name: <span itemprop="name">Amanda</span></p>
570
+ <div id="b" itemprop="band" itemscope itemref="c"></div>
571
+ <div id="c">
572
+ <p>Band: <span itemprop="name">Jazz Band</span></p>
573
+ <p>Size: <span itemprop="size">12</span> players</p>
574
+ </div>
575
+ </body></html>
576
+ '
577
+
578
+ @md = Mida::Document.new(html)
579
+ end
580
+
581
+ it 'should return all the properties from the text with the correct values' do
582
+ expected_results = [{
583
+ type: nil,
584
+ id: nil,
585
+ properties: {
586
+ 'name' => ['Amanda'],
587
+ 'band' => [{
588
+ type: nil,
589
+ id: nil,
590
+ properties: {
591
+ 'name' => ['Jazz Band'],
592
+ 'size' => ['12']
593
+ }
594
+ }],
595
+ 'age' => ['30']
596
+ }
597
+ }]
598
+
599
+ test_parsing(@md, %r{}, expected_results)
600
+ end
601
+ end
602
+
603
+ describe Mida::Document, 'when run against a document using multiple itemprops with the same name' do
604
+
605
+ before do
606
+ html = '
607
+ <html><body>
608
+ <div itemscope itemtype="icecreams">
609
+ <p>Flavours in my favourite ice cream:</p>
610
+ <ul>
611
+ <li itemprop="flavour">Lemon sorbet</li>
612
+ <li itemprop="flavour">Apricot sorbet</li>
613
+ <li itemprop="flavour" itemscope itemtype="icecream-type">
614
+ <span itemprop="fruit">Strawberry</span>
615
+ <span itemprop="style">Homemade</span>
616
+ </li>
617
+ </ul>
618
+ </div>
619
+ </body></html>
620
+ '
621
+
622
+ @md = Mida::Document.new(html)
623
+ end
624
+
625
+ it_should_behave_like 'one root itemscope'
626
+
627
+ it 'should return the correct number of itemscopes' do
628
+ vocabularies = [
629
+ %r{icecreams},
630
+ %r{icecream-type}
631
+ ]
632
+ vocabularies.each {|vocabulary| @md.search(vocabulary).size.should == 1}
633
+ end
634
+
635
+ it 'should return all the properties from the text with the correct values' do
636
+ expected_results = [{
637
+ type: 'icecreams',
638
+ id: nil,
639
+ properties: {
640
+ 'flavour' => [
641
+ 'Lemon sorbet',
642
+ 'Apricot sorbet',
643
+ { type: 'icecream-type',
644
+ id: nil,
645
+ properties: {
646
+ 'fruit' => ['Strawberry'],
647
+ 'style' => ['Homemade']
648
+ }
649
+ }
650
+ ]
651
+ }
652
+ }]
653
+
654
+ test_parsing(@md, %r{icecreams}, expected_results)
655
+ end
656
+ end
657
+
658
+ describe Mida::Document, 'when run against a document using an itemprop with multiple properties' do
659
+
660
+ before do
661
+ html = '
662
+ <html><body>
663
+ <div itemscope>
664
+ <span itemprop="favourite-colour favourite-fruit">orange</span>
665
+ </div>
666
+ </body></html>
667
+ '
668
+
669
+ @md = Mida::Document.new(html)
670
+ end
671
+
672
+ it 'should return all the properties from the text with the correct values' do
673
+ expected_results = [{
674
+ type: nil,
675
+ id: nil,
676
+ properties: {
677
+ 'favourite-colour' => ['orange'],
678
+ 'favourite-fruit' => ['orange']
679
+ }
680
+ }]
681
+
682
+ test_parsing(@md, %r{}, expected_results)
683
+ end
684
+ end