mida 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.rdoc +21 -0
- data/README.rdoc +68 -0
- data/Rakefile +26 -0
- data/TODO.rdoc +6 -0
- data/lib/mida.rb +6 -0
- data/lib/mida/document.rb +61 -0
- data/lib/mida/item.rb +100 -0
- data/lib/mida/property.rb +70 -0
- data/spec/document_spec.rb +684 -0
- data/spec/item_spec.rb +393 -0
- data/spec/property_spec.rb +152 -0
- data/spec/spec_helper.rb +41 -0
- metadata +172 -0
data/LICENSE.rdoc
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
= The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2011 Lawrence Woodman
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
= Mida
|
2
|
+
|
3
|
+
* {Mida Project Page}[https://github.com/LawrenceWoodman/mida]
|
4
|
+
* {Mida Bug Tracker}[https://github.com/LawrenceWoodman/mida/issues]
|
5
|
+
|
6
|
+
== Description
|
7
|
+
A Microdata[http://en.wikipedia.org/wiki/Microdata_(HTML5)] parser and
|
8
|
+
extractor library for ruby.
|
9
|
+
This is based on the latest Published version of the Microdata Specification
|
10
|
+
dated {5th April 2011}[http://www.w3.org/TR/2011/WD-microdata-20110405/].
|
11
|
+
|
12
|
+
== Installation
|
13
|
+
With Ruby and Rubygems:
|
14
|
+
gem install mida
|
15
|
+
|
16
|
+
=== Requirements:
|
17
|
+
|
18
|
+
* +Nokogiri+
|
19
|
+
|
20
|
+
== Usage
|
21
|
+
The following examples assume that you have required +mida+ and
|
22
|
+
+open-uri+.
|
23
|
+
|
24
|
+
=== Extracting Microdata from a page
|
25
|
+
All the Microdata is extracted from a page when a new <tt>Mida::Document</tt> instance
|
26
|
+
is created.
|
27
|
+
|
28
|
+
To extract all the Microdata from a webpage:
|
29
|
+
url = 'http://example.com'
|
30
|
+
open(url) {|f| doc = Mida::Document.new(f, url)}
|
31
|
+
|
32
|
+
The top-level +Items+ will be held in an array accessible via
|
33
|
+
<tt>doc.items</tt>.
|
34
|
+
|
35
|
+
To simply list all the top-level +Items+ that have been found:
|
36
|
+
puts doc.items
|
37
|
+
|
38
|
+
=== Searching
|
39
|
+
If you want to search for an +Item+ that has a specific +itemtype+/vocabulary
|
40
|
+
this can be done with the +search+ method.
|
41
|
+
|
42
|
+
To return all the +Items+ that use one of Google's Review vocabularies:
|
43
|
+
doc.search(%r{http://data-vocabulary\.org.*?review.*?}i)
|
44
|
+
|
45
|
+
=== Inspecting an +Item+
|
46
|
+
Each +Item+ is a <tt>Mida::Item</tt> instance and has three main methods of
|
47
|
+
interest, +type+, +properties+ and +id+.
|
48
|
+
|
49
|
+
To find out the +itemtype+ of the +Item+:
|
50
|
+
puts doc.items.first.type
|
51
|
+
|
52
|
+
To find out the +itemid+ of the +Item+:
|
53
|
+
puts doc.items.first.id
|
54
|
+
|
55
|
+
Properties are returned as a hash containing name/values pairs. The
|
56
|
+
values will be an array of either +String+ or <tt>Mida::Item</tt> instances.
|
57
|
+
|
58
|
+
To see the +properties+ of the +Item+:
|
59
|
+
puts doc.items.first.properties
|
60
|
+
|
61
|
+
== Bugs/Feature Requests
|
62
|
+
If you find a bug or want to make a feature request, please report it at the
|
63
|
+
Mida project's {issues tracker}[https://github.com/LawrenceWoodman/mida/issues]
|
64
|
+
on github.
|
65
|
+
|
66
|
+
== License
|
67
|
+
Copyright (c) 2011 Lawrence Woodman.
|
68
|
+
This software is licensed under the MIT License. Please see the file, LICENSE.rdoc, for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
task :default => :spec
|
2
|
+
|
3
|
+
desc "Create Gem"
|
4
|
+
require 'rake/gempackagetask'
|
5
|
+
spec = Gem::Specification.new do |s|
|
6
|
+
s.name = "mida"
|
7
|
+
s.summary = "A Microdata parser"
|
8
|
+
s.description = File.read(File.join(File.dirname(__FILE__), 'README.rdoc'))
|
9
|
+
s.version = "0.0.0"
|
10
|
+
s.author = "Lawrence Woodman"
|
11
|
+
s.email = "lwoodman@vlifesystems.com"
|
12
|
+
s.homepage = %q{http://github.com/LawrenceWoodman/mida}
|
13
|
+
s.platform = Gem::Platform::RUBY
|
14
|
+
s.required_ruby_version = '>=1.9'
|
15
|
+
s.files = Dir['lib/**/*.rb'] + Dir['spec/**/*.rb'] + Dir['*.rdoc'] + Dir['Rakefile']
|
16
|
+
s.has_rdoc = true
|
17
|
+
s.extra_rdoc_files = ['README.rdoc', 'LICENSE.rdoc']
|
18
|
+
s.rdoc_options << '--main' << 'README.rdoc'
|
19
|
+
s.add_dependency('nokogiri')
|
20
|
+
s.add_development_dependency('rspec')
|
21
|
+
end
|
22
|
+
Rake::GemPackageTask.new(spec).define
|
23
|
+
|
24
|
+
desc "Run Specs"
|
25
|
+
require 'rspec/core/rake_task'
|
26
|
+
RSpec::Core::RakeTask.new(:spec)
|
data/TODO.rdoc
ADDED
@@ -0,0 +1,6 @@
|
|
1
|
+
= Todo List
|
2
|
+
|
3
|
+
* Support img rating in alt for google?
|
4
|
+
* Look further in extra complications of microdata, e.g. alt tag for img rating and different size ratings
|
5
|
+
http://www.google.com/support/webmasters/bin/answer.py?answer=172705
|
6
|
+
* Put nested itemscopes, that are not a property of its parent into the parents hash using [:nested]
|
data/lib/mida.rb
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Mida
|
4
|
+
|
5
|
+
# Class that holds the extracted Microdata
|
6
|
+
class Document
|
7
|
+
|
8
|
+
# An Array of Mida::Item objects. These are all top-level
|
9
|
+
# and hence not properties of other Items
|
10
|
+
attr_reader :items
|
11
|
+
|
12
|
+
# Create a new Microdata object
|
13
|
+
#
|
14
|
+
# [target] The string containing the html that you want to parse
|
15
|
+
# [page_url] The url of target used for form absolute urls. This must
|
16
|
+
# include the filename, e.g. index.html.
|
17
|
+
def initialize(target, page_url=nil)
|
18
|
+
@doc = Nokogiri(target)
|
19
|
+
@page_url = page_url
|
20
|
+
@items = extract_items
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns an array of matching Mida::Item objects
|
24
|
+
#
|
25
|
+
# [vocabulary] A regexp to match the item types against
|
26
|
+
def search(vocabulary, items=@items)
|
27
|
+
found_items = []
|
28
|
+
items.each do |item|
|
29
|
+
# Allows matching against empty string, otherwise couldn't match
|
30
|
+
# as item.type can be nil
|
31
|
+
if (item.type.nil? && "" =~ vocabulary) || (item.type =~ vocabulary)
|
32
|
+
found_items << item
|
33
|
+
end
|
34
|
+
found_items += search_values(item.properties.values, vocabulary)
|
35
|
+
end
|
36
|
+
found_items
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
def extract_items
|
41
|
+
items_doc = @doc.search('//*[@itemscope and not(@itemprop)]')
|
42
|
+
return nil unless items_doc
|
43
|
+
|
44
|
+
items_doc.collect do |item_doc|
|
45
|
+
Item.new(item_doc, @page_url)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def search_values(values, vocabulary)
|
50
|
+
items = []
|
51
|
+
values.each do |value|
|
52
|
+
if value.is_a?(Mida::Item) then items += search(vocabulary, [value])
|
53
|
+
elsif value.is_a?(Array) then items += search_values(value, vocabulary)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
items
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
data/lib/mida/item.rb
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Mida
|
4
|
+
|
5
|
+
# Class that holds each item/itemscope
|
6
|
+
class Item
|
7
|
+
# The Type of the item
|
8
|
+
attr_reader :type
|
9
|
+
|
10
|
+
# The Global Identifier of the item
|
11
|
+
attr_reader :id
|
12
|
+
|
13
|
+
# A Hash representing the properties as name/values paris
|
14
|
+
# The values will be an array containing either +String+
|
15
|
+
# or <tt>Mida::Item</tt> instances
|
16
|
+
attr_reader :properties
|
17
|
+
|
18
|
+
# Create a new Item object
|
19
|
+
#
|
20
|
+
# [itemscope] The itemscope that you want to parse
|
21
|
+
# [page_url] The url of target used for form absolute urls
|
22
|
+
def initialize(itemscope, page_url=nil)
|
23
|
+
@itemscope, @page_url = itemscope, page_url
|
24
|
+
@type, @id = extract_attribute('itemtype'), extract_attribute('itemid')
|
25
|
+
@properties = {}
|
26
|
+
add_itemref_properties
|
27
|
+
traverse_elements(extract_elements(itemscope))
|
28
|
+
end
|
29
|
+
|
30
|
+
# Return a Hash representation
|
31
|
+
# of the form {type: 'The item type', properties: {'a name' => 'avalue' }}
|
32
|
+
def to_h
|
33
|
+
{type: @type, id: @id, properties: properties_to_h(@properties)}
|
34
|
+
end
|
35
|
+
|
36
|
+
def to_s
|
37
|
+
to_h.to_s
|
38
|
+
end
|
39
|
+
|
40
|
+
def ==(other)
|
41
|
+
@type == other.type and @id == other.id and @properties == other.properties
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def extract_attribute(attribute)
|
47
|
+
(value = @itemscope.attribute(attribute)) ? value.value : nil
|
48
|
+
end
|
49
|
+
|
50
|
+
def extract_elements(itemscope)
|
51
|
+
itemscope.search('./*')
|
52
|
+
end
|
53
|
+
|
54
|
+
# Find an element with a matching id
|
55
|
+
def find_with_id(id)
|
56
|
+
@itemscope.search("//*[@id='#{id}']")
|
57
|
+
end
|
58
|
+
|
59
|
+
# The value as it should appear in to_h()
|
60
|
+
def value_to_h(value)
|
61
|
+
case
|
62
|
+
when value.is_a?(Array) then value.collect {|element| value_to_h(element)}
|
63
|
+
when value.is_a?(Item) then value.to_h
|
64
|
+
else value
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def properties_to_h(properties)
|
69
|
+
hash = {}
|
70
|
+
properties.each { |name, value| hash[name] = value_to_h(value) }
|
71
|
+
hash
|
72
|
+
end
|
73
|
+
|
74
|
+
# Add any properties referred to by 'itemref'
|
75
|
+
def add_itemref_properties
|
76
|
+
itemref = extract_attribute('itemref')
|
77
|
+
if itemref
|
78
|
+
itemref.split.each {|id| traverse_elements(find_with_id(id))}
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def traverse_elements(elements)
|
83
|
+
elements.each do |element|
|
84
|
+
internal_elements = extract_elements(element)
|
85
|
+
if internal_elements.empty? || element.attribute('itemscope')
|
86
|
+
add_itemprop(element)
|
87
|
+
else
|
88
|
+
traverse_elements(internal_elements)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def add_itemprop(itemprop)
|
94
|
+
properties = Property.parse(itemprop, @page_url)
|
95
|
+
properties.each { |name, value| (@properties[name] ||= []) << value }
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|
99
|
+
|
100
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module Mida
|
5
|
+
|
6
|
+
# Module that parses itemprop elements
|
7
|
+
module Property
|
8
|
+
|
9
|
+
# Returns a Hash representing the property.
|
10
|
+
# Hash is of the form {'property name' => 'value'}
|
11
|
+
# [element] The itemprop element to be parsed
|
12
|
+
# [page_url] The url of the page, including the filename, used to form absolute urls
|
13
|
+
def self.parse(element, page_url=nil)
|
14
|
+
hash = {}
|
15
|
+
extract_property_names(element).each do |name|
|
16
|
+
hash[name] = extract_property(element, page_url)
|
17
|
+
end
|
18
|
+
hash
|
19
|
+
end
|
20
|
+
|
21
|
+
NON_TEXTCONTENT_ELEMENTS = {
|
22
|
+
'a' => 'href', 'area' => 'href',
|
23
|
+
'audio' => 'src', 'embed' => 'src',
|
24
|
+
'iframe' => 'src', 'img' => 'src',
|
25
|
+
'link' => 'href', 'meta' => 'content',
|
26
|
+
'object' => 'data', 'source' => 'src',
|
27
|
+
'time' => 'datetime', 'track' => 'src',
|
28
|
+
'video' => 'src'
|
29
|
+
}
|
30
|
+
|
31
|
+
URL_ATTRIBUTES = ['data', 'href', 'src']
|
32
|
+
|
33
|
+
# This returns an empty string if can't form a valid
|
34
|
+
# absolute url as per the Microdata spec.
|
35
|
+
def self.make_absolute_url(url, page_url)
|
36
|
+
return url unless URI.parse(url).relative?
|
37
|
+
begin
|
38
|
+
URI.parse(page_url).merge(url).to_s
|
39
|
+
rescue URI::Error
|
40
|
+
''
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.extract_property_names(itemprop)
|
45
|
+
itemprop_attr = itemprop.attribute('itemprop')
|
46
|
+
itemprop_attr ? itemprop_attr.value.split() : []
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.extract_property_value(itemprop, page_url)
|
50
|
+
element = itemprop.name
|
51
|
+
if NON_TEXTCONTENT_ELEMENTS.has_key?(element)
|
52
|
+
attribute = NON_TEXTCONTENT_ELEMENTS[element]
|
53
|
+
value = itemprop.attribute(attribute).value
|
54
|
+
(URL_ATTRIBUTES.include?(attribute)) ? make_absolute_url(value, page_url) : value
|
55
|
+
else
|
56
|
+
itemprop.inner_text
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.extract_property(itemprop, page_url)
|
61
|
+
if itemprop.attribute('itemscope')
|
62
|
+
Mida::Item.new(itemprop, page_url)
|
63
|
+
else
|
64
|
+
extract_property_value(itemprop, page_url)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
@@ -0,0 +1,684 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
require_relative '../lib/mida'
|
3
|
+
|
4
|
+
def test_parsing(md, vocabulary, expected_results)
|
5
|
+
items = md.search(vocabulary)
|
6
|
+
expected_results.each_with_index do |expected_result,i|
|
7
|
+
item = items[i]
|
8
|
+
test_to_h(item, expected_result)
|
9
|
+
test_properties(item, expected_result)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_to_h(item, expected_result)
|
14
|
+
item.to_h.should == expected_result
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_properties(item, expected_result)
|
18
|
+
item.properties.each do |name, value|
|
19
|
+
match_array(value, expected_result[:properties][name])
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def match_array(value_array, expected_results)
|
24
|
+
value_array.each_with_index do |element, i|
|
25
|
+
if element.is_a?(Mida::Item)
|
26
|
+
test_properties(element, expected_results[i])
|
27
|
+
else
|
28
|
+
element.should == expected_results[i]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
shared_examples_for 'one root itemscope' do
|
34
|
+
it 'should not match itemscopes with different names' do
|
35
|
+
@md.search(%r{nothing}).size.should == 0
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should find the correct number of itemscopes' do
|
39
|
+
@md.items.size.should == 1
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
describe Mida::Document, 'when run with a document containing textContent and non textContent itemprops' do
|
44
|
+
before do
|
45
|
+
@html = '
|
46
|
+
<html>
|
47
|
+
<head itemscope>
|
48
|
+
<link itemprop="link_field" rel="stylesheet" type="text/css" href="stylesheet.css" />
|
49
|
+
</head>
|
50
|
+
<body>
|
51
|
+
There is some text here
|
52
|
+
<div>
|
53
|
+
and also some here
|
54
|
+
<div itemscope>
|
55
|
+
<span itemprop="span_field">Some span content</span>
|
56
|
+
<time itemprop="dtreviewed" datetime="2009-01-06">Jan 6</time>.
|
57
|
+
<meta itemprop="meta_field" content="Some meta content">
|
58
|
+
<a itemprop="a_field1" href="http://example.com">non content</a>
|
59
|
+
<a itemprop="a_field2" href="welcome/index.html">non content</a>
|
60
|
+
<a itemprop="a_field3" href="/intro">non content</a>
|
61
|
+
<a itemprop="a_field4" href="/intro/index.html">non content</a>
|
62
|
+
<map name="somemap">
|
63
|
+
<area shape="rect" coords="0,0,50,120" href="left.html" />
|
64
|
+
<area itemprop="area_right" shape="rect" coords="51,0,120,120" href="right.html" />
|
65
|
+
</map>
|
66
|
+
<audio itemprop="audio_field" src="asound.ogg" controls="controls">
|
67
|
+
Audio tag not supported by your browser.
|
68
|
+
</audio>
|
69
|
+
|
70
|
+
<embed itemprop="embed_field" src="helloworld.swf" />
|
71
|
+
<iframe itemprop="iframe_field" src="http://www.example.com/iframe_test"></iframe>
|
72
|
+
<img itemprop="img_field" src="animage.png" width="120" height="120" usemap="#planetmap" />
|
73
|
+
<object itemprop="object_field" data="object.png" type="image/png" />
|
74
|
+
<audio controls="controls">
|
75
|
+
<source itemprop="source_field" src="song.ogg" type="audio/ogg" />
|
76
|
+
<track itemprop="track_field" src="atrack.ogg" />
|
77
|
+
Audio tag not supported by your browser.
|
78
|
+
</audio>
|
79
|
+
<video itemprop="video_field" src="movie.ogg" controls="controls">
|
80
|
+
Video tag not supported by your browser.
|
81
|
+
</video>
|
82
|
+
</div>
|
83
|
+
</div>
|
84
|
+
</body>
|
85
|
+
</html>
|
86
|
+
'
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
context 'when not given a page_url' do
|
91
|
+
before do
|
92
|
+
@md = Mida::Document.new(@html)
|
93
|
+
end
|
94
|
+
|
95
|
+
it 'should return all the properties and types with the correct values' do
|
96
|
+
expected_results = [
|
97
|
+
{ type: nil, id: nil, properties: {'link_field' => ['']} },
|
98
|
+
{ type: nil,
|
99
|
+
id: nil,
|
100
|
+
properties: {
|
101
|
+
'span_field' => ['Some span content'],
|
102
|
+
'dtreviewed' => ['2009-01-06'],
|
103
|
+
'meta_field' => ['Some meta content'],
|
104
|
+
'a_field1' => ['http://example.com'],
|
105
|
+
'a_field2' => [''],
|
106
|
+
'a_field3' => [''],
|
107
|
+
'a_field4' => [''],
|
108
|
+
'area_right' => [''],
|
109
|
+
'audio_field' => [''],
|
110
|
+
'embed_field' => [''],
|
111
|
+
'iframe_field' => ['http://www.example.com/iframe_test'],
|
112
|
+
'img_field' => [''],
|
113
|
+
'object_field' => [''],
|
114
|
+
'source_field' => [''],
|
115
|
+
'track_field' => [''],
|
116
|
+
'video_field' => ['']
|
117
|
+
}
|
118
|
+
}
|
119
|
+
]
|
120
|
+
|
121
|
+
test_parsing(@md, %r{}, expected_results)
|
122
|
+
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
context 'when given a page_url' do
|
127
|
+
before do
|
128
|
+
@md = Mida::Document.new(@html, 'http://example.com/start/')
|
129
|
+
end
|
130
|
+
|
131
|
+
it 'should return all the properties and types with the correct values' do
|
132
|
+
expected_results = [
|
133
|
+
{ type: nil, id: nil, properties: {
|
134
|
+
'link_field' => ['http://example.com/start/stylesheet.css']
|
135
|
+
}
|
136
|
+
},
|
137
|
+
{ type: nil,
|
138
|
+
id: nil,
|
139
|
+
properties: {
|
140
|
+
'span_field' => ['Some span content'],
|
141
|
+
'dtreviewed' => ['2009-01-06'],
|
142
|
+
'meta_field' => ['Some meta content'],
|
143
|
+
'a_field1' => ['http://example.com'],
|
144
|
+
'a_field2' => ['http://example.com/start/welcome/index.html'],
|
145
|
+
'a_field3' => ['http://example.com/intro'],
|
146
|
+
'a_field4' => ['http://example.com/intro/index.html'],
|
147
|
+
'area_right' => ['http://example.com/start/right.html'],
|
148
|
+
'audio_field' => ['http://example.com/start/asound.ogg'],
|
149
|
+
'embed_field' => ['http://example.com/start/helloworld.swf'],
|
150
|
+
'iframe_field' => ['http://www.example.com/iframe_test'],
|
151
|
+
'img_field' => ['http://example.com/start/animage.png'],
|
152
|
+
'object_field' => ['http://example.com/start/object.png'],
|
153
|
+
'source_field' => ['http://example.com/start/song.ogg'],
|
154
|
+
'track_field' => ['http://example.com/start/atrack.ogg'],
|
155
|
+
'video_field' => ['http://example.com/start/movie.ogg']
|
156
|
+
}
|
157
|
+
}
|
158
|
+
]
|
159
|
+
|
160
|
+
test_parsing(@md, %r{}, expected_results)
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
|
166
|
+
describe Mida::Document, 'when run against a full html document containing one itemscope with no itemtype' do
|
167
|
+
|
168
|
+
before do
|
169
|
+
html = '
|
170
|
+
<html><body>
|
171
|
+
There is some text here
|
172
|
+
<div>
|
173
|
+
and also some here
|
174
|
+
<div itemscope>
|
175
|
+
<span itemprop="itemreviewed">Romeo Pizza</span>
|
176
|
+
Reviewed by <span itemprop="reviewer">Ulysses Grant</span> on
|
177
|
+
<time itemprop="dtreviewed" datetime="2009-01-06">Jan 6</time>.
|
178
|
+
<meta itemprop="fielda" content="a5482">
|
179
|
+
|
180
|
+
<span itemprop="summary">Delicious, tasty pizza in Eastlake!</span>
|
181
|
+
<span itemprop="description">This is a very nice pizza place.</span>
|
182
|
+
Rating: <span itemprop="rating">4.5</span>
|
183
|
+
</div>
|
184
|
+
</div>
|
185
|
+
</body></html>
|
186
|
+
'
|
187
|
+
@md = Mida::Document.new(html)
|
188
|
+
|
189
|
+
end
|
190
|
+
|
191
|
+
it_should_behave_like 'one root itemscope'
|
192
|
+
|
193
|
+
it 'should return all the properties and types with the correct values' do
|
194
|
+
expected_results = [{
|
195
|
+
type: nil,
|
196
|
+
id: nil,
|
197
|
+
properties: {
|
198
|
+
'itemreviewed' => ['Romeo Pizza'],
|
199
|
+
'reviewer' => ['Ulysses Grant'],
|
200
|
+
'dtreviewed' => ['2009-01-06'],
|
201
|
+
'fielda' => ['a5482'],
|
202
|
+
'summary' => ['Delicious, tasty pizza in Eastlake!'],
|
203
|
+
'description' => ['This is a very nice pizza place.'],
|
204
|
+
'rating' => ['4.5']
|
205
|
+
}
|
206
|
+
}]
|
207
|
+
|
208
|
+
test_parsing(@md, %r{}, expected_results)
|
209
|
+
end
|
210
|
+
|
211
|
+
end
|
212
|
+
|
213
|
+
describe Mida::Document, 'when run against a full html document containing one itemscope nested within another' do
|
214
|
+
|
215
|
+
before do
|
216
|
+
html = '
|
217
|
+
<html><body>
|
218
|
+
There is some text here
|
219
|
+
<div>
|
220
|
+
and also some here
|
221
|
+
<div itemscope>
|
222
|
+
<span itemprop="itemreviewed">Romeo Pizza</span>
|
223
|
+
<div itemprop="address" itemscope>
|
224
|
+
<span itemprop="firstline">237 Italian Way</span>
|
225
|
+
<span itemprop="country">United Kingdom</span>
|
226
|
+
</div>
|
227
|
+
Rating: <span itemprop="rating">4.5</span>
|
228
|
+
</div>
|
229
|
+
</div>
|
230
|
+
</body></html>
|
231
|
+
'
|
232
|
+
|
233
|
+
@md = Mida::Document.new(html)
|
234
|
+
|
235
|
+
end
|
236
|
+
|
237
|
+
it_should_behave_like 'one root itemscope'
|
238
|
+
|
239
|
+
it 'should return all the properties and types with the correct values' do
|
240
|
+
expected_results = [{
|
241
|
+
type: nil,
|
242
|
+
id: nil,
|
243
|
+
properties: {
|
244
|
+
'itemreviewed' => ['Romeo Pizza'],
|
245
|
+
'address' => [{
|
246
|
+
type: nil, id: nil, properties: {
|
247
|
+
'firstline' => ['237 Italian Way'],
|
248
|
+
'country' => ['United Kingdom']
|
249
|
+
}
|
250
|
+
}],
|
251
|
+
'rating' => ['4.5']
|
252
|
+
}
|
253
|
+
}]
|
254
|
+
|
255
|
+
test_parsing(@md, %r{}, expected_results)
|
256
|
+
end
|
257
|
+
|
258
|
+
end
|
259
|
+
|
260
|
+
describe Mida::Document, 'when run against a full html document containing one itemscope nested within another within another' do
|
261
|
+
|
262
|
+
before do
|
263
|
+
html = '
|
264
|
+
<html><body>
|
265
|
+
There is some text here
|
266
|
+
<div>
|
267
|
+
and also some here
|
268
|
+
<div itemscope>
|
269
|
+
<span itemprop="itemreviewed">Romeo Pizza</span>
|
270
|
+
<div itemprop="address" itemscope>
|
271
|
+
<div itemprop="firstline" itemscope>
|
272
|
+
<span itemprop="number">237</span>
|
273
|
+
<span itemprop="road">Italian Way</span>
|
274
|
+
</div>
|
275
|
+
<span itemprop="country">United Kingdom</span>
|
276
|
+
</div>
|
277
|
+
Rating: <span itemprop="rating">4.5</span>
|
278
|
+
</div>
|
279
|
+
</div>
|
280
|
+
</body></html>
|
281
|
+
'
|
282
|
+
|
283
|
+
@md = Mida::Document.new(html)
|
284
|
+
end
|
285
|
+
|
286
|
+
it_should_behave_like 'one root itemscope'
|
287
|
+
|
288
|
+
it 'should return all the properties and types with the correct values' do
|
289
|
+
expected_results = [{
|
290
|
+
type: nil,
|
291
|
+
id: nil,
|
292
|
+
properties: {
|
293
|
+
'itemreviewed' => ['Romeo Pizza'],
|
294
|
+
'address' => [{
|
295
|
+
type: nil,
|
296
|
+
id: nil,
|
297
|
+
properties: {
|
298
|
+
'firstline' => [{
|
299
|
+
type: nil,
|
300
|
+
id: nil,
|
301
|
+
properties: {
|
302
|
+
'number' => ['237'],
|
303
|
+
'road' => ['Italian Way']
|
304
|
+
},
|
305
|
+
}],
|
306
|
+
'country' => ['United Kingdom']
|
307
|
+
},
|
308
|
+
}],
|
309
|
+
'rating' => ['4.5']
|
310
|
+
}
|
311
|
+
}]
|
312
|
+
|
313
|
+
test_parsing(@md, %r{^$}, expected_results)
|
314
|
+
end
|
315
|
+
|
316
|
+
end
|
317
|
+
|
318
|
+
describe Mida::Document, 'when run against a full html document containing one itemscope with an itemtype' do
|
319
|
+
|
320
|
+
before do
|
321
|
+
html = '
|
322
|
+
<html><body>
|
323
|
+
There is some text here
|
324
|
+
<div>
|
325
|
+
and also some here
|
326
|
+
<div itemscope itemtype="http://data-vocabulary.org/Review">
|
327
|
+
<span itemprop="itemreviewed">Romeo Pizza</span>
|
328
|
+
Reviewed by <span itemprop="reviewer">Ulysses Grant</span> on
|
329
|
+
<time itemprop="dtreviewed" datetime="2009-01-06">Jan 6</time>.
|
330
|
+
<span itemprop="summary">Delicious, tasty pizza in Eastlake!</span>
|
331
|
+
<span itemprop="description">This is a very nice pizza place.</span>
|
332
|
+
Rating: <span itemprop="rating">4.5</span>
|
333
|
+
</div>
|
334
|
+
</div>
|
335
|
+
</body></html>
|
336
|
+
'
|
337
|
+
|
338
|
+
@md = Mida::Document.new(html)
|
339
|
+
|
340
|
+
end
|
341
|
+
|
342
|
+
it_should_behave_like 'one root itemscope'
|
343
|
+
|
344
|
+
it 'should find the correct number of itemscopes if outer specified' do
|
345
|
+
@md.search(%r{http://data-vocabulary.org/Review}).size.should == 1
|
346
|
+
end
|
347
|
+
|
348
|
+
it 'should specify the correct type' do
|
349
|
+
@md.search(%r{http://data-vocabulary.org/Review}).first.type.should == 'http://data-vocabulary.org/Review'
|
350
|
+
end
|
351
|
+
|
352
|
+
it 'should return all the properties and types with the correct values' do
|
353
|
+
expected_results = [{
|
354
|
+
type: 'http://data-vocabulary.org/Review',
|
355
|
+
id: nil,
|
356
|
+
properties: {
|
357
|
+
'itemreviewed' => ['Romeo Pizza'],
|
358
|
+
'reviewer' => ['Ulysses Grant'],
|
359
|
+
'dtreviewed' => ['2009-01-06'],
|
360
|
+
'summary' => ['Delicious, tasty pizza in Eastlake!'],
|
361
|
+
'description' => ['This is a very nice pizza place.'],
|
362
|
+
'rating' => ['4.5']
|
363
|
+
}
|
364
|
+
}]
|
365
|
+
test_parsing(@md, %r{http://data-vocabulary.org/Review}, expected_results)
|
366
|
+
end
|
367
|
+
|
368
|
+
end
|
369
|
+
|
370
|
+
describe Mida::Document, 'when run against a full html document containing two non-nested itemscopes with itemtypes' do
|
371
|
+
|
372
|
+
before do
|
373
|
+
html = '
|
374
|
+
<html><body>
|
375
|
+
There is some text here
|
376
|
+
<div>
|
377
|
+
and also some here
|
378
|
+
<div itemscope itemtype="http://data-vocabulary.org/Review">
|
379
|
+
<span itemprop="itemreviewed">Romeo Pizza</span>
|
380
|
+
Rating: <span itemprop="rating">4.5</span>
|
381
|
+
</div>
|
382
|
+
<div itemscope itemtype="http://data-vocabulary.org/Organization">
|
383
|
+
<span itemprop="name">An org name</span>
|
384
|
+
<span itemprop="url">http://example.com</span>
|
385
|
+
</div>
|
386
|
+
</div>
|
387
|
+
</body></html>
|
388
|
+
'
|
389
|
+
|
390
|
+
@md = Mida::Document.new(html)
|
391
|
+
|
392
|
+
end
|
393
|
+
|
394
|
+
it 'should return all the itemscopes' do
|
395
|
+
@md.items.size.should == 2
|
396
|
+
end
|
397
|
+
|
398
|
+
it 'should give the type of each itemscope if none specified' do
|
399
|
+
itemscope_names = {
|
400
|
+
'http://data-vocabulary.org/Review' => 0,
|
401
|
+
'http://data-vocabulary.org/Organization' => 0
|
402
|
+
}
|
403
|
+
|
404
|
+
@md.items.each do |item|
|
405
|
+
itemscope_names[item.type] += 1
|
406
|
+
end
|
407
|
+
|
408
|
+
itemscope_names.size.should eq 2
|
409
|
+
itemscope_names.each { |name, num| num.should == 1 }
|
410
|
+
end
|
411
|
+
|
412
|
+
|
413
|
+
it 'should return all the properties and types with the correct values for 1st itemscope' do
|
414
|
+
expected_results = [{
|
415
|
+
type: 'http://data-vocabulary.org/Review',
|
416
|
+
id: nil,
|
417
|
+
properties: {
|
418
|
+
'itemreviewed' => ['Romeo Pizza'],
|
419
|
+
'rating' => ['4.5']
|
420
|
+
}
|
421
|
+
}]
|
422
|
+
test_parsing(@md, %r{http://data-vocabulary.org/Review}, expected_results)
|
423
|
+
end
|
424
|
+
|
425
|
+
it 'should return all the properties from the text for 2nd itemscope' do
|
426
|
+
expected_results = [{
|
427
|
+
type: 'http://data-vocabulary.org/Organization',
|
428
|
+
id: nil,
|
429
|
+
properties: {
|
430
|
+
'name' => ['An org name'],
|
431
|
+
'url' => ['http://example.com']
|
432
|
+
}
|
433
|
+
}]
|
434
|
+
test_parsing(@md, %r{http://data-vocabulary.org/Organization}, expected_results)
|
435
|
+
end
|
436
|
+
|
437
|
+
end
|
438
|
+
|
439
|
+
describe Mida::Document, 'when run against a full html document containing one
|
440
|
+
itemscope nested within another and the inner block is
|
441
|
+
surrounded with another non itemscope block' do
|
442
|
+
|
443
|
+
before do
|
444
|
+
html = '
|
445
|
+
<html><body>
|
446
|
+
<div itemscope itemtype="http://data-vocabulary.org/Product">
|
447
|
+
<ul class="reviews">
|
448
|
+
<li id="model" itemprop="name">DC07</li>
|
449
|
+
<li id="make" itemprop="brand">Dyson</li>
|
450
|
+
<li itemprop="review" itemscope itemtype="http://data-vocabulary.org/Review-aggregate">
|
451
|
+
<span class="ratingDetails">
|
452
|
+
<span itemprop="count">1</span> Review,
|
453
|
+
Average: <span itemprop="rating">5.0</span>
|
454
|
+
</span>
|
455
|
+
</li>
|
456
|
+
</ul>
|
457
|
+
</div>
|
458
|
+
</body></html>
|
459
|
+
'
|
460
|
+
|
461
|
+
@md = Mida::Document.new(html)
|
462
|
+
end
|
463
|
+
|
464
|
+
it_should_behave_like 'one root itemscope'
|
465
|
+
|
466
|
+
it 'should return the correct number of itemscopes' do
|
467
|
+
vocabularies = [
|
468
|
+
%r{http://data-vocabulary.org/Product},
|
469
|
+
%r{http://data-vocabulary.org/Review-aggregate}
|
470
|
+
]
|
471
|
+
vocabularies.each {|vocabulary| @md.search(vocabulary).size.should == 1}
|
472
|
+
end
|
473
|
+
|
474
|
+
context "when looking at the outer vocabulary" do
|
475
|
+
it 'should return all the properties from the text with the correct values' do
|
476
|
+
expected_results = [{
|
477
|
+
type: 'http://data-vocabulary.org/Product',
|
478
|
+
id: nil,
|
479
|
+
properties: {
|
480
|
+
'name' => ['DC07'],
|
481
|
+
'brand' => ['Dyson'],
|
482
|
+
'review' => [{
|
483
|
+
type: 'http://data-vocabulary.org/Review-aggregate',
|
484
|
+
id: nil,
|
485
|
+
properties: {
|
486
|
+
'count' => ['1'],
|
487
|
+
'rating' => ['5.0']
|
488
|
+
}
|
489
|
+
}]
|
490
|
+
}
|
491
|
+
}]
|
492
|
+
|
493
|
+
test_parsing(@md, %r{http://data-vocabulary.org/Product}, expected_results)
|
494
|
+
end
|
495
|
+
end
|
496
|
+
|
497
|
+
end
|
498
|
+
|
499
|
+
describe Mida::Document, 'when run against a document containing an itemscope
|
500
|
+
that contains another non-linked itemscope' do
|
501
|
+
|
502
|
+
before do
|
503
|
+
html = '
|
504
|
+
<html><body>
|
505
|
+
<div itemscope itemtype="http://data-vocabulary.org/Product">
|
506
|
+
<ul class="reviews">
|
507
|
+
<li id="model" itemprop="name">DC07</li>
|
508
|
+
<li id="make" itemprop="brand">Dyson</li>
|
509
|
+
<li itemscope itemtype="http://data-vocabulary.org/Review-aggregate">
|
510
|
+
<span class="ratingDetails">
|
511
|
+
<span itemprop="count">1</span> Review,
|
512
|
+
Average: <span itemprop="rating">5.0</span>
|
513
|
+
</span>
|
514
|
+
</li>
|
515
|
+
</ul>
|
516
|
+
</div>
|
517
|
+
</body></html>
|
518
|
+
'
|
519
|
+
|
520
|
+
@md = Mida::Document.new(html)
|
521
|
+
end
|
522
|
+
|
523
|
+
it 'should return the correct number of itemscopes when search used' do
|
524
|
+
vocabularies = {
|
525
|
+
%r{} => 2,
|
526
|
+
%r{http://data-vocabulary.org/Product} => 1,
|
527
|
+
%r{http://data-vocabulary.org/Review-aggregate} => 1
|
528
|
+
}
|
529
|
+
vocabularies.each {|vocabulary, num| @md.search(vocabulary).size.should == num}
|
530
|
+
end
|
531
|
+
|
532
|
+
it 'should return the correct number of items' do
|
533
|
+
@md.items.size.should == 2
|
534
|
+
end
|
535
|
+
|
536
|
+
context "when no vocabulary specified or looking at the outer vocabulary" do
|
537
|
+
it 'should return all the properties from the text with the correct values' do
|
538
|
+
pending("get the contains: feature working")
|
539
|
+
expected_result = {
|
540
|
+
type: 'http://data-vocabulary.org/Product',
|
541
|
+
id: nil,
|
542
|
+
properties: {
|
543
|
+
'name' => 'DC07',
|
544
|
+
'brand' => 'Dyson'
|
545
|
+
},
|
546
|
+
contains: {
|
547
|
+
type: 'http://data-vocabulary.org/Review-aggregate',
|
548
|
+
id: nil,
|
549
|
+
properties: {
|
550
|
+
'count' => '1',
|
551
|
+
'rating' => '5.0'
|
552
|
+
}
|
553
|
+
}
|
554
|
+
}
|
555
|
+
|
556
|
+
@md.search('http://data-vocabulary.org/Product').first.should == expected_result
|
557
|
+
end
|
558
|
+
end
|
559
|
+
end
|
560
|
+
|
561
|
+
describe Mida::Document, 'when run against a document using itemrefs' do
|
562
|
+
|
563
|
+
before do
|
564
|
+
html = '
|
565
|
+
<html><body>
|
566
|
+
<div itemscope id="amanda" itemref="a b">
|
567
|
+
<span itemprop="age">30</span>
|
568
|
+
</div>
|
569
|
+
<p id="a">Name: <span itemprop="name">Amanda</span></p>
|
570
|
+
<div id="b" itemprop="band" itemscope itemref="c"></div>
|
571
|
+
<div id="c">
|
572
|
+
<p>Band: <span itemprop="name">Jazz Band</span></p>
|
573
|
+
<p>Size: <span itemprop="size">12</span> players</p>
|
574
|
+
</div>
|
575
|
+
</body></html>
|
576
|
+
'
|
577
|
+
|
578
|
+
@md = Mida::Document.new(html)
|
579
|
+
end
|
580
|
+
|
581
|
+
it 'should return all the properties from the text with the correct values' do
|
582
|
+
expected_results = [{
|
583
|
+
type: nil,
|
584
|
+
id: nil,
|
585
|
+
properties: {
|
586
|
+
'name' => ['Amanda'],
|
587
|
+
'band' => [{
|
588
|
+
type: nil,
|
589
|
+
id: nil,
|
590
|
+
properties: {
|
591
|
+
'name' => ['Jazz Band'],
|
592
|
+
'size' => ['12']
|
593
|
+
}
|
594
|
+
}],
|
595
|
+
'age' => ['30']
|
596
|
+
}
|
597
|
+
}]
|
598
|
+
|
599
|
+
test_parsing(@md, %r{}, expected_results)
|
600
|
+
end
|
601
|
+
end
|
602
|
+
|
603
|
+
describe Mida::Document, 'when run against a document using multiple itemprops with the same name' do
|
604
|
+
|
605
|
+
before do
|
606
|
+
html = '
|
607
|
+
<html><body>
|
608
|
+
<div itemscope itemtype="icecreams">
|
609
|
+
<p>Flavours in my favourite ice cream:</p>
|
610
|
+
<ul>
|
611
|
+
<li itemprop="flavour">Lemon sorbet</li>
|
612
|
+
<li itemprop="flavour">Apricot sorbet</li>
|
613
|
+
<li itemprop="flavour" itemscope itemtype="icecream-type">
|
614
|
+
<span itemprop="fruit">Strawberry</span>
|
615
|
+
<span itemprop="style">Homemade</span>
|
616
|
+
</li>
|
617
|
+
</ul>
|
618
|
+
</div>
|
619
|
+
</body></html>
|
620
|
+
'
|
621
|
+
|
622
|
+
@md = Mida::Document.new(html)
|
623
|
+
end
|
624
|
+
|
625
|
+
it_should_behave_like 'one root itemscope'
|
626
|
+
|
627
|
+
it 'should return the correct number of itemscopes' do
|
628
|
+
vocabularies = [
|
629
|
+
%r{icecreams},
|
630
|
+
%r{icecream-type}
|
631
|
+
]
|
632
|
+
vocabularies.each {|vocabulary| @md.search(vocabulary).size.should == 1}
|
633
|
+
end
|
634
|
+
|
635
|
+
it 'should return all the properties from the text with the correct values' do
|
636
|
+
expected_results = [{
|
637
|
+
type: 'icecreams',
|
638
|
+
id: nil,
|
639
|
+
properties: {
|
640
|
+
'flavour' => [
|
641
|
+
'Lemon sorbet',
|
642
|
+
'Apricot sorbet',
|
643
|
+
{ type: 'icecream-type',
|
644
|
+
id: nil,
|
645
|
+
properties: {
|
646
|
+
'fruit' => ['Strawberry'],
|
647
|
+
'style' => ['Homemade']
|
648
|
+
}
|
649
|
+
}
|
650
|
+
]
|
651
|
+
}
|
652
|
+
}]
|
653
|
+
|
654
|
+
test_parsing(@md, %r{icecreams}, expected_results)
|
655
|
+
end
|
656
|
+
end
|
657
|
+
|
658
|
+
describe Mida::Document, 'when run against a document using an itemprop with multiple properties' do
|
659
|
+
|
660
|
+
before do
|
661
|
+
html = '
|
662
|
+
<html><body>
|
663
|
+
<div itemscope>
|
664
|
+
<span itemprop="favourite-colour favourite-fruit">orange</span>
|
665
|
+
</div>
|
666
|
+
</body></html>
|
667
|
+
'
|
668
|
+
|
669
|
+
@md = Mida::Document.new(html)
|
670
|
+
end
|
671
|
+
|
672
|
+
it 'should return all the properties from the text with the correct values' do
|
673
|
+
expected_results = [{
|
674
|
+
type: nil,
|
675
|
+
id: nil,
|
676
|
+
properties: {
|
677
|
+
'favourite-colour' => ['orange'],
|
678
|
+
'favourite-fruit' => ['orange']
|
679
|
+
}
|
680
|
+
}]
|
681
|
+
|
682
|
+
test_parsing(@md, %r{}, expected_results)
|
683
|
+
end
|
684
|
+
end
|