mida 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.rdoc +21 -0
- data/README.rdoc +68 -0
- data/Rakefile +26 -0
- data/TODO.rdoc +6 -0
- data/lib/mida.rb +6 -0
- data/lib/mida/document.rb +61 -0
- data/lib/mida/item.rb +100 -0
- data/lib/mida/property.rb +70 -0
- data/spec/document_spec.rb +684 -0
- data/spec/item_spec.rb +393 -0
- data/spec/property_spec.rb +152 -0
- data/spec/spec_helper.rb +41 -0
- metadata +172 -0
data/LICENSE.rdoc
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
= The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2011 Lawrence Woodman
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
= Mida
|
2
|
+
|
3
|
+
* {Mida Project Page}[https://github.com/LawrenceWoodman/mida]
|
4
|
+
* {Mida Bug Tracker}[https://github.com/LawrenceWoodman/mida/issues]
|
5
|
+
|
6
|
+
== Description
|
7
|
+
A Microdata[http://en.wikipedia.org/wiki/Microdata_(HTML5)] parser and
|
8
|
+
extractor library for ruby.
|
9
|
+
This is based on the latest Published version of the Microdata Specification
|
10
|
+
dated {5th April 2011}[http://www.w3.org/TR/2011/WD-microdata-20110405/].
|
11
|
+
|
12
|
+
== Installation
|
13
|
+
With Ruby and Rubygems:
|
14
|
+
gem install mida
|
15
|
+
|
16
|
+
=== Requirements:
|
17
|
+
|
18
|
+
* +Nokogiri+
|
19
|
+
|
20
|
+
== Usage
|
21
|
+
The following examples assume that you have required +mida+ and
|
22
|
+
+open-uri+.
|
23
|
+
|
24
|
+
=== Extracting Microdata from a page
|
25
|
+
All the Microdata is extracted from a page when a new <tt>Mida::Document</tt> instance
|
26
|
+
is created.
|
27
|
+
|
28
|
+
To extract all the Microdata from a webpage:
|
29
|
+
url = 'http://example.com'
|
30
|
+
open(url) {|f| doc = Mida::Document.new(f, url)}
|
31
|
+
|
32
|
+
The top-level +Items+ will be held in an array accessible via
|
33
|
+
<tt>doc.items</tt>.
|
34
|
+
|
35
|
+
To simply list all the top-level +Items+ that have been found:
|
36
|
+
puts doc.items
|
37
|
+
|
38
|
+
=== Searching
|
39
|
+
If you want to search for an +Item+ that has a specific +itemtype+/vocabulary
|
40
|
+
this can be done with the +search+ method.
|
41
|
+
|
42
|
+
To return all the +Items+ that use one of Google's Review vocabularies:
|
43
|
+
doc.search(%r{http://data-vocabulary\.org.*?review.*?}i)
|
44
|
+
|
45
|
+
=== Inspecting an +Item+
|
46
|
+
Each +Item+ is a <tt>Mida::Item</tt> instance and has three main methods of
|
47
|
+
interest, +type+, +properties+ and +id+.
|
48
|
+
|
49
|
+
To find out the +itemtype+ of the +Item+:
|
50
|
+
puts doc.items.first.type
|
51
|
+
|
52
|
+
To find out the +itemid+ of the +Item+:
|
53
|
+
puts doc.items.first.id
|
54
|
+
|
55
|
+
Properties are returned as a hash containing name/values pairs. The
|
56
|
+
values will be an array of either +String+ or <tt>Mida::Item</tt> instances.
|
57
|
+
|
58
|
+
To see the +properties+ of the +Item+:
|
59
|
+
puts doc.items.first.properties
|
60
|
+
|
61
|
+
== Bugs/Feature Requests
|
62
|
+
If you find a bug or want to make a feature request, please report it at the
|
63
|
+
Mida project's {issues tracker}[https://github.com/LawrenceWoodman/mida/issues]
|
64
|
+
on github.
|
65
|
+
|
66
|
+
== License
|
67
|
+
Copyright (c) 2011 Lawrence Woodman.
|
68
|
+
This software is licensed under the MIT License. Please see the file, LICENSE.rdoc, for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
task :default => :spec
|
2
|
+
|
3
|
+
desc "Create Gem"
|
4
|
+
require 'rake/gempackagetask'
|
5
|
+
spec = Gem::Specification.new do |s|
|
6
|
+
s.name = "mida"
|
7
|
+
s.summary = "A Microdata parser"
|
8
|
+
s.description = File.read(File.join(File.dirname(__FILE__), 'README.rdoc'))
|
9
|
+
s.version = "0.0.0"
|
10
|
+
s.author = "Lawrence Woodman"
|
11
|
+
s.email = "lwoodman@vlifesystems.com"
|
12
|
+
s.homepage = %q{http://github.com/LawrenceWoodman/mida}
|
13
|
+
s.platform = Gem::Platform::RUBY
|
14
|
+
s.required_ruby_version = '>=1.9'
|
15
|
+
s.files = Dir['lib/**/*.rb'] + Dir['spec/**/*.rb'] + Dir['*.rdoc'] + Dir['Rakefile']
|
16
|
+
s.has_rdoc = true
|
17
|
+
s.extra_rdoc_files = ['README.rdoc', 'LICENSE.rdoc']
|
18
|
+
s.rdoc_options << '--main' << 'README.rdoc'
|
19
|
+
s.add_dependency('nokogiri')
|
20
|
+
s.add_development_dependency('rspec')
|
21
|
+
end
|
22
|
+
Rake::GemPackageTask.new(spec).define
|
23
|
+
|
24
|
+
desc "Run Specs"
|
25
|
+
require 'rspec/core/rake_task'
|
26
|
+
RSpec::Core::RakeTask.new(:spec)
|
data/TODO.rdoc
ADDED
@@ -0,0 +1,6 @@
|
|
1
|
+
= Todo List
|
2
|
+
|
3
|
+
* Support img rating in alt for google?
|
4
|
+
* Look further in extra complications of microdata, e.g. alt tag for img rating and different size ratings
|
5
|
+
http://www.google.com/support/webmasters/bin/answer.py?answer=172705
|
6
|
+
* Put nested itemscopes, that are not a property of its parent into the parents hash using [:nested]
|
data/lib/mida.rb
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Mida
|
4
|
+
|
5
|
+
# Class that holds the extracted Microdata
|
6
|
+
class Document
|
7
|
+
|
8
|
+
# An Array of Mida::Item objects. These are all top-level
|
9
|
+
# and hence not properties of other Items
|
10
|
+
attr_reader :items
|
11
|
+
|
12
|
+
# Create a new Microdata object
|
13
|
+
#
|
14
|
+
# [target] The string containing the html that you want to parse
|
15
|
+
# [page_url] The url of target used for form absolute urls. This must
|
16
|
+
# include the filename, e.g. index.html.
|
17
|
+
def initialize(target, page_url=nil)
|
18
|
+
@doc = Nokogiri(target)
|
19
|
+
@page_url = page_url
|
20
|
+
@items = extract_items
|
21
|
+
end
|
22
|
+
|
23
|
+
# Returns an array of matching Mida::Item objects
|
24
|
+
#
|
25
|
+
# [vocabulary] A regexp to match the item types against
|
26
|
+
def search(vocabulary, items=@items)
|
27
|
+
found_items = []
|
28
|
+
items.each do |item|
|
29
|
+
# Allows matching against empty string, otherwise couldn't match
|
30
|
+
# as item.type can be nil
|
31
|
+
if (item.type.nil? && "" =~ vocabulary) || (item.type =~ vocabulary)
|
32
|
+
found_items << item
|
33
|
+
end
|
34
|
+
found_items += search_values(item.properties.values, vocabulary)
|
35
|
+
end
|
36
|
+
found_items
|
37
|
+
end
|
38
|
+
|
39
|
+
private
|
40
|
+
def extract_items
|
41
|
+
items_doc = @doc.search('//*[@itemscope and not(@itemprop)]')
|
42
|
+
return nil unless items_doc
|
43
|
+
|
44
|
+
items_doc.collect do |item_doc|
|
45
|
+
Item.new(item_doc, @page_url)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def search_values(values, vocabulary)
|
50
|
+
items = []
|
51
|
+
values.each do |value|
|
52
|
+
if value.is_a?(Mida::Item) then items += search(vocabulary, [value])
|
53
|
+
elsif value.is_a?(Array) then items += search_values(value, vocabulary)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
items
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
data/lib/mida/item.rb
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Mida
|
4
|
+
|
5
|
+
# Class that holds each item/itemscope
|
6
|
+
class Item
|
7
|
+
# The Type of the item
|
8
|
+
attr_reader :type
|
9
|
+
|
10
|
+
# The Global Identifier of the item
|
11
|
+
attr_reader :id
|
12
|
+
|
13
|
+
# A Hash representing the properties as name/values paris
|
14
|
+
# The values will be an array containing either +String+
|
15
|
+
# or <tt>Mida::Item</tt> instances
|
16
|
+
attr_reader :properties
|
17
|
+
|
18
|
+
# Create a new Item object
|
19
|
+
#
|
20
|
+
# [itemscope] The itemscope that you want to parse
|
21
|
+
# [page_url] The url of target used for form absolute urls
|
22
|
+
def initialize(itemscope, page_url=nil)
|
23
|
+
@itemscope, @page_url = itemscope, page_url
|
24
|
+
@type, @id = extract_attribute('itemtype'), extract_attribute('itemid')
|
25
|
+
@properties = {}
|
26
|
+
add_itemref_properties
|
27
|
+
traverse_elements(extract_elements(itemscope))
|
28
|
+
end
|
29
|
+
|
30
|
+
# Return a Hash representation
|
31
|
+
# of the form {type: 'The item type', properties: {'a name' => 'avalue' }}
|
32
|
+
def to_h
|
33
|
+
{type: @type, id: @id, properties: properties_to_h(@properties)}
|
34
|
+
end
|
35
|
+
|
36
|
+
def to_s
|
37
|
+
to_h.to_s
|
38
|
+
end
|
39
|
+
|
40
|
+
def ==(other)
|
41
|
+
@type == other.type and @id == other.id and @properties == other.properties
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def extract_attribute(attribute)
|
47
|
+
(value = @itemscope.attribute(attribute)) ? value.value : nil
|
48
|
+
end
|
49
|
+
|
50
|
+
def extract_elements(itemscope)
|
51
|
+
itemscope.search('./*')
|
52
|
+
end
|
53
|
+
|
54
|
+
# Find an element with a matching id
|
55
|
+
def find_with_id(id)
|
56
|
+
@itemscope.search("//*[@id='#{id}']")
|
57
|
+
end
|
58
|
+
|
59
|
+
# The value as it should appear in to_h()
|
60
|
+
def value_to_h(value)
|
61
|
+
case
|
62
|
+
when value.is_a?(Array) then value.collect {|element| value_to_h(element)}
|
63
|
+
when value.is_a?(Item) then value.to_h
|
64
|
+
else value
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def properties_to_h(properties)
|
69
|
+
hash = {}
|
70
|
+
properties.each { |name, value| hash[name] = value_to_h(value) }
|
71
|
+
hash
|
72
|
+
end
|
73
|
+
|
74
|
+
# Add any properties referred to by 'itemref'
|
75
|
+
def add_itemref_properties
|
76
|
+
itemref = extract_attribute('itemref')
|
77
|
+
if itemref
|
78
|
+
itemref.split.each {|id| traverse_elements(find_with_id(id))}
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def traverse_elements(elements)
|
83
|
+
elements.each do |element|
|
84
|
+
internal_elements = extract_elements(element)
|
85
|
+
if internal_elements.empty? || element.attribute('itemscope')
|
86
|
+
add_itemprop(element)
|
87
|
+
else
|
88
|
+
traverse_elements(internal_elements)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def add_itemprop(itemprop)
|
94
|
+
properties = Property.parse(itemprop, @page_url)
|
95
|
+
properties.each { |name, value| (@properties[name] ||= []) << value }
|
96
|
+
end
|
97
|
+
|
98
|
+
end
|
99
|
+
|
100
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module Mida
|
5
|
+
|
6
|
+
# Module that parses itemprop elements
|
7
|
+
module Property
|
8
|
+
|
9
|
+
# Returns a Hash representing the property.
|
10
|
+
# Hash is of the form {'property name' => 'value'}
|
11
|
+
# [element] The itemprop element to be parsed
|
12
|
+
# [page_url] The url of the page, including the filename, used to form absolute urls
|
13
|
+
def self.parse(element, page_url=nil)
|
14
|
+
hash = {}
|
15
|
+
extract_property_names(element).each do |name|
|
16
|
+
hash[name] = extract_property(element, page_url)
|
17
|
+
end
|
18
|
+
hash
|
19
|
+
end
|
20
|
+
|
21
|
+
NON_TEXTCONTENT_ELEMENTS = {
|
22
|
+
'a' => 'href', 'area' => 'href',
|
23
|
+
'audio' => 'src', 'embed' => 'src',
|
24
|
+
'iframe' => 'src', 'img' => 'src',
|
25
|
+
'link' => 'href', 'meta' => 'content',
|
26
|
+
'object' => 'data', 'source' => 'src',
|
27
|
+
'time' => 'datetime', 'track' => 'src',
|
28
|
+
'video' => 'src'
|
29
|
+
}
|
30
|
+
|
31
|
+
URL_ATTRIBUTES = ['data', 'href', 'src']
|
32
|
+
|
33
|
+
# This returns an empty string if can't form a valid
|
34
|
+
# absolute url as per the Microdata spec.
|
35
|
+
def self.make_absolute_url(url, page_url)
|
36
|
+
return url unless URI.parse(url).relative?
|
37
|
+
begin
|
38
|
+
URI.parse(page_url).merge(url).to_s
|
39
|
+
rescue URI::Error
|
40
|
+
''
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.extract_property_names(itemprop)
|
45
|
+
itemprop_attr = itemprop.attribute('itemprop')
|
46
|
+
itemprop_attr ? itemprop_attr.value.split() : []
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.extract_property_value(itemprop, page_url)
|
50
|
+
element = itemprop.name
|
51
|
+
if NON_TEXTCONTENT_ELEMENTS.has_key?(element)
|
52
|
+
attribute = NON_TEXTCONTENT_ELEMENTS[element]
|
53
|
+
value = itemprop.attribute(attribute).value
|
54
|
+
(URL_ATTRIBUTES.include?(attribute)) ? make_absolute_url(value, page_url) : value
|
55
|
+
else
|
56
|
+
itemprop.inner_text
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def self.extract_property(itemprop, page_url)
|
61
|
+
if itemprop.attribute('itemscope')
|
62
|
+
Mida::Item.new(itemprop, page_url)
|
63
|
+
else
|
64
|
+
extract_property_value(itemprop, page_url)
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
@@ -0,0 +1,684 @@
|
|
1
|
+
require_relative 'spec_helper'
|
2
|
+
require_relative '../lib/mida'
|
3
|
+
|
4
|
+
def test_parsing(md, vocabulary, expected_results)
|
5
|
+
items = md.search(vocabulary)
|
6
|
+
expected_results.each_with_index do |expected_result,i|
|
7
|
+
item = items[i]
|
8
|
+
test_to_h(item, expected_result)
|
9
|
+
test_properties(item, expected_result)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_to_h(item, expected_result)
|
14
|
+
item.to_h.should == expected_result
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_properties(item, expected_result)
|
18
|
+
item.properties.each do |name, value|
|
19
|
+
match_array(value, expected_result[:properties][name])
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def match_array(value_array, expected_results)
|
24
|
+
value_array.each_with_index do |element, i|
|
25
|
+
if element.is_a?(Mida::Item)
|
26
|
+
test_properties(element, expected_results[i])
|
27
|
+
else
|
28
|
+
element.should == expected_results[i]
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
shared_examples_for 'one root itemscope' do
|
34
|
+
it 'should not match itemscopes with different names' do
|
35
|
+
@md.search(%r{nothing}).size.should == 0
|
36
|
+
end
|
37
|
+
|
38
|
+
it 'should find the correct number of itemscopes' do
|
39
|
+
@md.items.size.should == 1
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
describe Mida::Document, 'when run with a document containing textContent and non textContent itemprops' do
|
44
|
+
before do
|
45
|
+
@html = '
|
46
|
+
<html>
|
47
|
+
<head itemscope>
|
48
|
+
<link itemprop="link_field" rel="stylesheet" type="text/css" href="stylesheet.css" />
|
49
|
+
</head>
|
50
|
+
<body>
|
51
|
+
There is some text here
|
52
|
+
<div>
|
53
|
+
and also some here
|
54
|
+
<div itemscope>
|
55
|
+
<span itemprop="span_field">Some span content</span>
|
56
|
+
<time itemprop="dtreviewed" datetime="2009-01-06">Jan 6</time>.
|
57
|
+
<meta itemprop="meta_field" content="Some meta content">
|
58
|
+
<a itemprop="a_field1" href="http://example.com">non content</a>
|
59
|
+
<a itemprop="a_field2" href="welcome/index.html">non content</a>
|
60
|
+
<a itemprop="a_field3" href="/intro">non content</a>
|
61
|
+
<a itemprop="a_field4" href="/intro/index.html">non content</a>
|
62
|
+
<map name="somemap">
|
63
|
+
<area shape="rect" coords="0,0,50,120" href="left.html" />
|
64
|
+
<area itemprop="area_right" shape="rect" coords="51,0,120,120" href="right.html" />
|
65
|
+
</map>
|
66
|
+
<audio itemprop="audio_field" src="asound.ogg" controls="controls">
|
67
|
+
Audio tag not supported by your browser.
|
68
|
+
</audio>
|
69
|
+
|
70
|
+
<embed itemprop="embed_field" src="helloworld.swf" />
|
71
|
+
<iframe itemprop="iframe_field" src="http://www.example.com/iframe_test"></iframe>
|
72
|
+
<img itemprop="img_field" src="animage.png" width="120" height="120" usemap="#planetmap" />
|
73
|
+
<object itemprop="object_field" data="object.png" type="image/png" />
|
74
|
+
<audio controls="controls">
|
75
|
+
<source itemprop="source_field" src="song.ogg" type="audio/ogg" />
|
76
|
+
<track itemprop="track_field" src="atrack.ogg" />
|
77
|
+
Audio tag not supported by your browser.
|
78
|
+
</audio>
|
79
|
+
<video itemprop="video_field" src="movie.ogg" controls="controls">
|
80
|
+
Video tag not supported by your browser.
|
81
|
+
</video>
|
82
|
+
</div>
|
83
|
+
</div>
|
84
|
+
</body>
|
85
|
+
</html>
|
86
|
+
'
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
context 'when not given a page_url' do
|
91
|
+
before do
|
92
|
+
@md = Mida::Document.new(@html)
|
93
|
+
end
|
94
|
+
|
95
|
+
it 'should return all the properties and types with the correct values' do
|
96
|
+
expected_results = [
|
97
|
+
{ type: nil, id: nil, properties: {'link_field' => ['']} },
|
98
|
+
{ type: nil,
|
99
|
+
id: nil,
|
100
|
+
properties: {
|
101
|
+
'span_field' => ['Some span content'],
|
102
|
+
'dtreviewed' => ['2009-01-06'],
|
103
|
+
'meta_field' => ['Some meta content'],
|
104
|
+
'a_field1' => ['http://example.com'],
|
105
|
+
'a_field2' => [''],
|
106
|
+
'a_field3' => [''],
|
107
|
+
'a_field4' => [''],
|
108
|
+
'area_right' => [''],
|
109
|
+
'audio_field' => [''],
|
110
|
+
'embed_field' => [''],
|
111
|
+
'iframe_field' => ['http://www.example.com/iframe_test'],
|
112
|
+
'img_field' => [''],
|
113
|
+
'object_field' => [''],
|
114
|
+
'source_field' => [''],
|
115
|
+
'track_field' => [''],
|
116
|
+
'video_field' => ['']
|
117
|
+
}
|
118
|
+
}
|
119
|
+
]
|
120
|
+
|
121
|
+
test_parsing(@md, %r{}, expected_results)
|
122
|
+
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
context 'when given a page_url' do
|
127
|
+
before do
|
128
|
+
@md = Mida::Document.new(@html, 'http://example.com/start/')
|
129
|
+
end
|
130
|
+
|
131
|
+
it 'should return all the properties and types with the correct values' do
|
132
|
+
expected_results = [
|
133
|
+
{ type: nil, id: nil, properties: {
|
134
|
+
'link_field' => ['http://example.com/start/stylesheet.css']
|
135
|
+
}
|
136
|
+
},
|
137
|
+
{ type: nil,
|
138
|
+
id: nil,
|
139
|
+
properties: {
|
140
|
+
'span_field' => ['Some span content'],
|
141
|
+
'dtreviewed' => ['2009-01-06'],
|
142
|
+
'meta_field' => ['Some meta content'],
|
143
|
+
'a_field1' => ['http://example.com'],
|
144
|
+
'a_field2' => ['http://example.com/start/welcome/index.html'],
|
145
|
+
'a_field3' => ['http://example.com/intro'],
|
146
|
+
'a_field4' => ['http://example.com/intro/index.html'],
|
147
|
+
'area_right' => ['http://example.com/start/right.html'],
|
148
|
+
'audio_field' => ['http://example.com/start/asound.ogg'],
|
149
|
+
'embed_field' => ['http://example.com/start/helloworld.swf'],
|
150
|
+
'iframe_field' => ['http://www.example.com/iframe_test'],
|
151
|
+
'img_field' => ['http://example.com/start/animage.png'],
|
152
|
+
'object_field' => ['http://example.com/start/object.png'],
|
153
|
+
'source_field' => ['http://example.com/start/song.ogg'],
|
154
|
+
'track_field' => ['http://example.com/start/atrack.ogg'],
|
155
|
+
'video_field' => ['http://example.com/start/movie.ogg']
|
156
|
+
}
|
157
|
+
}
|
158
|
+
]
|
159
|
+
|
160
|
+
test_parsing(@md, %r{}, expected_results)
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
end
|
165
|
+
|
166
|
+
describe Mida::Document, 'when run against a full html document containing one itemscope with no itemtype' do
|
167
|
+
|
168
|
+
before do
|
169
|
+
html = '
|
170
|
+
<html><body>
|
171
|
+
There is some text here
|
172
|
+
<div>
|
173
|
+
and also some here
|
174
|
+
<div itemscope>
|
175
|
+
<span itemprop="itemreviewed">Romeo Pizza</span>
|
176
|
+
Reviewed by <span itemprop="reviewer">Ulysses Grant</span> on
|
177
|
+
<time itemprop="dtreviewed" datetime="2009-01-06">Jan 6</time>.
|
178
|
+
<meta itemprop="fielda" content="a5482">
|
179
|
+
|
180
|
+
<span itemprop="summary">Delicious, tasty pizza in Eastlake!</span>
|
181
|
+
<span itemprop="description">This is a very nice pizza place.</span>
|
182
|
+
Rating: <span itemprop="rating">4.5</span>
|
183
|
+
</div>
|
184
|
+
</div>
|
185
|
+
</body></html>
|
186
|
+
'
|
187
|
+
@md = Mida::Document.new(html)
|
188
|
+
|
189
|
+
end
|
190
|
+
|
191
|
+
it_should_behave_like 'one root itemscope'
|
192
|
+
|
193
|
+
it 'should return all the properties and types with the correct values' do
|
194
|
+
expected_results = [{
|
195
|
+
type: nil,
|
196
|
+
id: nil,
|
197
|
+
properties: {
|
198
|
+
'itemreviewed' => ['Romeo Pizza'],
|
199
|
+
'reviewer' => ['Ulysses Grant'],
|
200
|
+
'dtreviewed' => ['2009-01-06'],
|
201
|
+
'fielda' => ['a5482'],
|
202
|
+
'summary' => ['Delicious, tasty pizza in Eastlake!'],
|
203
|
+
'description' => ['This is a very nice pizza place.'],
|
204
|
+
'rating' => ['4.5']
|
205
|
+
}
|
206
|
+
}]
|
207
|
+
|
208
|
+
test_parsing(@md, %r{}, expected_results)
|
209
|
+
end
|
210
|
+
|
211
|
+
end
|
212
|
+
|
213
|
+
describe Mida::Document, 'when run against a full html document containing one itemscope nested within another' do
|
214
|
+
|
215
|
+
before do
|
216
|
+
html = '
|
217
|
+
<html><body>
|
218
|
+
There is some text here
|
219
|
+
<div>
|
220
|
+
and also some here
|
221
|
+
<div itemscope>
|
222
|
+
<span itemprop="itemreviewed">Romeo Pizza</span>
|
223
|
+
<div itemprop="address" itemscope>
|
224
|
+
<span itemprop="firstline">237 Italian Way</span>
|
225
|
+
<span itemprop="country">United Kingdom</span>
|
226
|
+
</div>
|
227
|
+
Rating: <span itemprop="rating">4.5</span>
|
228
|
+
</div>
|
229
|
+
</div>
|
230
|
+
</body></html>
|
231
|
+
'
|
232
|
+
|
233
|
+
@md = Mida::Document.new(html)
|
234
|
+
|
235
|
+
end
|
236
|
+
|
237
|
+
it_should_behave_like 'one root itemscope'
|
238
|
+
|
239
|
+
it 'should return all the properties and types with the correct values' do
|
240
|
+
expected_results = [{
|
241
|
+
type: nil,
|
242
|
+
id: nil,
|
243
|
+
properties: {
|
244
|
+
'itemreviewed' => ['Romeo Pizza'],
|
245
|
+
'address' => [{
|
246
|
+
type: nil, id: nil, properties: {
|
247
|
+
'firstline' => ['237 Italian Way'],
|
248
|
+
'country' => ['United Kingdom']
|
249
|
+
}
|
250
|
+
}],
|
251
|
+
'rating' => ['4.5']
|
252
|
+
}
|
253
|
+
}]
|
254
|
+
|
255
|
+
test_parsing(@md, %r{}, expected_results)
|
256
|
+
end
|
257
|
+
|
258
|
+
end
|
259
|
+
|
260
|
+
describe Mida::Document, 'when run against a full html document containing one itemscope nested within another within another' do
|
261
|
+
|
262
|
+
before do
|
263
|
+
html = '
|
264
|
+
<html><body>
|
265
|
+
There is some text here
|
266
|
+
<div>
|
267
|
+
and also some here
|
268
|
+
<div itemscope>
|
269
|
+
<span itemprop="itemreviewed">Romeo Pizza</span>
|
270
|
+
<div itemprop="address" itemscope>
|
271
|
+
<div itemprop="firstline" itemscope>
|
272
|
+
<span itemprop="number">237</span>
|
273
|
+
<span itemprop="road">Italian Way</span>
|
274
|
+
</div>
|
275
|
+
<span itemprop="country">United Kingdom</span>
|
276
|
+
</div>
|
277
|
+
Rating: <span itemprop="rating">4.5</span>
|
278
|
+
</div>
|
279
|
+
</div>
|
280
|
+
</body></html>
|
281
|
+
'
|
282
|
+
|
283
|
+
@md = Mida::Document.new(html)
|
284
|
+
end
|
285
|
+
|
286
|
+
it_should_behave_like 'one root itemscope'
|
287
|
+
|
288
|
+
it 'should return all the properties and types with the correct values' do
|
289
|
+
expected_results = [{
|
290
|
+
type: nil,
|
291
|
+
id: nil,
|
292
|
+
properties: {
|
293
|
+
'itemreviewed' => ['Romeo Pizza'],
|
294
|
+
'address' => [{
|
295
|
+
type: nil,
|
296
|
+
id: nil,
|
297
|
+
properties: {
|
298
|
+
'firstline' => [{
|
299
|
+
type: nil,
|
300
|
+
id: nil,
|
301
|
+
properties: {
|
302
|
+
'number' => ['237'],
|
303
|
+
'road' => ['Italian Way']
|
304
|
+
},
|
305
|
+
}],
|
306
|
+
'country' => ['United Kingdom']
|
307
|
+
},
|
308
|
+
}],
|
309
|
+
'rating' => ['4.5']
|
310
|
+
}
|
311
|
+
}]
|
312
|
+
|
313
|
+
test_parsing(@md, %r{^$}, expected_results)
|
314
|
+
end
|
315
|
+
|
316
|
+
end
|
317
|
+
|
318
|
+
describe Mida::Document, 'when run against a full html document containing one itemscope with an itemtype' do
|
319
|
+
|
320
|
+
before do
|
321
|
+
html = '
|
322
|
+
<html><body>
|
323
|
+
There is some text here
|
324
|
+
<div>
|
325
|
+
and also some here
|
326
|
+
<div itemscope itemtype="http://data-vocabulary.org/Review">
|
327
|
+
<span itemprop="itemreviewed">Romeo Pizza</span>
|
328
|
+
Reviewed by <span itemprop="reviewer">Ulysses Grant</span> on
|
329
|
+
<time itemprop="dtreviewed" datetime="2009-01-06">Jan 6</time>.
|
330
|
+
<span itemprop="summary">Delicious, tasty pizza in Eastlake!</span>
|
331
|
+
<span itemprop="description">This is a very nice pizza place.</span>
|
332
|
+
Rating: <span itemprop="rating">4.5</span>
|
333
|
+
</div>
|
334
|
+
</div>
|
335
|
+
</body></html>
|
336
|
+
'
|
337
|
+
|
338
|
+
@md = Mida::Document.new(html)
|
339
|
+
|
340
|
+
end
|
341
|
+
|
342
|
+
it_should_behave_like 'one root itemscope'
|
343
|
+
|
344
|
+
it 'should find the correct number of itemscopes if outer specified' do
|
345
|
+
@md.search(%r{http://data-vocabulary.org/Review}).size.should == 1
|
346
|
+
end
|
347
|
+
|
348
|
+
it 'should specify the correct type' do
|
349
|
+
@md.search(%r{http://data-vocabulary.org/Review}).first.type.should == 'http://data-vocabulary.org/Review'
|
350
|
+
end
|
351
|
+
|
352
|
+
it 'should return all the properties and types with the correct values' do
|
353
|
+
expected_results = [{
|
354
|
+
type: 'http://data-vocabulary.org/Review',
|
355
|
+
id: nil,
|
356
|
+
properties: {
|
357
|
+
'itemreviewed' => ['Romeo Pizza'],
|
358
|
+
'reviewer' => ['Ulysses Grant'],
|
359
|
+
'dtreviewed' => ['2009-01-06'],
|
360
|
+
'summary' => ['Delicious, tasty pizza in Eastlake!'],
|
361
|
+
'description' => ['This is a very nice pizza place.'],
|
362
|
+
'rating' => ['4.5']
|
363
|
+
}
|
364
|
+
}]
|
365
|
+
test_parsing(@md, %r{http://data-vocabulary.org/Review}, expected_results)
|
366
|
+
end
|
367
|
+
|
368
|
+
end
|
369
|
+
|
370
|
+
describe Mida::Document, 'when run against a full html document containing two non-nested itemscopes with itemtypes' do
|
371
|
+
|
372
|
+
before do
|
373
|
+
html = '
|
374
|
+
<html><body>
|
375
|
+
There is some text here
|
376
|
+
<div>
|
377
|
+
and also some here
|
378
|
+
<div itemscope itemtype="http://data-vocabulary.org/Review">
|
379
|
+
<span itemprop="itemreviewed">Romeo Pizza</span>
|
380
|
+
Rating: <span itemprop="rating">4.5</span>
|
381
|
+
</div>
|
382
|
+
<div itemscope itemtype="http://data-vocabulary.org/Organization">
|
383
|
+
<span itemprop="name">An org name</span>
|
384
|
+
<span itemprop="url">http://example.com</span>
|
385
|
+
</div>
|
386
|
+
</div>
|
387
|
+
</body></html>
|
388
|
+
'
|
389
|
+
|
390
|
+
@md = Mida::Document.new(html)
|
391
|
+
|
392
|
+
end
|
393
|
+
|
394
|
+
it 'should return all the itemscopes' do
|
395
|
+
@md.items.size.should == 2
|
396
|
+
end
|
397
|
+
|
398
|
+
it 'should give the type of each itemscope if none specified' do
|
399
|
+
itemscope_names = {
|
400
|
+
'http://data-vocabulary.org/Review' => 0,
|
401
|
+
'http://data-vocabulary.org/Organization' => 0
|
402
|
+
}
|
403
|
+
|
404
|
+
@md.items.each do |item|
|
405
|
+
itemscope_names[item.type] += 1
|
406
|
+
end
|
407
|
+
|
408
|
+
itemscope_names.size.should eq 2
|
409
|
+
itemscope_names.each { |name, num| num.should == 1 }
|
410
|
+
end
|
411
|
+
|
412
|
+
|
413
|
+
it 'should return all the properties and types with the correct values for 1st itemscope' do
|
414
|
+
expected_results = [{
|
415
|
+
type: 'http://data-vocabulary.org/Review',
|
416
|
+
id: nil,
|
417
|
+
properties: {
|
418
|
+
'itemreviewed' => ['Romeo Pizza'],
|
419
|
+
'rating' => ['4.5']
|
420
|
+
}
|
421
|
+
}]
|
422
|
+
test_parsing(@md, %r{http://data-vocabulary.org/Review}, expected_results)
|
423
|
+
end
|
424
|
+
|
425
|
+
it 'should return all the properties from the text for 2nd itemscope' do
|
426
|
+
expected_results = [{
|
427
|
+
type: 'http://data-vocabulary.org/Organization',
|
428
|
+
id: nil,
|
429
|
+
properties: {
|
430
|
+
'name' => ['An org name'],
|
431
|
+
'url' => ['http://example.com']
|
432
|
+
}
|
433
|
+
}]
|
434
|
+
test_parsing(@md, %r{http://data-vocabulary.org/Organization}, expected_results)
|
435
|
+
end
|
436
|
+
|
437
|
+
end
|
438
|
+
|
439
|
+
describe Mida::Document, 'when run against a full html document containing one
|
440
|
+
itemscope nested within another and the inner block is
|
441
|
+
surrounded with another non itemscope block' do
|
442
|
+
|
443
|
+
before do
|
444
|
+
html = '
|
445
|
+
<html><body>
|
446
|
+
<div itemscope itemtype="http://data-vocabulary.org/Product">
|
447
|
+
<ul class="reviews">
|
448
|
+
<li id="model" itemprop="name">DC07</li>
|
449
|
+
<li id="make" itemprop="brand">Dyson</li>
|
450
|
+
<li itemprop="review" itemscope itemtype="http://data-vocabulary.org/Review-aggregate">
|
451
|
+
<span class="ratingDetails">
|
452
|
+
<span itemprop="count">1</span> Review,
|
453
|
+
Average: <span itemprop="rating">5.0</span>
|
454
|
+
</span>
|
455
|
+
</li>
|
456
|
+
</ul>
|
457
|
+
</div>
|
458
|
+
</body></html>
|
459
|
+
'
|
460
|
+
|
461
|
+
@md = Mida::Document.new(html)
|
462
|
+
end
|
463
|
+
|
464
|
+
it_should_behave_like 'one root itemscope'
|
465
|
+
|
466
|
+
it 'should return the correct number of itemscopes' do
|
467
|
+
vocabularies = [
|
468
|
+
%r{http://data-vocabulary.org/Product},
|
469
|
+
%r{http://data-vocabulary.org/Review-aggregate}
|
470
|
+
]
|
471
|
+
vocabularies.each {|vocabulary| @md.search(vocabulary).size.should == 1}
|
472
|
+
end
|
473
|
+
|
474
|
+
context "when looking at the outer vocabulary" do
|
475
|
+
it 'should return all the properties from the text with the correct values' do
|
476
|
+
expected_results = [{
|
477
|
+
type: 'http://data-vocabulary.org/Product',
|
478
|
+
id: nil,
|
479
|
+
properties: {
|
480
|
+
'name' => ['DC07'],
|
481
|
+
'brand' => ['Dyson'],
|
482
|
+
'review' => [{
|
483
|
+
type: 'http://data-vocabulary.org/Review-aggregate',
|
484
|
+
id: nil,
|
485
|
+
properties: {
|
486
|
+
'count' => ['1'],
|
487
|
+
'rating' => ['5.0']
|
488
|
+
}
|
489
|
+
}]
|
490
|
+
}
|
491
|
+
}]
|
492
|
+
|
493
|
+
test_parsing(@md, %r{http://data-vocabulary.org/Product}, expected_results)
|
494
|
+
end
|
495
|
+
end
|
496
|
+
|
497
|
+
end
|
498
|
+
|
499
|
+
describe Mida::Document, 'when run against a document containing an itemscope
|
500
|
+
that contains another non-linked itemscope' do
|
501
|
+
|
502
|
+
before do
|
503
|
+
html = '
|
504
|
+
<html><body>
|
505
|
+
<div itemscope itemtype="http://data-vocabulary.org/Product">
|
506
|
+
<ul class="reviews">
|
507
|
+
<li id="model" itemprop="name">DC07</li>
|
508
|
+
<li id="make" itemprop="brand">Dyson</li>
|
509
|
+
<li itemscope itemtype="http://data-vocabulary.org/Review-aggregate">
|
510
|
+
<span class="ratingDetails">
|
511
|
+
<span itemprop="count">1</span> Review,
|
512
|
+
Average: <span itemprop="rating">5.0</span>
|
513
|
+
</span>
|
514
|
+
</li>
|
515
|
+
</ul>
|
516
|
+
</div>
|
517
|
+
</body></html>
|
518
|
+
'
|
519
|
+
|
520
|
+
@md = Mida::Document.new(html)
|
521
|
+
end
|
522
|
+
|
523
|
+
it 'should return the correct number of itemscopes when search used' do
|
524
|
+
vocabularies = {
|
525
|
+
%r{} => 2,
|
526
|
+
%r{http://data-vocabulary.org/Product} => 1,
|
527
|
+
%r{http://data-vocabulary.org/Review-aggregate} => 1
|
528
|
+
}
|
529
|
+
vocabularies.each {|vocabulary, num| @md.search(vocabulary).size.should == num}
|
530
|
+
end
|
531
|
+
|
532
|
+
it 'should return the correct number of items' do
|
533
|
+
@md.items.size.should == 2
|
534
|
+
end
|
535
|
+
|
536
|
+
context "when no vocabulary specified or looking at the outer vocabulary" do
|
537
|
+
it 'should return all the properties from the text with the correct values' do
|
538
|
+
pending("get the contains: feature working")
|
539
|
+
expected_result = {
|
540
|
+
type: 'http://data-vocabulary.org/Product',
|
541
|
+
id: nil,
|
542
|
+
properties: {
|
543
|
+
'name' => 'DC07',
|
544
|
+
'brand' => 'Dyson'
|
545
|
+
},
|
546
|
+
contains: {
|
547
|
+
type: 'http://data-vocabulary.org/Review-aggregate',
|
548
|
+
id: nil,
|
549
|
+
properties: {
|
550
|
+
'count' => '1',
|
551
|
+
'rating' => '5.0'
|
552
|
+
}
|
553
|
+
}
|
554
|
+
}
|
555
|
+
|
556
|
+
@md.search('http://data-vocabulary.org/Product').first.should == expected_result
|
557
|
+
end
|
558
|
+
end
|
559
|
+
end
|
560
|
+
|
561
|
+
describe Mida::Document, 'when run against a document using itemrefs' do
|
562
|
+
|
563
|
+
before do
|
564
|
+
html = '
|
565
|
+
<html><body>
|
566
|
+
<div itemscope id="amanda" itemref="a b">
|
567
|
+
<span itemprop="age">30</span>
|
568
|
+
</div>
|
569
|
+
<p id="a">Name: <span itemprop="name">Amanda</span></p>
|
570
|
+
<div id="b" itemprop="band" itemscope itemref="c"></div>
|
571
|
+
<div id="c">
|
572
|
+
<p>Band: <span itemprop="name">Jazz Band</span></p>
|
573
|
+
<p>Size: <span itemprop="size">12</span> players</p>
|
574
|
+
</div>
|
575
|
+
</body></html>
|
576
|
+
'
|
577
|
+
|
578
|
+
@md = Mida::Document.new(html)
|
579
|
+
end
|
580
|
+
|
581
|
+
it 'should return all the properties from the text with the correct values' do
|
582
|
+
expected_results = [{
|
583
|
+
type: nil,
|
584
|
+
id: nil,
|
585
|
+
properties: {
|
586
|
+
'name' => ['Amanda'],
|
587
|
+
'band' => [{
|
588
|
+
type: nil,
|
589
|
+
id: nil,
|
590
|
+
properties: {
|
591
|
+
'name' => ['Jazz Band'],
|
592
|
+
'size' => ['12']
|
593
|
+
}
|
594
|
+
}],
|
595
|
+
'age' => ['30']
|
596
|
+
}
|
597
|
+
}]
|
598
|
+
|
599
|
+
test_parsing(@md, %r{}, expected_results)
|
600
|
+
end
|
601
|
+
end
|
602
|
+
|
603
|
+
describe Mida::Document, 'when run against a document using multiple itemprops with the same name' do
|
604
|
+
|
605
|
+
before do
|
606
|
+
html = '
|
607
|
+
<html><body>
|
608
|
+
<div itemscope itemtype="icecreams">
|
609
|
+
<p>Flavours in my favourite ice cream:</p>
|
610
|
+
<ul>
|
611
|
+
<li itemprop="flavour">Lemon sorbet</li>
|
612
|
+
<li itemprop="flavour">Apricot sorbet</li>
|
613
|
+
<li itemprop="flavour" itemscope itemtype="icecream-type">
|
614
|
+
<span itemprop="fruit">Strawberry</span>
|
615
|
+
<span itemprop="style">Homemade</span>
|
616
|
+
</li>
|
617
|
+
</ul>
|
618
|
+
</div>
|
619
|
+
</body></html>
|
620
|
+
'
|
621
|
+
|
622
|
+
@md = Mida::Document.new(html)
|
623
|
+
end
|
624
|
+
|
625
|
+
it_should_behave_like 'one root itemscope'
|
626
|
+
|
627
|
+
it 'should return the correct number of itemscopes' do
|
628
|
+
vocabularies = [
|
629
|
+
%r{icecreams},
|
630
|
+
%r{icecream-type}
|
631
|
+
]
|
632
|
+
vocabularies.each {|vocabulary| @md.search(vocabulary).size.should == 1}
|
633
|
+
end
|
634
|
+
|
635
|
+
it 'should return all the properties from the text with the correct values' do
|
636
|
+
expected_results = [{
|
637
|
+
type: 'icecreams',
|
638
|
+
id: nil,
|
639
|
+
properties: {
|
640
|
+
'flavour' => [
|
641
|
+
'Lemon sorbet',
|
642
|
+
'Apricot sorbet',
|
643
|
+
{ type: 'icecream-type',
|
644
|
+
id: nil,
|
645
|
+
properties: {
|
646
|
+
'fruit' => ['Strawberry'],
|
647
|
+
'style' => ['Homemade']
|
648
|
+
}
|
649
|
+
}
|
650
|
+
]
|
651
|
+
}
|
652
|
+
}]
|
653
|
+
|
654
|
+
test_parsing(@md, %r{icecreams}, expected_results)
|
655
|
+
end
|
656
|
+
end
|
657
|
+
|
658
|
+
describe Mida::Document, 'when run against a document using an itemprop with multiple properties' do
|
659
|
+
|
660
|
+
before do
|
661
|
+
html = '
|
662
|
+
<html><body>
|
663
|
+
<div itemscope>
|
664
|
+
<span itemprop="favourite-colour favourite-fruit">orange</span>
|
665
|
+
</div>
|
666
|
+
</body></html>
|
667
|
+
'
|
668
|
+
|
669
|
+
@md = Mida::Document.new(html)
|
670
|
+
end
|
671
|
+
|
672
|
+
it 'should return all the properties from the text with the correct values' do
|
673
|
+
expected_results = [{
|
674
|
+
type: nil,
|
675
|
+
id: nil,
|
676
|
+
properties: {
|
677
|
+
'favourite-colour' => ['orange'],
|
678
|
+
'favourite-fruit' => ['orange']
|
679
|
+
}
|
680
|
+
}]
|
681
|
+
|
682
|
+
test_parsing(@md, %r{}, expected_results)
|
683
|
+
end
|
684
|
+
end
|