hypermicrodata 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -1
- data/README.md +14 -2
- data/hypermicrodata.gemspec +1 -0
- data/lib/hypermicrodata.rb +1 -1
- data/lib/hypermicrodata/document.rb +20 -8
- data/lib/hypermicrodata/item.rb +66 -14
- data/lib/hypermicrodata/itemprop_parser.rb +4 -4
- data/lib/hypermicrodata/submit_button.rb +6 -28
- data/lib/hypermicrodata/version.rb +1 -1
- metadata +17 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bc45691f143d5cfcd20199463e9b549565c04b69
|
4
|
+
data.tar.gz: 2f0c44acf87fa98cc606bf3f292d3c940a2cbf57
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2c3b43e8c4ca94990f23827ccd3cb4111e7f98143c5291bb166a4bb3c621dc65065184e826caac6d792ec5b504d0f141dbc5cf6cb9956a90314ad5deb625c824
|
7
|
+
data.tar.gz: 71df574ab4d757fed7e67e39f3d6e88afdf76a4f416955ebce4c4884d7ca271e4c9cd6f713d5072bad004c154121116034efa1514bed3f7236ce4f42eae02e2a
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -42,6 +42,14 @@ Supported formats are
|
|
42
42
|
|
43
43
|
When you use this in Rails, you don't need to extract data manually.
|
44
44
|
|
45
|
+
/config/mime_types.rb
|
46
|
+
|
47
|
+
```
|
48
|
+
Mime::Type.register 'application/vnd.amundsen-uber+json', :uberjson
|
49
|
+
# or if you want HAL
|
50
|
+
Mime::Type.register 'application/hal+json', :haljson
|
51
|
+
```
|
52
|
+
|
45
53
|
/app/controllers/people_controller.rb
|
46
54
|
|
47
55
|
```
|
@@ -55,8 +63,7 @@ end
|
|
55
63
|
/app/views/people/show.html.haml
|
56
64
|
|
57
65
|
```
|
58
|
-
.person{itemscope: true, itemtype: 'http://schema.org/Person',
|
59
|
-
itemid: person_url(@person), data: {main_item: true}}
|
66
|
+
%main.person{itemscope: true, itemtype: 'http://schema.org/Person', itemid: person_url(@person)}
|
60
67
|
.media
|
61
68
|
.media-image.pull-left
|
62
69
|
= image_tag @person.picture_path, alt: '', itemprop: 'image'
|
@@ -66,6 +73,11 @@ end
|
|
66
73
|
= link_to 'collection', people_path, rel: 'collection', itemprop: 'isPartOf'
|
67
74
|
```
|
68
75
|
|
76
|
+
`<main>` elements is considered root nodes of the extraction into JSON.
|
77
|
+
|
78
|
+
If you don't want use `<main>`, you can use elements with `data-main-item` attribute instead.
|
79
|
+
|
80
|
+
|
69
81
|
And you can serve following JSON:
|
70
82
|
|
71
83
|
```
|
data/hypermicrodata.gemspec
CHANGED
@@ -22,6 +22,7 @@ Gem::Specification.new do |spec|
|
|
22
22
|
spec.add_dependency "mechanize"
|
23
23
|
spec.add_dependency "halibut"
|
24
24
|
spec.add_dependency "multi_json"
|
25
|
+
spec.add_dependency "addressable"
|
25
26
|
|
26
27
|
spec.add_development_dependency "bundler", "~> 1.3"
|
27
28
|
spec.add_development_dependency "rake"
|
data/lib/hypermicrodata.rb
CHANGED
@@ -12,17 +12,29 @@ module Hypermicrodata
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def extract_items
|
15
|
-
itemscopes = []
|
16
|
-
if @filter_xpath_attr
|
17
|
-
itemscopes = @doc.xpath("//*[#{@filter_xpath_attr} and @itemscope]")
|
18
|
-
puts "XPath //*[#{@filter_xpath_attr}] is not found. root node is used." if itemscopes.empty?
|
19
|
-
end
|
20
|
-
itemscopes = @doc.xpath('self::*[@itemscope] | .//*[@itemscope and not(@itemprop)]') if itemscopes.empty?
|
21
|
-
|
22
15
|
itemscopes.collect do |itemscope|
|
23
|
-
Item.
|
16
|
+
Item.parse(itemscope, @page_url)
|
24
17
|
end
|
25
18
|
end
|
26
19
|
|
20
|
+
private
|
21
|
+
|
22
|
+
def itemscopes
|
23
|
+
items_xpath = 'self::*[@itemscope] | .//*[@itemscope and not(@itemprop)] | .//form[not(@itemprop)]'
|
24
|
+
if @filter_xpath_attr
|
25
|
+
filtered_doc = @doc.xpath("//*[#{@filter_xpath_attr}]")
|
26
|
+
unless filtered_doc.empty?
|
27
|
+
return filtered_doc.xpath(items_xpath)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
print "XPath //*[#{@filter_xpath_attr}] is not found. "
|
31
|
+
filtered_doc = @doc.xpath('//main')
|
32
|
+
unless filtered_doc.empty?
|
33
|
+
print "main node is used.\n"
|
34
|
+
return filtered_doc.xpath(items_xpath)
|
35
|
+
end
|
36
|
+
print "root node is used.\n"
|
37
|
+
@doc.xpath(items_xpath)
|
38
|
+
end
|
27
39
|
end
|
28
40
|
end
|
data/lib/hypermicrodata/item.rb
CHANGED
@@ -2,6 +2,14 @@ module Hypermicrodata
|
|
2
2
|
class Item
|
3
3
|
attr_reader :type, :properties, :links, :id
|
4
4
|
|
5
|
+
def self.parse(top_node, page_url)
|
6
|
+
if top_node.name == 'form'
|
7
|
+
FormItem.new(top_node, page_url)
|
8
|
+
else
|
9
|
+
Item.new(top_node, page_url)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
5
13
|
def initialize(top_node, page_url)
|
6
14
|
@top_node = top_node
|
7
15
|
@type = extract_itemtype
|
@@ -68,8 +76,7 @@ module Hypermicrodata
|
|
68
76
|
itemscope = element.attribute('itemscope')
|
69
77
|
itemprop = element.attribute('itemprop')
|
70
78
|
internal_elements = extract_elements(element)
|
71
|
-
add_itemprop(element) if
|
72
|
-
add_form(element) if element.name == 'form'
|
79
|
+
add_itemprop(element) if itemprop || ItempropParser::LINK_ELEMENTS.include?(element.name)
|
73
80
|
parse_elements(internal_elements) if internal_elements && !itemscope
|
74
81
|
end
|
75
82
|
|
@@ -77,7 +84,10 @@ module Hypermicrodata
|
|
77
84
|
def add_itemprop(element)
|
78
85
|
property = ItempropParser.parse(element, @page_url)
|
79
86
|
if property.link? && property.names.empty? && property.rels.empty?
|
80
|
-
|
87
|
+
href = property.value.to_s.strip
|
88
|
+
unless href.empty? || href == '#' # href which doesn't work as link is ignored
|
89
|
+
(@links[element.name] ||= []) << property
|
90
|
+
end
|
81
91
|
else
|
82
92
|
property.names.each { |name| (@properties[name] ||= []) << property }
|
83
93
|
property.rels.each { |rel| (@links[rel] ||= []) << property }
|
@@ -92,22 +102,64 @@ module Hypermicrodata
|
|
92
102
|
end
|
93
103
|
end
|
94
104
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
105
|
+
# Find an element with a matching id
|
106
|
+
def find_with_id(id)
|
107
|
+
@top_node.search("//*[@id='#{id}']")
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
class FormItem < Item
|
112
|
+
attr_reader :submit_buttons
|
113
|
+
|
114
|
+
def initialize(top_node, page_url)
|
115
|
+
form = Mechanize::Form.new(top_node)
|
116
|
+
@submit_buttons = form.submits.map do |button|
|
117
|
+
SubmitButton.new(button, form)
|
118
|
+
end
|
119
|
+
super
|
120
|
+
end
|
121
|
+
|
122
|
+
private
|
123
|
+
|
124
|
+
def extract_itemtype
|
125
|
+
super || ['http://schema.org/Action']
|
126
|
+
end
|
127
|
+
|
128
|
+
# TODO: Make it DRY
|
129
|
+
def parse_element(element)
|
130
|
+
itemscope = element.attribute('itemscope')
|
131
|
+
itemprop = element.attribute('itemprop')
|
132
|
+
internal_elements = extract_elements(element)
|
133
|
+
add_itemprop(element) if itemprop || ItempropParser::LINK_ELEMENTS.include?(element.name) || submit_button_include?(element)
|
134
|
+
parse_elements(internal_elements) if internal_elements && !itemscope
|
135
|
+
end
|
136
|
+
|
137
|
+
def add_itemprop(element)
|
138
|
+
return super unless submit_button_include?(element)
|
139
|
+
property = @submit_buttons.find {|b| b.node == element }
|
140
|
+
if property.names.empty? && property.rels.empty?
|
141
|
+
href = property.value.to_s.strip
|
142
|
+
unless href.empty? || href == '#' # href which doesn't work as link is ignored
|
143
|
+
(@links[element.name] ||= []) << property
|
103
144
|
end
|
145
|
+
else
|
146
|
+
property.names.each { |name| (@properties[name] ||= []) << property }
|
147
|
+
property.rels.each { |rel| (@links[rel] ||= []) << property }
|
104
148
|
end
|
105
149
|
end
|
106
150
|
|
107
|
-
|
108
|
-
|
109
|
-
@top_node.search("//*[@id='#{id}']")
|
151
|
+
def submit_button_include?(element)
|
152
|
+
@submit_buttons.any? {|b| b.node == element }
|
110
153
|
end
|
154
|
+
end
|
155
|
+
end
|
111
156
|
|
157
|
+
# Patch for bug
|
158
|
+
Mechanize::Form.class_eval do
|
159
|
+
# Returns all buttons of type Submit
|
160
|
+
def submits
|
161
|
+
@submits ||= buttons.select {|f|
|
162
|
+
f.class == Mechanize::Form::Submit || (f.class == Mechanize::Form::Button && (f.type.nil? || f.type == 'submit'))
|
163
|
+
}
|
112
164
|
end
|
113
165
|
end
|
@@ -65,10 +65,10 @@ module Hypermicrodata
|
|
65
65
|
# This returns an empty string if can't form a valid
|
66
66
|
# absolute url as per the Microdata spec.
|
67
67
|
def make_absolute_url(url)
|
68
|
-
return url unless URI.parse(url).relative?
|
68
|
+
return url unless Addressable::URI.parse(url).relative?
|
69
69
|
begin
|
70
|
-
URI.parse(@page_url).merge(url).to_s
|
71
|
-
rescue
|
70
|
+
Addressable::URI.parse(@page_url).merge(url).to_s
|
71
|
+
rescue
|
72
72
|
url
|
73
73
|
end
|
74
74
|
end
|
@@ -104,7 +104,7 @@ module Hypermicrodata
|
|
104
104
|
|
105
105
|
def extract_property
|
106
106
|
if @element.attribute('itemscope')
|
107
|
-
Item.
|
107
|
+
Item.parse(@element, @page_url)
|
108
108
|
else
|
109
109
|
extract_property_value
|
110
110
|
end
|
@@ -26,11 +26,11 @@ module Hypermicrodata
|
|
26
26
|
end
|
27
27
|
|
28
28
|
def names
|
29
|
-
(
|
29
|
+
(node['itemprop'] || '').split(' ')
|
30
30
|
end
|
31
31
|
|
32
32
|
def rels
|
33
|
-
rel = (
|
33
|
+
rel = (node['rel'] || node['data-rel'] || @button.dom_class || '')
|
34
34
|
rel.split(' ')
|
35
35
|
end
|
36
36
|
|
@@ -46,6 +46,10 @@ module Hypermicrodata
|
|
46
46
|
true
|
47
47
|
end
|
48
48
|
|
49
|
+
def node
|
50
|
+
@button.node
|
51
|
+
end
|
52
|
+
|
49
53
|
private
|
50
54
|
def setup!
|
51
55
|
if method_field = @form.fields.find { |f| f.name == '_method' }
|
@@ -76,30 +80,4 @@ module Hypermicrodata
|
|
76
80
|
end.compact.join('&')
|
77
81
|
end
|
78
82
|
end
|
79
|
-
|
80
|
-
class FormParser
|
81
|
-
attr_reader :submit_buttons
|
82
|
-
|
83
|
-
def initialize(element, page_url = nil)
|
84
|
-
@element, @page_url = element, page_url
|
85
|
-
form = Mechanize::Form.new(element)
|
86
|
-
@submit_buttons = form.submits.map do |button|
|
87
|
-
SubmitButton.new(button, form)
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
def self.parse(element, page_url = nil)
|
92
|
-
self.new(element, page_url).submit_buttons
|
93
|
-
end
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
# Patch for bug
|
98
|
-
Mechanize::Form.class_eval do
|
99
|
-
# Returns all buttons of type Submit
|
100
|
-
def submits
|
101
|
-
@submits ||= buttons.select {|f|
|
102
|
-
f.class == Mechanize::Form::Submit || (f.class == Mechanize::Form::Button && (f.type.nil? || f.type == 'submit'))
|
103
|
-
}
|
104
|
-
end
|
105
83
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hypermicrodata
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jason Ronallo
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2015-01-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -67,6 +67,20 @@ dependencies:
|
|
67
67
|
- - '>='
|
68
68
|
- !ruby/object:Gem::Version
|
69
69
|
version: '0'
|
70
|
+
- !ruby/object:Gem::Dependency
|
71
|
+
name: addressable
|
72
|
+
requirement: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - '>='
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
77
|
+
type: :runtime
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - '>='
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: '0'
|
70
84
|
- !ruby/object:Gem::Dependency
|
71
85
|
name: bundler
|
72
86
|
requirement: !ruby/object:Gem::Requirement
|
@@ -154,7 +168,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
154
168
|
version: '0'
|
155
169
|
requirements: []
|
156
170
|
rubyforge_project:
|
157
|
-
rubygems_version: 2.4.
|
171
|
+
rubygems_version: 2.4.3
|
158
172
|
signing_key:
|
159
173
|
specification_version: 4
|
160
174
|
summary: Ruby library for extracting HTML5 Microdata with Hypermedia
|