hypermicrodata 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +2 -1
- data/README.md +14 -2
- data/hypermicrodata.gemspec +1 -0
- data/lib/hypermicrodata.rb +1 -1
- data/lib/hypermicrodata/document.rb +20 -8
- data/lib/hypermicrodata/item.rb +66 -14
- data/lib/hypermicrodata/itemprop_parser.rb +4 -4
- data/lib/hypermicrodata/submit_button.rb +6 -28
- data/lib/hypermicrodata/version.rb +1 -1
- metadata +17 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: bc45691f143d5cfcd20199463e9b549565c04b69
|
4
|
+
data.tar.gz: 2f0c44acf87fa98cc606bf3f292d3c940a2cbf57
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2c3b43e8c4ca94990f23827ccd3cb4111e7f98143c5291bb166a4bb3c621dc65065184e826caac6d792ec5b504d0f141dbc5cf6cb9956a90314ad5deb625c824
|
7
|
+
data.tar.gz: 71df574ab4d757fed7e67e39f3d6e88afdf76a4f416955ebce4c4884d7ca271e4c9cd6f713d5072bad004c154121116034efa1514bed3f7236ce4f42eae02e2a
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -42,6 +42,14 @@ Supported formats are
|
|
42
42
|
|
43
43
|
When you use this in Rails, you don't need to extract data manually.
|
44
44
|
|
45
|
+
/config/mime_types.rb
|
46
|
+
|
47
|
+
```
|
48
|
+
Mime::Type.register 'application/vnd.amundsen-uber+json', :uberjson
|
49
|
+
# or if you want HAL
|
50
|
+
Mime::Type.register 'application/hal+json', :haljson
|
51
|
+
```
|
52
|
+
|
45
53
|
/app/controllers/people_controller.rb
|
46
54
|
|
47
55
|
```
|
@@ -55,8 +63,7 @@ end
|
|
55
63
|
/app/views/people/show.html.haml
|
56
64
|
|
57
65
|
```
|
58
|
-
.person{itemscope: true, itemtype: 'http://schema.org/Person',
|
59
|
-
itemid: person_url(@person), data: {main_item: true}}
|
66
|
+
%main.person{itemscope: true, itemtype: 'http://schema.org/Person', itemid: person_url(@person)}
|
60
67
|
.media
|
61
68
|
.media-image.pull-left
|
62
69
|
= image_tag @person.picture_path, alt: '', itemprop: 'image'
|
@@ -66,6 +73,11 @@ end
|
|
66
73
|
= link_to 'collection', people_path, rel: 'collection', itemprop: 'isPartOf'
|
67
74
|
```
|
68
75
|
|
76
|
+
`<main>` elements is considered root nodes of the extraction into JSON.
|
77
|
+
|
78
|
+
If you don't want use `<main>`, you can use elements with `data-main-item` attribute instead.
|
79
|
+
|
80
|
+
|
69
81
|
And you can serve following JSON:
|
70
82
|
|
71
83
|
```
|
data/hypermicrodata.gemspec
CHANGED
@@ -22,6 +22,7 @@ Gem::Specification.new do |spec|
|
|
22
22
|
spec.add_dependency "mechanize"
|
23
23
|
spec.add_dependency "halibut"
|
24
24
|
spec.add_dependency "multi_json"
|
25
|
+
spec.add_dependency "addressable"
|
25
26
|
|
26
27
|
spec.add_development_dependency "bundler", "~> 1.3"
|
27
28
|
spec.add_development_dependency "rake"
|
data/lib/hypermicrodata.rb
CHANGED
@@ -12,17 +12,29 @@ module Hypermicrodata
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def extract_items
|
15
|
-
itemscopes = []
|
16
|
-
if @filter_xpath_attr
|
17
|
-
itemscopes = @doc.xpath("//*[#{@filter_xpath_attr} and @itemscope]")
|
18
|
-
puts "XPath //*[#{@filter_xpath_attr}] is not found. root node is used." if itemscopes.empty?
|
19
|
-
end
|
20
|
-
itemscopes = @doc.xpath('self::*[@itemscope] | .//*[@itemscope and not(@itemprop)]') if itemscopes.empty?
|
21
|
-
|
22
15
|
itemscopes.collect do |itemscope|
|
23
|
-
Item.
|
16
|
+
Item.parse(itemscope, @page_url)
|
24
17
|
end
|
25
18
|
end
|
26
19
|
|
20
|
+
private
|
21
|
+
|
22
|
+
def itemscopes
|
23
|
+
items_xpath = 'self::*[@itemscope] | .//*[@itemscope and not(@itemprop)] | .//form[not(@itemprop)]'
|
24
|
+
if @filter_xpath_attr
|
25
|
+
filtered_doc = @doc.xpath("//*[#{@filter_xpath_attr}]")
|
26
|
+
unless filtered_doc.empty?
|
27
|
+
return filtered_doc.xpath(items_xpath)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
print "XPath //*[#{@filter_xpath_attr}] is not found. "
|
31
|
+
filtered_doc = @doc.xpath('//main')
|
32
|
+
unless filtered_doc.empty?
|
33
|
+
print "main node is used.\n"
|
34
|
+
return filtered_doc.xpath(items_xpath)
|
35
|
+
end
|
36
|
+
print "root node is used.\n"
|
37
|
+
@doc.xpath(items_xpath)
|
38
|
+
end
|
27
39
|
end
|
28
40
|
end
|
data/lib/hypermicrodata/item.rb
CHANGED
@@ -2,6 +2,14 @@ module Hypermicrodata
|
|
2
2
|
class Item
|
3
3
|
attr_reader :type, :properties, :links, :id
|
4
4
|
|
5
|
+
def self.parse(top_node, page_url)
|
6
|
+
if top_node.name == 'form'
|
7
|
+
FormItem.new(top_node, page_url)
|
8
|
+
else
|
9
|
+
Item.new(top_node, page_url)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
5
13
|
def initialize(top_node, page_url)
|
6
14
|
@top_node = top_node
|
7
15
|
@type = extract_itemtype
|
@@ -68,8 +76,7 @@ module Hypermicrodata
|
|
68
76
|
itemscope = element.attribute('itemscope')
|
69
77
|
itemprop = element.attribute('itemprop')
|
70
78
|
internal_elements = extract_elements(element)
|
71
|
-
add_itemprop(element) if
|
72
|
-
add_form(element) if element.name == 'form'
|
79
|
+
add_itemprop(element) if itemprop || ItempropParser::LINK_ELEMENTS.include?(element.name)
|
73
80
|
parse_elements(internal_elements) if internal_elements && !itemscope
|
74
81
|
end
|
75
82
|
|
@@ -77,7 +84,10 @@ module Hypermicrodata
|
|
77
84
|
def add_itemprop(element)
|
78
85
|
property = ItempropParser.parse(element, @page_url)
|
79
86
|
if property.link? && property.names.empty? && property.rels.empty?
|
80
|
-
|
87
|
+
href = property.value.to_s.strip
|
88
|
+
unless href.empty? || href == '#' # href which doesn't work as link is ignored
|
89
|
+
(@links[element.name] ||= []) << property
|
90
|
+
end
|
81
91
|
else
|
82
92
|
property.names.each { |name| (@properties[name] ||= []) << property }
|
83
93
|
property.rels.each { |rel| (@links[rel] ||= []) << property }
|
@@ -92,22 +102,64 @@ module Hypermicrodata
|
|
92
102
|
end
|
93
103
|
end
|
94
104
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
105
|
+
# Find an element with a matching id
|
106
|
+
def find_with_id(id)
|
107
|
+
@top_node.search("//*[@id='#{id}']")
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
class FormItem < Item
|
112
|
+
attr_reader :submit_buttons
|
113
|
+
|
114
|
+
def initialize(top_node, page_url)
|
115
|
+
form = Mechanize::Form.new(top_node)
|
116
|
+
@submit_buttons = form.submits.map do |button|
|
117
|
+
SubmitButton.new(button, form)
|
118
|
+
end
|
119
|
+
super
|
120
|
+
end
|
121
|
+
|
122
|
+
private
|
123
|
+
|
124
|
+
def extract_itemtype
|
125
|
+
super || ['http://schema.org/Action']
|
126
|
+
end
|
127
|
+
|
128
|
+
# TODO: Make it DRY
|
129
|
+
def parse_element(element)
|
130
|
+
itemscope = element.attribute('itemscope')
|
131
|
+
itemprop = element.attribute('itemprop')
|
132
|
+
internal_elements = extract_elements(element)
|
133
|
+
add_itemprop(element) if itemprop || ItempropParser::LINK_ELEMENTS.include?(element.name) || submit_button_include?(element)
|
134
|
+
parse_elements(internal_elements) if internal_elements && !itemscope
|
135
|
+
end
|
136
|
+
|
137
|
+
def add_itemprop(element)
|
138
|
+
return super unless submit_button_include?(element)
|
139
|
+
property = @submit_buttons.find {|b| b.node == element }
|
140
|
+
if property.names.empty? && property.rels.empty?
|
141
|
+
href = property.value.to_s.strip
|
142
|
+
unless href.empty? || href == '#' # href which doesn't work as link is ignored
|
143
|
+
(@links[element.name] ||= []) << property
|
103
144
|
end
|
145
|
+
else
|
146
|
+
property.names.each { |name| (@properties[name] ||= []) << property }
|
147
|
+
property.rels.each { |rel| (@links[rel] ||= []) << property }
|
104
148
|
end
|
105
149
|
end
|
106
150
|
|
107
|
-
|
108
|
-
|
109
|
-
@top_node.search("//*[@id='#{id}']")
|
151
|
+
def submit_button_include?(element)
|
152
|
+
@submit_buttons.any? {|b| b.node == element }
|
110
153
|
end
|
154
|
+
end
|
155
|
+
end
|
111
156
|
|
157
|
+
# Patch for bug
|
158
|
+
Mechanize::Form.class_eval do
|
159
|
+
# Returns all buttons of type Submit
|
160
|
+
def submits
|
161
|
+
@submits ||= buttons.select {|f|
|
162
|
+
f.class == Mechanize::Form::Submit || (f.class == Mechanize::Form::Button && (f.type.nil? || f.type == 'submit'))
|
163
|
+
}
|
112
164
|
end
|
113
165
|
end
|
@@ -65,10 +65,10 @@ module Hypermicrodata
|
|
65
65
|
# This returns an empty string if can't form a valid
|
66
66
|
# absolute url as per the Microdata spec.
|
67
67
|
def make_absolute_url(url)
|
68
|
-
return url unless URI.parse(url).relative?
|
68
|
+
return url unless Addressable::URI.parse(url).relative?
|
69
69
|
begin
|
70
|
-
URI.parse(@page_url).merge(url).to_s
|
71
|
-
rescue
|
70
|
+
Addressable::URI.parse(@page_url).merge(url).to_s
|
71
|
+
rescue
|
72
72
|
url
|
73
73
|
end
|
74
74
|
end
|
@@ -104,7 +104,7 @@ module Hypermicrodata
|
|
104
104
|
|
105
105
|
def extract_property
|
106
106
|
if @element.attribute('itemscope')
|
107
|
-
Item.
|
107
|
+
Item.parse(@element, @page_url)
|
108
108
|
else
|
109
109
|
extract_property_value
|
110
110
|
end
|
@@ -26,11 +26,11 @@ module Hypermicrodata
|
|
26
26
|
end
|
27
27
|
|
28
28
|
def names
|
29
|
-
(
|
29
|
+
(node['itemprop'] || '').split(' ')
|
30
30
|
end
|
31
31
|
|
32
32
|
def rels
|
33
|
-
rel = (
|
33
|
+
rel = (node['rel'] || node['data-rel'] || @button.dom_class || '')
|
34
34
|
rel.split(' ')
|
35
35
|
end
|
36
36
|
|
@@ -46,6 +46,10 @@ module Hypermicrodata
|
|
46
46
|
true
|
47
47
|
end
|
48
48
|
|
49
|
+
def node
|
50
|
+
@button.node
|
51
|
+
end
|
52
|
+
|
49
53
|
private
|
50
54
|
def setup!
|
51
55
|
if method_field = @form.fields.find { |f| f.name == '_method' }
|
@@ -76,30 +80,4 @@ module Hypermicrodata
|
|
76
80
|
end.compact.join('&')
|
77
81
|
end
|
78
82
|
end
|
79
|
-
|
80
|
-
class FormParser
|
81
|
-
attr_reader :submit_buttons
|
82
|
-
|
83
|
-
def initialize(element, page_url = nil)
|
84
|
-
@element, @page_url = element, page_url
|
85
|
-
form = Mechanize::Form.new(element)
|
86
|
-
@submit_buttons = form.submits.map do |button|
|
87
|
-
SubmitButton.new(button, form)
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
def self.parse(element, page_url = nil)
|
92
|
-
self.new(element, page_url).submit_buttons
|
93
|
-
end
|
94
|
-
end
|
95
|
-
end
|
96
|
-
|
97
|
-
# Patch for bug
|
98
|
-
Mechanize::Form.class_eval do
|
99
|
-
# Returns all buttons of type Submit
|
100
|
-
def submits
|
101
|
-
@submits ||= buttons.select {|f|
|
102
|
-
f.class == Mechanize::Form::Submit || (f.class == Mechanize::Form::Button && (f.type.nil? || f.type == 'submit'))
|
103
|
-
}
|
104
|
-
end
|
105
83
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hypermicrodata
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Jason Ronallo
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2015-01-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -67,6 +67,20 @@ dependencies:
|
|
67
67
|
- - '>='
|
68
68
|
- !ruby/object:Gem::Version
|
69
69
|
version: '0'
|
70
|
+
- !ruby/object:Gem::Dependency
|
71
|
+
name: addressable
|
72
|
+
requirement: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - '>='
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
77
|
+
type: :runtime
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - '>='
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: '0'
|
70
84
|
- !ruby/object:Gem::Dependency
|
71
85
|
name: bundler
|
72
86
|
requirement: !ruby/object:Gem::Requirement
|
@@ -154,7 +168,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
154
168
|
version: '0'
|
155
169
|
requirements: []
|
156
170
|
rubyforge_project:
|
157
|
-
rubygems_version: 2.4.
|
171
|
+
rubygems_version: 2.4.3
|
158
172
|
signing_key:
|
159
173
|
specification_version: 4
|
160
174
|
summary: Ruby library for extracting HTML5 Microdata with Hypermedia
|