hypermicrodata 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 31c2827d5ef761811004b563f135127def94a625
4
- data.tar.gz: 229d52f8eee1e15347d72171d37f6ebda948df84
3
+ metadata.gz: bc45691f143d5cfcd20199463e9b549565c04b69
4
+ data.tar.gz: 2f0c44acf87fa98cc606bf3f292d3c940a2cbf57
5
5
  SHA512:
6
- metadata.gz: b1caaf52ab0fe4dc1c3285b04b9a1db8b3b1debe0d2966cc3c903d88a72e3900d3d2815521c95d5d98217e0c81cffe8f1d63fa9977e53d2ee1db361f8ea70de0
7
- data.tar.gz: 3f2531028ea38dc3d082e84a14aa426406d7ebee8667e3459611ccfd05c7857eb965daf73715c5c390ef272989566b761386512955e940d4b6032bb821332dfb
6
+ metadata.gz: 2c3b43e8c4ca94990f23827ccd3cb4111e7f98143c5291bb166a4bb3c621dc65065184e826caac6d792ec5b504d0f141dbc5cf6cb9956a90314ad5deb625c824
7
+ data.tar.gz: 71df574ab4d757fed7e67e39f3d6e88afdf76a4f416955ebce4c4884d7ca271e4c9cd6f713d5072bad004c154121116034efa1514bed3f7236ce4f42eae02e2a
data/.travis.yml CHANGED
@@ -2,7 +2,8 @@ language: ruby
2
2
  rvm:
3
3
  - 1.9.3
4
4
  - 2.0.0
5
- - 2.1.1
5
+ - 2.1
6
+ - 2.2
6
7
  - jruby-19mode # JRuby in 1.9 mode
7
8
  # uncomment this line if your project needs to run something other than `rake`:
8
9
  script: rake test
data/README.md CHANGED
@@ -42,6 +42,14 @@ Supported formats are
42
42
 
43
43
  When you use this in Rails, you don't need to extract data manually.
44
44
 
45
+ /config/mime_types.rb
46
+
47
+ ```
48
+ Mime::Type.register 'application/vnd.amundsen-uber+json', :uberjson
49
+ # or if you want HAL
50
+ Mime::Type.register 'application/hal+json', :haljson
51
+ ```
52
+
45
53
  /app/controllers/people_controller.rb
46
54
 
47
55
  ```
@@ -55,8 +63,7 @@ end
55
63
  /app/views/people/show.html.haml
56
64
 
57
65
  ```
58
- .person{itemscope: true, itemtype: 'http://schema.org/Person',
59
- itemid: person_url(@person), data: {main_item: true}}
66
+ %main.person{itemscope: true, itemtype: 'http://schema.org/Person', itemid: person_url(@person)}
60
67
  .media
61
68
  .media-image.pull-left
62
69
  = image_tag @person.picture_path, alt: '', itemprop: 'image'
@@ -66,6 +73,11 @@ end
66
73
  = link_to 'collection', people_path, rel: 'collection', itemprop: 'isPartOf'
67
74
  ```
68
75
 
76
+ `<main>` elements is considered root nodes of the extraction into JSON.
77
+
78
+ If you don't want use `<main>`, you can use elements with `data-main-item` attribute instead.
79
+
80
+
69
81
  And you can serve following JSON:
70
82
 
71
83
  ```
@@ -22,6 +22,7 @@ Gem::Specification.new do |spec|
22
22
  spec.add_dependency "mechanize"
23
23
  spec.add_dependency "halibut"
24
24
  spec.add_dependency "multi_json"
25
+ spec.add_dependency "addressable"
25
26
 
26
27
  spec.add_development_dependency "bundler", "~> 1.3"
27
28
  spec.add_development_dependency "rake"
@@ -15,7 +15,7 @@ require "hypermicrodata/extract"
15
15
  require "hypermicrodata/rails/html_based_json_renderer"
16
16
  require 'open-uri'
17
17
  require 'json'
18
- require 'uri'
18
+ require 'addressable/uri'
19
19
 
20
20
  module Hypermicrodata
21
21
 
@@ -12,17 +12,29 @@ module Hypermicrodata
12
12
  end
13
13
 
14
14
  def extract_items
15
- itemscopes = []
16
- if @filter_xpath_attr
17
- itemscopes = @doc.xpath("//*[#{@filter_xpath_attr} and @itemscope]")
18
- puts "XPath //*[#{@filter_xpath_attr}] is not found. root node is used." if itemscopes.empty?
19
- end
20
- itemscopes = @doc.xpath('self::*[@itemscope] | .//*[@itemscope and not(@itemprop)]') if itemscopes.empty?
21
-
22
15
  itemscopes.collect do |itemscope|
23
- Item.new(itemscope, @page_url)
16
+ Item.parse(itemscope, @page_url)
24
17
  end
25
18
  end
26
19
 
20
+ private
21
+
22
+ def itemscopes
23
+ items_xpath = 'self::*[@itemscope] | .//*[@itemscope and not(@itemprop)] | .//form[not(@itemprop)]'
24
+ if @filter_xpath_attr
25
+ filtered_doc = @doc.xpath("//*[#{@filter_xpath_attr}]")
26
+ unless filtered_doc.empty?
27
+ return filtered_doc.xpath(items_xpath)
28
+ end
29
+ end
30
+ print "XPath //*[#{@filter_xpath_attr}] is not found. "
31
+ filtered_doc = @doc.xpath('//main')
32
+ unless filtered_doc.empty?
33
+ print "main node is used.\n"
34
+ return filtered_doc.xpath(items_xpath)
35
+ end
36
+ print "root node is used.\n"
37
+ @doc.xpath(items_xpath)
38
+ end
27
39
  end
28
40
  end
@@ -2,6 +2,14 @@ module Hypermicrodata
2
2
  class Item
3
3
  attr_reader :type, :properties, :links, :id
4
4
 
5
+ def self.parse(top_node, page_url)
6
+ if top_node.name == 'form'
7
+ FormItem.new(top_node, page_url)
8
+ else
9
+ Item.new(top_node, page_url)
10
+ end
11
+ end
12
+
5
13
  def initialize(top_node, page_url)
6
14
  @top_node = top_node
7
15
  @type = extract_itemtype
@@ -68,8 +76,7 @@ module Hypermicrodata
68
76
  itemscope = element.attribute('itemscope')
69
77
  itemprop = element.attribute('itemprop')
70
78
  internal_elements = extract_elements(element)
71
- add_itemprop(element) if itemscope || itemprop || ItempropParser::LINK_ELEMENTS.include?(element.name)
72
- add_form(element) if element.name == 'form'
79
+ add_itemprop(element) if itemprop || ItempropParser::LINK_ELEMENTS.include?(element.name)
73
80
  parse_elements(internal_elements) if internal_elements && !itemscope
74
81
  end
75
82
 
@@ -77,7 +84,10 @@ module Hypermicrodata
77
84
  def add_itemprop(element)
78
85
  property = ItempropParser.parse(element, @page_url)
79
86
  if property.link? && property.names.empty? && property.rels.empty?
80
- (@links['link'] ||= []) << property
87
+ href = property.value.to_s.strip
88
+ unless href.empty? || href == '#' # href which doesn't work as link is ignored
89
+ (@links[element.name] ||= []) << property
90
+ end
81
91
  else
82
92
  property.names.each { |name| (@properties[name] ||= []) << property }
83
93
  property.rels.each { |rel| (@links[rel] ||= []) << property }
@@ -92,22 +102,64 @@ module Hypermicrodata
92
102
  end
93
103
  end
94
104
 
95
- def add_form(element)
96
- submit_buttons = FormParser.parse(element, @page_url)
97
- submit_buttons.each do |submit_button|
98
- submit_button.names.each { |name| (@properties[name] ||= []) << submit_button }
99
- if submit_button.rels.empty?
100
- (@links['submit'] ||= []) << submit_button
101
- else
102
- submit_button.rels.each { |rel| (@links[rel] ||= []) << submit_button }
105
+ # Find an element with a matching id
106
+ def find_with_id(id)
107
+ @top_node.search("//*[@id='#{id}']")
108
+ end
109
+ end
110
+
111
+ class FormItem < Item
112
+ attr_reader :submit_buttons
113
+
114
+ def initialize(top_node, page_url)
115
+ form = Mechanize::Form.new(top_node)
116
+ @submit_buttons = form.submits.map do |button|
117
+ SubmitButton.new(button, form)
118
+ end
119
+ super
120
+ end
121
+
122
+ private
123
+
124
+ def extract_itemtype
125
+ super || ['http://schema.org/Action']
126
+ end
127
+
128
+ # TODO: Make it DRY
129
+ def parse_element(element)
130
+ itemscope = element.attribute('itemscope')
131
+ itemprop = element.attribute('itemprop')
132
+ internal_elements = extract_elements(element)
133
+ add_itemprop(element) if itemprop || ItempropParser::LINK_ELEMENTS.include?(element.name) || submit_button_include?(element)
134
+ parse_elements(internal_elements) if internal_elements && !itemscope
135
+ end
136
+
137
+ def add_itemprop(element)
138
+ return super unless submit_button_include?(element)
139
+ property = @submit_buttons.find {|b| b.node == element }
140
+ if property.names.empty? && property.rels.empty?
141
+ href = property.value.to_s.strip
142
+ unless href.empty? || href == '#' # href which doesn't work as link is ignored
143
+ (@links[element.name] ||= []) << property
103
144
  end
145
+ else
146
+ property.names.each { |name| (@properties[name] ||= []) << property }
147
+ property.rels.each { |rel| (@links[rel] ||= []) << property }
104
148
  end
105
149
  end
106
150
 
107
- # Find an element with a matching id
108
- def find_with_id(id)
109
- @top_node.search("//*[@id='#{id}']")
151
+ def submit_button_include?(element)
152
+ @submit_buttons.any? {|b| b.node == element }
110
153
  end
154
+ end
155
+ end
111
156
 
157
+ # Patch for bug
158
+ Mechanize::Form.class_eval do
159
+ # Returns all buttons of type Submit
160
+ def submits
161
+ @submits ||= buttons.select {|f|
162
+ f.class == Mechanize::Form::Submit || (f.class == Mechanize::Form::Button && (f.type.nil? || f.type == 'submit'))
163
+ }
112
164
  end
113
165
  end
@@ -65,10 +65,10 @@ module Hypermicrodata
65
65
  # This returns an empty string if can't form a valid
66
66
  # absolute url as per the Microdata spec.
67
67
  def make_absolute_url(url)
68
- return url unless URI.parse(url).relative?
68
+ return url unless Addressable::URI.parse(url).relative?
69
69
  begin
70
- URI.parse(@page_url).merge(url).to_s
71
- rescue URI::Error
70
+ Addressable::URI.parse(@page_url).merge(url).to_s
71
+ rescue
72
72
  url
73
73
  end
74
74
  end
@@ -104,7 +104,7 @@ module Hypermicrodata
104
104
 
105
105
  def extract_property
106
106
  if @element.attribute('itemscope')
107
- Item.new(@element, @page_url)
107
+ Item.parse(@element, @page_url)
108
108
  else
109
109
  extract_property_value
110
110
  end
@@ -26,11 +26,11 @@ module Hypermicrodata
26
26
  end
27
27
 
28
28
  def names
29
- (@button.node['itemprop'] || '').split(' ')
29
+ (node['itemprop'] || '').split(' ')
30
30
  end
31
31
 
32
32
  def rels
33
- rel = (@button.node['rel'] || @button.node['data-rel'] || @button.dom_class || '')
33
+ rel = (node['rel'] || node['data-rel'] || @button.dom_class || '')
34
34
  rel.split(' ')
35
35
  end
36
36
 
@@ -46,6 +46,10 @@ module Hypermicrodata
46
46
  true
47
47
  end
48
48
 
49
+ def node
50
+ @button.node
51
+ end
52
+
49
53
  private
50
54
  def setup!
51
55
  if method_field = @form.fields.find { |f| f.name == '_method' }
@@ -76,30 +80,4 @@ module Hypermicrodata
76
80
  end.compact.join('&')
77
81
  end
78
82
  end
79
-
80
- class FormParser
81
- attr_reader :submit_buttons
82
-
83
- def initialize(element, page_url = nil)
84
- @element, @page_url = element, page_url
85
- form = Mechanize::Form.new(element)
86
- @submit_buttons = form.submits.map do |button|
87
- SubmitButton.new(button, form)
88
- end
89
- end
90
-
91
- def self.parse(element, page_url = nil)
92
- self.new(element, page_url).submit_buttons
93
- end
94
- end
95
- end
96
-
97
- # Patch for bug
98
- Mechanize::Form.class_eval do
99
- # Returns all buttons of type Submit
100
- def submits
101
- @submits ||= buttons.select {|f|
102
- f.class == Mechanize::Form::Submit || (f.class == Mechanize::Form::Button && (f.type.nil? || f.type == 'submit'))
103
- }
104
- end
105
83
  end
@@ -1,3 +1,3 @@
1
1
  module Hypermicrodata
2
- VERSION = "0.1.1"
2
+ VERSION = "0.2.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hypermicrodata
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jason Ronallo
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-10-13 00:00:00.000000000 Z
12
+ date: 2015-01-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -67,6 +67,20 @@ dependencies:
67
67
  - - '>='
68
68
  - !ruby/object:Gem::Version
69
69
  version: '0'
70
+ - !ruby/object:Gem::Dependency
71
+ name: addressable
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ type: :runtime
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - '>='
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
70
84
  - !ruby/object:Gem::Dependency
71
85
  name: bundler
72
86
  requirement: !ruby/object:Gem::Requirement
@@ -154,7 +168,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
154
168
  version: '0'
155
169
  requirements: []
156
170
  rubyforge_project:
157
- rubygems_version: 2.4.2
171
+ rubygems_version: 2.4.3
158
172
  signing_key:
159
173
  specification_version: 4
160
174
  summary: Ruby library for extracting HTML5 Microdata with Hypermedia