hypermicrodata 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 31c2827d5ef761811004b563f135127def94a625
4
- data.tar.gz: 229d52f8eee1e15347d72171d37f6ebda948df84
3
+ metadata.gz: bc45691f143d5cfcd20199463e9b549565c04b69
4
+ data.tar.gz: 2f0c44acf87fa98cc606bf3f292d3c940a2cbf57
5
5
  SHA512:
6
- metadata.gz: b1caaf52ab0fe4dc1c3285b04b9a1db8b3b1debe0d2966cc3c903d88a72e3900d3d2815521c95d5d98217e0c81cffe8f1d63fa9977e53d2ee1db361f8ea70de0
7
- data.tar.gz: 3f2531028ea38dc3d082e84a14aa426406d7ebee8667e3459611ccfd05c7857eb965daf73715c5c390ef272989566b761386512955e940d4b6032bb821332dfb
6
+ metadata.gz: 2c3b43e8c4ca94990f23827ccd3cb4111e7f98143c5291bb166a4bb3c621dc65065184e826caac6d792ec5b504d0f141dbc5cf6cb9956a90314ad5deb625c824
7
+ data.tar.gz: 71df574ab4d757fed7e67e39f3d6e88afdf76a4f416955ebce4c4884d7ca271e4c9cd6f713d5072bad004c154121116034efa1514bed3f7236ce4f42eae02e2a
data/.travis.yml CHANGED
@@ -2,7 +2,8 @@ language: ruby
2
2
  rvm:
3
3
  - 1.9.3
4
4
  - 2.0.0
5
- - 2.1.1
5
+ - 2.1
6
+ - 2.2
6
7
  - jruby-19mode # JRuby in 1.9 mode
7
8
  # uncomment this line if your project needs to run something other than `rake`:
8
9
  script: rake test
data/README.md CHANGED
@@ -42,6 +42,14 @@ Supported formats are
42
42
 
43
43
  When you use this in Rails, you don't need to extract data manually.
44
44
 
45
+ /config/mime_types.rb
46
+
47
+ ```
48
+ Mime::Type.register 'application/vnd.amundsen-uber+json', :uberjson
49
+ # or if you want HAL
50
+ Mime::Type.register 'application/hal+json', :haljson
51
+ ```
52
+
45
53
  /app/controllers/people_controller.rb
46
54
 
47
55
  ```
@@ -55,8 +63,7 @@ end
55
63
  /app/views/people/show.html.haml
56
64
 
57
65
  ```
58
- .person{itemscope: true, itemtype: 'http://schema.org/Person',
59
- itemid: person_url(@person), data: {main_item: true}}
66
+ %main.person{itemscope: true, itemtype: 'http://schema.org/Person', itemid: person_url(@person)}
60
67
  .media
61
68
  .media-image.pull-left
62
69
  = image_tag @person.picture_path, alt: '', itemprop: 'image'
@@ -66,6 +73,11 @@ end
66
73
  = link_to 'collection', people_path, rel: 'collection', itemprop: 'isPartOf'
67
74
  ```
68
75
 
76
+ `<main>` elements is considered root nodes of the extraction into JSON.
77
+
78
+ If you don't want use `<main>`, you can use elements with `data-main-item` attribute instead.
79
+
80
+
69
81
  And you can serve following JSON:
70
82
 
71
83
  ```
@@ -22,6 +22,7 @@ Gem::Specification.new do |spec|
22
22
  spec.add_dependency "mechanize"
23
23
  spec.add_dependency "halibut"
24
24
  spec.add_dependency "multi_json"
25
+ spec.add_dependency "addressable"
25
26
 
26
27
  spec.add_development_dependency "bundler", "~> 1.3"
27
28
  spec.add_development_dependency "rake"
@@ -15,7 +15,7 @@ require "hypermicrodata/extract"
15
15
  require "hypermicrodata/rails/html_based_json_renderer"
16
16
  require 'open-uri'
17
17
  require 'json'
18
- require 'uri'
18
+ require 'addressable/uri'
19
19
 
20
20
  module Hypermicrodata
21
21
 
@@ -12,17 +12,29 @@ module Hypermicrodata
12
12
  end
13
13
 
14
14
  def extract_items
15
- itemscopes = []
16
- if @filter_xpath_attr
17
- itemscopes = @doc.xpath("//*[#{@filter_xpath_attr} and @itemscope]")
18
- puts "XPath //*[#{@filter_xpath_attr}] is not found. root node is used." if itemscopes.empty?
19
- end
20
- itemscopes = @doc.xpath('self::*[@itemscope] | .//*[@itemscope and not(@itemprop)]') if itemscopes.empty?
21
-
22
15
  itemscopes.collect do |itemscope|
23
- Item.new(itemscope, @page_url)
16
+ Item.parse(itemscope, @page_url)
24
17
  end
25
18
  end
26
19
 
20
+ private
21
+
22
+ def itemscopes
23
+ items_xpath = 'self::*[@itemscope] | .//*[@itemscope and not(@itemprop)] | .//form[not(@itemprop)]'
24
+ if @filter_xpath_attr
25
+ filtered_doc = @doc.xpath("//*[#{@filter_xpath_attr}]")
26
+ unless filtered_doc.empty?
27
+ return filtered_doc.xpath(items_xpath)
28
+ end
29
+ end
30
+ print "XPath //*[#{@filter_xpath_attr}] is not found. "
31
+ filtered_doc = @doc.xpath('//main')
32
+ unless filtered_doc.empty?
33
+ print "main node is used.\n"
34
+ return filtered_doc.xpath(items_xpath)
35
+ end
36
+ print "root node is used.\n"
37
+ @doc.xpath(items_xpath)
38
+ end
27
39
  end
28
40
  end
@@ -2,6 +2,14 @@ module Hypermicrodata
2
2
  class Item
3
3
  attr_reader :type, :properties, :links, :id
4
4
 
5
+ def self.parse(top_node, page_url)
6
+ if top_node.name == 'form'
7
+ FormItem.new(top_node, page_url)
8
+ else
9
+ Item.new(top_node, page_url)
10
+ end
11
+ end
12
+
5
13
  def initialize(top_node, page_url)
6
14
  @top_node = top_node
7
15
  @type = extract_itemtype
@@ -68,8 +76,7 @@ module Hypermicrodata
68
76
  itemscope = element.attribute('itemscope')
69
77
  itemprop = element.attribute('itemprop')
70
78
  internal_elements = extract_elements(element)
71
- add_itemprop(element) if itemscope || itemprop || ItempropParser::LINK_ELEMENTS.include?(element.name)
72
- add_form(element) if element.name == 'form'
79
+ add_itemprop(element) if itemprop || ItempropParser::LINK_ELEMENTS.include?(element.name)
73
80
  parse_elements(internal_elements) if internal_elements && !itemscope
74
81
  end
75
82
 
@@ -77,7 +84,10 @@ module Hypermicrodata
77
84
  def add_itemprop(element)
78
85
  property = ItempropParser.parse(element, @page_url)
79
86
  if property.link? && property.names.empty? && property.rels.empty?
80
- (@links['link'] ||= []) << property
87
+ href = property.value.to_s.strip
88
+ unless href.empty? || href == '#' # href which doesn't work as link is ignored
89
+ (@links[element.name] ||= []) << property
90
+ end
81
91
  else
82
92
  property.names.each { |name| (@properties[name] ||= []) << property }
83
93
  property.rels.each { |rel| (@links[rel] ||= []) << property }
@@ -92,22 +102,64 @@ module Hypermicrodata
92
102
  end
93
103
  end
94
104
 
95
- def add_form(element)
96
- submit_buttons = FormParser.parse(element, @page_url)
97
- submit_buttons.each do |submit_button|
98
- submit_button.names.each { |name| (@properties[name] ||= []) << submit_button }
99
- if submit_button.rels.empty?
100
- (@links['submit'] ||= []) << submit_button
101
- else
102
- submit_button.rels.each { |rel| (@links[rel] ||= []) << submit_button }
105
+ # Find an element with a matching id
106
+ def find_with_id(id)
107
+ @top_node.search("//*[@id='#{id}']")
108
+ end
109
+ end
110
+
111
+ class FormItem < Item
112
+ attr_reader :submit_buttons
113
+
114
+ def initialize(top_node, page_url)
115
+ form = Mechanize::Form.new(top_node)
116
+ @submit_buttons = form.submits.map do |button|
117
+ SubmitButton.new(button, form)
118
+ end
119
+ super
120
+ end
121
+
122
+ private
123
+
124
+ def extract_itemtype
125
+ super || ['http://schema.org/Action']
126
+ end
127
+
128
+ # TODO: Make it DRY
129
+ def parse_element(element)
130
+ itemscope = element.attribute('itemscope')
131
+ itemprop = element.attribute('itemprop')
132
+ internal_elements = extract_elements(element)
133
+ add_itemprop(element) if itemprop || ItempropParser::LINK_ELEMENTS.include?(element.name) || submit_button_include?(element)
134
+ parse_elements(internal_elements) if internal_elements && !itemscope
135
+ end
136
+
137
+ def add_itemprop(element)
138
+ return super unless submit_button_include?(element)
139
+ property = @submit_buttons.find {|b| b.node == element }
140
+ if property.names.empty? && property.rels.empty?
141
+ href = property.value.to_s.strip
142
+ unless href.empty? || href == '#' # href which doesn't work as link is ignored
143
+ (@links[element.name] ||= []) << property
103
144
  end
145
+ else
146
+ property.names.each { |name| (@properties[name] ||= []) << property }
147
+ property.rels.each { |rel| (@links[rel] ||= []) << property }
104
148
  end
105
149
  end
106
150
 
107
- # Find an element with a matching id
108
- def find_with_id(id)
109
- @top_node.search("//*[@id='#{id}']")
151
+ def submit_button_include?(element)
152
+ @submit_buttons.any? {|b| b.node == element }
110
153
  end
154
+ end
155
+ end
111
156
 
157
+ # Patch for bug
158
+ Mechanize::Form.class_eval do
159
+ # Returns all buttons of type Submit
160
+ def submits
161
+ @submits ||= buttons.select {|f|
162
+ f.class == Mechanize::Form::Submit || (f.class == Mechanize::Form::Button && (f.type.nil? || f.type == 'submit'))
163
+ }
112
164
  end
113
165
  end
@@ -65,10 +65,10 @@ module Hypermicrodata
65
65
  # This returns an empty string if can't form a valid
66
66
  # absolute url as per the Microdata spec.
67
67
  def make_absolute_url(url)
68
- return url unless URI.parse(url).relative?
68
+ return url unless Addressable::URI.parse(url).relative?
69
69
  begin
70
- URI.parse(@page_url).merge(url).to_s
71
- rescue URI::Error
70
+ Addressable::URI.parse(@page_url).merge(url).to_s
71
+ rescue
72
72
  url
73
73
  end
74
74
  end
@@ -104,7 +104,7 @@ module Hypermicrodata
104
104
 
105
105
  def extract_property
106
106
  if @element.attribute('itemscope')
107
- Item.new(@element, @page_url)
107
+ Item.parse(@element, @page_url)
108
108
  else
109
109
  extract_property_value
110
110
  end
@@ -26,11 +26,11 @@ module Hypermicrodata
26
26
  end
27
27
 
28
28
  def names
29
- (@button.node['itemprop'] || '').split(' ')
29
+ (node['itemprop'] || '').split(' ')
30
30
  end
31
31
 
32
32
  def rels
33
- rel = (@button.node['rel'] || @button.node['data-rel'] || @button.dom_class || '')
33
+ rel = (node['rel'] || node['data-rel'] || @button.dom_class || '')
34
34
  rel.split(' ')
35
35
  end
36
36
 
@@ -46,6 +46,10 @@ module Hypermicrodata
46
46
  true
47
47
  end
48
48
 
49
+ def node
50
+ @button.node
51
+ end
52
+
49
53
  private
50
54
  def setup!
51
55
  if method_field = @form.fields.find { |f| f.name == '_method' }
@@ -76,30 +80,4 @@ module Hypermicrodata
76
80
  end.compact.join('&')
77
81
  end
78
82
  end
79
-
80
- class FormParser
81
- attr_reader :submit_buttons
82
-
83
- def initialize(element, page_url = nil)
84
- @element, @page_url = element, page_url
85
- form = Mechanize::Form.new(element)
86
- @submit_buttons = form.submits.map do |button|
87
- SubmitButton.new(button, form)
88
- end
89
- end
90
-
91
- def self.parse(element, page_url = nil)
92
- self.new(element, page_url).submit_buttons
93
- end
94
- end
95
- end
96
-
97
- # Patch for bug
98
- Mechanize::Form.class_eval do
99
- # Returns all buttons of type Submit
100
- def submits
101
- @submits ||= buttons.select {|f|
102
- f.class == Mechanize::Form::Submit || (f.class == Mechanize::Form::Button && (f.type.nil? || f.type == 'submit'))
103
- }
104
- end
105
83
  end
@@ -1,3 +1,3 @@
1
1
  module Hypermicrodata
2
- VERSION = "0.1.1"
2
+ VERSION = "0.2.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hypermicrodata
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jason Ronallo
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-10-13 00:00:00.000000000 Z
12
+ date: 2015-01-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -67,6 +67,20 @@ dependencies:
67
67
  - - '>='
68
68
  - !ruby/object:Gem::Version
69
69
  version: '0'
70
+ - !ruby/object:Gem::Dependency
71
+ name: addressable
72
+ requirement: !ruby/object:Gem::Requirement
73
+ requirements:
74
+ - - '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ type: :runtime
78
+ prerelease: false
79
+ version_requirements: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - '>='
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
70
84
  - !ruby/object:Gem::Dependency
71
85
  name: bundler
72
86
  requirement: !ruby/object:Gem::Requirement
@@ -154,7 +168,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
154
168
  version: '0'
155
169
  requirements: []
156
170
  rubyforge_project:
157
- rubygems_version: 2.4.2
171
+ rubygems_version: 2.4.3
158
172
  signing_key:
159
173
  specification_version: 4
160
174
  summary: Ruby library for extracting HTML5 Microdata with Hypermedia