rdf-microdata 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,232 @@
1
+ module RDF::Microdata
2
+ class Reader < RDF::Reader
3
+ ##
4
+ # Nokogiri implementation of an HTML parser.
5
+ #
6
+ # @see http://nokogiri.org/
7
+ module Nokogiri
8
+ ##
9
+ # Returns the name of the underlying XML library.
10
+ #
11
+ # @return [Symbol]
12
+ def self.library
13
+ :nokogiri
14
+ end
15
+
16
+ # Proxy class to implement uniform element accessors
17
+ class NodeProxy
18
+ attr_reader :node
19
+ attr_reader :parent
20
+
21
+ def initialize(node, parent = nil)
22
+ @node = node
23
+ @parent = parent
24
+ end
25
+
26
+ ##
27
+ # Element language
28
+ #
29
+ # From HTML5 [3.2.3.3]
30
+ # If both the lang attribute in no namespace and the lang attribute in the XML namespace are set
31
+ # on an element, user agents must use the lang attribute in the XML namespace, and the lang
32
+ # attribute in no namespace must be ignored for the purposes of determining the element's
33
+ # language.
34
+ #
35
+ # @return [String]
36
+ def language
37
+ language = case
38
+ when @node.document.is_a?(::Nokogiri::HTML::Document) && @node.attributes["xml:lang"]
39
+ @node.attributes["xml:lang"].to_s
40
+ when @node.document.is_a?(::Nokogiri::HTML::Document) && @node.attributes["lang"]
41
+ @node.attributes["lang"].to_s
42
+ when @node.attribute("lang")
43
+ @node.attribute("lang").to_s
44
+ else
45
+ parent && parent.element? && parent.language
46
+ end
47
+ end
48
+
49
+ ##
50
+ # Get any xml:base in effect for this element
51
+ def base
52
+ if @base.nil?
53
+ @base = attributes['xml:base'] ||
54
+ (parent && parent.element? && parent.base) ||
55
+ false
56
+ end
57
+
58
+ @base == false ? nil : @base
59
+ end
60
+
61
+ def display_path
62
+ @display_path ||= begin
63
+ path = []
64
+ path << parent.display_path if parent
65
+ path << @node.name
66
+ case @node
67
+ when ::Nokogiri::XML::Element then path.join("/")
68
+ when ::Nokogiri::XML::Attr then path.join("@")
69
+ else path.join("?")
70
+ end
71
+ end
72
+ end
73
+
74
+ ##
75
+ # Return true of all child elements are text
76
+ #
77
+ # @return [Array<:text, :element, :attribute>]
78
+ def text_content?
79
+ @node.children.all? {|c| c.text?}
80
+ end
81
+
82
+ ##
83
+ # Retrieve XMLNS definitions for this element
84
+ #
85
+ # @return [Hash{String => String}]
86
+ def namespaces
87
+ @node.namespace_definitions.inject({}) {|memo, ns| memo[ns.prefix] = ns.href.to_s; memo }
88
+ end
89
+
90
+ ##
91
+ # Children of this node
92
+ #
93
+ # @return [NodeSetProxy]
94
+ def children
95
+ NodeSetProxy.new(@node.children, self)
96
+ end
97
+
98
+ ##
99
+ # Elements of this node
100
+ #
101
+ # @return [NodeSetProxy]
102
+ def elements
103
+ NodeSetProxy.new(@node.elements, self)
104
+ end
105
+
106
+ ##
107
+ # Proxy for everything else to @node
108
+ def method_missing(method, *args)
109
+ @node.send(method, *args)
110
+ end
111
+ end
112
+
113
+ ##
114
+ # NodeSet proxy
115
+ class NodeSetProxy
116
+ attr_reader :node_set
117
+ attr_reader :parent
118
+
119
+ def initialize(node_set, parent)
120
+ @node_set = node_set
121
+ @parent = parent
122
+ end
123
+
124
+ ##
125
+ # Return a proxy for each child
126
+ #
127
+ # @yield(child)
128
+ # @yieldparam(NodeProxy)
129
+ def each
130
+ @node_set.each do |c|
131
+ yield NodeProxy.new(c, parent)
132
+ end
133
+ end
134
+
135
+ ##
136
+ # Return proxy for first element and remove it
137
+ # @return [NodeProxy]
138
+ def shift
139
+ (e = node_set.shift) && NodeProxy.new(e, parent)
140
+ end
141
+
142
+ ##
143
+ # Add NodeSetProxys
144
+ # @param [NodeSetProxy, Nokogiri::XML::Node]
145
+ # @return [NodeSetProxy]
146
+ def +(other)
147
+ NodeSetProxy.new(self.node_set + other.node_set, parent)
148
+ end
149
+
150
+ ##
151
+ # Add a NodeProxy
152
+ # @param [NodeProxy, Nokogiri::XML::Node]
153
+ # @return [NodeSetProxy]
154
+ def <<(elem)
155
+ node_set << (elem.is_a?(NodeProxy) ? elem.node : elem)
156
+ self
157
+ end
158
+
159
+ def inspect
160
+ @node_set.map {|c| NodeProxy.new(c, parent).display_path}.inspect
161
+ end
162
+
163
+ ##
164
+ # Proxy for everything else to @node_set
165
+ def method_missing(method, *args)
166
+ @node_set.send(method, *args)
167
+ end
168
+ end
169
+
170
+ ##
171
+ # Initializes the underlying XML library.
172
+ #
173
+ # @param [Hash{Symbol => Object}] options
174
+ # @return [void]
175
+ def initialize_html(input, options = {})
176
+ require 'nokogiri' unless defined?(::Nokogiri)
177
+ @doc = case input
178
+ when ::Nokogiri::HTML::Document
179
+ input
180
+ else
181
+ # Try to detect charset from input
182
+ options[:encoding] ||= input.charset if input.respond_to?(:charset)
183
+
184
+ # Otherwise, default is utf-8
185
+ options[:encoding] ||= 'utf-8'
186
+
187
+ ::Nokogiri::HTML.parse(input, base_uri.to_s, options[:encoding])
188
+ end
189
+ end
190
+
191
+ # Accessor methods to mask native elements & attributes
192
+
193
+ ##
194
+ # Return proxy for document root
195
+ def root
196
+ @root ||= NodeProxy.new(@doc.root) if @doc && @doc.root
197
+ end
198
+
199
+ ##
200
+ # Document errors
201
+ def doc_errors
202
+ @doc.errors
203
+ end
204
+
205
+ ##
206
+ # Find value of document base
207
+ #
208
+ # @param [String] base Existing base from URI or :base_uri
209
+ # @return [String]
210
+ def doc_base(base)
211
+ # find if the document has a base element
212
+ base_el = @doc.at_css("html>head>base")
213
+ base = base_el.attribute("href").to_s.split("#").first if base_el
214
+ base
215
+ end
216
+
217
+ ##
218
+ # Based on Microdata element.getItems
219
+ #
220
+ # @see http://www.w3.org/TR/2011/WD-microdata-20110525/#top-level-microdata-items
221
+ def getItems
222
+ @doc.css('[itemscope]').select {|el| !el.has_attribute?('itemprop')}.map {|n| NodeProxy.new(n)}
223
+ end
224
+
225
+ ##
226
+ # Look up an element in the document by id
227
+ def find_element_by_id(id)
228
+ (e = @doc.at_css("##{id}")) && NodeProxy.new(e)
229
+ end
230
+ end
231
+ end
232
+ end
@@ -0,0 +1,277 @@
1
+ require 'htmlentities'
2
+
3
+ module RDF::Microdata
4
+ class Reader < RDF::Reader
5
+ ##
6
+ # REXML implementation of an HTML parser.
7
+ #
8
+ # @see http://www.germane-software.com/software/rexml/
9
+ module REXML
10
+ ##
11
+ # Returns the name of the underlying XML library.
12
+ #
13
+ # @return [Symbol]
14
+ def self.library
15
+ :rexml
16
+ end
17
+
18
+ # Proxy class to implement uniform element accessors
19
+ class NodeProxy
20
+ attr_reader :node
21
+ attr_reader :parent
22
+
23
+ def initialize(node, parent = nil)
24
+ @node = node
25
+ @parent = parent
26
+ end
27
+
28
+ ##
29
+ # Element language
30
+ #
31
+ # From HTML5 [3.2.3.3]
32
+ # If both the lang attribute in no namespace and the lang attribute in the XML namespace are set
33
+ # on an element, user agents must use the lang attribute in the XML namespace, and the lang
34
+ # attribute in no namespace must be ignored for the purposes of determining the element's
35
+ # language.
36
+ #
37
+ # @return [String]
38
+ def language
39
+ language = case
40
+ when @node.attribute("lang")
41
+ @node.attribute("lang").to_s
42
+ else
43
+ parent && parent.element? && parent.language
44
+ end
45
+ end
46
+
47
+ ##
48
+ # Return xml:base on element, if defined
49
+ #
50
+ # @return [String]
51
+ def base
52
+ if @base.nil?
53
+ @base = attributes['xml:base'] ||
54
+ (parent && parent.element? && parent.base) ||
55
+ false
56
+ end
57
+
58
+ @base == false ? nil : @base
59
+ end
60
+
61
+ def display_path
62
+ @display_path ||= begin
63
+ path = []
64
+ path << parent.display_path if parent
65
+ path << @node.name
66
+ case @node
67
+ when ::REXML::Element then path.join("/")
68
+ when ::REXML::Attribute then path.join("@")
69
+ else path.join("?")
70
+ end
71
+ end
72
+ end
73
+
74
+ ##
75
+ # Return true of all child elements are text
76
+ #
77
+ # @return [Array<:text, :element, :attribute>]
78
+ def text_content?
79
+ @node.children.all? {|c| c.is_a?(::REXML::Text)}
80
+ end
81
+
82
+ ##
83
+ # Retrieve XMLNS definitions for this element
84
+ #
85
+ # @return [Hash{String => String}]
86
+ def namespaces
87
+ ns_decls = {}
88
+ @node.attributes.each do |name, attr|
89
+ next unless name =~ /^xmlns(?:\:(.+))?/
90
+ ns_decls[$1] = attr
91
+ end
92
+ ns_decls
93
+ end
94
+
95
+ ##
96
+ # Children of this node
97
+ #
98
+ # @return [NodeSetProxy]
99
+ def children
100
+ NodeSetProxy.new(@node.children, self)
101
+ end
102
+
103
+ ##
104
+ # Elements of this node
105
+ #
106
+ # @return [NodeSetProxy]
107
+ def elements
108
+ NodeSetProxy.new(@node.children.select {|c| c.is_a?(::REXML::Element)}, self)
109
+ end
110
+
111
+ ##
112
+ # Inner text of an element
113
+ #
114
+ # @see http://apidock.com/ruby/REXML/Element/get_text#743-Get-all-inner-texts
115
+ # @return [String]
116
+ def inner_text
117
+ coder = HTMLEntities.new
118
+ ::REXML::XPath.match(@node,'.//text()').map { |e|
119
+ coder.decode(e)
120
+ }.join
121
+ end
122
+
123
+ ##
124
+ # Inner text of an element
125
+ #
126
+ # @see http://apidock.com/ruby/REXML/Element/get_text#743-Get-all-inner-texts
127
+ # @return [String]
128
+ def inner_html
129
+ @node.children.map(&:to_s).join
130
+ end
131
+
132
+ ##
133
+ # Node type accessors
134
+ #
135
+ # @return [Boolean]
136
+ def element?
137
+ @node.is_a?(::REXML::Element)
138
+ end
139
+
140
+ def has_attribute?(attr)
141
+ !!node.attribute(attr)
142
+ end
143
+
144
+ ##
145
+ # Proxy for everything else to @node
146
+ def method_missing(method, *args)
147
+ @node.send(method, *args)
148
+ end
149
+ end
150
+
151
+ ##
152
+ # NodeSet proxy
153
+ class NodeSetProxy
154
+ attr_reader :node_set
155
+ attr_reader :parent
156
+
157
+ def initialize(node_set, parent)
158
+ @node_set = node_set
159
+ @parent = parent
160
+ end
161
+
162
+ ##
163
+ # Return a proxy for each child
164
+ #
165
+ # @yield(child)
166
+ # @yieldparam(NodeProxy)
167
+ def each
168
+ @node_set.each do |c|
169
+ yield NodeProxy.new(c, parent)
170
+ end
171
+ end
172
+
173
+ ##
174
+ # Return proxy for first element and remove it
175
+ # @return [NodeProxy]
176
+ def shift
177
+ (e = node_set.shift) && NodeProxy.new(e, parent)
178
+ end
179
+
180
+ ##
181
+ # Add NodeSetProxys
182
+ # @param [NodeSetProxy, Nokogiri::XML::Node]
183
+ # @return [NodeSetProxy]
184
+ def +(other)
185
+ new_ns = node_set.clone
186
+ other.node_set.each {|n| new_ns << n}
187
+ NodeSetProxy.new(new_ns, parent)
188
+ end
189
+
190
+ ##
191
+ # Add a NodeProxy
192
+ # @param [NodeProxy, Nokogiri::XML::Node]
193
+ # @return [NodeSetProxy]
194
+ def <<(elem)
195
+ node_set << (elem.is_a?(NodeProxy) ? elem.node : elem)
196
+ self
197
+ end
198
+
199
+ def inspect
200
+ @node_set.map {|c| NodeProxy.new(c, parent).display_path}.inspect
201
+ end
202
+
203
+ ##
204
+ # Proxy for everything else to @node_set
205
+ def method_missing(method, *args)
206
+ @node_set.send(method, *args)
207
+ end
208
+ end
209
+
210
+ ##
211
+ # Initializes the underlying XML library.
212
+ #
213
+ # @param [Hash{Symbol => Object}] options
214
+ # @return [void]
215
+ def initialize_html(input, options = {})
216
+ require 'rexml/document' unless defined?(::REXML)
217
+ @doc = case input
218
+ when ::REXML::Document
219
+ input
220
+ else
221
+ # Try to detect charset from input
222
+ options[:encoding] ||= input.charset if input.respond_to?(:charset)
223
+
224
+ # Otherwise, default is utf-8
225
+ options[:encoding] ||= 'utf-8'
226
+
227
+ # Set xml:base for the document element, if defined
228
+ @base_uri = base_uri ? base_uri.to_s : nil
229
+
230
+ # Only parse as XML, no HTML mode
231
+ doc = ::REXML::Document.new(input.respond_to?(:read) ? input.read : input.to_s)
232
+ end
233
+ end
234
+
235
+ # Accessor methods to mask native elements & attributes
236
+
237
+ ##
238
+ # Return proxy for document root
239
+ def root
240
+ @root ||= NodeProxy.new(@doc.root) if @doc && @doc.root
241
+ end
242
+
243
+ ##
244
+ # Document errors
245
+ def doc_errors
246
+ []
247
+ end
248
+
249
+ ##
250
+ # Find value of document base
251
+ #
252
+ # @param [String] base Existing base from URI or :base_uri
253
+ # @return [String]
254
+ def doc_base(base)
255
+ # find if the document has a base element
256
+ base_el = ::REXML::XPath.first(@doc, "/html/head/base")
257
+ base = base_el.attribute("href").to_s.split("#").first if base_el
258
+
259
+ base || @base_uri
260
+ end
261
+
262
+ ##
263
+ # Based on Microdata element.getItems
264
+ #
265
+ # @see http://www.w3.org/TR/2011/WD-microdata-20110525/#top-level-microdata-items
266
+ def getItems
267
+ ::REXML::XPath.match(@doc, "//[@itemscope]").select {|el| !el.attribute('itemprop')}.map {|n| NodeProxy.new(n)}
268
+ end
269
+
270
+ ##
271
+ # Look up an element in the document by id
272
+ def find_element_by_id(id)
273
+ (e = ::REXML::XPath.first(@doc, "//[@id='#{id}']")) && NodeProxy.new(e)
274
+ end
275
+ end
276
+ end
277
+ end