rdf-microdata 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,232 @@
1
+ module RDF::Microdata
2
+ class Reader < RDF::Reader
3
+ ##
4
+ # Nokogiri implementation of an HTML parser.
5
+ #
6
+ # @see http://nokogiri.org/
7
+ module Nokogiri
8
+ ##
9
+ # Returns the name of the underlying XML library.
10
+ #
11
+ # @return [Symbol]
12
+ def self.library
13
+ :nokogiri
14
+ end
15
+
16
+ # Proxy class to implement uniform element accessors
17
+ class NodeProxy
18
+ attr_reader :node
19
+ attr_reader :parent
20
+
21
+ def initialize(node, parent = nil)
22
+ @node = node
23
+ @parent = parent
24
+ end
25
+
26
+ ##
27
+ # Element language
28
+ #
29
+ # From HTML5 [3.2.3.3]
30
+ # If both the lang attribute in no namespace and the lang attribute in the XML namespace are set
31
+ # on an element, user agents must use the lang attribute in the XML namespace, and the lang
32
+ # attribute in no namespace must be ignored for the purposes of determining the element's
33
+ # language.
34
+ #
35
+ # @return [String]
36
+ def language
37
+ language = case
38
+ when @node.document.is_a?(::Nokogiri::HTML::Document) && @node.attributes["xml:lang"]
39
+ @node.attributes["xml:lang"].to_s
40
+ when @node.document.is_a?(::Nokogiri::HTML::Document) && @node.attributes["lang"]
41
+ @node.attributes["lang"].to_s
42
+ when @node.attribute("lang")
43
+ @node.attribute("lang").to_s
44
+ else
45
+ parent && parent.element? && parent.language
46
+ end
47
+ end
48
+
49
+ ##
50
+ # Get any xml:base in effect for this element
51
+ def base
52
+ if @base.nil?
53
+ @base = attributes['xml:base'] ||
54
+ (parent && parent.element? && parent.base) ||
55
+ false
56
+ end
57
+
58
+ @base == false ? nil : @base
59
+ end
60
+
61
+ def display_path
62
+ @display_path ||= begin
63
+ path = []
64
+ path << parent.display_path if parent
65
+ path << @node.name
66
+ case @node
67
+ when ::Nokogiri::XML::Element then path.join("/")
68
+ when ::Nokogiri::XML::Attr then path.join("@")
69
+ else path.join("?")
70
+ end
71
+ end
72
+ end
73
+
74
+ ##
75
+ # Return true of all child elements are text
76
+ #
77
+ # @return [Array<:text, :element, :attribute>]
78
+ def text_content?
79
+ @node.children.all? {|c| c.text?}
80
+ end
81
+
82
+ ##
83
+ # Retrieve XMLNS definitions for this element
84
+ #
85
+ # @return [Hash{String => String}]
86
+ def namespaces
87
+ @node.namespace_definitions.inject({}) {|memo, ns| memo[ns.prefix] = ns.href.to_s; memo }
88
+ end
89
+
90
+ ##
91
+ # Children of this node
92
+ #
93
+ # @return [NodeSetProxy]
94
+ def children
95
+ NodeSetProxy.new(@node.children, self)
96
+ end
97
+
98
+ ##
99
+ # Elements of this node
100
+ #
101
+ # @return [NodeSetProxy]
102
+ def elements
103
+ NodeSetProxy.new(@node.elements, self)
104
+ end
105
+
106
+ ##
107
+ # Proxy for everything else to @node
108
+ def method_missing(method, *args)
109
+ @node.send(method, *args)
110
+ end
111
+ end
112
+
113
+ ##
114
+ # NodeSet proxy
115
+ class NodeSetProxy
116
+ attr_reader :node_set
117
+ attr_reader :parent
118
+
119
+ def initialize(node_set, parent)
120
+ @node_set = node_set
121
+ @parent = parent
122
+ end
123
+
124
+ ##
125
+ # Return a proxy for each child
126
+ #
127
+ # @yield(child)
128
+ # @yieldparam(NodeProxy)
129
+ def each
130
+ @node_set.each do |c|
131
+ yield NodeProxy.new(c, parent)
132
+ end
133
+ end
134
+
135
+ ##
136
+ # Return proxy for first element and remove it
137
+ # @return [NodeProxy]
138
+ def shift
139
+ (e = node_set.shift) && NodeProxy.new(e, parent)
140
+ end
141
+
142
+ ##
143
+ # Add NodeSetProxys
144
+ # @param [NodeSetProxy, Nokogiri::XML::Node]
145
+ # @return [NodeSetProxy]
146
+ def +(other)
147
+ NodeSetProxy.new(self.node_set + other.node_set, parent)
148
+ end
149
+
150
+ ##
151
+ # Add a NodeProxy
152
+ # @param [NodeProxy, Nokogiri::XML::Node]
153
+ # @return [NodeSetProxy]
154
+ def <<(elem)
155
+ node_set << (elem.is_a?(NodeProxy) ? elem.node : elem)
156
+ self
157
+ end
158
+
159
+ def inspect
160
+ @node_set.map {|c| NodeProxy.new(c, parent).display_path}.inspect
161
+ end
162
+
163
+ ##
164
+ # Proxy for everything else to @node_set
165
+ def method_missing(method, *args)
166
+ @node_set.send(method, *args)
167
+ end
168
+ end
169
+
170
+ ##
171
+ # Initializes the underlying XML library.
172
+ #
173
+ # @param [Hash{Symbol => Object}] options
174
+ # @return [void]
175
+ def initialize_html(input, options = {})
176
+ require 'nokogiri' unless defined?(::Nokogiri)
177
+ @doc = case input
178
+ when ::Nokogiri::HTML::Document
179
+ input
180
+ else
181
+ # Try to detect charset from input
182
+ options[:encoding] ||= input.charset if input.respond_to?(:charset)
183
+
184
+ # Otherwise, default is utf-8
185
+ options[:encoding] ||= 'utf-8'
186
+
187
+ ::Nokogiri::HTML.parse(input, base_uri.to_s, options[:encoding])
188
+ end
189
+ end
190
+
191
+ # Accessor methods to mask native elements & attributes
192
+
193
+ ##
194
+ # Return proxy for document root
195
+ def root
196
+ @root ||= NodeProxy.new(@doc.root) if @doc && @doc.root
197
+ end
198
+
199
+ ##
200
+ # Document errors
201
+ def doc_errors
202
+ @doc.errors
203
+ end
204
+
205
+ ##
206
+ # Find value of document base
207
+ #
208
+ # @param [String] base Existing base from URI or :base_uri
209
+ # @return [String]
210
+ def doc_base(base)
211
+ # find if the document has a base element
212
+ base_el = @doc.at_css("html>head>base")
213
+ base = base_el.attribute("href").to_s.split("#").first if base_el
214
+ base
215
+ end
216
+
217
+ ##
218
+ # Based on Microdata element.getItems
219
+ #
220
+ # @see http://www.w3.org/TR/2011/WD-microdata-20110525/#top-level-microdata-items
221
+ def getItems
222
+ @doc.css('[itemscope]').select {|el| !el.has_attribute?('itemprop')}.map {|n| NodeProxy.new(n)}
223
+ end
224
+
225
+ ##
226
+ # Look up an element in the document by id
227
+ def find_element_by_id(id)
228
+ (e = @doc.at_css("##{id}")) && NodeProxy.new(e)
229
+ end
230
+ end
231
+ end
232
+ end
@@ -0,0 +1,277 @@
1
+ require 'htmlentities'
2
+
3
+ module RDF::Microdata
4
+ class Reader < RDF::Reader
5
+ ##
6
+ # REXML implementation of an HTML parser.
7
+ #
8
+ # @see http://www.germane-software.com/software/rexml/
9
+ module REXML
10
+ ##
11
+ # Returns the name of the underlying XML library.
12
+ #
13
+ # @return [Symbol]
14
+ def self.library
15
+ :rexml
16
+ end
17
+
18
+ # Proxy class to implement uniform element accessors
19
+ class NodeProxy
20
+ attr_reader :node
21
+ attr_reader :parent
22
+
23
+ def initialize(node, parent = nil)
24
+ @node = node
25
+ @parent = parent
26
+ end
27
+
28
+ ##
29
+ # Element language
30
+ #
31
+ # From HTML5 [3.2.3.3]
32
+ # If both the lang attribute in no namespace and the lang attribute in the XML namespace are set
33
+ # on an element, user agents must use the lang attribute in the XML namespace, and the lang
34
+ # attribute in no namespace must be ignored for the purposes of determining the element's
35
+ # language.
36
+ #
37
+ # @return [String]
38
+ def language
39
+ language = case
40
+ when @node.attribute("lang")
41
+ @node.attribute("lang").to_s
42
+ else
43
+ parent && parent.element? && parent.language
44
+ end
45
+ end
46
+
47
+ ##
48
+ # Return xml:base on element, if defined
49
+ #
50
+ # @return [String]
51
+ def base
52
+ if @base.nil?
53
+ @base = attributes['xml:base'] ||
54
+ (parent && parent.element? && parent.base) ||
55
+ false
56
+ end
57
+
58
+ @base == false ? nil : @base
59
+ end
60
+
61
+ def display_path
62
+ @display_path ||= begin
63
+ path = []
64
+ path << parent.display_path if parent
65
+ path << @node.name
66
+ case @node
67
+ when ::REXML::Element then path.join("/")
68
+ when ::REXML::Attribute then path.join("@")
69
+ else path.join("?")
70
+ end
71
+ end
72
+ end
73
+
74
+ ##
75
+ # Return true of all child elements are text
76
+ #
77
+ # @return [Array<:text, :element, :attribute>]
78
+ def text_content?
79
+ @node.children.all? {|c| c.is_a?(::REXML::Text)}
80
+ end
81
+
82
+ ##
83
+ # Retrieve XMLNS definitions for this element
84
+ #
85
+ # @return [Hash{String => String}]
86
+ def namespaces
87
+ ns_decls = {}
88
+ @node.attributes.each do |name, attr|
89
+ next unless name =~ /^xmlns(?:\:(.+))?/
90
+ ns_decls[$1] = attr
91
+ end
92
+ ns_decls
93
+ end
94
+
95
+ ##
96
+ # Children of this node
97
+ #
98
+ # @return [NodeSetProxy]
99
+ def children
100
+ NodeSetProxy.new(@node.children, self)
101
+ end
102
+
103
+ ##
104
+ # Elements of this node
105
+ #
106
+ # @return [NodeSetProxy]
107
+ def elements
108
+ NodeSetProxy.new(@node.children.select {|c| c.is_a?(::REXML::Element)}, self)
109
+ end
110
+
111
+ ##
112
+ # Inner text of an element
113
+ #
114
+ # @see http://apidock.com/ruby/REXML/Element/get_text#743-Get-all-inner-texts
115
+ # @return [String]
116
+ def inner_text
117
+ coder = HTMLEntities.new
118
+ ::REXML::XPath.match(@node,'.//text()').map { |e|
119
+ coder.decode(e)
120
+ }.join
121
+ end
122
+
123
+ ##
124
+ # Inner text of an element
125
+ #
126
+ # @see http://apidock.com/ruby/REXML/Element/get_text#743-Get-all-inner-texts
127
+ # @return [String]
128
+ def inner_html
129
+ @node.children.map(&:to_s).join
130
+ end
131
+
132
+ ##
133
+ # Node type accessors
134
+ #
135
+ # @return [Boolean]
136
+ def element?
137
+ @node.is_a?(::REXML::Element)
138
+ end
139
+
140
+ def has_attribute?(attr)
141
+ !!node.attribute(attr)
142
+ end
143
+
144
+ ##
145
+ # Proxy for everything else to @node
146
+ def method_missing(method, *args)
147
+ @node.send(method, *args)
148
+ end
149
+ end
150
+
151
+ ##
152
+ # NodeSet proxy
153
+ class NodeSetProxy
154
+ attr_reader :node_set
155
+ attr_reader :parent
156
+
157
+ def initialize(node_set, parent)
158
+ @node_set = node_set
159
+ @parent = parent
160
+ end
161
+
162
+ ##
163
+ # Return a proxy for each child
164
+ #
165
+ # @yield(child)
166
+ # @yieldparam(NodeProxy)
167
+ def each
168
+ @node_set.each do |c|
169
+ yield NodeProxy.new(c, parent)
170
+ end
171
+ end
172
+
173
+ ##
174
+ # Return proxy for first element and remove it
175
+ # @return [NodeProxy]
176
+ def shift
177
+ (e = node_set.shift) && NodeProxy.new(e, parent)
178
+ end
179
+
180
+ ##
181
+ # Add NodeSetProxys
182
+ # @param [NodeSetProxy, Nokogiri::XML::Node]
183
+ # @return [NodeSetProxy]
184
+ def +(other)
185
+ new_ns = node_set.clone
186
+ other.node_set.each {|n| new_ns << n}
187
+ NodeSetProxy.new(new_ns, parent)
188
+ end
189
+
190
+ ##
191
+ # Add a NodeProxy
192
+ # @param [NodeProxy, Nokogiri::XML::Node]
193
+ # @return [NodeSetProxy]
194
+ def <<(elem)
195
+ node_set << (elem.is_a?(NodeProxy) ? elem.node : elem)
196
+ self
197
+ end
198
+
199
+ def inspect
200
+ @node_set.map {|c| NodeProxy.new(c, parent).display_path}.inspect
201
+ end
202
+
203
+ ##
204
+ # Proxy for everything else to @node_set
205
+ def method_missing(method, *args)
206
+ @node_set.send(method, *args)
207
+ end
208
+ end
209
+
210
+ ##
211
+ # Initializes the underlying XML library.
212
+ #
213
+ # @param [Hash{Symbol => Object}] options
214
+ # @return [void]
215
+ def initialize_html(input, options = {})
216
+ require 'rexml/document' unless defined?(::REXML)
217
+ @doc = case input
218
+ when ::REXML::Document
219
+ input
220
+ else
221
+ # Try to detect charset from input
222
+ options[:encoding] ||= input.charset if input.respond_to?(:charset)
223
+
224
+ # Otherwise, default is utf-8
225
+ options[:encoding] ||= 'utf-8'
226
+
227
+ # Set xml:base for the document element, if defined
228
+ @base_uri = base_uri ? base_uri.to_s : nil
229
+
230
+ # Only parse as XML, no HTML mode
231
+ doc = ::REXML::Document.new(input.respond_to?(:read) ? input.read : input.to_s)
232
+ end
233
+ end
234
+
235
+ # Accessor methods to mask native elements & attributes
236
+
237
+ ##
238
+ # Return proxy for document root
239
+ def root
240
+ @root ||= NodeProxy.new(@doc.root) if @doc && @doc.root
241
+ end
242
+
243
+ ##
244
+ # Document errors
245
+ def doc_errors
246
+ []
247
+ end
248
+
249
+ ##
250
+ # Find value of document base
251
+ #
252
+ # @param [String] base Existing base from URI or :base_uri
253
+ # @return [String]
254
+ def doc_base(base)
255
+ # find if the document has a base element
256
+ base_el = ::REXML::XPath.first(@doc, "/html/head/base")
257
+ base = base_el.attribute("href").to_s.split("#").first if base_el
258
+
259
+ base || @base_uri
260
+ end
261
+
262
+ ##
263
+ # Based on Microdata element.getItems
264
+ #
265
+ # @see http://www.w3.org/TR/2011/WD-microdata-20110525/#top-level-microdata-items
266
+ def getItems
267
+ ::REXML::XPath.match(@doc, "//[@itemscope]").select {|el| !el.attribute('itemprop')}.map {|n| NodeProxy.new(n)}
268
+ end
269
+
270
+ ##
271
+ # Look up an element in the document by id
272
+ def find_element_by_id(id)
273
+ (e = ::REXML::XPath.first(@doc, "//[@id='#{id}']")) && NodeProxy.new(e)
274
+ end
275
+ end
276
+ end
277
+ end