rdf-microdata 0.2.2 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +21 -45
- data/VERSION +1 -1
- data/etc/doap.html +42 -0
- data/etc/registry.json +39 -0
- data/lib/rdf/microdata.rb +0 -2
- data/lib/rdf/microdata/reader.rb +316 -193
- data/lib/rdf/microdata/reader/nokogiri.rb +232 -0
- data/lib/rdf/microdata/reader/rexml.rb +277 -0
- data/lib/rdf/microdata/vocab.rb +1 -1
- metadata +58 -21
- data/lib/rdf/microdata/extensions.rb +0 -34
@@ -0,0 +1,232 @@
|
|
1
|
+
module RDF::Microdata
|
2
|
+
class Reader < RDF::Reader
|
3
|
+
##
|
4
|
+
# Nokogiri implementation of an HTML parser.
|
5
|
+
#
|
6
|
+
# @see http://nokogiri.org/
|
7
|
+
module Nokogiri
|
8
|
+
##
|
9
|
+
# Returns the name of the underlying XML library.
|
10
|
+
#
|
11
|
+
# @return [Symbol]
|
12
|
+
def self.library
|
13
|
+
:nokogiri
|
14
|
+
end
|
15
|
+
|
16
|
+
# Proxy class to implement uniform element accessors
|
17
|
+
class NodeProxy
|
18
|
+
attr_reader :node
|
19
|
+
attr_reader :parent
|
20
|
+
|
21
|
+
def initialize(node, parent = nil)
|
22
|
+
@node = node
|
23
|
+
@parent = parent
|
24
|
+
end
|
25
|
+
|
26
|
+
##
|
27
|
+
# Element language
|
28
|
+
#
|
29
|
+
# From HTML5 [3.2.3.3]
|
30
|
+
# If both the lang attribute in no namespace and the lang attribute in the XML namespace are set
|
31
|
+
# on an element, user agents must use the lang attribute in the XML namespace, and the lang
|
32
|
+
# attribute in no namespace must be ignored for the purposes of determining the element's
|
33
|
+
# language.
|
34
|
+
#
|
35
|
+
# @return [String]
|
36
|
+
def language
|
37
|
+
language = case
|
38
|
+
when @node.document.is_a?(::Nokogiri::HTML::Document) && @node.attributes["xml:lang"]
|
39
|
+
@node.attributes["xml:lang"].to_s
|
40
|
+
when @node.document.is_a?(::Nokogiri::HTML::Document) && @node.attributes["lang"]
|
41
|
+
@node.attributes["lang"].to_s
|
42
|
+
when @node.attribute("lang")
|
43
|
+
@node.attribute("lang").to_s
|
44
|
+
else
|
45
|
+
parent && parent.element? && parent.language
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
##
|
50
|
+
# Get any xml:base in effect for this element
|
51
|
+
def base
|
52
|
+
if @base.nil?
|
53
|
+
@base = attributes['xml:base'] ||
|
54
|
+
(parent && parent.element? && parent.base) ||
|
55
|
+
false
|
56
|
+
end
|
57
|
+
|
58
|
+
@base == false ? nil : @base
|
59
|
+
end
|
60
|
+
|
61
|
+
def display_path
|
62
|
+
@display_path ||= begin
|
63
|
+
path = []
|
64
|
+
path << parent.display_path if parent
|
65
|
+
path << @node.name
|
66
|
+
case @node
|
67
|
+
when ::Nokogiri::XML::Element then path.join("/")
|
68
|
+
when ::Nokogiri::XML::Attr then path.join("@")
|
69
|
+
else path.join("?")
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
##
|
75
|
+
# Return true of all child elements are text
|
76
|
+
#
|
77
|
+
# @return [Array<:text, :element, :attribute>]
|
78
|
+
def text_content?
|
79
|
+
@node.children.all? {|c| c.text?}
|
80
|
+
end
|
81
|
+
|
82
|
+
##
|
83
|
+
# Retrieve XMLNS definitions for this element
|
84
|
+
#
|
85
|
+
# @return [Hash{String => String}]
|
86
|
+
def namespaces
|
87
|
+
@node.namespace_definitions.inject({}) {|memo, ns| memo[ns.prefix] = ns.href.to_s; memo }
|
88
|
+
end
|
89
|
+
|
90
|
+
##
|
91
|
+
# Children of this node
|
92
|
+
#
|
93
|
+
# @return [NodeSetProxy]
|
94
|
+
def children
|
95
|
+
NodeSetProxy.new(@node.children, self)
|
96
|
+
end
|
97
|
+
|
98
|
+
##
|
99
|
+
# Elements of this node
|
100
|
+
#
|
101
|
+
# @return [NodeSetProxy]
|
102
|
+
def elements
|
103
|
+
NodeSetProxy.new(@node.elements, self)
|
104
|
+
end
|
105
|
+
|
106
|
+
##
|
107
|
+
# Proxy for everything else to @node
|
108
|
+
def method_missing(method, *args)
|
109
|
+
@node.send(method, *args)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
##
|
114
|
+
# NodeSet proxy
|
115
|
+
class NodeSetProxy
|
116
|
+
attr_reader :node_set
|
117
|
+
attr_reader :parent
|
118
|
+
|
119
|
+
def initialize(node_set, parent)
|
120
|
+
@node_set = node_set
|
121
|
+
@parent = parent
|
122
|
+
end
|
123
|
+
|
124
|
+
##
|
125
|
+
# Return a proxy for each child
|
126
|
+
#
|
127
|
+
# @yield(child)
|
128
|
+
# @yieldparam(NodeProxy)
|
129
|
+
def each
|
130
|
+
@node_set.each do |c|
|
131
|
+
yield NodeProxy.new(c, parent)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
##
|
136
|
+
# Return proxy for first element and remove it
|
137
|
+
# @return [NodeProxy]
|
138
|
+
def shift
|
139
|
+
(e = node_set.shift) && NodeProxy.new(e, parent)
|
140
|
+
end
|
141
|
+
|
142
|
+
##
|
143
|
+
# Add NodeSetProxys
|
144
|
+
# @param [NodeSetProxy, Nokogiri::XML::Node]
|
145
|
+
# @return [NodeSetProxy]
|
146
|
+
def +(other)
|
147
|
+
NodeSetProxy.new(self.node_set + other.node_set, parent)
|
148
|
+
end
|
149
|
+
|
150
|
+
##
|
151
|
+
# Add a NodeProxy
|
152
|
+
# @param [NodeProxy, Nokogiri::XML::Node]
|
153
|
+
# @return [NodeSetProxy]
|
154
|
+
def <<(elem)
|
155
|
+
node_set << (elem.is_a?(NodeProxy) ? elem.node : elem)
|
156
|
+
self
|
157
|
+
end
|
158
|
+
|
159
|
+
def inspect
|
160
|
+
@node_set.map {|c| NodeProxy.new(c, parent).display_path}.inspect
|
161
|
+
end
|
162
|
+
|
163
|
+
##
|
164
|
+
# Proxy for everything else to @node_set
|
165
|
+
def method_missing(method, *args)
|
166
|
+
@node_set.send(method, *args)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
##
|
171
|
+
# Initializes the underlying XML library.
|
172
|
+
#
|
173
|
+
# @param [Hash{Symbol => Object}] options
|
174
|
+
# @return [void]
|
175
|
+
def initialize_html(input, options = {})
|
176
|
+
require 'nokogiri' unless defined?(::Nokogiri)
|
177
|
+
@doc = case input
|
178
|
+
when ::Nokogiri::HTML::Document
|
179
|
+
input
|
180
|
+
else
|
181
|
+
# Try to detect charset from input
|
182
|
+
options[:encoding] ||= input.charset if input.respond_to?(:charset)
|
183
|
+
|
184
|
+
# Otherwise, default is utf-8
|
185
|
+
options[:encoding] ||= 'utf-8'
|
186
|
+
|
187
|
+
::Nokogiri::HTML.parse(input, base_uri.to_s, options[:encoding])
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
# Accessor methods to mask native elements & attributes
|
192
|
+
|
193
|
+
##
|
194
|
+
# Return proxy for document root
|
195
|
+
def root
|
196
|
+
@root ||= NodeProxy.new(@doc.root) if @doc && @doc.root
|
197
|
+
end
|
198
|
+
|
199
|
+
##
|
200
|
+
# Document errors
|
201
|
+
def doc_errors
|
202
|
+
@doc.errors
|
203
|
+
end
|
204
|
+
|
205
|
+
##
|
206
|
+
# Find value of document base
|
207
|
+
#
|
208
|
+
# @param [String] base Existing base from URI or :base_uri
|
209
|
+
# @return [String]
|
210
|
+
def doc_base(base)
|
211
|
+
# find if the document has a base element
|
212
|
+
base_el = @doc.at_css("html>head>base")
|
213
|
+
base = base_el.attribute("href").to_s.split("#").first if base_el
|
214
|
+
base
|
215
|
+
end
|
216
|
+
|
217
|
+
##
|
218
|
+
# Based on Microdata element.getItems
|
219
|
+
#
|
220
|
+
# @see http://www.w3.org/TR/2011/WD-microdata-20110525/#top-level-microdata-items
|
221
|
+
def getItems
|
222
|
+
@doc.css('[itemscope]').select {|el| !el.has_attribute?('itemprop')}.map {|n| NodeProxy.new(n)}
|
223
|
+
end
|
224
|
+
|
225
|
+
##
|
226
|
+
# Look up an element in the document by id
|
227
|
+
def find_element_by_id(id)
|
228
|
+
(e = @doc.at_css("##{id}")) && NodeProxy.new(e)
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|
232
|
+
end
|
@@ -0,0 +1,277 @@
|
|
1
|
+
require 'htmlentities'
|
2
|
+
|
3
|
+
module RDF::Microdata
|
4
|
+
class Reader < RDF::Reader
|
5
|
+
##
|
6
|
+
# REXML implementation of an HTML parser.
|
7
|
+
#
|
8
|
+
# @see http://www.germane-software.com/software/rexml/
|
9
|
+
module REXML
|
10
|
+
##
|
11
|
+
# Returns the name of the underlying XML library.
|
12
|
+
#
|
13
|
+
# @return [Symbol]
|
14
|
+
def self.library
|
15
|
+
:rexml
|
16
|
+
end
|
17
|
+
|
18
|
+
# Proxy class to implement uniform element accessors
|
19
|
+
class NodeProxy
|
20
|
+
attr_reader :node
|
21
|
+
attr_reader :parent
|
22
|
+
|
23
|
+
def initialize(node, parent = nil)
|
24
|
+
@node = node
|
25
|
+
@parent = parent
|
26
|
+
end
|
27
|
+
|
28
|
+
##
|
29
|
+
# Element language
|
30
|
+
#
|
31
|
+
# From HTML5 [3.2.3.3]
|
32
|
+
# If both the lang attribute in no namespace and the lang attribute in the XML namespace are set
|
33
|
+
# on an element, user agents must use the lang attribute in the XML namespace, and the lang
|
34
|
+
# attribute in no namespace must be ignored for the purposes of determining the element's
|
35
|
+
# language.
|
36
|
+
#
|
37
|
+
# @return [String]
|
38
|
+
def language
|
39
|
+
language = case
|
40
|
+
when @node.attribute("lang")
|
41
|
+
@node.attribute("lang").to_s
|
42
|
+
else
|
43
|
+
parent && parent.element? && parent.language
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
##
|
48
|
+
# Return xml:base on element, if defined
|
49
|
+
#
|
50
|
+
# @return [String]
|
51
|
+
def base
|
52
|
+
if @base.nil?
|
53
|
+
@base = attributes['xml:base'] ||
|
54
|
+
(parent && parent.element? && parent.base) ||
|
55
|
+
false
|
56
|
+
end
|
57
|
+
|
58
|
+
@base == false ? nil : @base
|
59
|
+
end
|
60
|
+
|
61
|
+
def display_path
|
62
|
+
@display_path ||= begin
|
63
|
+
path = []
|
64
|
+
path << parent.display_path if parent
|
65
|
+
path << @node.name
|
66
|
+
case @node
|
67
|
+
when ::REXML::Element then path.join("/")
|
68
|
+
when ::REXML::Attribute then path.join("@")
|
69
|
+
else path.join("?")
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
##
|
75
|
+
# Return true of all child elements are text
|
76
|
+
#
|
77
|
+
# @return [Array<:text, :element, :attribute>]
|
78
|
+
def text_content?
|
79
|
+
@node.children.all? {|c| c.is_a?(::REXML::Text)}
|
80
|
+
end
|
81
|
+
|
82
|
+
##
|
83
|
+
# Retrieve XMLNS definitions for this element
|
84
|
+
#
|
85
|
+
# @return [Hash{String => String}]
|
86
|
+
def namespaces
|
87
|
+
ns_decls = {}
|
88
|
+
@node.attributes.each do |name, attr|
|
89
|
+
next unless name =~ /^xmlns(?:\:(.+))?/
|
90
|
+
ns_decls[$1] = attr
|
91
|
+
end
|
92
|
+
ns_decls
|
93
|
+
end
|
94
|
+
|
95
|
+
##
|
96
|
+
# Children of this node
|
97
|
+
#
|
98
|
+
# @return [NodeSetProxy]
|
99
|
+
def children
|
100
|
+
NodeSetProxy.new(@node.children, self)
|
101
|
+
end
|
102
|
+
|
103
|
+
##
|
104
|
+
# Elements of this node
|
105
|
+
#
|
106
|
+
# @return [NodeSetProxy]
|
107
|
+
def elements
|
108
|
+
NodeSetProxy.new(@node.children.select {|c| c.is_a?(::REXML::Element)}, self)
|
109
|
+
end
|
110
|
+
|
111
|
+
##
|
112
|
+
# Inner text of an element
|
113
|
+
#
|
114
|
+
# @see http://apidock.com/ruby/REXML/Element/get_text#743-Get-all-inner-texts
|
115
|
+
# @return [String]
|
116
|
+
def inner_text
|
117
|
+
coder = HTMLEntities.new
|
118
|
+
::REXML::XPath.match(@node,'.//text()').map { |e|
|
119
|
+
coder.decode(e)
|
120
|
+
}.join
|
121
|
+
end
|
122
|
+
|
123
|
+
##
|
124
|
+
# Inner text of an element
|
125
|
+
#
|
126
|
+
# @see http://apidock.com/ruby/REXML/Element/get_text#743-Get-all-inner-texts
|
127
|
+
# @return [String]
|
128
|
+
def inner_html
|
129
|
+
@node.children.map(&:to_s).join
|
130
|
+
end
|
131
|
+
|
132
|
+
##
|
133
|
+
# Node type accessors
|
134
|
+
#
|
135
|
+
# @return [Boolean]
|
136
|
+
def element?
|
137
|
+
@node.is_a?(::REXML::Element)
|
138
|
+
end
|
139
|
+
|
140
|
+
def has_attribute?(attr)
|
141
|
+
!!node.attribute(attr)
|
142
|
+
end
|
143
|
+
|
144
|
+
##
|
145
|
+
# Proxy for everything else to @node
|
146
|
+
def method_missing(method, *args)
|
147
|
+
@node.send(method, *args)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
##
|
152
|
+
# NodeSet proxy
|
153
|
+
class NodeSetProxy
|
154
|
+
attr_reader :node_set
|
155
|
+
attr_reader :parent
|
156
|
+
|
157
|
+
def initialize(node_set, parent)
|
158
|
+
@node_set = node_set
|
159
|
+
@parent = parent
|
160
|
+
end
|
161
|
+
|
162
|
+
##
|
163
|
+
# Return a proxy for each child
|
164
|
+
#
|
165
|
+
# @yield(child)
|
166
|
+
# @yieldparam(NodeProxy)
|
167
|
+
def each
|
168
|
+
@node_set.each do |c|
|
169
|
+
yield NodeProxy.new(c, parent)
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
##
|
174
|
+
# Return proxy for first element and remove it
|
175
|
+
# @return [NodeProxy]
|
176
|
+
def shift
|
177
|
+
(e = node_set.shift) && NodeProxy.new(e, parent)
|
178
|
+
end
|
179
|
+
|
180
|
+
##
|
181
|
+
# Add NodeSetProxys
|
182
|
+
# @param [NodeSetProxy, Nokogiri::XML::Node]
|
183
|
+
# @return [NodeSetProxy]
|
184
|
+
def +(other)
|
185
|
+
new_ns = node_set.clone
|
186
|
+
other.node_set.each {|n| new_ns << n}
|
187
|
+
NodeSetProxy.new(new_ns, parent)
|
188
|
+
end
|
189
|
+
|
190
|
+
##
|
191
|
+
# Add a NodeProxy
|
192
|
+
# @param [NodeProxy, Nokogiri::XML::Node]
|
193
|
+
# @return [NodeSetProxy]
|
194
|
+
def <<(elem)
|
195
|
+
node_set << (elem.is_a?(NodeProxy) ? elem.node : elem)
|
196
|
+
self
|
197
|
+
end
|
198
|
+
|
199
|
+
def inspect
|
200
|
+
@node_set.map {|c| NodeProxy.new(c, parent).display_path}.inspect
|
201
|
+
end
|
202
|
+
|
203
|
+
##
|
204
|
+
# Proxy for everything else to @node_set
|
205
|
+
def method_missing(method, *args)
|
206
|
+
@node_set.send(method, *args)
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
##
|
211
|
+
# Initializes the underlying XML library.
|
212
|
+
#
|
213
|
+
# @param [Hash{Symbol => Object}] options
|
214
|
+
# @return [void]
|
215
|
+
def initialize_html(input, options = {})
|
216
|
+
require 'rexml/document' unless defined?(::REXML)
|
217
|
+
@doc = case input
|
218
|
+
when ::REXML::Document
|
219
|
+
input
|
220
|
+
else
|
221
|
+
# Try to detect charset from input
|
222
|
+
options[:encoding] ||= input.charset if input.respond_to?(:charset)
|
223
|
+
|
224
|
+
# Otherwise, default is utf-8
|
225
|
+
options[:encoding] ||= 'utf-8'
|
226
|
+
|
227
|
+
# Set xml:base for the document element, if defined
|
228
|
+
@base_uri = base_uri ? base_uri.to_s : nil
|
229
|
+
|
230
|
+
# Only parse as XML, no HTML mode
|
231
|
+
doc = ::REXML::Document.new(input.respond_to?(:read) ? input.read : input.to_s)
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
# Accessor methods to mask native elements & attributes
|
236
|
+
|
237
|
+
##
|
238
|
+
# Return proxy for document root
|
239
|
+
def root
|
240
|
+
@root ||= NodeProxy.new(@doc.root) if @doc && @doc.root
|
241
|
+
end
|
242
|
+
|
243
|
+
##
|
244
|
+
# Document errors
|
245
|
+
def doc_errors
|
246
|
+
[]
|
247
|
+
end
|
248
|
+
|
249
|
+
##
|
250
|
+
# Find value of document base
|
251
|
+
#
|
252
|
+
# @param [String] base Existing base from URI or :base_uri
|
253
|
+
# @return [String]
|
254
|
+
def doc_base(base)
|
255
|
+
# find if the document has a base element
|
256
|
+
base_el = ::REXML::XPath.first(@doc, "/html/head/base")
|
257
|
+
base = base_el.attribute("href").to_s.split("#").first if base_el
|
258
|
+
|
259
|
+
base || @base_uri
|
260
|
+
end
|
261
|
+
|
262
|
+
##
|
263
|
+
# Based on Microdata element.getItems
|
264
|
+
#
|
265
|
+
# @see http://www.w3.org/TR/2011/WD-microdata-20110525/#top-level-microdata-items
|
266
|
+
def getItems
|
267
|
+
::REXML::XPath.match(@doc, "//[@itemscope]").select {|el| !el.attribute('itemprop')}.map {|n| NodeProxy.new(n)}
|
268
|
+
end
|
269
|
+
|
270
|
+
##
|
271
|
+
# Look up an element in the document by id
|
272
|
+
def find_element_by_id(id)
|
273
|
+
(e = ::REXML::XPath.first(@doc, "//[@id='#{id}']")) && NodeProxy.new(e)
|
274
|
+
end
|
275
|
+
end
|
276
|
+
end
|
277
|
+
end
|