rdf-microdata 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README +21 -45
- data/VERSION +1 -1
- data/etc/doap.html +42 -0
- data/etc/registry.json +39 -0
- data/lib/rdf/microdata.rb +0 -2
- data/lib/rdf/microdata/reader.rb +316 -193
- data/lib/rdf/microdata/reader/nokogiri.rb +232 -0
- data/lib/rdf/microdata/reader/rexml.rb +277 -0
- data/lib/rdf/microdata/vocab.rb +1 -1
- metadata +58 -21
- data/lib/rdf/microdata/extensions.rb +0 -34
@@ -0,0 +1,232 @@
|
|
1
|
+
module RDF::Microdata
|
2
|
+
class Reader < RDF::Reader
|
3
|
+
##
|
4
|
+
# Nokogiri implementation of an HTML parser.
|
5
|
+
#
|
6
|
+
# @see http://nokogiri.org/
|
7
|
+
module Nokogiri
|
8
|
+
##
|
9
|
+
# Returns the name of the underlying XML library.
|
10
|
+
#
|
11
|
+
# @return [Symbol]
|
12
|
+
def self.library
|
13
|
+
:nokogiri
|
14
|
+
end
|
15
|
+
|
16
|
+
# Proxy class to implement uniform element accessors
|
17
|
+
class NodeProxy
|
18
|
+
attr_reader :node
|
19
|
+
attr_reader :parent
|
20
|
+
|
21
|
+
def initialize(node, parent = nil)
|
22
|
+
@node = node
|
23
|
+
@parent = parent
|
24
|
+
end
|
25
|
+
|
26
|
+
##
|
27
|
+
# Element language
|
28
|
+
#
|
29
|
+
# From HTML5 [3.2.3.3]
|
30
|
+
# If both the lang attribute in no namespace and the lang attribute in the XML namespace are set
|
31
|
+
# on an element, user agents must use the lang attribute in the XML namespace, and the lang
|
32
|
+
# attribute in no namespace must be ignored for the purposes of determining the element's
|
33
|
+
# language.
|
34
|
+
#
|
35
|
+
# @return [String]
|
36
|
+
def language
|
37
|
+
language = case
|
38
|
+
when @node.document.is_a?(::Nokogiri::HTML::Document) && @node.attributes["xml:lang"]
|
39
|
+
@node.attributes["xml:lang"].to_s
|
40
|
+
when @node.document.is_a?(::Nokogiri::HTML::Document) && @node.attributes["lang"]
|
41
|
+
@node.attributes["lang"].to_s
|
42
|
+
when @node.attribute("lang")
|
43
|
+
@node.attribute("lang").to_s
|
44
|
+
else
|
45
|
+
parent && parent.element? && parent.language
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
##
|
50
|
+
# Get any xml:base in effect for this element
|
51
|
+
def base
|
52
|
+
if @base.nil?
|
53
|
+
@base = attributes['xml:base'] ||
|
54
|
+
(parent && parent.element? && parent.base) ||
|
55
|
+
false
|
56
|
+
end
|
57
|
+
|
58
|
+
@base == false ? nil : @base
|
59
|
+
end
|
60
|
+
|
61
|
+
def display_path
|
62
|
+
@display_path ||= begin
|
63
|
+
path = []
|
64
|
+
path << parent.display_path if parent
|
65
|
+
path << @node.name
|
66
|
+
case @node
|
67
|
+
when ::Nokogiri::XML::Element then path.join("/")
|
68
|
+
when ::Nokogiri::XML::Attr then path.join("@")
|
69
|
+
else path.join("?")
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
##
|
75
|
+
# Return true of all child elements are text
|
76
|
+
#
|
77
|
+
# @return [Array<:text, :element, :attribute>]
|
78
|
+
def text_content?
|
79
|
+
@node.children.all? {|c| c.text?}
|
80
|
+
end
|
81
|
+
|
82
|
+
##
|
83
|
+
# Retrieve XMLNS definitions for this element
|
84
|
+
#
|
85
|
+
# @return [Hash{String => String}]
|
86
|
+
def namespaces
|
87
|
+
@node.namespace_definitions.inject({}) {|memo, ns| memo[ns.prefix] = ns.href.to_s; memo }
|
88
|
+
end
|
89
|
+
|
90
|
+
##
|
91
|
+
# Children of this node
|
92
|
+
#
|
93
|
+
# @return [NodeSetProxy]
|
94
|
+
def children
|
95
|
+
NodeSetProxy.new(@node.children, self)
|
96
|
+
end
|
97
|
+
|
98
|
+
##
|
99
|
+
# Elements of this node
|
100
|
+
#
|
101
|
+
# @return [NodeSetProxy]
|
102
|
+
def elements
|
103
|
+
NodeSetProxy.new(@node.elements, self)
|
104
|
+
end
|
105
|
+
|
106
|
+
##
|
107
|
+
# Proxy for everything else to @node
|
108
|
+
def method_missing(method, *args)
|
109
|
+
@node.send(method, *args)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
##
|
114
|
+
# NodeSet proxy
|
115
|
+
class NodeSetProxy
|
116
|
+
attr_reader :node_set
|
117
|
+
attr_reader :parent
|
118
|
+
|
119
|
+
def initialize(node_set, parent)
|
120
|
+
@node_set = node_set
|
121
|
+
@parent = parent
|
122
|
+
end
|
123
|
+
|
124
|
+
##
|
125
|
+
# Return a proxy for each child
|
126
|
+
#
|
127
|
+
# @yield(child)
|
128
|
+
# @yieldparam(NodeProxy)
|
129
|
+
def each
|
130
|
+
@node_set.each do |c|
|
131
|
+
yield NodeProxy.new(c, parent)
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
##
|
136
|
+
# Return proxy for first element and remove it
|
137
|
+
# @return [NodeProxy]
|
138
|
+
def shift
|
139
|
+
(e = node_set.shift) && NodeProxy.new(e, parent)
|
140
|
+
end
|
141
|
+
|
142
|
+
##
|
143
|
+
# Add NodeSetProxys
|
144
|
+
# @param [NodeSetProxy, Nokogiri::XML::Node]
|
145
|
+
# @return [NodeSetProxy]
|
146
|
+
def +(other)
|
147
|
+
NodeSetProxy.new(self.node_set + other.node_set, parent)
|
148
|
+
end
|
149
|
+
|
150
|
+
##
|
151
|
+
# Add a NodeProxy
|
152
|
+
# @param [NodeProxy, Nokogiri::XML::Node]
|
153
|
+
# @return [NodeSetProxy]
|
154
|
+
def <<(elem)
|
155
|
+
node_set << (elem.is_a?(NodeProxy) ? elem.node : elem)
|
156
|
+
self
|
157
|
+
end
|
158
|
+
|
159
|
+
def inspect
|
160
|
+
@node_set.map {|c| NodeProxy.new(c, parent).display_path}.inspect
|
161
|
+
end
|
162
|
+
|
163
|
+
##
|
164
|
+
# Proxy for everything else to @node_set
|
165
|
+
def method_missing(method, *args)
|
166
|
+
@node_set.send(method, *args)
|
167
|
+
end
|
168
|
+
end
|
169
|
+
|
170
|
+
##
|
171
|
+
# Initializes the underlying XML library.
|
172
|
+
#
|
173
|
+
# @param [Hash{Symbol => Object}] options
|
174
|
+
# @return [void]
|
175
|
+
def initialize_html(input, options = {})
|
176
|
+
require 'nokogiri' unless defined?(::Nokogiri)
|
177
|
+
@doc = case input
|
178
|
+
when ::Nokogiri::HTML::Document
|
179
|
+
input
|
180
|
+
else
|
181
|
+
# Try to detect charset from input
|
182
|
+
options[:encoding] ||= input.charset if input.respond_to?(:charset)
|
183
|
+
|
184
|
+
# Otherwise, default is utf-8
|
185
|
+
options[:encoding] ||= 'utf-8'
|
186
|
+
|
187
|
+
::Nokogiri::HTML.parse(input, base_uri.to_s, options[:encoding])
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
# Accessor methods to mask native elements & attributes
|
192
|
+
|
193
|
+
##
|
194
|
+
# Return proxy for document root
|
195
|
+
def root
|
196
|
+
@root ||= NodeProxy.new(@doc.root) if @doc && @doc.root
|
197
|
+
end
|
198
|
+
|
199
|
+
##
|
200
|
+
# Document errors
|
201
|
+
def doc_errors
|
202
|
+
@doc.errors
|
203
|
+
end
|
204
|
+
|
205
|
+
##
|
206
|
+
# Find value of document base
|
207
|
+
#
|
208
|
+
# @param [String] base Existing base from URI or :base_uri
|
209
|
+
# @return [String]
|
210
|
+
def doc_base(base)
|
211
|
+
# find if the document has a base element
|
212
|
+
base_el = @doc.at_css("html>head>base")
|
213
|
+
base = base_el.attribute("href").to_s.split("#").first if base_el
|
214
|
+
base
|
215
|
+
end
|
216
|
+
|
217
|
+
##
|
218
|
+
# Based on Microdata element.getItems
|
219
|
+
#
|
220
|
+
# @see http://www.w3.org/TR/2011/WD-microdata-20110525/#top-level-microdata-items
|
221
|
+
def getItems
|
222
|
+
@doc.css('[itemscope]').select {|el| !el.has_attribute?('itemprop')}.map {|n| NodeProxy.new(n)}
|
223
|
+
end
|
224
|
+
|
225
|
+
##
|
226
|
+
# Look up an element in the document by id
|
227
|
+
def find_element_by_id(id)
|
228
|
+
(e = @doc.at_css("##{id}")) && NodeProxy.new(e)
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|
232
|
+
end
|
@@ -0,0 +1,277 @@
|
|
1
|
+
require 'htmlentities'
|
2
|
+
|
3
|
+
module RDF::Microdata
|
4
|
+
class Reader < RDF::Reader
|
5
|
+
##
|
6
|
+
# REXML implementation of an HTML parser.
|
7
|
+
#
|
8
|
+
# @see http://www.germane-software.com/software/rexml/
|
9
|
+
module REXML
|
10
|
+
##
|
11
|
+
# Returns the name of the underlying XML library.
|
12
|
+
#
|
13
|
+
# @return [Symbol]
|
14
|
+
def self.library
|
15
|
+
:rexml
|
16
|
+
end
|
17
|
+
|
18
|
+
# Proxy class to implement uniform element accessors
|
19
|
+
class NodeProxy
|
20
|
+
attr_reader :node
|
21
|
+
attr_reader :parent
|
22
|
+
|
23
|
+
def initialize(node, parent = nil)
|
24
|
+
@node = node
|
25
|
+
@parent = parent
|
26
|
+
end
|
27
|
+
|
28
|
+
##
|
29
|
+
# Element language
|
30
|
+
#
|
31
|
+
# From HTML5 [3.2.3.3]
|
32
|
+
# If both the lang attribute in no namespace and the lang attribute in the XML namespace are set
|
33
|
+
# on an element, user agents must use the lang attribute in the XML namespace, and the lang
|
34
|
+
# attribute in no namespace must be ignored for the purposes of determining the element's
|
35
|
+
# language.
|
36
|
+
#
|
37
|
+
# @return [String]
|
38
|
+
def language
|
39
|
+
language = case
|
40
|
+
when @node.attribute("lang")
|
41
|
+
@node.attribute("lang").to_s
|
42
|
+
else
|
43
|
+
parent && parent.element? && parent.language
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
##
|
48
|
+
# Return xml:base on element, if defined
|
49
|
+
#
|
50
|
+
# @return [String]
|
51
|
+
def base
|
52
|
+
if @base.nil?
|
53
|
+
@base = attributes['xml:base'] ||
|
54
|
+
(parent && parent.element? && parent.base) ||
|
55
|
+
false
|
56
|
+
end
|
57
|
+
|
58
|
+
@base == false ? nil : @base
|
59
|
+
end
|
60
|
+
|
61
|
+
def display_path
|
62
|
+
@display_path ||= begin
|
63
|
+
path = []
|
64
|
+
path << parent.display_path if parent
|
65
|
+
path << @node.name
|
66
|
+
case @node
|
67
|
+
when ::REXML::Element then path.join("/")
|
68
|
+
when ::REXML::Attribute then path.join("@")
|
69
|
+
else path.join("?")
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
##
|
75
|
+
# Return true of all child elements are text
|
76
|
+
#
|
77
|
+
# @return [Array<:text, :element, :attribute>]
|
78
|
+
def text_content?
|
79
|
+
@node.children.all? {|c| c.is_a?(::REXML::Text)}
|
80
|
+
end
|
81
|
+
|
82
|
+
##
|
83
|
+
# Retrieve XMLNS definitions for this element
|
84
|
+
#
|
85
|
+
# @return [Hash{String => String}]
|
86
|
+
def namespaces
|
87
|
+
ns_decls = {}
|
88
|
+
@node.attributes.each do |name, attr|
|
89
|
+
next unless name =~ /^xmlns(?:\:(.+))?/
|
90
|
+
ns_decls[$1] = attr
|
91
|
+
end
|
92
|
+
ns_decls
|
93
|
+
end
|
94
|
+
|
95
|
+
##
|
96
|
+
# Children of this node
|
97
|
+
#
|
98
|
+
# @return [NodeSetProxy]
|
99
|
+
def children
|
100
|
+
NodeSetProxy.new(@node.children, self)
|
101
|
+
end
|
102
|
+
|
103
|
+
##
|
104
|
+
# Elements of this node
|
105
|
+
#
|
106
|
+
# @return [NodeSetProxy]
|
107
|
+
def elements
|
108
|
+
NodeSetProxy.new(@node.children.select {|c| c.is_a?(::REXML::Element)}, self)
|
109
|
+
end
|
110
|
+
|
111
|
+
##
|
112
|
+
# Inner text of an element
|
113
|
+
#
|
114
|
+
# @see http://apidock.com/ruby/REXML/Element/get_text#743-Get-all-inner-texts
|
115
|
+
# @return [String]
|
116
|
+
def inner_text
|
117
|
+
coder = HTMLEntities.new
|
118
|
+
::REXML::XPath.match(@node,'.//text()').map { |e|
|
119
|
+
coder.decode(e)
|
120
|
+
}.join
|
121
|
+
end
|
122
|
+
|
123
|
+
##
|
124
|
+
# Inner text of an element
|
125
|
+
#
|
126
|
+
# @see http://apidock.com/ruby/REXML/Element/get_text#743-Get-all-inner-texts
|
127
|
+
# @return [String]
|
128
|
+
def inner_html
|
129
|
+
@node.children.map(&:to_s).join
|
130
|
+
end
|
131
|
+
|
132
|
+
##
|
133
|
+
# Node type accessors
|
134
|
+
#
|
135
|
+
# @return [Boolean]
|
136
|
+
def element?
|
137
|
+
@node.is_a?(::REXML::Element)
|
138
|
+
end
|
139
|
+
|
140
|
+
def has_attribute?(attr)
|
141
|
+
!!node.attribute(attr)
|
142
|
+
end
|
143
|
+
|
144
|
+
##
|
145
|
+
# Proxy for everything else to @node
|
146
|
+
def method_missing(method, *args)
|
147
|
+
@node.send(method, *args)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
##
|
152
|
+
# NodeSet proxy
|
153
|
+
class NodeSetProxy
|
154
|
+
attr_reader :node_set
|
155
|
+
attr_reader :parent
|
156
|
+
|
157
|
+
def initialize(node_set, parent)
|
158
|
+
@node_set = node_set
|
159
|
+
@parent = parent
|
160
|
+
end
|
161
|
+
|
162
|
+
##
|
163
|
+
# Return a proxy for each child
|
164
|
+
#
|
165
|
+
# @yield(child)
|
166
|
+
# @yieldparam(NodeProxy)
|
167
|
+
def each
|
168
|
+
@node_set.each do |c|
|
169
|
+
yield NodeProxy.new(c, parent)
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
##
|
174
|
+
# Return proxy for first element and remove it
|
175
|
+
# @return [NodeProxy]
|
176
|
+
def shift
|
177
|
+
(e = node_set.shift) && NodeProxy.new(e, parent)
|
178
|
+
end
|
179
|
+
|
180
|
+
##
|
181
|
+
# Add NodeSetProxys
|
182
|
+
# @param [NodeSetProxy, Nokogiri::XML::Node]
|
183
|
+
# @return [NodeSetProxy]
|
184
|
+
def +(other)
|
185
|
+
new_ns = node_set.clone
|
186
|
+
other.node_set.each {|n| new_ns << n}
|
187
|
+
NodeSetProxy.new(new_ns, parent)
|
188
|
+
end
|
189
|
+
|
190
|
+
##
|
191
|
+
# Add a NodeProxy
|
192
|
+
# @param [NodeProxy, Nokogiri::XML::Node]
|
193
|
+
# @return [NodeSetProxy]
|
194
|
+
def <<(elem)
|
195
|
+
node_set << (elem.is_a?(NodeProxy) ? elem.node : elem)
|
196
|
+
self
|
197
|
+
end
|
198
|
+
|
199
|
+
def inspect
|
200
|
+
@node_set.map {|c| NodeProxy.new(c, parent).display_path}.inspect
|
201
|
+
end
|
202
|
+
|
203
|
+
##
|
204
|
+
# Proxy for everything else to @node_set
|
205
|
+
def method_missing(method, *args)
|
206
|
+
@node_set.send(method, *args)
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
##
|
211
|
+
# Initializes the underlying XML library.
|
212
|
+
#
|
213
|
+
# @param [Hash{Symbol => Object}] options
|
214
|
+
# @return [void]
|
215
|
+
def initialize_html(input, options = {})
|
216
|
+
require 'rexml/document' unless defined?(::REXML)
|
217
|
+
@doc = case input
|
218
|
+
when ::REXML::Document
|
219
|
+
input
|
220
|
+
else
|
221
|
+
# Try to detect charset from input
|
222
|
+
options[:encoding] ||= input.charset if input.respond_to?(:charset)
|
223
|
+
|
224
|
+
# Otherwise, default is utf-8
|
225
|
+
options[:encoding] ||= 'utf-8'
|
226
|
+
|
227
|
+
# Set xml:base for the document element, if defined
|
228
|
+
@base_uri = base_uri ? base_uri.to_s : nil
|
229
|
+
|
230
|
+
# Only parse as XML, no HTML mode
|
231
|
+
doc = ::REXML::Document.new(input.respond_to?(:read) ? input.read : input.to_s)
|
232
|
+
end
|
233
|
+
end
|
234
|
+
|
235
|
+
# Accessor methods to mask native elements & attributes
|
236
|
+
|
237
|
+
##
|
238
|
+
# Return proxy for document root
|
239
|
+
def root
|
240
|
+
@root ||= NodeProxy.new(@doc.root) if @doc && @doc.root
|
241
|
+
end
|
242
|
+
|
243
|
+
##
|
244
|
+
# Document errors
|
245
|
+
def doc_errors
|
246
|
+
[]
|
247
|
+
end
|
248
|
+
|
249
|
+
##
|
250
|
+
# Find value of document base
|
251
|
+
#
|
252
|
+
# @param [String] base Existing base from URI or :base_uri
|
253
|
+
# @return [String]
|
254
|
+
def doc_base(base)
|
255
|
+
# find if the document has a base element
|
256
|
+
base_el = ::REXML::XPath.first(@doc, "/html/head/base")
|
257
|
+
base = base_el.attribute("href").to_s.split("#").first if base_el
|
258
|
+
|
259
|
+
base || @base_uri
|
260
|
+
end
|
261
|
+
|
262
|
+
##
|
263
|
+
# Based on Microdata element.getItems
|
264
|
+
#
|
265
|
+
# @see http://www.w3.org/TR/2011/WD-microdata-20110525/#top-level-microdata-items
|
266
|
+
def getItems
|
267
|
+
::REXML::XPath.match(@doc, "//[@itemscope]").select {|el| !el.attribute('itemprop')}.map {|n| NodeProxy.new(n)}
|
268
|
+
end
|
269
|
+
|
270
|
+
##
|
271
|
+
# Look up an element in the document by id
|
272
|
+
def find_element_by_id(id)
|
273
|
+
(e = ::REXML::XPath.first(@doc, "//[@id='#{id}']")) && NodeProxy.new(e)
|
274
|
+
end
|
275
|
+
end
|
276
|
+
end
|
277
|
+
end
|