rdf-rdfa 0.3.7 → 0.3.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,274 @@
1
+ module RDF::RDFa
2
+ class Reader < RDF::Reader
3
+ ##
4
+ # Nokogiri implementation of an XML parser.
5
+ #
6
+ # @see http://nokogiri.org/
7
+ module Nokogiri
8
+ ##
9
+ # Returns the name of the underlying XML library.
10
+ #
11
+ # @return [Symbol]
12
+ def self.library
13
+ :nokogiri
14
+ end
15
+
16
+ # Proxy class to implement uniform element accessors
17
+ class NodeProxy
18
+ attr_reader :node
19
+ attr_reader :parent
20
+
21
+ def initialize(node, parent = nil)
22
+ @node = node
23
+ @parent = parent
24
+ end
25
+
26
+ ##
27
+ # Element language
28
+ #
29
+ # From HTML5 [3.2.3.3]
30
+ # If both the lang attribute in no namespace and the lang attribute in the XML namespace are set
31
+ # on an element, user agents must use the lang attribute in the XML namespace, and the lang
32
+ # attribute in no namespace must be ignored for the purposes of determining the element's
33
+ # language.
34
+ #
35
+ # @return [String]
36
+ def language
37
+ language = case
38
+ when @node.document.is_a?(::Nokogiri::HTML::Document) && @node.attributes["xml:lang"]
39
+ @node.attributes["xml:lang"].to_s
40
+ when @node.document.is_a?(::Nokogiri::HTML::Document) && @node.attributes["lang"]
41
+ @node.attributes["lang"].to_s
42
+ when @node.attribute_with_ns("lang", RDF::XML.to_s)
43
+ @node.attribute_with_ns("lang", RDF::XML.to_s)
44
+ when @node.attribute("lang")
45
+ @node.attribute("lang").to_s
46
+ end
47
+ end
48
+
49
+ ##
50
+ # Return xml:base on element, if defined
51
+ #
52
+ # @return [String]
53
+ def base
54
+ @node.attribute_with_ns("base", RDF::XML.to_s)
55
+ end
56
+
57
+ def display_path
58
+ @display_path ||= begin
59
+ path = []
60
+ path << parent.display_path if parent
61
+ path << @node.name
62
+ case @node
63
+ when ::Nokogiri::XML::Element then path.join("/")
64
+ when ::Nokogiri::XML::Attr then path.join("@")
65
+ else path.join("?")
66
+ end
67
+ end
68
+ end
69
+
70
+ ##
71
+ # Return true of all child elements are text
72
+ #
73
+ # @return [Array<:text, :element, :attribute>]
74
+ def text_content?
75
+ @node.children.all? {|c| c.text?}
76
+ end
77
+
78
+ ##
79
+ # Retrieve XMLNS definitions for this element
80
+ #
81
+ # @return [Hash{String => String}]
82
+ def namespaces
83
+ @node.namespace_definitions.inject({}) {|memo, ns| memo[ns.prefix] = ns.href.to_s; memo }
84
+ end
85
+
86
+ ##
87
+ # Children of this node
88
+ #
89
+ # @return [NodeSetProxy]
90
+ def children
91
+ NodeSetProxy.new(@node.children, self)
92
+ end
93
+
94
+ ##
95
+ # Proxy for everything else to @node
96
+ def method_missing(method, *args)
97
+ @node.send(method, *args)
98
+ end
99
+ end
100
+
101
+ ##
102
+ # NodeSet proxy
103
+ class NodeSetProxy
104
+ attr_reader :node_set
105
+ attr_reader :parent
106
+
107
+ def initialize(node_set, parent)
108
+ @node_set = node_set
109
+ @parent = parent
110
+ end
111
+
112
+ ##
113
+ # Return a proxy for each child
114
+ #
115
+ # @yield(child)
116
+ # @yieldparam(NodeProxy)
117
+ def each
118
+ @node_set.each do |c|
119
+ yield NodeProxy.new(c, parent)
120
+ end
121
+ end
122
+
123
+ ##
124
+ # Proxy for everything else to @node_set
125
+ def method_missing(method, *args)
126
+ @node_set.send(method, *args)
127
+ end
128
+ end
129
+
130
+ ##
131
+ # Initializes the underlying XML library.
132
+ #
133
+ # @param [Hash{Symbol => Object}] options
134
+ # @return [void]
135
+ def initialize_xml(input, options = {})
136
+ require 'nokogiri' unless defined?(::Nokogiri)
137
+ @doc = case input
138
+ when ::Nokogiri::HTML::Document, ::Nokogiri::XML::Document
139
+ input
140
+ else
141
+ # Try to detect charset from input
142
+ options[:encoding] ||= input.charset if input.respond_to?(:charset)
143
+
144
+ # Otherwise, default is utf-8
145
+ options[:encoding] ||= 'utf-8'
146
+
147
+ case @host_language
148
+ when :html4, :html5
149
+ ::Nokogiri::HTML.parse(input, base_uri.to_s, options[:encoding])
150
+ else
151
+ ::Nokogiri::XML.parse(input, base_uri.to_s, options[:encoding])
152
+ end
153
+ end
154
+ end
155
+
156
+ # Determine the host language and/or version from options and the input document
157
+ def detect_host_language_version(input, options)
158
+ @host_language = options[:host_language] ? options[:host_language].to_sym : nil
159
+ @version = options[:version] ? options[:version].to_sym : nil
160
+ return if @host_language && @version
161
+
162
+ # Snif version based on input
163
+ case input
164
+ when ::Nokogiri::XML::Document, ::Nokogiri::HTML::Document
165
+ doc_type_string = input.children.detect {|c| c.is_a?(::Nokogiri::XML::DTD)}
166
+ version_attr = input.root && input.root.attribute("version").to_s
167
+ root_element = input.root.name.downcase
168
+ root_namespace = input.root.namespace.to_s
169
+ root_attrs = input.root.attributes
170
+ content_type = case
171
+ when root_element == "html" && input.is_a?(Nokogiri::HTML::Document)
172
+ "text/html"
173
+ when root_element == "html" && input.is_a?(Nokogiri::XML::Document)
174
+ "application/xhtml+html"
175
+ end
176
+ else
177
+ content_type = input.content_type if input.respond_to?(:content_type)
178
+
179
+ # Determine from head of document
180
+ head = if input.respond_to?(:read)
181
+ input.rewind
182
+ string = input.read(1000)
183
+ input.rewind
184
+ string.to_s
185
+ else
186
+ input.to_s[0..1000]
187
+ end
188
+
189
+ doc_type_string = head.match(%r(<!DOCTYPE[^>]*>)m).to_s
190
+ root = head.match(%r(<[^!\?>]*>)m).to_s
191
+ root_element = root.match(%r(^<(\S+)[ >])) ? $1 : ""
192
+ version_attr = root.match(/version\s+=\s+(\S+)[\s">]/m) ? $1 : ""
193
+ head_element = head.match(%r(<head.*<\/head>)mi)
194
+ head_doc = ::Nokogiri::HTML.parse(head_element.to_s)
195
+
196
+ # May determine content-type and/or charset from meta
197
+ # Easist way is to parse head into a document and iterate
198
+ # of CSS matches
199
+ head_doc.css("meta").each do |e|
200
+ if e.attr("http-equiv").to_s.downcase == 'content-type'
201
+ content_type, e = e.attr("content").to_s.downcase.split(";")
202
+ options[:encoding] = $1.downcase if e.to_s =~ /charset=([^\s]*)$/i
203
+ elsif e.attr("charset")
204
+ options[:encoding] = e.attr("charset").to_s.downcase
205
+ end
206
+ end
207
+ end
208
+
209
+ # Already using XML parser, determine from DOCTYPE and/or root element
210
+ @version ||= :"rdfa1.0" if doc_type_string =~ /RDFa 1\.0/
211
+ @version ||= :"rdfa1.0" if version_attr =~ /RDFa 1\.0/
212
+ @version ||= :"rdfa1.1" if version_attr =~ /RDFa 1\.1/
213
+ @version ||= :"rdfa1.1"
214
+
215
+ @host_language ||= case content_type
216
+ when "application/xml" then :xml1
217
+ when "image/svg+xml" then :svg
218
+ when "text/html"
219
+ case doc_type_string
220
+ when /html 4/i then :html4
221
+ when /xhtml/i then :xhtml1
222
+ when /html/i then :html5
223
+ end
224
+ when "application/xhtml+xml"
225
+ case doc_type_string
226
+ when /html 4/i then :html4
227
+ when /xhtml/i then :xhtml1
228
+ when /html/i then :xhtml5
229
+ end
230
+ else
231
+ case root_element
232
+ when /svg/i then :svg
233
+ when /html/i then :html4
234
+ end
235
+ end
236
+
237
+ @host_language ||= :xml1
238
+ end
239
+
240
+ # Accessor methods to mask native elements & attributes
241
+
242
+ ##
243
+ # Return proxy for document root
244
+ def root
245
+ @root ||= NodeProxy.new(@doc.root) if @doc && @doc.root
246
+ end
247
+
248
+ ##
249
+ # Document errors
250
+ def doc_errors
251
+ @doc.errors
252
+ end
253
+
254
+ ##
255
+ # Find value of document base
256
+ #
257
+ # @param [String] base Existing base from URI or :base_uri
258
+ # @return [String]
259
+ def doc_base(base)
260
+ # find if the document has a base element
261
+ case @host_language
262
+ when :xhtml1, :xhtml5, :html4, :html5
263
+ base_el = @doc.at_css("html>head>base")
264
+ base = base_el.attribute("href").to_s.split("#").first if base_el
265
+ else
266
+ xml_base = root.attribute_with_ns("base", RDF::XML.to_s)
267
+ base = xml_base if xml_base
268
+ end
269
+
270
+ base
271
+ end
272
+ end
273
+ end
274
+ end
@@ -0,0 +1,300 @@
1
+ require 'htmlentities'
2
+
3
+ module RDF::RDFa
4
+ class Reader < RDF::Reader
5
+ ##
6
+ # REXML implementation of an XML parser.
7
+ #
8
+ # @see http://www.germane-software.com/software/rexml/
9
+ module REXML
10
+ ##
11
+ # Returns the name of the underlying XML library.
12
+ #
13
+ # @return [Symbol]
14
+ def self.library
15
+ :rexml
16
+ end
17
+
18
+ # Proxy class to implement uniform element accessors
19
+ class NodeProxy
20
+ attr_reader :node
21
+ attr_reader :parent
22
+
23
+ def initialize(node, parent = nil)
24
+ @node = node
25
+ @parent = parent
26
+ end
27
+
28
+ ##
29
+ # Element language
30
+ #
31
+ # From HTML5 [3.2.3.3]
32
+ # If both the lang attribute in no namespace and the lang attribute in the XML namespace are set
33
+ # on an element, user agents must use the lang attribute in the XML namespace, and the lang
34
+ # attribute in no namespace must be ignored for the purposes of determining the element's
35
+ # language.
36
+ #
37
+ # @return [String]
38
+ def language
39
+ language = case
40
+ when @node.attribute("lang", RDF::XML.to_s)
41
+ @node.attribute("lang", RDF::XML.to_s)
42
+ when @node.attribute("lang")
43
+ @node.attribute("lang").to_s
44
+ end
45
+ end
46
+
47
+ ##
48
+ # Return xml:base on element, if defined
49
+ #
50
+ # @return [String]
51
+ def base
52
+ @node.attribute("base", RDF::XML.to_s)
53
+ end
54
+
55
+ def display_path
56
+ @display_path ||= begin
57
+ path = []
58
+ path << parent.display_path if parent
59
+ path << @node.name
60
+ case @node
61
+ when ::REXML::Element then path.join("/")
62
+ when ::REXML::Attribute then path.join("@")
63
+ else path.join("?")
64
+ end
65
+ end
66
+ end
67
+
68
+ ##
69
+ # Return true of all child elements are text
70
+ #
71
+ # @return [Array<:text, :element, :attribute>]
72
+ def text_content?
73
+ @node.children.all? {|c| c.is_a?(::REXML::Text)}
74
+ end
75
+
76
+ ##
77
+ # Retrieve XMLNS definitions for this element
78
+ #
79
+ # @return [Hash{String => String}]
80
+ def namespaces
81
+ ns_decls = {}
82
+ @node.attributes.each do |name, attr|
83
+ next unless name =~ /^xmlns(?:\:(.+))?/
84
+ ns_decls[$1] = attr
85
+ end
86
+ ns_decls
87
+ end
88
+
89
+ ##
90
+ # Children of this node
91
+ #
92
+ # @return [NodeSetProxy]
93
+ def children
94
+ NodeSetProxy.new(@node.children, self)
95
+ end
96
+
97
+ ##
98
+ # Inner text of an element
99
+ #
100
+ # @see http://apidock.com/ruby/REXML/Element/get_text#743-Get-all-inner-texts
101
+ # @return [String]
102
+ def inner_text
103
+ coder = HTMLEntities.new
104
+ ::REXML::XPath.match(@node,'.//text()').map { |e|
105
+ coder.decode(e)
106
+ }.join
107
+ end
108
+
109
+ ##
110
+ # Inner text of an element
111
+ #
112
+ # @see http://apidock.com/ruby/REXML/Element/get_text#743-Get-all-inner-texts
113
+ # @return [String]
114
+ def inner_html
115
+ @node.children.map(&:to_s).join
116
+ end
117
+
118
+ ##
119
+ # Node type accessors
120
+ #
121
+ # @return [Boolean]
122
+ def element?
123
+ @node.is_a?(::REXML::Element)
124
+ end
125
+
126
+ ##
127
+ # Proxy for everything else to @node
128
+ def method_missing(method, *args)
129
+ @node.send(method, *args)
130
+ end
131
+ end
132
+
133
+ ##
134
+ # NodeSet proxy
135
+ class NodeSetProxy
136
+ attr_reader :node_set
137
+ attr_reader :parent
138
+
139
+ def initialize(node_set, parent)
140
+ @node_set = node_set
141
+ @parent = parent
142
+ end
143
+
144
+ ##
145
+ # Return a proxy for each child
146
+ #
147
+ # @yield(child)
148
+ # @yieldparam(NodeProxy)
149
+ def each
150
+ @node_set.each do |c|
151
+ yield NodeProxy.new(c, parent)
152
+ end
153
+ end
154
+
155
+ ##
156
+ # Proxy for everything else to @node_set
157
+ def method_missing(method, *args)
158
+ @node_set.send(method, *args)
159
+ end
160
+ end
161
+
162
+ ##
163
+ # Initializes the underlying XML library.
164
+ #
165
+ # @param [Hash{Symbol => Object}] options
166
+ # @return [void]
167
+ def initialize_xml(input, options = {})
168
+ require 'rexml/document' unless defined?(::REXML)
169
+ @doc = case input
170
+ when ::REXML::Document
171
+ input
172
+ else
173
+ # Try to detect charset from input
174
+ options[:encoding] ||= input.charset if input.respond_to?(:charset)
175
+
176
+ # Otherwise, default is utf-8
177
+ options[:encoding] ||= 'utf-8'
178
+
179
+ # Set xml:base for the document element, if defined
180
+ @base_uri = base_uri ? base_uri.to_s : nil
181
+
182
+ # Only parse as XML, no HTML mode
183
+ doc = ::REXML::Document.new(input.respond_to?(:read) ? input.read : input.to_s)
184
+ end
185
+ end
186
+
187
+ # Determine the host language and/or version from options and the input document
188
+ def detect_host_language_version(input, options)
189
+ @host_language = options[:host_language] ? options[:host_language].to_sym : nil
190
+ @version = options[:version] ? options[:version].to_sym : nil
191
+ return if @host_language && @version
192
+
193
+ # Snif version based on input
194
+ case input
195
+ when ::REXML::Document
196
+ doc_type_string = input.doctype.to_s
197
+ version_attr = input.root && input.root.attribute("version").to_s
198
+ root_element = input.root.name.downcase
199
+ root_namespace = input.root.namespace.to_s
200
+ root_attrs = input.root.attributes
201
+ content_type = "application/xhtml+html" # FIXME: what about other possible XML types?
202
+ else
203
+ content_type = input.content_type if input.respond_to?(:content_type)
204
+
205
+ # Determine from head of document
206
+ head = if input.respond_to?(:read)
207
+ input.rewind
208
+ string = input.read(1000)
209
+ input.rewind
210
+ string.to_s
211
+ else
212
+ input.to_s[0..1000]
213
+ end
214
+
215
+ doc_type_string = head.match(%r(<!DOCTYPE[^>]*>)m).to_s
216
+ root = head.match(%r(<[^!\?>]*>)m).to_s
217
+ root_element = root.match(%r(^<(\S+)[ >])) ? $1 : ""
218
+ version_attr = root.match(/version\s+=\s+(\S+)[\s">]/m) ? $1 : ""
219
+ head_element = head.match(%r(<head.*<\/head>)mi)
220
+ head_doc = ::REXML::Document.new(head_element.to_s)
221
+
222
+ # May determine content-type and/or charset from meta
223
+ # Easist way is to parse head into a document and iterate
224
+ # of CSS matches
225
+ ::REXML::XPath.each(head_doc, "//meta") do |e|
226
+ if e.attribute("http-equiv").to_s.downcase == 'content-type'
227
+ content_type, e = e.attribute("content").to_s.downcase.split(";")
228
+ options[:encoding] = $1.downcase if e.to_s =~ /charset=([^\s]*)$/i
229
+ elsif e.attribute("charset")
230
+ options[:encoding] = e.attr("charset").to_s.downcase
231
+ end
232
+ end
233
+ end
234
+
235
+ # Already using XML parser, determine from DOCTYPE and/or root element
236
+ @version ||= :"rdfa1.0" if doc_type_string =~ /RDFa 1\.0/
237
+ @version ||= :"rdfa1.0" if version_attr =~ /RDFa 1\.0/
238
+ @version ||= :"rdfa1.1" if version_attr =~ /RDFa 1\.1/
239
+ @version ||= :"rdfa1.1"
240
+
241
+ @host_language ||= case content_type
242
+ when "application/xml" then :xml1
243
+ when "image/svg+xml" then :svg
244
+ when "text/html"
245
+ case doc_type_string
246
+ when /html 4/i then :html4
247
+ when /xhtml/i then :xhtml1
248
+ when /html/i then :html5
249
+ end
250
+ when "application/xhtml+xml"
251
+ case doc_type_string
252
+ when /html 4/i then :html4
253
+ when /xhtml/i then :xhtml1
254
+ when /html/i then :xhtml5
255
+ end
256
+ else
257
+ case root_element
258
+ when /svg/i then :svg
259
+ when /html/i then :html4
260
+ end
261
+ end
262
+
263
+ @host_language ||= :xml1
264
+ end
265
+
266
+ # Accessor methods to mask native elements & attributes
267
+
268
+ ##
269
+ # Return proxy for document root
270
+ def root
271
+ @root ||= NodeProxy.new(@doc.root) if @doc && @doc.root
272
+ end
273
+
274
+ ##
275
+ # Document errors
276
+ def doc_errors
277
+ []
278
+ end
279
+
280
+ ##
281
+ # Find value of document base
282
+ #
283
+ # @param [String] base Existing base from URI or :base_uri
284
+ # @return [String]
285
+ def doc_base(base)
286
+ # find if the document has a base element
287
+ case @host_language
288
+ when :xhtml1, :xhtml5, :html4, :html5
289
+ base_el = ::REXML::XPath.first(@doc, "/html/head/base")
290
+ base = base_el.attribute("href").to_s.split("#").first if base_el
291
+ else
292
+ xml_base = root.attribute("base", RDF::XML.to_s)
293
+ base = xml_base if xml_base
294
+ end
295
+
296
+ base || @base_uri
297
+ end
298
+ end
299
+ end
300
+ end