rdf-rdfa 0.3.7 → 0.3.8

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,274 @@
1
+ module RDF::RDFa
2
+ class Reader < RDF::Reader
3
+ ##
4
+ # Nokogiri implementation of an XML parser.
5
+ #
6
+ # @see http://nokogiri.org/
7
+ module Nokogiri
8
+ ##
9
+ # Returns the name of the underlying XML library.
10
+ #
11
+ # @return [Symbol]
12
+ def self.library
13
+ :nokogiri
14
+ end
15
+
16
+ # Proxy class to implement uniform element accessors
17
+ class NodeProxy
18
+ attr_reader :node
19
+ attr_reader :parent
20
+
21
+ def initialize(node, parent = nil)
22
+ @node = node
23
+ @parent = parent
24
+ end
25
+
26
+ ##
27
+ # Element language
28
+ #
29
+ # From HTML5 [3.2.3.3]
30
+ # If both the lang attribute in no namespace and the lang attribute in the XML namespace are set
31
+ # on an element, user agents must use the lang attribute in the XML namespace, and the lang
32
+ # attribute in no namespace must be ignored for the purposes of determining the element's
33
+ # language.
34
+ #
35
+ # @return [String]
36
+ def language
37
+ language = case
38
+ when @node.document.is_a?(::Nokogiri::HTML::Document) && @node.attributes["xml:lang"]
39
+ @node.attributes["xml:lang"].to_s
40
+ when @node.document.is_a?(::Nokogiri::HTML::Document) && @node.attributes["lang"]
41
+ @node.attributes["lang"].to_s
42
+ when @node.attribute_with_ns("lang", RDF::XML.to_s)
43
+ @node.attribute_with_ns("lang", RDF::XML.to_s)
44
+ when @node.attribute("lang")
45
+ @node.attribute("lang").to_s
46
+ end
47
+ end
48
+
49
+ ##
50
+ # Return xml:base on element, if defined
51
+ #
52
+ # @return [String]
53
+ def base
54
+ @node.attribute_with_ns("base", RDF::XML.to_s)
55
+ end
56
+
57
+ def display_path
58
+ @display_path ||= begin
59
+ path = []
60
+ path << parent.display_path if parent
61
+ path << @node.name
62
+ case @node
63
+ when ::Nokogiri::XML::Element then path.join("/")
64
+ when ::Nokogiri::XML::Attr then path.join("@")
65
+ else path.join("?")
66
+ end
67
+ end
68
+ end
69
+
70
+ ##
71
+ # Return true of all child elements are text
72
+ #
73
+ # @return [Array<:text, :element, :attribute>]
74
+ def text_content?
75
+ @node.children.all? {|c| c.text?}
76
+ end
77
+
78
+ ##
79
+ # Retrieve XMLNS definitions for this element
80
+ #
81
+ # @return [Hash{String => String}]
82
+ def namespaces
83
+ @node.namespace_definitions.inject({}) {|memo, ns| memo[ns.prefix] = ns.href.to_s; memo }
84
+ end
85
+
86
+ ##
87
+ # Children of this node
88
+ #
89
+ # @return [NodeSetProxy]
90
+ def children
91
+ NodeSetProxy.new(@node.children, self)
92
+ end
93
+
94
+ ##
95
+ # Proxy for everything else to @node
96
+ def method_missing(method, *args)
97
+ @node.send(method, *args)
98
+ end
99
+ end
100
+
101
+ ##
102
+ # NodeSet proxy
103
+ class NodeSetProxy
104
+ attr_reader :node_set
105
+ attr_reader :parent
106
+
107
+ def initialize(node_set, parent)
108
+ @node_set = node_set
109
+ @parent = parent
110
+ end
111
+
112
+ ##
113
+ # Return a proxy for each child
114
+ #
115
+ # @yield(child)
116
+ # @yieldparam(NodeProxy)
117
+ def each
118
+ @node_set.each do |c|
119
+ yield NodeProxy.new(c, parent)
120
+ end
121
+ end
122
+
123
+ ##
124
+ # Proxy for everything else to @node_set
125
+ def method_missing(method, *args)
126
+ @node_set.send(method, *args)
127
+ end
128
+ end
129
+
130
+ ##
131
+ # Initializes the underlying XML library.
132
+ #
133
+ # @param [Hash{Symbol => Object}] options
134
+ # @return [void]
135
+ def initialize_xml(input, options = {})
136
+ require 'nokogiri' unless defined?(::Nokogiri)
137
+ @doc = case input
138
+ when ::Nokogiri::HTML::Document, ::Nokogiri::XML::Document
139
+ input
140
+ else
141
+ # Try to detect charset from input
142
+ options[:encoding] ||= input.charset if input.respond_to?(:charset)
143
+
144
+ # Otherwise, default is utf-8
145
+ options[:encoding] ||= 'utf-8'
146
+
147
+ case @host_language
148
+ when :html4, :html5
149
+ ::Nokogiri::HTML.parse(input, base_uri.to_s, options[:encoding])
150
+ else
151
+ ::Nokogiri::XML.parse(input, base_uri.to_s, options[:encoding])
152
+ end
153
+ end
154
+ end
155
+
156
+ # Determine the host language and/or version from options and the input document
157
+ def detect_host_language_version(input, options)
158
+ @host_language = options[:host_language] ? options[:host_language].to_sym : nil
159
+ @version = options[:version] ? options[:version].to_sym : nil
160
+ return if @host_language && @version
161
+
162
+ # Snif version based on input
163
+ case input
164
+ when ::Nokogiri::XML::Document, ::Nokogiri::HTML::Document
165
+ doc_type_string = input.children.detect {|c| c.is_a?(::Nokogiri::XML::DTD)}
166
+ version_attr = input.root && input.root.attribute("version").to_s
167
+ root_element = input.root.name.downcase
168
+ root_namespace = input.root.namespace.to_s
169
+ root_attrs = input.root.attributes
170
+ content_type = case
171
+ when root_element == "html" && input.is_a?(Nokogiri::HTML::Document)
172
+ "text/html"
173
+ when root_element == "html" && input.is_a?(Nokogiri::XML::Document)
174
+ "application/xhtml+html"
175
+ end
176
+ else
177
+ content_type = input.content_type if input.respond_to?(:content_type)
178
+
179
+ # Determine from head of document
180
+ head = if input.respond_to?(:read)
181
+ input.rewind
182
+ string = input.read(1000)
183
+ input.rewind
184
+ string.to_s
185
+ else
186
+ input.to_s[0..1000]
187
+ end
188
+
189
+ doc_type_string = head.match(%r(<!DOCTYPE[^>]*>)m).to_s
190
+ root = head.match(%r(<[^!\?>]*>)m).to_s
191
+ root_element = root.match(%r(^<(\S+)[ >])) ? $1 : ""
192
+ version_attr = root.match(/version\s+=\s+(\S+)[\s">]/m) ? $1 : ""
193
+ head_element = head.match(%r(<head.*<\/head>)mi)
194
+ head_doc = ::Nokogiri::HTML.parse(head_element.to_s)
195
+
196
+ # May determine content-type and/or charset from meta
197
+ # Easist way is to parse head into a document and iterate
198
+ # of CSS matches
199
+ head_doc.css("meta").each do |e|
200
+ if e.attr("http-equiv").to_s.downcase == 'content-type'
201
+ content_type, e = e.attr("content").to_s.downcase.split(";")
202
+ options[:encoding] = $1.downcase if e.to_s =~ /charset=([^\s]*)$/i
203
+ elsif e.attr("charset")
204
+ options[:encoding] = e.attr("charset").to_s.downcase
205
+ end
206
+ end
207
+ end
208
+
209
+ # Already using XML parser, determine from DOCTYPE and/or root element
210
+ @version ||= :"rdfa1.0" if doc_type_string =~ /RDFa 1\.0/
211
+ @version ||= :"rdfa1.0" if version_attr =~ /RDFa 1\.0/
212
+ @version ||= :"rdfa1.1" if version_attr =~ /RDFa 1\.1/
213
+ @version ||= :"rdfa1.1"
214
+
215
+ @host_language ||= case content_type
216
+ when "application/xml" then :xml1
217
+ when "image/svg+xml" then :svg
218
+ when "text/html"
219
+ case doc_type_string
220
+ when /html 4/i then :html4
221
+ when /xhtml/i then :xhtml1
222
+ when /html/i then :html5
223
+ end
224
+ when "application/xhtml+xml"
225
+ case doc_type_string
226
+ when /html 4/i then :html4
227
+ when /xhtml/i then :xhtml1
228
+ when /html/i then :xhtml5
229
+ end
230
+ else
231
+ case root_element
232
+ when /svg/i then :svg
233
+ when /html/i then :html4
234
+ end
235
+ end
236
+
237
+ @host_language ||= :xml1
238
+ end
239
+
240
+ # Accessor methods to mask native elements & attributes
241
+
242
+ ##
243
+ # Return proxy for document root
244
+ def root
245
+ @root ||= NodeProxy.new(@doc.root) if @doc && @doc.root
246
+ end
247
+
248
+ ##
249
+ # Document errors
250
+ def doc_errors
251
+ @doc.errors
252
+ end
253
+
254
+ ##
255
+ # Find value of document base
256
+ #
257
+ # @param [String] base Existing base from URI or :base_uri
258
+ # @return [String]
259
+ def doc_base(base)
260
+ # find if the document has a base element
261
+ case @host_language
262
+ when :xhtml1, :xhtml5, :html4, :html5
263
+ base_el = @doc.at_css("html>head>base")
264
+ base = base_el.attribute("href").to_s.split("#").first if base_el
265
+ else
266
+ xml_base = root.attribute_with_ns("base", RDF::XML.to_s)
267
+ base = xml_base if xml_base
268
+ end
269
+
270
+ base
271
+ end
272
+ end
273
+ end
274
+ end
@@ -0,0 +1,300 @@
1
+ require 'htmlentities'
2
+
3
+ module RDF::RDFa
4
+ class Reader < RDF::Reader
5
+ ##
6
+ # REXML implementation of an XML parser.
7
+ #
8
+ # @see http://www.germane-software.com/software/rexml/
9
+ module REXML
10
+ ##
11
+ # Returns the name of the underlying XML library.
12
+ #
13
+ # @return [Symbol]
14
+ def self.library
15
+ :rexml
16
+ end
17
+
18
+ # Proxy class to implement uniform element accessors
19
+ class NodeProxy
20
+ attr_reader :node
21
+ attr_reader :parent
22
+
23
+ def initialize(node, parent = nil)
24
+ @node = node
25
+ @parent = parent
26
+ end
27
+
28
+ ##
29
+ # Element language
30
+ #
31
+ # From HTML5 [3.2.3.3]
32
+ # If both the lang attribute in no namespace and the lang attribute in the XML namespace are set
33
+ # on an element, user agents must use the lang attribute in the XML namespace, and the lang
34
+ # attribute in no namespace must be ignored for the purposes of determining the element's
35
+ # language.
36
+ #
37
+ # @return [String]
38
+ def language
39
+ language = case
40
+ when @node.attribute("lang", RDF::XML.to_s)
41
+ @node.attribute("lang", RDF::XML.to_s)
42
+ when @node.attribute("lang")
43
+ @node.attribute("lang").to_s
44
+ end
45
+ end
46
+
47
+ ##
48
+ # Return xml:base on element, if defined
49
+ #
50
+ # @return [String]
51
+ def base
52
+ @node.attribute("base", RDF::XML.to_s)
53
+ end
54
+
55
+ def display_path
56
+ @display_path ||= begin
57
+ path = []
58
+ path << parent.display_path if parent
59
+ path << @node.name
60
+ case @node
61
+ when ::REXML::Element then path.join("/")
62
+ when ::REXML::Attribute then path.join("@")
63
+ else path.join("?")
64
+ end
65
+ end
66
+ end
67
+
68
+ ##
69
+ # Return true of all child elements are text
70
+ #
71
+ # @return [Array<:text, :element, :attribute>]
72
+ def text_content?
73
+ @node.children.all? {|c| c.is_a?(::REXML::Text)}
74
+ end
75
+
76
+ ##
77
+ # Retrieve XMLNS definitions for this element
78
+ #
79
+ # @return [Hash{String => String}]
80
+ def namespaces
81
+ ns_decls = {}
82
+ @node.attributes.each do |name, attr|
83
+ next unless name =~ /^xmlns(?:\:(.+))?/
84
+ ns_decls[$1] = attr
85
+ end
86
+ ns_decls
87
+ end
88
+
89
+ ##
90
+ # Children of this node
91
+ #
92
+ # @return [NodeSetProxy]
93
+ def children
94
+ NodeSetProxy.new(@node.children, self)
95
+ end
96
+
97
+ ##
98
+ # Inner text of an element
99
+ #
100
+ # @see http://apidock.com/ruby/REXML/Element/get_text#743-Get-all-inner-texts
101
+ # @return [String]
102
+ def inner_text
103
+ coder = HTMLEntities.new
104
+ ::REXML::XPath.match(@node,'.//text()').map { |e|
105
+ coder.decode(e)
106
+ }.join
107
+ end
108
+
109
+ ##
110
+ # Inner text of an element
111
+ #
112
+ # @see http://apidock.com/ruby/REXML/Element/get_text#743-Get-all-inner-texts
113
+ # @return [String]
114
+ def inner_html
115
+ @node.children.map(&:to_s).join
116
+ end
117
+
118
+ ##
119
+ # Node type accessors
120
+ #
121
+ # @return [Boolean]
122
+ def element?
123
+ @node.is_a?(::REXML::Element)
124
+ end
125
+
126
+ ##
127
+ # Proxy for everything else to @node
128
+ def method_missing(method, *args)
129
+ @node.send(method, *args)
130
+ end
131
+ end
132
+
133
+ ##
134
+ # NodeSet proxy
135
+ class NodeSetProxy
136
+ attr_reader :node_set
137
+ attr_reader :parent
138
+
139
+ def initialize(node_set, parent)
140
+ @node_set = node_set
141
+ @parent = parent
142
+ end
143
+
144
+ ##
145
+ # Return a proxy for each child
146
+ #
147
+ # @yield(child)
148
+ # @yieldparam(NodeProxy)
149
+ def each
150
+ @node_set.each do |c|
151
+ yield NodeProxy.new(c, parent)
152
+ end
153
+ end
154
+
155
+ ##
156
+ # Proxy for everything else to @node_set
157
+ def method_missing(method, *args)
158
+ @node_set.send(method, *args)
159
+ end
160
+ end
161
+
162
+ ##
163
+ # Initializes the underlying XML library.
164
+ #
165
+ # @param [Hash{Symbol => Object}] options
166
+ # @return [void]
167
+ def initialize_xml(input, options = {})
168
+ require 'rexml/document' unless defined?(::REXML)
169
+ @doc = case input
170
+ when ::REXML::Document
171
+ input
172
+ else
173
+ # Try to detect charset from input
174
+ options[:encoding] ||= input.charset if input.respond_to?(:charset)
175
+
176
+ # Otherwise, default is utf-8
177
+ options[:encoding] ||= 'utf-8'
178
+
179
+ # Set xml:base for the document element, if defined
180
+ @base_uri = base_uri ? base_uri.to_s : nil
181
+
182
+ # Only parse as XML, no HTML mode
183
+ doc = ::REXML::Document.new(input.respond_to?(:read) ? input.read : input.to_s)
184
+ end
185
+ end
186
+
187
+ # Determine the host language and/or version from options and the input document
188
+ def detect_host_language_version(input, options)
189
+ @host_language = options[:host_language] ? options[:host_language].to_sym : nil
190
+ @version = options[:version] ? options[:version].to_sym : nil
191
+ return if @host_language && @version
192
+
193
+ # Snif version based on input
194
+ case input
195
+ when ::REXML::Document
196
+ doc_type_string = input.doctype.to_s
197
+ version_attr = input.root && input.root.attribute("version").to_s
198
+ root_element = input.root.name.downcase
199
+ root_namespace = input.root.namespace.to_s
200
+ root_attrs = input.root.attributes
201
+ content_type = "application/xhtml+html" # FIXME: what about other possible XML types?
202
+ else
203
+ content_type = input.content_type if input.respond_to?(:content_type)
204
+
205
+ # Determine from head of document
206
+ head = if input.respond_to?(:read)
207
+ input.rewind
208
+ string = input.read(1000)
209
+ input.rewind
210
+ string.to_s
211
+ else
212
+ input.to_s[0..1000]
213
+ end
214
+
215
+ doc_type_string = head.match(%r(<!DOCTYPE[^>]*>)m).to_s
216
+ root = head.match(%r(<[^!\?>]*>)m).to_s
217
+ root_element = root.match(%r(^<(\S+)[ >])) ? $1 : ""
218
+ version_attr = root.match(/version\s+=\s+(\S+)[\s">]/m) ? $1 : ""
219
+ head_element = head.match(%r(<head.*<\/head>)mi)
220
+ head_doc = ::REXML::Document.new(head_element.to_s)
221
+
222
+ # May determine content-type and/or charset from meta
223
+ # Easist way is to parse head into a document and iterate
224
+ # of CSS matches
225
+ ::REXML::XPath.each(head_doc, "//meta") do |e|
226
+ if e.attribute("http-equiv").to_s.downcase == 'content-type'
227
+ content_type, e = e.attribute("content").to_s.downcase.split(";")
228
+ options[:encoding] = $1.downcase if e.to_s =~ /charset=([^\s]*)$/i
229
+ elsif e.attribute("charset")
230
+ options[:encoding] = e.attr("charset").to_s.downcase
231
+ end
232
+ end
233
+ end
234
+
235
+ # Already using XML parser, determine from DOCTYPE and/or root element
236
+ @version ||= :"rdfa1.0" if doc_type_string =~ /RDFa 1\.0/
237
+ @version ||= :"rdfa1.0" if version_attr =~ /RDFa 1\.0/
238
+ @version ||= :"rdfa1.1" if version_attr =~ /RDFa 1\.1/
239
+ @version ||= :"rdfa1.1"
240
+
241
+ @host_language ||= case content_type
242
+ when "application/xml" then :xml1
243
+ when "image/svg+xml" then :svg
244
+ when "text/html"
245
+ case doc_type_string
246
+ when /html 4/i then :html4
247
+ when /xhtml/i then :xhtml1
248
+ when /html/i then :html5
249
+ end
250
+ when "application/xhtml+xml"
251
+ case doc_type_string
252
+ when /html 4/i then :html4
253
+ when /xhtml/i then :xhtml1
254
+ when /html/i then :xhtml5
255
+ end
256
+ else
257
+ case root_element
258
+ when /svg/i then :svg
259
+ when /html/i then :html4
260
+ end
261
+ end
262
+
263
+ @host_language ||= :xml1
264
+ end
265
+
266
+ # Accessor methods to mask native elements & attributes
267
+
268
+ ##
269
+ # Return proxy for document root
270
+ def root
271
+ @root ||= NodeProxy.new(@doc.root) if @doc && @doc.root
272
+ end
273
+
274
+ ##
275
+ # Document errors
276
+ def doc_errors
277
+ []
278
+ end
279
+
280
+ ##
281
+ # Find value of document base
282
+ #
283
+ # @param [String] base Existing base from URI or :base_uri
284
+ # @return [String]
285
+ def doc_base(base)
286
+ # find if the document has a base element
287
+ case @host_language
288
+ when :xhtml1, :xhtml5, :html4, :html5
289
+ base_el = ::REXML::XPath.first(@doc, "/html/head/base")
290
+ base = base_el.attribute("href").to_s.split("#").first if base_el
291
+ else
292
+ xml_base = root.attribute("base", RDF::XML.to_s)
293
+ base = xml_base if xml_base
294
+ end
295
+
296
+ base || @base_uri
297
+ end
298
+ end
299
+ end
300
+ end