oga 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +21 -0
- data/doc/changelog.md +108 -0
- data/ext/c/lexer.c +63 -48
- data/ext/java/org/liboga/xml/Lexer.java +87 -101
- data/ext/ragel/base_lexer.rl +8 -0
- data/lib/oga.rb +7 -1
- data/lib/oga/html/sax_parser.rb +18 -0
- data/lib/oga/oga.rb +30 -0
- data/lib/oga/version.rb +1 -1
- data/lib/oga/xml/cdata.rb +0 -7
- data/lib/oga/xml/comment.rb +0 -7
- data/lib/oga/xml/doctype.rb +0 -7
- data/lib/oga/xml/document.rb +7 -1
- data/lib/oga/xml/element.rb +43 -18
- data/lib/oga/xml/html_void_elements.rb +28 -0
- data/lib/oga/xml/lexer.rb +1 -26
- data/lib/oga/xml/node.rb +0 -7
- data/lib/oga/xml/parser.rb +34 -2
- data/lib/oga/xml/pull_parser.rb +17 -3
- data/lib/oga/xml/sax_parser.rb +63 -0
- data/lib/oga/xml/text.rb +1 -6
- data/lib/oga/xml/xml_declaration.rb +0 -7
- data/lib/oga/xpath/evaluator.rb +3 -2
- data/lib/oga/xpath/lexer.rb +75 -71
- data/lib/oga/xpath/parser.rb +65 -60
- metadata +5 -2
data/ext/ragel/base_lexer.rl
CHANGED
@@ -292,6 +292,14 @@
|
|
292
292
|
allowed_text = any* -- terminate_text;
|
293
293
|
|
294
294
|
text := |*
|
295
|
+
# Input such as just "</" or "<?". This rule takes precedence over the
|
296
|
+
# rules below, but only if those don't match.
|
297
|
+
terminate_text => {
|
298
|
+
callback("on_text", data, encoding, ts, te);
|
299
|
+
|
300
|
+
fnext main;
|
301
|
+
};
|
302
|
+
|
295
303
|
# Text followed by a special tag, such as "foo<!--"
|
296
304
|
allowed_text @{ mark = p; } terminate_text => {
|
297
305
|
callback("on_text", data, encoding, ts, mark);
|
data/lib/oga.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
gem 'racc'
|
2
|
+
|
1
3
|
require 'ast'
|
2
4
|
require 'set'
|
3
5
|
require 'stringio'
|
@@ -9,7 +11,6 @@ require_relative 'oga/oga'
|
|
9
11
|
# Oga::XML namespace.
|
10
12
|
require_relative 'oga/xml/lexer'
|
11
13
|
require_relative 'oga/xml/parser'
|
12
|
-
require_relative 'oga/xml/pull_parser'
|
13
14
|
|
14
15
|
require_relative 'liboga'
|
15
16
|
|
@@ -19,6 +20,7 @@ if RUBY_PLATFORM == 'java'
|
|
19
20
|
end
|
20
21
|
#:nocov:
|
21
22
|
|
23
|
+
require_relative 'oga/xml/html_void_elements'
|
22
24
|
require_relative 'oga/xml/querying'
|
23
25
|
require_relative 'oga/xml/traversal'
|
24
26
|
require_relative 'oga/xml/node'
|
@@ -35,7 +37,11 @@ require_relative 'oga/xml/attribute'
|
|
35
37
|
require_relative 'oga/xml/element'
|
36
38
|
require_relative 'oga/xml/node_set'
|
37
39
|
|
40
|
+
require_relative 'oga/xml/sax_parser'
|
41
|
+
require_relative 'oga/xml/pull_parser'
|
42
|
+
|
38
43
|
require_relative 'oga/html/parser'
|
44
|
+
require_relative 'oga/html/sax_parser'
|
39
45
|
|
40
46
|
require_relative 'oga/xpath/node'
|
41
47
|
require_relative 'oga/xpath/lexer'
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Oga
|
2
|
+
module HTML
|
3
|
+
##
|
4
|
+
# SAX parser for HTML documents. See the documentation of
|
5
|
+
# {Oga::XML::SaxParser} for more information.
|
6
|
+
#
|
7
|
+
class SaxParser < XML::SaxParser
|
8
|
+
##
|
9
|
+
# @see [Oga::XML::SaxParser#initialize]
|
10
|
+
#
|
11
|
+
def initialize(handler, data, options = {})
|
12
|
+
options = options.merge(:html => true)
|
13
|
+
|
14
|
+
super(handler, data, options)
|
15
|
+
end
|
16
|
+
end # SaxParser
|
17
|
+
end # HTML
|
18
|
+
end # Oga
|
data/lib/oga/oga.rb
CHANGED
@@ -24,4 +24,34 @@ module Oga
|
|
24
24
|
def self.parse_html(html)
|
25
25
|
return HTML::Parser.new(html).parse
|
26
26
|
end
|
27
|
+
|
28
|
+
##
|
29
|
+
# Parses the given XML document using the SAX parser.
|
30
|
+
#
|
31
|
+
# @example
|
32
|
+
# handler = SomeSaxHandler.new
|
33
|
+
#
|
34
|
+
# Oga.sax_parse_html(handler, '<root>Hello</root>')
|
35
|
+
#
|
36
|
+
# @param [Object] handler The SAX handler for the parser.
|
37
|
+
# @param [String|IO] xml The XML to parse.
|
38
|
+
#
|
39
|
+
def self.sax_parse_xml(handler, xml)
|
40
|
+
XML::SaxParser.new(handler, xml).parse
|
41
|
+
end
|
42
|
+
|
43
|
+
##
|
44
|
+
# Parses the given HTML document using the SAX parser.
|
45
|
+
#
|
46
|
+
# @example
|
47
|
+
# handler = SomeSaxHandler.new
|
48
|
+
#
|
49
|
+
# Oga.sax_parse_html(handler, '<script>foo()</script>')
|
50
|
+
#
|
51
|
+
# @param [Object] handler The SAX handler for the parser.
|
52
|
+
# @param [String|IO] html The HTML to parse.
|
53
|
+
#
|
54
|
+
def self.sax_parse_html(handler, html)
|
55
|
+
HTML::SaxParser.new(handler, html).parse
|
56
|
+
end
|
27
57
|
end # Oga
|
data/lib/oga/version.rb
CHANGED
data/lib/oga/xml/cdata.rb
CHANGED
data/lib/oga/xml/comment.rb
CHANGED
data/lib/oga/xml/doctype.rb
CHANGED
data/lib/oga/xml/document.rb
CHANGED
@@ -12,11 +12,15 @@ module Oga
|
|
12
12
|
# The XML declaration of the document.
|
13
13
|
# @return [Oga::XML::XmlDeclaration]
|
14
14
|
#
|
15
|
+
# @!attribute [rw] type
|
16
|
+
# The document type, either `:xml` or `:html`.
|
17
|
+
# @return [Symbol]
|
18
|
+
#
|
15
19
|
class Document
|
16
20
|
include Querying
|
17
21
|
include Traversal
|
18
22
|
|
19
|
-
attr_accessor :doctype, :xml_declaration
|
23
|
+
attr_accessor :doctype, :xml_declaration, :type
|
20
24
|
|
21
25
|
##
|
22
26
|
# @param [Hash] options
|
@@ -24,10 +28,12 @@ module Oga
|
|
24
28
|
# @option options [Oga::XML::NodeSet] :children
|
25
29
|
# @option options [Oga::XML::Doctype] :doctype
|
26
30
|
# @option options [Oga::XML::XmlDeclaration] :xml_declaration
|
31
|
+
# @option options [Symbol] :type
|
27
32
|
#
|
28
33
|
def initialize(options = {})
|
29
34
|
@doctype = options[:doctype]
|
30
35
|
@xml_declaration = options[:xml_declaration]
|
36
|
+
@type = options[:type] || :xml
|
31
37
|
|
32
38
|
self.children = options[:children] if options[:children]
|
33
39
|
end
|
data/lib/oga/xml/element.rb
CHANGED
@@ -144,7 +144,12 @@ module Oga
|
|
144
144
|
# @return [Oga::XML::Namespace]
|
145
145
|
#
|
146
146
|
def namespace
|
147
|
-
|
147
|
+
unless @namespace
|
148
|
+
available = available_namespaces
|
149
|
+
@namespace = available[namespace_name] || available[XMLNS_PREFIX]
|
150
|
+
end
|
151
|
+
|
152
|
+
return @namespace
|
148
153
|
end
|
149
154
|
|
150
155
|
##
|
@@ -206,7 +211,12 @@ module Oga
|
|
206
211
|
# @return [String]
|
207
212
|
#
|
208
213
|
def to_xml
|
209
|
-
|
214
|
+
if namespace_name
|
215
|
+
full_name = "#{namespace_name}:#{name}"
|
216
|
+
else
|
217
|
+
full_name = name
|
218
|
+
end
|
219
|
+
|
210
220
|
body = children.map(&:to_xml).join('')
|
211
221
|
attrs = ''
|
212
222
|
|
@@ -214,7 +224,11 @@ module Oga
|
|
214
224
|
attrs << " #{attr.to_xml}"
|
215
225
|
end
|
216
226
|
|
217
|
-
|
227
|
+
if self_closing?
|
228
|
+
return "<#{full_name}#{attrs} />"
|
229
|
+
else
|
230
|
+
return "<#{full_name}#{attrs}>#{body}</#{full_name}>"
|
231
|
+
end
|
218
232
|
end
|
219
233
|
|
220
234
|
##
|
@@ -236,13 +250,6 @@ module Oga
|
|
236
250
|
return "Element(#{segments.join(' ')})"
|
237
251
|
end
|
238
252
|
|
239
|
-
##
|
240
|
-
# @return [Symbol]
|
241
|
-
#
|
242
|
-
def node_type
|
243
|
-
return :element
|
244
|
-
end
|
245
|
-
|
246
253
|
##
|
247
254
|
# Registers a new namespace for the current element and its child
|
248
255
|
# elements.
|
@@ -270,13 +277,33 @@ module Oga
|
|
270
277
|
node = parent
|
271
278
|
|
272
279
|
while node && node.respond_to?(:namespaces)
|
273
|
-
|
274
|
-
|
280
|
+
node.namespaces.each do |prefix, ns|
|
281
|
+
merged[prefix] = ns unless merged[prefix]
|
282
|
+
end
|
283
|
+
|
284
|
+
node = node.parent
|
275
285
|
end
|
276
286
|
|
277
287
|
return merged
|
278
288
|
end
|
279
289
|
|
290
|
+
##
|
291
|
+
# Returns `true` if the element is a self-closing element.
|
292
|
+
#
|
293
|
+
# @return [TrueClass|FalseClass]
|
294
|
+
#
|
295
|
+
def self_closing?
|
296
|
+
self_closing = children.empty?
|
297
|
+
root = root_node
|
298
|
+
|
299
|
+
if root.is_a?(Document) and root.type == :html \
|
300
|
+
and !HTML_VOID_ELEMENTS.include?(name)
|
301
|
+
self_closing = false
|
302
|
+
end
|
303
|
+
|
304
|
+
return self_closing
|
305
|
+
end
|
306
|
+
|
280
307
|
private
|
281
308
|
|
282
309
|
##
|
@@ -284,14 +311,12 @@ module Oga
|
|
284
311
|
# has been registered the corresponding attribute is removed.
|
285
312
|
#
|
286
313
|
def register_namespaces_from_attributes
|
287
|
-
|
314
|
+
attributes.each do |attr|
|
288
315
|
# We're using `namespace_name` opposed to `namespace.name` as "xmlns"
|
289
316
|
# is not a registered namespace.
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
remove
|
317
|
+
if attr.name == XMLNS_PREFIX or attr.namespace_name == XMLNS_PREFIX
|
318
|
+
register_namespace(attr.name, attr.value)
|
319
|
+
end
|
295
320
|
end
|
296
321
|
end
|
297
322
|
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Names of the HTML void elements that should be handled when HTML lexing
|
5
|
+
# is enabled.
|
6
|
+
#
|
7
|
+
# @return [Set]
|
8
|
+
#
|
9
|
+
HTML_VOID_ELEMENTS = Set.new([
|
10
|
+
'area',
|
11
|
+
'base',
|
12
|
+
'br',
|
13
|
+
'col',
|
14
|
+
'command',
|
15
|
+
'embed',
|
16
|
+
'hr',
|
17
|
+
'img',
|
18
|
+
'input',
|
19
|
+
'keygen',
|
20
|
+
'link',
|
21
|
+
'meta',
|
22
|
+
'param',
|
23
|
+
'source',
|
24
|
+
'track',
|
25
|
+
'wbr'
|
26
|
+
])
|
27
|
+
end # XML
|
28
|
+
end # Oga
|
data/lib/oga/xml/lexer.rb
CHANGED
@@ -40,31 +40,6 @@ module Oga
|
|
40
40
|
class Lexer
|
41
41
|
attr_reader :html
|
42
42
|
|
43
|
-
##
|
44
|
-
# Names of the HTML void elements that should be handled when HTML lexing
|
45
|
-
# is enabled.
|
46
|
-
#
|
47
|
-
# @return [Set]
|
48
|
-
#
|
49
|
-
HTML_VOID_ELEMENTS = Set.new([
|
50
|
-
'area',
|
51
|
-
'base',
|
52
|
-
'br',
|
53
|
-
'col',
|
54
|
-
'command',
|
55
|
-
'embed',
|
56
|
-
'hr',
|
57
|
-
'img',
|
58
|
-
'input',
|
59
|
-
'keygen',
|
60
|
-
'link',
|
61
|
-
'meta',
|
62
|
-
'param',
|
63
|
-
'source',
|
64
|
-
'track',
|
65
|
-
'wbr'
|
66
|
-
])
|
67
|
-
|
68
43
|
##
|
69
44
|
# @param [String|IO] data The data to lex. This can either be a String or
|
70
45
|
# an IO instance.
|
@@ -347,7 +322,7 @@ module Oga
|
|
347
322
|
# Called on the closing `>` of the open tag of an element.
|
348
323
|
#
|
349
324
|
def on_element_open_end
|
350
|
-
if html? and HTML_VOID_ELEMENTS.include?(current_element)
|
325
|
+
if html? and HTML_VOID_ELEMENTS.include?(current_element.downcase)
|
351
326
|
add_token(:T_ELEM_END)
|
352
327
|
@elements.pop
|
353
328
|
end
|
data/lib/oga/xml/node.rb
CHANGED
data/lib/oga/xml/parser.rb
CHANGED
@@ -9,6 +9,35 @@ module Oga
|
|
9
9
|
module XML
|
10
10
|
class Parser < Racc::Parser
|
11
11
|
|
12
|
+
##
|
13
|
+
# Hash mapping token types and dedicated error labels.
|
14
|
+
#
|
15
|
+
# @return [Hash]
|
16
|
+
#
|
17
|
+
TOKEN_ERROR_MAPPING = {
|
18
|
+
'T_STRING' => 'string',
|
19
|
+
'T_TEXT' => 'text',
|
20
|
+
'T_DOCTYPE_START' => 'doctype start',
|
21
|
+
'T_DOCTYPE_END' => 'doctype closing tag',
|
22
|
+
'T_DOCTYPE_TYPE' => 'doctype type',
|
23
|
+
'T_DOCTYPE_NAME' => 'doctype name',
|
24
|
+
'T_DOCTYPE_INLINE' => 'inline doctype rules',
|
25
|
+
'T_CDATA' => 'CDATA',
|
26
|
+
'T_COMMENT' => 'comment',
|
27
|
+
'T_ELEM_START' => 'element start',
|
28
|
+
'T_ELEM_NAME' => 'element name',
|
29
|
+
'T_ELEM_NS' => 'element namespace',
|
30
|
+
'T_ELEM_END' => 'element closing tag',
|
31
|
+
'T_ATTR' => 'attribute',
|
32
|
+
'T_ATTR_NS' => 'attribute namespace',
|
33
|
+
'T_XML_DECL_START' => 'XML declaration start',
|
34
|
+
'T_XML_DECL_END' => 'XML declaration end',
|
35
|
+
'T_PROC_INS_START' => 'processing-instruction start',
|
36
|
+
'T_PROC_INS_NAME' => 'processing-instruction name',
|
37
|
+
'T_PROC_INS_END' => 'processing-instruction closing tag',
|
38
|
+
'$end' => 'end of input'
|
39
|
+
}
|
40
|
+
|
12
41
|
##
|
13
42
|
# @param [String|IO] data The input to parse.
|
14
43
|
# @param [Hash] options
|
@@ -53,6 +82,7 @@ module Oga
|
|
53
82
|
#
|
54
83
|
def on_error(type, value, stack)
|
55
84
|
name = token_to_str(type)
|
85
|
+
name = TOKEN_ERROR_MAPPING[name] || name
|
56
86
|
index = @line - 1
|
57
87
|
index_range = (index - 5)..(index + 5)
|
58
88
|
code = ''
|
@@ -84,7 +114,7 @@ module Oga
|
|
84
114
|
end
|
85
115
|
|
86
116
|
raise Racc::ParseError, <<-EOF.strip
|
87
|
-
Unexpected #{name}
|
117
|
+
Unexpected #{name} on line #{@line}:
|
88
118
|
|
89
119
|
#{code}
|
90
120
|
EOF
|
@@ -112,7 +142,9 @@ Unexpected #{name} with value #{value.inspect} on line #{@line}:
|
|
112
142
|
# @return [Oga::XML::Document]
|
113
143
|
#
|
114
144
|
def on_document(children = [])
|
115
|
-
document = Document.new
|
145
|
+
document = Document.new(
|
146
|
+
:type => @lexer.html ? :html : :xml
|
147
|
+
)
|
116
148
|
|
117
149
|
children.each do |child|
|
118
150
|
if child.is_a?(Doctype)
|
data/lib/oga/xml/pull_parser.rb
CHANGED
@@ -50,6 +50,21 @@ module Oga
|
|
50
50
|
:on_proc_ins
|
51
51
|
]
|
52
52
|
|
53
|
+
##
|
54
|
+
# Returns the shorthands that can be used for various node classes.
|
55
|
+
#
|
56
|
+
# @return [Hash]
|
57
|
+
#
|
58
|
+
NODE_SHORTHANDS = {
|
59
|
+
:text => XML::Text,
|
60
|
+
:node => XML::Node,
|
61
|
+
:cdata => XML::Cdata,
|
62
|
+
:element => XML::Element,
|
63
|
+
:doctype => XML::Doctype,
|
64
|
+
:comment => XML::Comment,
|
65
|
+
:xml_declaration => XML::XmlDeclaration
|
66
|
+
}
|
67
|
+
|
53
68
|
##
|
54
69
|
# @see Oga::XML::Parser#reset
|
55
70
|
#
|
@@ -89,8 +104,7 @@ module Oga
|
|
89
104
|
# Instead of this:
|
90
105
|
#
|
91
106
|
# parser.parse do |node|
|
92
|
-
# if node.
|
93
|
-
# and parser.nesting == %w{people person name}
|
107
|
+
# if node.is_a?(Oga::XML::Text) and parser.nesting == %w{people person name}
|
94
108
|
# puts node.text
|
95
109
|
# end
|
96
110
|
# end
|
@@ -113,7 +127,7 @@ module Oga
|
|
113
127
|
# @param [Array] nesting The element name nesting to act upon.
|
114
128
|
#
|
115
129
|
def on(type, nesting = [])
|
116
|
-
if node.
|
130
|
+
if node.is_a?(NODE_SHORTHANDS[type])
|
117
131
|
if nesting.empty? or nesting == self.nesting
|
118
132
|
yield
|
119
133
|
end
|