oga 0.1.1-java → 0.1.2-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +21 -0
- data/doc/changelog.md +102 -1
- data/ext/c/lexer.c +63 -48
- data/ext/java/org/liboga/xml/Lexer.java +87 -101
- data/ext/ragel/base_lexer.rl +8 -0
- data/lib/liboga.jar +0 -0
- data/lib/oga.rb +6 -0
- data/lib/oga/html/sax_parser.rb +18 -0
- data/lib/oga/oga.rb +30 -0
- data/lib/oga/version.rb +1 -1
- data/lib/oga/xml/document.rb +7 -1
- data/lib/oga/xml/element.rb +43 -11
- data/lib/oga/xml/html_void_elements.rb +28 -0
- data/lib/oga/xml/lexer.rb +1 -26
- data/lib/oga/xml/parser.rb +34 -2
- data/lib/oga/xml/sax_parser.rb +63 -0
- data/lib/oga/xpath/evaluator.rb +3 -2
- data/lib/oga/xpath/lexer.rb +75 -71
- data/lib/oga/xpath/parser.rb +65 -60
- metadata +27 -24
data/ext/ragel/base_lexer.rl
CHANGED
@@ -292,6 +292,14 @@
|
|
292
292
|
allowed_text = any* -- terminate_text;
|
293
293
|
|
294
294
|
text := |*
|
295
|
+
# Input such as just "</" or "<?". This rule takes precedence over the
|
296
|
+
# rules below, but only if those don't match.
|
297
|
+
terminate_text => {
|
298
|
+
callback("on_text", data, encoding, ts, te);
|
299
|
+
|
300
|
+
fnext main;
|
301
|
+
};
|
302
|
+
|
295
303
|
# Text followed by a special tag, such as "foo<!--"
|
296
304
|
allowed_text @{ mark = p; } terminate_text => {
|
297
305
|
callback("on_text", data, encoding, ts, mark);
|
data/lib/liboga.jar
CHANGED
Binary file
|
data/lib/oga.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
gem 'racc'
|
2
|
+
|
1
3
|
require 'ast'
|
2
4
|
require 'set'
|
3
5
|
require 'stringio'
|
@@ -18,6 +20,7 @@ if RUBY_PLATFORM == 'java'
|
|
18
20
|
end
|
19
21
|
#:nocov:
|
20
22
|
|
23
|
+
require_relative 'oga/xml/html_void_elements'
|
21
24
|
require_relative 'oga/xml/querying'
|
22
25
|
require_relative 'oga/xml/traversal'
|
23
26
|
require_relative 'oga/xml/node'
|
@@ -34,8 +37,11 @@ require_relative 'oga/xml/attribute'
|
|
34
37
|
require_relative 'oga/xml/element'
|
35
38
|
require_relative 'oga/xml/node_set'
|
36
39
|
|
40
|
+
require_relative 'oga/xml/sax_parser'
|
37
41
|
require_relative 'oga/xml/pull_parser'
|
42
|
+
|
38
43
|
require_relative 'oga/html/parser'
|
44
|
+
require_relative 'oga/html/sax_parser'
|
39
45
|
|
40
46
|
require_relative 'oga/xpath/node'
|
41
47
|
require_relative 'oga/xpath/lexer'
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Oga
|
2
|
+
module HTML
|
3
|
+
##
|
4
|
+
# SAX parser for HTML documents. See the documentation of
|
5
|
+
# {Oga::XML::SaxParser} for more information.
|
6
|
+
#
|
7
|
+
class SaxParser < XML::SaxParser
|
8
|
+
##
|
9
|
+
# @see [Oga::XML::SaxParser#initialize]
|
10
|
+
#
|
11
|
+
def initialize(handler, data, options = {})
|
12
|
+
options = options.merge(:html => true)
|
13
|
+
|
14
|
+
super(handler, data, options)
|
15
|
+
end
|
16
|
+
end # SaxParser
|
17
|
+
end # HTML
|
18
|
+
end # Oga
|
data/lib/oga/oga.rb
CHANGED
@@ -24,4 +24,34 @@ module Oga
|
|
24
24
|
def self.parse_html(html)
|
25
25
|
return HTML::Parser.new(html).parse
|
26
26
|
end
|
27
|
+
|
28
|
+
##
|
29
|
+
# Parses the given XML document using the SAX parser.
|
30
|
+
#
|
31
|
+
# @example
|
32
|
+
# handler = SomeSaxHandler.new
|
33
|
+
#
|
34
|
+
# Oga.sax_parse_html(handler, '<root>Hello</root>')
|
35
|
+
#
|
36
|
+
# @param [Object] handler The SAX handler for the parser.
|
37
|
+
# @param [String|IO] xml The XML to parse.
|
38
|
+
#
|
39
|
+
def self.sax_parse_xml(handler, xml)
|
40
|
+
XML::SaxParser.new(handler, xml).parse
|
41
|
+
end
|
42
|
+
|
43
|
+
##
|
44
|
+
# Parses the given HTML document using the SAX parser.
|
45
|
+
#
|
46
|
+
# @example
|
47
|
+
# handler = SomeSaxHandler.new
|
48
|
+
#
|
49
|
+
# Oga.sax_parse_html(handler, '<script>foo()</script>')
|
50
|
+
#
|
51
|
+
# @param [Object] handler The SAX handler for the parser.
|
52
|
+
# @param [String|IO] html The HTML to parse.
|
53
|
+
#
|
54
|
+
def self.sax_parse_html(handler, html)
|
55
|
+
HTML::SaxParser.new(handler, html).parse
|
56
|
+
end
|
27
57
|
end # Oga
|
data/lib/oga/version.rb
CHANGED
data/lib/oga/xml/document.rb
CHANGED
@@ -12,11 +12,15 @@ module Oga
|
|
12
12
|
# The XML declaration of the document.
|
13
13
|
# @return [Oga::XML::XmlDeclaration]
|
14
14
|
#
|
15
|
+
# @!attribute [rw] type
|
16
|
+
# The document type, either `:xml` or `:html`.
|
17
|
+
# @return [Symbol]
|
18
|
+
#
|
15
19
|
class Document
|
16
20
|
include Querying
|
17
21
|
include Traversal
|
18
22
|
|
19
|
-
attr_accessor :doctype, :xml_declaration
|
23
|
+
attr_accessor :doctype, :xml_declaration, :type
|
20
24
|
|
21
25
|
##
|
22
26
|
# @param [Hash] options
|
@@ -24,10 +28,12 @@ module Oga
|
|
24
28
|
# @option options [Oga::XML::NodeSet] :children
|
25
29
|
# @option options [Oga::XML::Doctype] :doctype
|
26
30
|
# @option options [Oga::XML::XmlDeclaration] :xml_declaration
|
31
|
+
# @option options [Symbol] :type
|
27
32
|
#
|
28
33
|
def initialize(options = {})
|
29
34
|
@doctype = options[:doctype]
|
30
35
|
@xml_declaration = options[:xml_declaration]
|
36
|
+
@type = options[:type] || :xml
|
31
37
|
|
32
38
|
self.children = options[:children] if options[:children]
|
33
39
|
end
|
data/lib/oga/xml/element.rb
CHANGED
@@ -144,7 +144,12 @@ module Oga
|
|
144
144
|
# @return [Oga::XML::Namespace]
|
145
145
|
#
|
146
146
|
def namespace
|
147
|
-
|
147
|
+
unless @namespace
|
148
|
+
available = available_namespaces
|
149
|
+
@namespace = available[namespace_name] || available[XMLNS_PREFIX]
|
150
|
+
end
|
151
|
+
|
152
|
+
return @namespace
|
148
153
|
end
|
149
154
|
|
150
155
|
##
|
@@ -206,7 +211,12 @@ module Oga
|
|
206
211
|
# @return [String]
|
207
212
|
#
|
208
213
|
def to_xml
|
209
|
-
|
214
|
+
if namespace_name
|
215
|
+
full_name = "#{namespace_name}:#{name}"
|
216
|
+
else
|
217
|
+
full_name = name
|
218
|
+
end
|
219
|
+
|
210
220
|
body = children.map(&:to_xml).join('')
|
211
221
|
attrs = ''
|
212
222
|
|
@@ -214,7 +224,11 @@ module Oga
|
|
214
224
|
attrs << " #{attr.to_xml}"
|
215
225
|
end
|
216
226
|
|
217
|
-
|
227
|
+
if self_closing?
|
228
|
+
return "<#{full_name}#{attrs} />"
|
229
|
+
else
|
230
|
+
return "<#{full_name}#{attrs}>#{body}</#{full_name}>"
|
231
|
+
end
|
218
232
|
end
|
219
233
|
|
220
234
|
##
|
@@ -263,13 +277,33 @@ module Oga
|
|
263
277
|
node = parent
|
264
278
|
|
265
279
|
while node && node.respond_to?(:namespaces)
|
266
|
-
|
267
|
-
|
280
|
+
node.namespaces.each do |prefix, ns|
|
281
|
+
merged[prefix] = ns unless merged[prefix]
|
282
|
+
end
|
283
|
+
|
284
|
+
node = node.parent
|
268
285
|
end
|
269
286
|
|
270
287
|
return merged
|
271
288
|
end
|
272
289
|
|
290
|
+
##
|
291
|
+
# Returns `true` if the element is a self-closing element.
|
292
|
+
#
|
293
|
+
# @return [TrueClass|FalseClass]
|
294
|
+
#
|
295
|
+
def self_closing?
|
296
|
+
self_closing = children.empty?
|
297
|
+
root = root_node
|
298
|
+
|
299
|
+
if root.is_a?(Document) and root.type == :html \
|
300
|
+
and !HTML_VOID_ELEMENTS.include?(name)
|
301
|
+
self_closing = false
|
302
|
+
end
|
303
|
+
|
304
|
+
return self_closing
|
305
|
+
end
|
306
|
+
|
273
307
|
private
|
274
308
|
|
275
309
|
##
|
@@ -277,14 +311,12 @@ module Oga
|
|
277
311
|
# has been registered the corresponding attribute is removed.
|
278
312
|
#
|
279
313
|
def register_namespaces_from_attributes
|
280
|
-
|
314
|
+
attributes.each do |attr|
|
281
315
|
# We're using `namespace_name` opposed to `namespace.name` as "xmlns"
|
282
316
|
# is not a registered namespace.
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
remove
|
317
|
+
if attr.name == XMLNS_PREFIX or attr.namespace_name == XMLNS_PREFIX
|
318
|
+
register_namespace(attr.name, attr.value)
|
319
|
+
end
|
288
320
|
end
|
289
321
|
end
|
290
322
|
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# Names of the HTML void elements that should be handled when HTML lexing
|
5
|
+
# is enabled.
|
6
|
+
#
|
7
|
+
# @return [Set]
|
8
|
+
#
|
9
|
+
HTML_VOID_ELEMENTS = Set.new([
|
10
|
+
'area',
|
11
|
+
'base',
|
12
|
+
'br',
|
13
|
+
'col',
|
14
|
+
'command',
|
15
|
+
'embed',
|
16
|
+
'hr',
|
17
|
+
'img',
|
18
|
+
'input',
|
19
|
+
'keygen',
|
20
|
+
'link',
|
21
|
+
'meta',
|
22
|
+
'param',
|
23
|
+
'source',
|
24
|
+
'track',
|
25
|
+
'wbr'
|
26
|
+
])
|
27
|
+
end # XML
|
28
|
+
end # Oga
|
data/lib/oga/xml/lexer.rb
CHANGED
@@ -40,31 +40,6 @@ module Oga
|
|
40
40
|
class Lexer
|
41
41
|
attr_reader :html
|
42
42
|
|
43
|
-
##
|
44
|
-
# Names of the HTML void elements that should be handled when HTML lexing
|
45
|
-
# is enabled.
|
46
|
-
#
|
47
|
-
# @return [Set]
|
48
|
-
#
|
49
|
-
HTML_VOID_ELEMENTS = Set.new([
|
50
|
-
'area',
|
51
|
-
'base',
|
52
|
-
'br',
|
53
|
-
'col',
|
54
|
-
'command',
|
55
|
-
'embed',
|
56
|
-
'hr',
|
57
|
-
'img',
|
58
|
-
'input',
|
59
|
-
'keygen',
|
60
|
-
'link',
|
61
|
-
'meta',
|
62
|
-
'param',
|
63
|
-
'source',
|
64
|
-
'track',
|
65
|
-
'wbr'
|
66
|
-
])
|
67
|
-
|
68
43
|
##
|
69
44
|
# @param [String|IO] data The data to lex. This can either be a String or
|
70
45
|
# an IO instance.
|
@@ -347,7 +322,7 @@ module Oga
|
|
347
322
|
# Called on the closing `>` of the open tag of an element.
|
348
323
|
#
|
349
324
|
def on_element_open_end
|
350
|
-
if html? and HTML_VOID_ELEMENTS.include?(current_element)
|
325
|
+
if html? and HTML_VOID_ELEMENTS.include?(current_element.downcase)
|
351
326
|
add_token(:T_ELEM_END)
|
352
327
|
@elements.pop
|
353
328
|
end
|
data/lib/oga/xml/parser.rb
CHANGED
@@ -9,6 +9,35 @@ module Oga
|
|
9
9
|
module XML
|
10
10
|
class Parser < Racc::Parser
|
11
11
|
|
12
|
+
##
|
13
|
+
# Hash mapping token types and dedicated error labels.
|
14
|
+
#
|
15
|
+
# @return [Hash]
|
16
|
+
#
|
17
|
+
TOKEN_ERROR_MAPPING = {
|
18
|
+
'T_STRING' => 'string',
|
19
|
+
'T_TEXT' => 'text',
|
20
|
+
'T_DOCTYPE_START' => 'doctype start',
|
21
|
+
'T_DOCTYPE_END' => 'doctype closing tag',
|
22
|
+
'T_DOCTYPE_TYPE' => 'doctype type',
|
23
|
+
'T_DOCTYPE_NAME' => 'doctype name',
|
24
|
+
'T_DOCTYPE_INLINE' => 'inline doctype rules',
|
25
|
+
'T_CDATA' => 'CDATA',
|
26
|
+
'T_COMMENT' => 'comment',
|
27
|
+
'T_ELEM_START' => 'element start',
|
28
|
+
'T_ELEM_NAME' => 'element name',
|
29
|
+
'T_ELEM_NS' => 'element namespace',
|
30
|
+
'T_ELEM_END' => 'element closing tag',
|
31
|
+
'T_ATTR' => 'attribute',
|
32
|
+
'T_ATTR_NS' => 'attribute namespace',
|
33
|
+
'T_XML_DECL_START' => 'XML declaration start',
|
34
|
+
'T_XML_DECL_END' => 'XML declaration end',
|
35
|
+
'T_PROC_INS_START' => 'processing-instruction start',
|
36
|
+
'T_PROC_INS_NAME' => 'processing-instruction name',
|
37
|
+
'T_PROC_INS_END' => 'processing-instruction closing tag',
|
38
|
+
'$end' => 'end of input'
|
39
|
+
}
|
40
|
+
|
12
41
|
##
|
13
42
|
# @param [String|IO] data The input to parse.
|
14
43
|
# @param [Hash] options
|
@@ -53,6 +82,7 @@ module Oga
|
|
53
82
|
#
|
54
83
|
def on_error(type, value, stack)
|
55
84
|
name = token_to_str(type)
|
85
|
+
name = TOKEN_ERROR_MAPPING[name] || name
|
56
86
|
index = @line - 1
|
57
87
|
index_range = (index - 5)..(index + 5)
|
58
88
|
code = ''
|
@@ -84,7 +114,7 @@ module Oga
|
|
84
114
|
end
|
85
115
|
|
86
116
|
raise Racc::ParseError, <<-EOF.strip
|
87
|
-
Unexpected #{name}
|
117
|
+
Unexpected #{name} on line #{@line}:
|
88
118
|
|
89
119
|
#{code}
|
90
120
|
EOF
|
@@ -112,7 +142,9 @@ Unexpected #{name} with value #{value.inspect} on line #{@line}:
|
|
112
142
|
# @return [Oga::XML::Document]
|
113
143
|
#
|
114
144
|
def on_document(children = [])
|
115
|
-
document = Document.new
|
145
|
+
document = Document.new(
|
146
|
+
:type => @lexer.html ? :html : :xml
|
147
|
+
)
|
116
148
|
|
117
149
|
children.each do |child|
|
118
150
|
if child.is_a?(Doctype)
|
@@ -0,0 +1,63 @@
|
|
1
|
+
module Oga
|
2
|
+
module XML
|
3
|
+
##
|
4
|
+
# The SaxParser class provides the basic interface for writing custom SAX
|
5
|
+
# parsers. All callback methods defined in {Oga::XML::Parser} are delegated
|
6
|
+
# to a dedicated handler class.
|
7
|
+
#
|
8
|
+
# To write a custom handler for the SAX parser, create a class that
|
9
|
+
# implements one (or many) of the following callback methods:
|
10
|
+
#
|
11
|
+
# * `on_document`
|
12
|
+
# * `on_doctype`
|
13
|
+
# * `on_cdata`
|
14
|
+
# * `on_comment`
|
15
|
+
# * `on_proc_ins`
|
16
|
+
# * `on_xml_decl`
|
17
|
+
# * `on_text`
|
18
|
+
# * `on_element`
|
19
|
+
# * `on_element_children`
|
20
|
+
# * `after_element`
|
21
|
+
#
|
22
|
+
# For example:
|
23
|
+
#
|
24
|
+
# class SaxHandler
|
25
|
+
# def on_element(namespace, name, attrs = {})
|
26
|
+
# puts name
|
27
|
+
# end
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
# You can then use it as following:
|
31
|
+
#
|
32
|
+
# handler = SaxHandler.new
|
33
|
+
# parser = Oga::XML::SaxParser.new(handler, '<foo />')
|
34
|
+
#
|
35
|
+
# parser.parse
|
36
|
+
#
|
37
|
+
# For information on the callback arguments see the documentation of the
|
38
|
+
# corresponding methods in {Oga::XML::Parser}.
|
39
|
+
#
|
40
|
+
class SaxParser < Parser
|
41
|
+
##
|
42
|
+
# @param [Object] handler The SAX handler to delegate callbacks to.
|
43
|
+
# @see [Oga::XML::Parser#initialize]
|
44
|
+
#
|
45
|
+
def initialize(handler, *args)
|
46
|
+
@handler = handler
|
47
|
+
|
48
|
+
super(*args)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Delegate all callbacks to the handler object.
|
52
|
+
instance_methods.grep(/^(on_|after_)/).each do |method|
|
53
|
+
eval <<-EOF, nil, __FILE__, __LINE__ + 1
|
54
|
+
def #{method}(*args)
|
55
|
+
@handler.#{method}(*args) if @handler.respond_to?(:#{method})
|
56
|
+
|
57
|
+
return
|
58
|
+
end
|
59
|
+
EOF
|
60
|
+
end
|
61
|
+
end # SaxParser
|
62
|
+
end # XML
|
63
|
+
end # Oga
|
data/lib/oga/xpath/evaluator.rb
CHANGED
@@ -131,7 +131,8 @@ module Oga
|
|
131
131
|
context = XML::NodeSet.new([@document])
|
132
132
|
end
|
133
133
|
|
134
|
-
return
|
134
|
+
# If the expression is just "/" we'll just return the current context.
|
135
|
+
return ast_node.children.empty? ? context : on_path(ast_node, context)
|
135
136
|
end
|
136
137
|
|
137
138
|
##
|
@@ -1188,7 +1189,7 @@ module Oga
|
|
1188
1189
|
# This function call returns the substring of the 1st argument that occurs
|
1189
1190
|
# after the string given in the 2nd argument. For example:
|
1190
1191
|
#
|
1191
|
-
# substring-
|
1192
|
+
# substring-after("2014-08-25", "-")
|
1192
1193
|
#
|
1193
1194
|
# This would return "08-25" as it occurs after the first "-".
|
1194
1195
|
#
|