oga 0.3.1-java → 0.3.2-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/c/lexer.c +1234 -919
- data/ext/c/lexer.rl +12 -2
- data/ext/java/org/liboga/xml/Lexer.java +473 -317
- data/ext/java/org/liboga/xml/Lexer.rl +9 -2
- data/ext/ragel/base_lexer.rl +132 -17
- data/lib/liboga.jar +0 -0
- data/lib/oga/version.rb +1 -1
- data/lib/oga/xml/document.rb +4 -2
- data/lib/oga/xml/element.rb +53 -14
- data/lib/oga/xml/lexer.rb +44 -5
- data/lib/oga/xml/node.rb +30 -14
- data/lib/oga/xml/parser.rb +153 -117
- data/lib/oga/xml/traversal.rb +3 -1
- metadata +2 -2
@@ -89,6 +89,8 @@ public class Lexer extends RubyObject
|
|
89
89
|
@JRubyMethod
|
90
90
|
public IRubyObject advance_native(ThreadContext context, RubyString rb_str)
|
91
91
|
{
|
92
|
+
Boolean html_p = this.callMethod(context, "html").isTrue();
|
93
|
+
|
92
94
|
Encoding encoding = rb_str.getEncoding();
|
93
95
|
|
94
96
|
byte[] data = rb_str.getBytes();
|
@@ -104,8 +106,12 @@ public class Lexer extends RubyObject
|
|
104
106
|
String id_advance_line = "advance_line";
|
105
107
|
String id_on_attribute = "on_attribute";
|
106
108
|
String id_on_attribute_ns = "on_attribute_ns";
|
107
|
-
String
|
108
|
-
String
|
109
|
+
String id_on_cdata_start = "on_cdata_start";
|
110
|
+
String id_on_cdata_body = "on_cdata_body";
|
111
|
+
String id_on_cdata_end = "on_cdata_end";
|
112
|
+
String id_on_comment_start = "on_comment_start";
|
113
|
+
String id_on_comment_body = "on_comment_body";
|
114
|
+
String id_on_comment_end = "on_comment_end";
|
109
115
|
String id_on_doctype_end = "on_doctype_end";
|
110
116
|
String id_on_doctype_inline = "on_doctype_inline";
|
111
117
|
String id_on_doctype_name = "on_doctype_name";
|
@@ -119,6 +125,7 @@ public class Lexer extends RubyObject
|
|
119
125
|
String id_on_proc_ins_end = "on_proc_ins_end";
|
120
126
|
String id_on_proc_ins_name = "on_proc_ins_name";
|
121
127
|
String id_on_proc_ins_start = "on_proc_ins_start";
|
128
|
+
String id_on_proc_ins_body = "on_proc_ins_body";
|
122
129
|
String id_on_string_body = "on_string_body";
|
123
130
|
String id_on_string_dquote = "on_string_dquote";
|
124
131
|
String id_on_string_squote = "on_string_squote";
|
data/ext/ragel/base_lexer.rl
CHANGED
@@ -67,12 +67,35 @@
|
|
67
67
|
|
68
68
|
comment_start = '<!--';
|
69
69
|
comment_end = '-->';
|
70
|
-
|
70
|
+
|
71
|
+
# Everything except "-" OR a single "-"
|
72
|
+
comment_allowed = (^'-'+ | '-') $count_newlines;
|
71
73
|
|
72
74
|
action start_comment {
|
73
|
-
|
75
|
+
callback_simple(id_on_comment_start);
|
76
|
+
|
77
|
+
fnext comment_body;
|
74
78
|
}
|
75
79
|
|
80
|
+
comment_body := |*
|
81
|
+
comment_allowed => {
|
82
|
+
callback(id_on_comment_body, data, encoding, ts, te);
|
83
|
+
|
84
|
+
if ( lines > 0 )
|
85
|
+
{
|
86
|
+
advance_line(lines);
|
87
|
+
|
88
|
+
lines = 0;
|
89
|
+
}
|
90
|
+
};
|
91
|
+
|
92
|
+
comment_end => {
|
93
|
+
callback_simple(id_on_comment_end);
|
94
|
+
|
95
|
+
fnext main;
|
96
|
+
};
|
97
|
+
*|;
|
98
|
+
|
76
99
|
# CDATA
|
77
100
|
#
|
78
101
|
# http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
|
@@ -83,12 +106,35 @@
|
|
83
106
|
|
84
107
|
cdata_start = '<![CDATA[';
|
85
108
|
cdata_end = ']]>';
|
86
|
-
|
109
|
+
|
110
|
+
# Everything except "]" OR a single "]"
|
111
|
+
cdata_allowed = (^']'+ | ']') $count_newlines;
|
87
112
|
|
88
113
|
action start_cdata {
|
89
|
-
|
114
|
+
callback_simple(id_on_cdata_start);
|
115
|
+
|
116
|
+
fnext cdata_body;
|
90
117
|
}
|
91
118
|
|
119
|
+
cdata_body := |*
|
120
|
+
cdata_allowed => {
|
121
|
+
callback(id_on_cdata_body, data, encoding, ts, te);
|
122
|
+
|
123
|
+
if ( lines > 0 )
|
124
|
+
{
|
125
|
+
advance_line(lines);
|
126
|
+
|
127
|
+
lines = 0;
|
128
|
+
}
|
129
|
+
};
|
130
|
+
|
131
|
+
cdata_end => {
|
132
|
+
callback_simple(id_on_cdata_end);
|
133
|
+
|
134
|
+
fnext main;
|
135
|
+
};
|
136
|
+
*|;
|
137
|
+
|
92
138
|
# Processing Instructions
|
93
139
|
#
|
94
140
|
# http://www.w3.org/TR/xpath/#section-Processing-Instruction-Nodes
|
@@ -103,26 +149,33 @@
|
|
103
149
|
proc_ins_start = '<?' identifier;
|
104
150
|
proc_ins_end = '?>';
|
105
151
|
|
152
|
+
# Everything except "?" OR a single "?"
|
153
|
+
proc_ins_allowed = (^'?'+ | '?') $count_newlines;
|
154
|
+
|
106
155
|
action start_proc_ins {
|
107
156
|
callback_simple(id_on_proc_ins_start);
|
108
157
|
callback(id_on_proc_ins_name, data, encoding, ts + 2, te);
|
109
158
|
|
110
|
-
mark = te;
|
111
|
-
|
112
159
|
fnext proc_ins_body;
|
113
160
|
}
|
114
161
|
|
115
162
|
proc_ins_body := |*
|
163
|
+
proc_ins_allowed => {
|
164
|
+
callback(id_on_proc_ins_body, data, encoding, ts, te);
|
165
|
+
|
166
|
+
if ( lines > 0 )
|
167
|
+
{
|
168
|
+
advance_line(lines);
|
169
|
+
|
170
|
+
lines = 0;
|
171
|
+
}
|
172
|
+
};
|
173
|
+
|
116
174
|
proc_ins_end => {
|
117
|
-
callback(id_on_text, data, encoding, mark, ts);
|
118
175
|
callback_simple(id_on_proc_ins_end);
|
119
176
|
|
120
|
-
mark = 0;
|
121
|
-
|
122
177
|
fnext main;
|
123
178
|
};
|
124
|
-
|
125
|
-
any;
|
126
179
|
*|;
|
127
180
|
|
128
181
|
# Strings
|
@@ -253,19 +306,34 @@
|
|
253
306
|
# Machine that processes the contents of an XML declaration tag.
|
254
307
|
xml_decl := |*
|
255
308
|
xml_decl_end => {
|
309
|
+
if ( lines > 0 )
|
310
|
+
{
|
311
|
+
advance_line(lines);
|
312
|
+
|
313
|
+
lines = 0;
|
314
|
+
}
|
315
|
+
|
256
316
|
callback_simple(id_on_xml_decl_end);
|
317
|
+
|
257
318
|
fnext main;
|
258
319
|
};
|
259
320
|
|
260
321
|
# Attributes and their values (e.g. version="1.0").
|
261
322
|
identifier => {
|
323
|
+
if ( lines > 0 )
|
324
|
+
{
|
325
|
+
advance_line(lines);
|
326
|
+
|
327
|
+
lines = 0;
|
328
|
+
}
|
329
|
+
|
262
330
|
callback(id_on_attribute, data, encoding, ts, te);
|
263
331
|
};
|
264
332
|
|
265
333
|
squote => start_string_squote;
|
266
334
|
dquote => start_string_dquote;
|
267
335
|
|
268
|
-
any;
|
336
|
+
any $count_newlines;
|
269
337
|
*|;
|
270
338
|
|
271
339
|
# Elements
|
@@ -302,10 +370,49 @@
|
|
302
370
|
};
|
303
371
|
*|;
|
304
372
|
|
373
|
+
action hold_start_element_head {
|
374
|
+
fhold;
|
375
|
+
fnext element_head;
|
376
|
+
}
|
377
|
+
|
378
|
+
# Characters that can be used for unquoted HTML attribute values.
|
379
|
+
# See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
|
380
|
+
# for more info.
|
381
|
+
html_unquoted_value = ^(
|
382
|
+
squote | dquote | '`' | '=' | '<' | '>' | whitespace | newline
|
383
|
+
)+;
|
384
|
+
|
385
|
+
# Machine used for processing HTML attribute values.
|
386
|
+
html_attribute_value := |*
|
387
|
+
squote | dquote => {
|
388
|
+
fhold;
|
389
|
+
fnext xml_attribute_value;
|
390
|
+
};
|
391
|
+
|
392
|
+
# Unquoted attribute values are lexed as if they were single quoted
|
393
|
+
# strings.
|
394
|
+
html_unquoted_value => {
|
395
|
+
callback_simple(id_on_string_squote);
|
396
|
+
|
397
|
+
callback(id_on_string_body, data, encoding, ts, te);
|
398
|
+
|
399
|
+
callback_simple(id_on_string_squote);
|
400
|
+
};
|
401
|
+
|
402
|
+
any => hold_start_element_head;
|
403
|
+
*|;
|
404
|
+
|
405
|
+
# Machine used for processing XML attribute values.
|
406
|
+
xml_attribute_value := |*
|
407
|
+
squote => start_string_squote;
|
408
|
+
dquote => start_string_dquote;
|
409
|
+
any => hold_start_element_head;
|
410
|
+
*|;
|
411
|
+
|
305
412
|
# Machine used for processing the contents of an element's starting tag.
|
306
413
|
# This includes the name, namespace and attributes.
|
307
414
|
element_head := |*
|
308
|
-
whitespace
|
415
|
+
whitespace;
|
309
416
|
|
310
417
|
newline => {
|
311
418
|
callback_simple(id_advance_line);
|
@@ -321,8 +428,16 @@
|
|
321
428
|
};
|
322
429
|
|
323
430
|
# Attribute values.
|
324
|
-
|
325
|
-
|
431
|
+
'=' => {
|
432
|
+
if ( html_p )
|
433
|
+
{
|
434
|
+
fnext html_attribute_value;
|
435
|
+
}
|
436
|
+
else
|
437
|
+
{
|
438
|
+
fnext xml_attribute_value;
|
439
|
+
}
|
440
|
+
};
|
326
441
|
|
327
442
|
# We're done with the open tag of the element.
|
328
443
|
'>' => {
|
@@ -438,8 +553,8 @@
|
|
438
553
|
main := |*
|
439
554
|
doctype_start => start_doctype;
|
440
555
|
xml_decl_start => start_xml_decl;
|
441
|
-
|
442
|
-
|
556
|
+
comment_start => start_comment;
|
557
|
+
cdata_start => start_cdata;
|
443
558
|
proc_ins_start => start_proc_ins;
|
444
559
|
element_start => start_element;
|
445
560
|
element_end => close_element;
|
data/lib/liboga.jar
CHANGED
Binary file
|
data/lib/oga/version.rb
CHANGED
data/lib/oga/xml/document.rb
CHANGED
@@ -12,7 +12,7 @@ module Oga
|
|
12
12
|
# The XML declaration of the document.
|
13
13
|
# @return [Oga::XML::XmlDeclaration]
|
14
14
|
#
|
15
|
-
# @!attribute [
|
15
|
+
# @!attribute [r] type
|
16
16
|
# The document type, either `:xml` or `:html`.
|
17
17
|
# @return [Symbol]
|
18
18
|
#
|
@@ -20,7 +20,9 @@ module Oga
|
|
20
20
|
include Querying
|
21
21
|
include Traversal
|
22
22
|
|
23
|
-
attr_accessor :doctype, :xml_declaration
|
23
|
+
attr_accessor :doctype, :xml_declaration
|
24
|
+
|
25
|
+
attr_reader :type
|
24
26
|
|
25
27
|
##
|
26
28
|
# @param [Hash] options
|
data/lib/oga/xml/element.rb
CHANGED
@@ -8,7 +8,7 @@ module Oga
|
|
8
8
|
# The name of the element.
|
9
9
|
# @return [String]
|
10
10
|
#
|
11
|
-
# @!attribute [
|
11
|
+
# @!attribute [r] namespace_name
|
12
12
|
# The name of the namespace.
|
13
13
|
# @return [String]
|
14
14
|
#
|
@@ -23,7 +23,9 @@ module Oga
|
|
23
23
|
class Element < Node
|
24
24
|
include Querying
|
25
25
|
|
26
|
-
|
26
|
+
attr_reader :namespace_name
|
27
|
+
|
28
|
+
attr_accessor :name, :attributes
|
27
29
|
|
28
30
|
attr_writer :namespaces
|
29
31
|
|
@@ -56,6 +58,14 @@ module Oga
|
|
56
58
|
register_namespaces_from_attributes
|
57
59
|
end
|
58
60
|
|
61
|
+
##
|
62
|
+
# @param [String] name
|
63
|
+
#
|
64
|
+
def namespace_name=(name)
|
65
|
+
@namespace_name = name
|
66
|
+
@namespace = nil
|
67
|
+
end
|
68
|
+
|
59
69
|
##
|
60
70
|
# Returns an attribute matching the given name (with or without the
|
61
71
|
# namespace).
|
@@ -289,14 +299,17 @@ module Oga
|
|
289
299
|
#
|
290
300
|
# @param [String] name
|
291
301
|
# @param [String] uri
|
302
|
+
# @param [TrueClass|FalseClass] flush
|
292
303
|
# @see [Oga::XML::Namespace#initialize]
|
293
304
|
#
|
294
|
-
def register_namespace(name, uri)
|
305
|
+
def register_namespace(name, uri, flush = true)
|
295
306
|
if namespaces[name]
|
296
307
|
raise ArgumentError, "The namespace #{name.inspect} already exists"
|
297
308
|
end
|
298
309
|
|
299
310
|
namespaces[name] = Namespace.new(:name => name, :uri => uri)
|
311
|
+
|
312
|
+
flush_namespaces_cache if flush
|
300
313
|
end
|
301
314
|
|
302
315
|
##
|
@@ -306,20 +319,25 @@ module Oga
|
|
306
319
|
# @return [Hash]
|
307
320
|
#
|
308
321
|
def available_namespaces
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
322
|
+
# HTML(5) completely ignores namespaces
|
323
|
+
if html?
|
324
|
+
return @available_namespaces ||= {}
|
325
|
+
elsif !@available_namespaces
|
326
|
+
merged = namespaces.dup
|
327
|
+
node = parent
|
328
|
+
|
329
|
+
while node && node.respond_to?(:namespaces)
|
330
|
+
node.namespaces.each do |prefix, ns|
|
331
|
+
merged[prefix] = ns unless merged[prefix]
|
332
|
+
end
|
333
|
+
|
334
|
+
node = node.parent
|
317
335
|
end
|
318
336
|
|
319
|
-
|
337
|
+
@available_namespaces = merged
|
320
338
|
end
|
321
339
|
|
322
|
-
return
|
340
|
+
return @available_namespaces
|
323
341
|
end
|
324
342
|
|
325
343
|
##
|
@@ -339,19 +357,40 @@ module Oga
|
|
339
357
|
return self_closing
|
340
358
|
end
|
341
359
|
|
360
|
+
##
|
361
|
+
# Flushes the namespaces cache of the current element and all its child
|
362
|
+
# elements.
|
363
|
+
#
|
364
|
+
def flush_namespaces_cache
|
365
|
+
@available_namespaces = nil
|
366
|
+
@namespace = nil
|
367
|
+
|
368
|
+
children.each do |child|
|
369
|
+
child.flush_namespaces_cache if child.is_a?(Element)
|
370
|
+
end
|
371
|
+
end
|
372
|
+
|
342
373
|
private
|
343
374
|
|
344
375
|
##
|
345
376
|
# Registers namespaces based on any "xmlns" attributes.
|
346
377
|
#
|
347
378
|
def register_namespaces_from_attributes
|
379
|
+
flush = false
|
380
|
+
|
348
381
|
attributes.each do |attr|
|
349
382
|
# We're using `namespace_name` opposed to `namespace.name` as "xmlns"
|
350
383
|
# is not a registered namespace.
|
351
384
|
if attr.name == XMLNS_PREFIX or attr.namespace_name == XMLNS_PREFIX
|
352
|
-
|
385
|
+
flush = true
|
386
|
+
|
387
|
+
# Ensures we only flush the cache once instead of flushing it on
|
388
|
+
# every register_namespace call.
|
389
|
+
register_namespace(attr.name, attr.value, false)
|
353
390
|
end
|
354
391
|
end
|
392
|
+
|
393
|
+
flush_namespaces_cache if flush
|
355
394
|
end
|
356
395
|
|
357
396
|
##
|
data/lib/oga/xml/lexer.rb
CHANGED
@@ -262,10 +262,40 @@ module Oga
|
|
262
262
|
end
|
263
263
|
|
264
264
|
##
|
265
|
-
# Called on
|
265
|
+
# Called on the open CDATA tag.
|
266
266
|
#
|
267
|
-
def
|
268
|
-
add_token(:
|
267
|
+
def on_cdata_start
|
268
|
+
add_token(:T_CDATA_START)
|
269
|
+
end
|
270
|
+
|
271
|
+
##
|
272
|
+
# Called on the closing CDATA tag.
|
273
|
+
#
|
274
|
+
def on_cdata_end
|
275
|
+
add_token(:T_CDATA_END)
|
276
|
+
end
|
277
|
+
|
278
|
+
##
|
279
|
+
# Called for the body of a CDATA tag.
|
280
|
+
#
|
281
|
+
# @param [String] value
|
282
|
+
#
|
283
|
+
def on_cdata_body(value)
|
284
|
+
add_token(:T_CDATA_BODY, value)
|
285
|
+
end
|
286
|
+
|
287
|
+
##
|
288
|
+
# Called on the open comment tag.
|
289
|
+
#
|
290
|
+
def on_comment_start
|
291
|
+
add_token(:T_COMMENT_START)
|
292
|
+
end
|
293
|
+
|
294
|
+
##
|
295
|
+
# Called on the closing comment tag.
|
296
|
+
#
|
297
|
+
def on_comment_end
|
298
|
+
add_token(:T_COMMENT_END)
|
269
299
|
end
|
270
300
|
|
271
301
|
##
|
@@ -273,8 +303,8 @@ module Oga
|
|
273
303
|
#
|
274
304
|
# @param [String] value
|
275
305
|
#
|
276
|
-
def
|
277
|
-
add_token(:
|
306
|
+
def on_comment_body(value)
|
307
|
+
add_token(:T_COMMENT_BODY, value)
|
278
308
|
end
|
279
309
|
|
280
310
|
##
|
@@ -314,6 +344,15 @@ module Oga
|
|
314
344
|
add_token(:T_PROC_INS_NAME, value)
|
315
345
|
end
|
316
346
|
|
347
|
+
##
|
348
|
+
# Called on the body of a processing instruction.
|
349
|
+
#
|
350
|
+
# @param [String] value
|
351
|
+
#
|
352
|
+
def on_proc_ins_body(value)
|
353
|
+
add_token(:T_PROC_INS_BODY, value)
|
354
|
+
end
|
355
|
+
|
317
356
|
##
|
318
357
|
# Called on the end of a processing instruction.
|
319
358
|
#
|