oga 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/c/lexer.c +1234 -919
- data/ext/c/lexer.rl +12 -2
- data/ext/java/org/liboga/xml/Lexer.java +473 -317
- data/ext/java/org/liboga/xml/Lexer.rl +9 -2
- data/ext/ragel/base_lexer.rl +132 -17
- data/lib/oga/version.rb +1 -1
- data/lib/oga/xml/document.rb +4 -2
- data/lib/oga/xml/element.rb +53 -14
- data/lib/oga/xml/lexer.rb +44 -5
- data/lib/oga/xml/node.rb +30 -14
- data/lib/oga/xml/parser.rb +153 -117
- data/lib/oga/xml/traversal.rb +3 -1
- metadata +2 -2
@@ -89,6 +89,8 @@ public class Lexer extends RubyObject
|
|
89
89
|
@JRubyMethod
|
90
90
|
public IRubyObject advance_native(ThreadContext context, RubyString rb_str)
|
91
91
|
{
|
92
|
+
Boolean html_p = this.callMethod(context, "html").isTrue();
|
93
|
+
|
92
94
|
Encoding encoding = rb_str.getEncoding();
|
93
95
|
|
94
96
|
byte[] data = rb_str.getBytes();
|
@@ -104,8 +106,12 @@ public class Lexer extends RubyObject
|
|
104
106
|
String id_advance_line = "advance_line";
|
105
107
|
String id_on_attribute = "on_attribute";
|
106
108
|
String id_on_attribute_ns = "on_attribute_ns";
|
107
|
-
String
|
108
|
-
String
|
109
|
+
String id_on_cdata_start = "on_cdata_start";
|
110
|
+
String id_on_cdata_body = "on_cdata_body";
|
111
|
+
String id_on_cdata_end = "on_cdata_end";
|
112
|
+
String id_on_comment_start = "on_comment_start";
|
113
|
+
String id_on_comment_body = "on_comment_body";
|
114
|
+
String id_on_comment_end = "on_comment_end";
|
109
115
|
String id_on_doctype_end = "on_doctype_end";
|
110
116
|
String id_on_doctype_inline = "on_doctype_inline";
|
111
117
|
String id_on_doctype_name = "on_doctype_name";
|
@@ -119,6 +125,7 @@ public class Lexer extends RubyObject
|
|
119
125
|
String id_on_proc_ins_end = "on_proc_ins_end";
|
120
126
|
String id_on_proc_ins_name = "on_proc_ins_name";
|
121
127
|
String id_on_proc_ins_start = "on_proc_ins_start";
|
128
|
+
String id_on_proc_ins_body = "on_proc_ins_body";
|
122
129
|
String id_on_string_body = "on_string_body";
|
123
130
|
String id_on_string_dquote = "on_string_dquote";
|
124
131
|
String id_on_string_squote = "on_string_squote";
|
data/ext/ragel/base_lexer.rl
CHANGED
@@ -67,12 +67,35 @@
|
|
67
67
|
|
68
68
|
comment_start = '<!--';
|
69
69
|
comment_end = '-->';
|
70
|
-
|
70
|
+
|
71
|
+
# Everything except "-" OR a single "-"
|
72
|
+
comment_allowed = (^'-'+ | '-') $count_newlines;
|
71
73
|
|
72
74
|
action start_comment {
|
73
|
-
|
75
|
+
callback_simple(id_on_comment_start);
|
76
|
+
|
77
|
+
fnext comment_body;
|
74
78
|
}
|
75
79
|
|
80
|
+
comment_body := |*
|
81
|
+
comment_allowed => {
|
82
|
+
callback(id_on_comment_body, data, encoding, ts, te);
|
83
|
+
|
84
|
+
if ( lines > 0 )
|
85
|
+
{
|
86
|
+
advance_line(lines);
|
87
|
+
|
88
|
+
lines = 0;
|
89
|
+
}
|
90
|
+
};
|
91
|
+
|
92
|
+
comment_end => {
|
93
|
+
callback_simple(id_on_comment_end);
|
94
|
+
|
95
|
+
fnext main;
|
96
|
+
};
|
97
|
+
*|;
|
98
|
+
|
76
99
|
# CDATA
|
77
100
|
#
|
78
101
|
# http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
|
@@ -83,12 +106,35 @@
|
|
83
106
|
|
84
107
|
cdata_start = '<![CDATA[';
|
85
108
|
cdata_end = ']]>';
|
86
|
-
|
109
|
+
|
110
|
+
# Everything except "]" OR a single "]"
|
111
|
+
cdata_allowed = (^']'+ | ']') $count_newlines;
|
87
112
|
|
88
113
|
action start_cdata {
|
89
|
-
|
114
|
+
callback_simple(id_on_cdata_start);
|
115
|
+
|
116
|
+
fnext cdata_body;
|
90
117
|
}
|
91
118
|
|
119
|
+
cdata_body := |*
|
120
|
+
cdata_allowed => {
|
121
|
+
callback(id_on_cdata_body, data, encoding, ts, te);
|
122
|
+
|
123
|
+
if ( lines > 0 )
|
124
|
+
{
|
125
|
+
advance_line(lines);
|
126
|
+
|
127
|
+
lines = 0;
|
128
|
+
}
|
129
|
+
};
|
130
|
+
|
131
|
+
cdata_end => {
|
132
|
+
callback_simple(id_on_cdata_end);
|
133
|
+
|
134
|
+
fnext main;
|
135
|
+
};
|
136
|
+
*|;
|
137
|
+
|
92
138
|
# Processing Instructions
|
93
139
|
#
|
94
140
|
# http://www.w3.org/TR/xpath/#section-Processing-Instruction-Nodes
|
@@ -103,26 +149,33 @@
|
|
103
149
|
proc_ins_start = '<?' identifier;
|
104
150
|
proc_ins_end = '?>';
|
105
151
|
|
152
|
+
# Everything except "?" OR a single "?"
|
153
|
+
proc_ins_allowed = (^'?'+ | '?') $count_newlines;
|
154
|
+
|
106
155
|
action start_proc_ins {
|
107
156
|
callback_simple(id_on_proc_ins_start);
|
108
157
|
callback(id_on_proc_ins_name, data, encoding, ts + 2, te);
|
109
158
|
|
110
|
-
mark = te;
|
111
|
-
|
112
159
|
fnext proc_ins_body;
|
113
160
|
}
|
114
161
|
|
115
162
|
proc_ins_body := |*
|
163
|
+
proc_ins_allowed => {
|
164
|
+
callback(id_on_proc_ins_body, data, encoding, ts, te);
|
165
|
+
|
166
|
+
if ( lines > 0 )
|
167
|
+
{
|
168
|
+
advance_line(lines);
|
169
|
+
|
170
|
+
lines = 0;
|
171
|
+
}
|
172
|
+
};
|
173
|
+
|
116
174
|
proc_ins_end => {
|
117
|
-
callback(id_on_text, data, encoding, mark, ts);
|
118
175
|
callback_simple(id_on_proc_ins_end);
|
119
176
|
|
120
|
-
mark = 0;
|
121
|
-
|
122
177
|
fnext main;
|
123
178
|
};
|
124
|
-
|
125
|
-
any;
|
126
179
|
*|;
|
127
180
|
|
128
181
|
# Strings
|
@@ -253,19 +306,34 @@
|
|
253
306
|
# Machine that processes the contents of an XML declaration tag.
|
254
307
|
xml_decl := |*
|
255
308
|
xml_decl_end => {
|
309
|
+
if ( lines > 0 )
|
310
|
+
{
|
311
|
+
advance_line(lines);
|
312
|
+
|
313
|
+
lines = 0;
|
314
|
+
}
|
315
|
+
|
256
316
|
callback_simple(id_on_xml_decl_end);
|
317
|
+
|
257
318
|
fnext main;
|
258
319
|
};
|
259
320
|
|
260
321
|
# Attributes and their values (e.g. version="1.0").
|
261
322
|
identifier => {
|
323
|
+
if ( lines > 0 )
|
324
|
+
{
|
325
|
+
advance_line(lines);
|
326
|
+
|
327
|
+
lines = 0;
|
328
|
+
}
|
329
|
+
|
262
330
|
callback(id_on_attribute, data, encoding, ts, te);
|
263
331
|
};
|
264
332
|
|
265
333
|
squote => start_string_squote;
|
266
334
|
dquote => start_string_dquote;
|
267
335
|
|
268
|
-
any;
|
336
|
+
any $count_newlines;
|
269
337
|
*|;
|
270
338
|
|
271
339
|
# Elements
|
@@ -302,10 +370,49 @@
|
|
302
370
|
};
|
303
371
|
*|;
|
304
372
|
|
373
|
+
action hold_start_element_head {
|
374
|
+
fhold;
|
375
|
+
fnext element_head;
|
376
|
+
}
|
377
|
+
|
378
|
+
# Characters that can be used for unquoted HTML attribute values.
|
379
|
+
# See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
|
380
|
+
# for more info.
|
381
|
+
html_unquoted_value = ^(
|
382
|
+
squote | dquote | '`' | '=' | '<' | '>' | whitespace | newline
|
383
|
+
)+;
|
384
|
+
|
385
|
+
# Machine used for processing HTML attribute values.
|
386
|
+
html_attribute_value := |*
|
387
|
+
squote | dquote => {
|
388
|
+
fhold;
|
389
|
+
fnext xml_attribute_value;
|
390
|
+
};
|
391
|
+
|
392
|
+
# Unquoted attribute values are lexed as if they were single quoted
|
393
|
+
# strings.
|
394
|
+
html_unquoted_value => {
|
395
|
+
callback_simple(id_on_string_squote);
|
396
|
+
|
397
|
+
callback(id_on_string_body, data, encoding, ts, te);
|
398
|
+
|
399
|
+
callback_simple(id_on_string_squote);
|
400
|
+
};
|
401
|
+
|
402
|
+
any => hold_start_element_head;
|
403
|
+
*|;
|
404
|
+
|
405
|
+
# Machine used for processing XML attribute values.
|
406
|
+
xml_attribute_value := |*
|
407
|
+
squote => start_string_squote;
|
408
|
+
dquote => start_string_dquote;
|
409
|
+
any => hold_start_element_head;
|
410
|
+
*|;
|
411
|
+
|
305
412
|
# Machine used for processing the contents of an element's starting tag.
|
306
413
|
# This includes the name, namespace and attributes.
|
307
414
|
element_head := |*
|
308
|
-
whitespace
|
415
|
+
whitespace;
|
309
416
|
|
310
417
|
newline => {
|
311
418
|
callback_simple(id_advance_line);
|
@@ -321,8 +428,16 @@
|
|
321
428
|
};
|
322
429
|
|
323
430
|
# Attribute values.
|
324
|
-
|
325
|
-
|
431
|
+
'=' => {
|
432
|
+
if ( html_p )
|
433
|
+
{
|
434
|
+
fnext html_attribute_value;
|
435
|
+
}
|
436
|
+
else
|
437
|
+
{
|
438
|
+
fnext xml_attribute_value;
|
439
|
+
}
|
440
|
+
};
|
326
441
|
|
327
442
|
# We're done with the open tag of the element.
|
328
443
|
'>' => {
|
@@ -438,8 +553,8 @@
|
|
438
553
|
main := |*
|
439
554
|
doctype_start => start_doctype;
|
440
555
|
xml_decl_start => start_xml_decl;
|
441
|
-
|
442
|
-
|
556
|
+
comment_start => start_comment;
|
557
|
+
cdata_start => start_cdata;
|
443
558
|
proc_ins_start => start_proc_ins;
|
444
559
|
element_start => start_element;
|
445
560
|
element_end => close_element;
|
data/lib/oga/version.rb
CHANGED
data/lib/oga/xml/document.rb
CHANGED
@@ -12,7 +12,7 @@ module Oga
|
|
12
12
|
# The XML declaration of the document.
|
13
13
|
# @return [Oga::XML::XmlDeclaration]
|
14
14
|
#
|
15
|
-
# @!attribute [
|
15
|
+
# @!attribute [r] type
|
16
16
|
# The document type, either `:xml` or `:html`.
|
17
17
|
# @return [Symbol]
|
18
18
|
#
|
@@ -20,7 +20,9 @@ module Oga
|
|
20
20
|
include Querying
|
21
21
|
include Traversal
|
22
22
|
|
23
|
-
attr_accessor :doctype, :xml_declaration
|
23
|
+
attr_accessor :doctype, :xml_declaration
|
24
|
+
|
25
|
+
attr_reader :type
|
24
26
|
|
25
27
|
##
|
26
28
|
# @param [Hash] options
|
data/lib/oga/xml/element.rb
CHANGED
@@ -8,7 +8,7 @@ module Oga
|
|
8
8
|
# The name of the element.
|
9
9
|
# @return [String]
|
10
10
|
#
|
11
|
-
# @!attribute [
|
11
|
+
# @!attribute [r] namespace_name
|
12
12
|
# The name of the namespace.
|
13
13
|
# @return [String]
|
14
14
|
#
|
@@ -23,7 +23,9 @@ module Oga
|
|
23
23
|
class Element < Node
|
24
24
|
include Querying
|
25
25
|
|
26
|
-
|
26
|
+
attr_reader :namespace_name
|
27
|
+
|
28
|
+
attr_accessor :name, :attributes
|
27
29
|
|
28
30
|
attr_writer :namespaces
|
29
31
|
|
@@ -56,6 +58,14 @@ module Oga
|
|
56
58
|
register_namespaces_from_attributes
|
57
59
|
end
|
58
60
|
|
61
|
+
##
|
62
|
+
# @param [String] name
|
63
|
+
#
|
64
|
+
def namespace_name=(name)
|
65
|
+
@namespace_name = name
|
66
|
+
@namespace = nil
|
67
|
+
end
|
68
|
+
|
59
69
|
##
|
60
70
|
# Returns an attribute matching the given name (with or without the
|
61
71
|
# namespace).
|
@@ -289,14 +299,17 @@ module Oga
|
|
289
299
|
#
|
290
300
|
# @param [String] name
|
291
301
|
# @param [String] uri
|
302
|
+
# @param [TrueClass|FalseClass] flush
|
292
303
|
# @see [Oga::XML::Namespace#initialize]
|
293
304
|
#
|
294
|
-
def register_namespace(name, uri)
|
305
|
+
def register_namespace(name, uri, flush = true)
|
295
306
|
if namespaces[name]
|
296
307
|
raise ArgumentError, "The namespace #{name.inspect} already exists"
|
297
308
|
end
|
298
309
|
|
299
310
|
namespaces[name] = Namespace.new(:name => name, :uri => uri)
|
311
|
+
|
312
|
+
flush_namespaces_cache if flush
|
300
313
|
end
|
301
314
|
|
302
315
|
##
|
@@ -306,20 +319,25 @@ module Oga
|
|
306
319
|
# @return [Hash]
|
307
320
|
#
|
308
321
|
def available_namespaces
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
322
|
+
# HTML(5) completely ignores namespaces
|
323
|
+
if html?
|
324
|
+
return @available_namespaces ||= {}
|
325
|
+
elsif !@available_namespaces
|
326
|
+
merged = namespaces.dup
|
327
|
+
node = parent
|
328
|
+
|
329
|
+
while node && node.respond_to?(:namespaces)
|
330
|
+
node.namespaces.each do |prefix, ns|
|
331
|
+
merged[prefix] = ns unless merged[prefix]
|
332
|
+
end
|
333
|
+
|
334
|
+
node = node.parent
|
317
335
|
end
|
318
336
|
|
319
|
-
|
337
|
+
@available_namespaces = merged
|
320
338
|
end
|
321
339
|
|
322
|
-
return
|
340
|
+
return @available_namespaces
|
323
341
|
end
|
324
342
|
|
325
343
|
##
|
@@ -339,19 +357,40 @@ module Oga
|
|
339
357
|
return self_closing
|
340
358
|
end
|
341
359
|
|
360
|
+
##
|
361
|
+
# Flushes the namespaces cache of the current element and all its child
|
362
|
+
# elements.
|
363
|
+
#
|
364
|
+
def flush_namespaces_cache
|
365
|
+
@available_namespaces = nil
|
366
|
+
@namespace = nil
|
367
|
+
|
368
|
+
children.each do |child|
|
369
|
+
child.flush_namespaces_cache if child.is_a?(Element)
|
370
|
+
end
|
371
|
+
end
|
372
|
+
|
342
373
|
private
|
343
374
|
|
344
375
|
##
|
345
376
|
# Registers namespaces based on any "xmlns" attributes.
|
346
377
|
#
|
347
378
|
def register_namespaces_from_attributes
|
379
|
+
flush = false
|
380
|
+
|
348
381
|
attributes.each do |attr|
|
349
382
|
# We're using `namespace_name` opposed to `namespace.name` as "xmlns"
|
350
383
|
# is not a registered namespace.
|
351
384
|
if attr.name == XMLNS_PREFIX or attr.namespace_name == XMLNS_PREFIX
|
352
|
-
|
385
|
+
flush = true
|
386
|
+
|
387
|
+
# Ensures we only flush the cache once instead of flushing it on
|
388
|
+
# every register_namespace call.
|
389
|
+
register_namespace(attr.name, attr.value, false)
|
353
390
|
end
|
354
391
|
end
|
392
|
+
|
393
|
+
flush_namespaces_cache if flush
|
355
394
|
end
|
356
395
|
|
357
396
|
##
|
data/lib/oga/xml/lexer.rb
CHANGED
@@ -262,10 +262,40 @@ module Oga
|
|
262
262
|
end
|
263
263
|
|
264
264
|
##
|
265
|
-
# Called on
|
265
|
+
# Called on the open CDATA tag.
|
266
266
|
#
|
267
|
-
def
|
268
|
-
add_token(:
|
267
|
+
def on_cdata_start
|
268
|
+
add_token(:T_CDATA_START)
|
269
|
+
end
|
270
|
+
|
271
|
+
##
|
272
|
+
# Called on the closing CDATA tag.
|
273
|
+
#
|
274
|
+
def on_cdata_end
|
275
|
+
add_token(:T_CDATA_END)
|
276
|
+
end
|
277
|
+
|
278
|
+
##
|
279
|
+
# Called for the body of a CDATA tag.
|
280
|
+
#
|
281
|
+
# @param [String] value
|
282
|
+
#
|
283
|
+
def on_cdata_body(value)
|
284
|
+
add_token(:T_CDATA_BODY, value)
|
285
|
+
end
|
286
|
+
|
287
|
+
##
|
288
|
+
# Called on the open comment tag.
|
289
|
+
#
|
290
|
+
def on_comment_start
|
291
|
+
add_token(:T_COMMENT_START)
|
292
|
+
end
|
293
|
+
|
294
|
+
##
|
295
|
+
# Called on the closing comment tag.
|
296
|
+
#
|
297
|
+
def on_comment_end
|
298
|
+
add_token(:T_COMMENT_END)
|
269
299
|
end
|
270
300
|
|
271
301
|
##
|
@@ -273,8 +303,8 @@ module Oga
|
|
273
303
|
#
|
274
304
|
# @param [String] value
|
275
305
|
#
|
276
|
-
def
|
277
|
-
add_token(:
|
306
|
+
def on_comment_body(value)
|
307
|
+
add_token(:T_COMMENT_BODY, value)
|
278
308
|
end
|
279
309
|
|
280
310
|
##
|
@@ -314,6 +344,15 @@ module Oga
|
|
314
344
|
add_token(:T_PROC_INS_NAME, value)
|
315
345
|
end
|
316
346
|
|
347
|
+
##
|
348
|
+
# Called on the body of a processing instruction.
|
349
|
+
#
|
350
|
+
# @param [String] value
|
351
|
+
#
|
352
|
+
def on_proc_ins_body(value)
|
353
|
+
add_token(:T_PROC_INS_BODY, value)
|
354
|
+
end
|
355
|
+
|
317
356
|
##
|
318
357
|
# Called on the end of a processing instruction.
|
319
358
|
#
|