oga 0.2.0-java → 0.2.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +10 -0
- data/doc/changelog.md +47 -0
- data/ext/c/lexer.c +847 -639
- data/ext/c/lexer.rl +39 -14
- data/ext/java/org/liboga/xml/Lexer.java +369 -263
- data/ext/java/org/liboga/xml/Lexer.rl +36 -0
- data/ext/ragel/base_lexer.rl +93 -35
- data/lib/liboga.jar +0 -0
- data/lib/oga/version.rb +1 -1
- data/lib/oga/xml/document.rb +7 -0
- data/lib/oga/xml/element.rb +3 -6
- data/lib/oga/xml/entities.rb +13 -4
- data/lib/oga/xml/lexer.rb +17 -0
- data/lib/oga/xml/parser.rb +95 -84
- data/lib/oga/xml/text.rb +9 -1
- metadata +2 -2
@@ -101,6 +101,31 @@ public class Lexer extends RubyObject
|
|
101
101
|
int pe = data.length;
|
102
102
|
int eof = data.length;
|
103
103
|
|
104
|
+
String id_advance_line = "advance_line";
|
105
|
+
String id_on_attribute = "on_attribute";
|
106
|
+
String id_on_attribute_ns = "on_attribute_ns";
|
107
|
+
String id_on_cdata = "on_cdata";
|
108
|
+
String id_on_comment = "on_comment";
|
109
|
+
String id_on_doctype_end = "on_doctype_end";
|
110
|
+
String id_on_doctype_inline = "on_doctype_inline";
|
111
|
+
String id_on_doctype_name = "on_doctype_name";
|
112
|
+
String id_on_doctype_start = "on_doctype_start";
|
113
|
+
String id_on_doctype_type = "on_doctype_type";
|
114
|
+
String id_on_element_end = "on_element_end";
|
115
|
+
String id_on_element_name = "on_element_name";
|
116
|
+
String id_on_element_ns = "on_element_ns";
|
117
|
+
String id_on_element_open_end = "on_element_open_end";
|
118
|
+
String id_on_element_start = "on_element_start";
|
119
|
+
String id_on_proc_ins_end = "on_proc_ins_end";
|
120
|
+
String id_on_proc_ins_name = "on_proc_ins_name";
|
121
|
+
String id_on_proc_ins_start = "on_proc_ins_start";
|
122
|
+
String id_on_string_body = "on_string_body";
|
123
|
+
String id_on_string_dquote = "on_string_dquote";
|
124
|
+
String id_on_string_squote = "on_string_squote";
|
125
|
+
String id_on_text = "on_text";
|
126
|
+
String id_on_xml_decl_end = "on_xml_decl_end";
|
127
|
+
String id_on_xml_decl_start = "on_xml_decl_start";
|
128
|
+
|
104
129
|
%% write exec;
|
105
130
|
|
106
131
|
this.lines = lines;
|
@@ -160,6 +185,17 @@ public class Lexer extends RubyObject
|
|
160
185
|
|
161
186
|
this.callMethod(context, "advance_line", lines);
|
162
187
|
}
|
188
|
+
|
189
|
+
/**
|
190
|
+
* Returns true if we're in an HTML script tag. See
|
191
|
+
* Oga::XML::Lexer#inside_html_script? for more information.
|
192
|
+
*/
|
193
|
+
public Boolean inside_html_script_p()
|
194
|
+
{
|
195
|
+
ThreadContext context = this.runtime.getCurrentContext();
|
196
|
+
|
197
|
+
return this.callMethod(context, "inside_html_script?").isTrue();
|
198
|
+
}
|
163
199
|
}
|
164
200
|
|
165
201
|
%%{
|
data/ext/ragel/base_lexer.rl
CHANGED
@@ -28,6 +28,17 @@
|
|
28
28
|
# When you call a method in Ruby make sure that said method is defined as
|
29
29
|
# an instance method in the `Oga::XML::Lexer` class.
|
30
30
|
#
|
31
|
+
# The name of the callback to invoke should be an identifier starting with
|
32
|
+
# "id_". The identifier should be defined in the associated C and Java code.
|
33
|
+
# In case of C code its value should be a Symbol as a ID object, for Java
|
34
|
+
# it should be a String. For example:
|
35
|
+
#
|
36
|
+
# ID id_foo = rb_intern("foo");
|
37
|
+
#
|
38
|
+
# And for Java:
|
39
|
+
#
|
40
|
+
# String id_foo = "foo";
|
41
|
+
#
|
31
42
|
# ## Machine Transitions
|
32
43
|
#
|
33
44
|
# To transition from one machine to another always use `fnext` instead of
|
@@ -59,7 +70,7 @@
|
|
59
70
|
comment = comment_start (any* -- comment_end) comment_end;
|
60
71
|
|
61
72
|
action start_comment {
|
62
|
-
callback(
|
73
|
+
callback(id_on_comment, data, encoding, ts + 4, te - 3);
|
63
74
|
}
|
64
75
|
|
65
76
|
# CDATA
|
@@ -75,7 +86,7 @@
|
|
75
86
|
cdata = cdata_start (any* -- cdata_end) cdata_end;
|
76
87
|
|
77
88
|
action start_cdata {
|
78
|
-
callback(
|
89
|
+
callback(id_on_cdata, data, encoding, ts + 9, te - 3);
|
79
90
|
}
|
80
91
|
|
81
92
|
# Processing Instructions
|
@@ -93,8 +104,8 @@
|
|
93
104
|
proc_ins_end = '?>';
|
94
105
|
|
95
106
|
action start_proc_ins {
|
96
|
-
callback_simple(
|
97
|
-
callback(
|
107
|
+
callback_simple(id_on_proc_ins_start);
|
108
|
+
callback(id_on_proc_ins_name, data, encoding, ts + 2, te);
|
98
109
|
|
99
110
|
mark = te;
|
100
111
|
|
@@ -103,8 +114,8 @@
|
|
103
114
|
|
104
115
|
proc_ins_body := |*
|
105
116
|
proc_ins_end => {
|
106
|
-
callback(
|
107
|
-
callback_simple(
|
117
|
+
callback(id_on_text, data, encoding, mark, ts);
|
118
|
+
callback_simple(id_on_proc_ins_end);
|
108
119
|
|
109
120
|
mark = 0;
|
110
121
|
|
@@ -124,7 +135,7 @@
|
|
124
135
|
squote = "'";
|
125
136
|
|
126
137
|
action emit_string {
|
127
|
-
callback(
|
138
|
+
callback(id_on_string_body, data, encoding, ts, te);
|
128
139
|
|
129
140
|
if ( lines > 0 )
|
130
141
|
{
|
@@ -135,13 +146,13 @@
|
|
135
146
|
}
|
136
147
|
|
137
148
|
action start_string_squote {
|
138
|
-
callback_simple(
|
149
|
+
callback_simple(id_on_string_squote);
|
139
150
|
|
140
151
|
fcall string_squote;
|
141
152
|
}
|
142
153
|
|
143
154
|
action start_string_dquote {
|
144
|
-
callback_simple(
|
155
|
+
callback_simple(id_on_string_dquote);
|
145
156
|
|
146
157
|
fcall string_dquote;
|
147
158
|
}
|
@@ -150,7 +161,7 @@
|
|
150
161
|
^squote* $count_newlines => emit_string;
|
151
162
|
|
152
163
|
squote => {
|
153
|
-
callback_simple(
|
164
|
+
callback_simple(id_on_string_squote);
|
154
165
|
|
155
166
|
fret;
|
156
167
|
};
|
@@ -160,7 +171,7 @@
|
|
160
171
|
^dquote* $count_newlines => emit_string;
|
161
172
|
|
162
173
|
dquote => {
|
163
|
-
callback_simple(
|
174
|
+
callback_simple(id_on_string_dquote);
|
164
175
|
|
165
176
|
fret;
|
166
177
|
};
|
@@ -179,22 +190,35 @@
|
|
179
190
|
doctype_start = '<!DOCTYPE'i whitespace+;
|
180
191
|
|
181
192
|
action start_doctype {
|
182
|
-
callback_simple(
|
193
|
+
callback_simple(id_on_doctype_start);
|
183
194
|
fnext doctype;
|
184
195
|
}
|
185
196
|
|
197
|
+
# Machine for processing inline rules of a doctype.
|
198
|
+
doctype_inline := |*
|
199
|
+
^']'* $count_newlines => {
|
200
|
+
callback(id_on_doctype_inline, data, encoding, ts, te);
|
201
|
+
|
202
|
+
if ( lines > 0 )
|
203
|
+
{
|
204
|
+
advance_line(lines);
|
205
|
+
|
206
|
+
lines = 0;
|
207
|
+
}
|
208
|
+
};
|
209
|
+
|
210
|
+
']' => { fnext doctype; };
|
211
|
+
*|;
|
212
|
+
|
186
213
|
# Machine for processing doctypes. Doctype values such as the public
|
187
214
|
# and system IDs are treated as T_STRING tokens.
|
188
215
|
doctype := |*
|
189
216
|
'PUBLIC' | 'SYSTEM' => {
|
190
|
-
callback(
|
217
|
+
callback(id_on_doctype_type, data, encoding, ts, te);
|
191
218
|
};
|
192
219
|
|
193
|
-
#
|
194
|
-
|
195
|
-
'[' any+ :> ']' => {
|
196
|
-
callback("on_doctype_inline", data, encoding, ts + 1, te - 1);
|
197
|
-
};
|
220
|
+
# Starts a set of inline doctype rules.
|
221
|
+
'[' => { fnext doctype_inline; };
|
198
222
|
|
199
223
|
# Lex the public/system IDs as regular strings.
|
200
224
|
squote => start_string_squote;
|
@@ -205,11 +229,11 @@
|
|
205
229
|
whitespace;
|
206
230
|
|
207
231
|
identifier => {
|
208
|
-
callback(
|
232
|
+
callback(id_on_doctype_name, data, encoding, ts, te);
|
209
233
|
};
|
210
234
|
|
211
235
|
'>' => {
|
212
|
-
callback_simple(
|
236
|
+
callback_simple(id_on_doctype_end);
|
213
237
|
fnext main;
|
214
238
|
};
|
215
239
|
*|;
|
@@ -222,20 +246,20 @@
|
|
222
246
|
xml_decl_end = '?>';
|
223
247
|
|
224
248
|
action start_xml_decl {
|
225
|
-
callback_simple(
|
249
|
+
callback_simple(id_on_xml_decl_start);
|
226
250
|
fnext xml_decl;
|
227
251
|
}
|
228
252
|
|
229
253
|
# Machine that processes the contents of an XML declaration tag.
|
230
254
|
xml_decl := |*
|
231
255
|
xml_decl_end => {
|
232
|
-
callback_simple(
|
256
|
+
callback_simple(id_on_xml_decl_end);
|
233
257
|
fnext main;
|
234
258
|
};
|
235
259
|
|
236
260
|
# Attributes and their values (e.g. version="1.0").
|
237
261
|
identifier => {
|
238
|
-
callback(
|
262
|
+
callback(id_on_attribute, data, encoding, ts, te);
|
239
263
|
};
|
240
264
|
|
241
265
|
squote => start_string_squote;
|
@@ -257,23 +281,23 @@
|
|
257
281
|
element_end = '</' identifier (':' identifier)* '>';
|
258
282
|
|
259
283
|
action start_element {
|
260
|
-
callback_simple(
|
284
|
+
callback_simple(id_on_element_start);
|
261
285
|
fhold;
|
262
286
|
fnext element_name;
|
263
287
|
}
|
264
288
|
|
265
289
|
action close_element {
|
266
|
-
callback_simple(
|
290
|
+
callback_simple(id_on_element_end);
|
267
291
|
}
|
268
292
|
|
269
293
|
# Machine used for lexing the name/namespace of an element.
|
270
294
|
element_name := |*
|
271
295
|
identifier ':' => {
|
272
|
-
callback(
|
296
|
+
callback(id_on_element_ns, data, encoding, ts, te - 1);
|
273
297
|
};
|
274
298
|
|
275
299
|
identifier => {
|
276
|
-
callback(
|
300
|
+
callback(id_on_element_name, data, encoding, ts, te);
|
277
301
|
fnext element_head;
|
278
302
|
};
|
279
303
|
*|;
|
@@ -284,16 +308,16 @@
|
|
284
308
|
whitespace | '=';
|
285
309
|
|
286
310
|
newline => {
|
287
|
-
callback_simple(
|
311
|
+
callback_simple(id_advance_line);
|
288
312
|
};
|
289
313
|
|
290
314
|
# Attribute names and namespaces.
|
291
315
|
identifier ':' => {
|
292
|
-
callback(
|
316
|
+
callback(id_on_attribute_ns, data, encoding, ts, te - 1);
|
293
317
|
};
|
294
318
|
|
295
319
|
identifier => {
|
296
|
-
callback(
|
320
|
+
callback(id_on_attribute, data, encoding, ts, te);
|
297
321
|
};
|
298
322
|
|
299
323
|
# Attribute values.
|
@@ -302,13 +326,23 @@
|
|
302
326
|
|
303
327
|
# We're done with the open tag of the element.
|
304
328
|
'>' => {
|
305
|
-
callback_simple(
|
306
|
-
|
329
|
+
callback_simple(id_on_element_open_end);
|
330
|
+
|
331
|
+
if ( inside_html_script_p() )
|
332
|
+
{
|
333
|
+
mark = ts + 1;
|
334
|
+
|
335
|
+
fnext script_text;
|
336
|
+
}
|
337
|
+
else
|
338
|
+
{
|
339
|
+
fnext main;
|
340
|
+
}
|
307
341
|
};
|
308
342
|
|
309
343
|
# Self closing tags.
|
310
344
|
'/>' => {
|
311
|
-
callback_simple(
|
345
|
+
callback_simple(id_on_element_end);
|
312
346
|
fnext main;
|
313
347
|
};
|
314
348
|
*|;
|
@@ -337,7 +371,7 @@
|
|
337
371
|
|
338
372
|
text := |*
|
339
373
|
terminate_text | allowed_text => {
|
340
|
-
callback(
|
374
|
+
callback(id_on_text, data, encoding, ts, te);
|
341
375
|
|
342
376
|
if ( lines > 0 )
|
343
377
|
{
|
@@ -351,7 +385,7 @@
|
|
351
385
|
|
352
386
|
# Text followed by a special tag, such as "foo<!--"
|
353
387
|
allowed_text %{ mark = p; } terminate_text => {
|
354
|
-
callback(
|
388
|
+
callback(id_on_text, data, encoding, ts, mark);
|
355
389
|
|
356
390
|
p = mark - 1;
|
357
391
|
mark = 0;
|
@@ -367,6 +401,30 @@
|
|
367
401
|
};
|
368
402
|
*|;
|
369
403
|
|
404
|
+
# <script> tags in HTML can contain basically anything except for the
|
405
|
+
# literal "</script>". As a result of this we can't use the regular text
|
406
|
+
# machine.
|
407
|
+
script_text := |*
|
408
|
+
'</script>' => {
|
409
|
+
callback(id_on_text, data, encoding, mark, ts);
|
410
|
+
|
411
|
+
mark = 0;
|
412
|
+
|
413
|
+
if ( lines > 0 )
|
414
|
+
{
|
415
|
+
advance_line(lines);
|
416
|
+
|
417
|
+
lines = 0;
|
418
|
+
}
|
419
|
+
|
420
|
+
callback_simple(id_on_element_end);
|
421
|
+
|
422
|
+
fnext main;
|
423
|
+
};
|
424
|
+
|
425
|
+
any $count_newlines;
|
426
|
+
*|;
|
427
|
+
|
370
428
|
# The main machine aka the entry point of Ragel.
|
371
429
|
main := |*
|
372
430
|
doctype_start => start_doctype;
|
data/lib/liboga.jar
CHANGED
Binary file
|
data/lib/oga/version.rb
CHANGED
data/lib/oga/xml/document.rb
CHANGED
data/lib/oga/xml/element.rb
CHANGED
@@ -212,11 +212,8 @@ module Oga
|
|
212
212
|
# @param [String] text
|
213
213
|
#
|
214
214
|
def inner_text=(text)
|
215
|
-
|
216
|
-
|
217
|
-
end
|
218
|
-
|
219
|
-
children << XML::Text.new(:text => text)
|
215
|
+
text_node = XML::Text.new(:text => text)
|
216
|
+
@children = NodeSet.new([text_node])
|
220
217
|
end
|
221
218
|
|
222
219
|
##
|
@@ -310,7 +307,7 @@ module Oga
|
|
310
307
|
self_closing = children.empty?
|
311
308
|
root = root_node
|
312
309
|
|
313
|
-
if root.is_a?(Document) and root.
|
310
|
+
if root.is_a?(Document) and root.html? \
|
314
311
|
and !HTML_VOID_ELEMENTS.include?(name)
|
315
312
|
self_closing = false
|
316
313
|
end
|
data/lib/oga/xml/entities.rb
CHANGED
@@ -10,9 +10,16 @@ module Oga
|
|
10
10
|
# @return [Hash]
|
11
11
|
#
|
12
12
|
DECODE_MAPPING = {
|
13
|
-
'<'
|
14
|
-
'
|
15
|
-
'&
|
13
|
+
'<' => '<',
|
14
|
+
'<' => '<',
|
15
|
+
'>' => '>',
|
16
|
+
'>' => '>',
|
17
|
+
''' => "'",
|
18
|
+
''' => "'",
|
19
|
+
'"' => '"',
|
20
|
+
'"' => '"',
|
21
|
+
'&' => '&',
|
22
|
+
'&' => '&',
|
16
23
|
}
|
17
24
|
|
18
25
|
##
|
@@ -22,8 +29,10 @@ module Oga
|
|
22
29
|
#
|
23
30
|
ENCODE_MAPPING = {
|
24
31
|
'&' => '&',
|
32
|
+
'"' => '"',
|
33
|
+
"'" => ''',
|
25
34
|
'>' => '>',
|
26
|
-
'<' => '<'
|
35
|
+
'<' => '<',
|
27
36
|
}
|
28
37
|
|
29
38
|
##
|
data/lib/oga/xml/lexer.rb
CHANGED
@@ -40,6 +40,14 @@ module Oga
|
|
40
40
|
class Lexer
|
41
41
|
attr_reader :html
|
42
42
|
|
43
|
+
##
|
44
|
+
# Element name used to determine if a tag being processed is a Javascript
|
45
|
+
# tag.
|
46
|
+
#
|
47
|
+
# @return [String]
|
48
|
+
#
|
49
|
+
SCRIPT_TAG = 'script'.freeze
|
50
|
+
|
43
51
|
##
|
44
52
|
# @param [String|IO] data The data to lex. This can either be a String or
|
45
53
|
# an IO instance.
|
@@ -181,6 +189,15 @@ module Oga
|
|
181
189
|
return @elements.last
|
182
190
|
end
|
183
191
|
|
192
|
+
##
|
193
|
+
# Returns true if the current element is the HTML `<script>` element.
|
194
|
+
#
|
195
|
+
# @return [TrueClass|FalseClass]
|
196
|
+
#
|
197
|
+
def inside_html_script?
|
198
|
+
return html? && current_element == SCRIPT_TAG
|
199
|
+
end
|
200
|
+
|
184
201
|
##
|
185
202
|
# Called when processing a single quote.
|
186
203
|
#
|