oga 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +10 -0
- data/doc/changelog.md +47 -0
- data/ext/c/lexer.c +847 -639
- data/ext/c/lexer.rl +39 -14
- data/ext/java/org/liboga/xml/Lexer.java +369 -263
- data/ext/java/org/liboga/xml/Lexer.rl +36 -0
- data/ext/ragel/base_lexer.rl +93 -35
- data/lib/oga/css/parser.rb +39 -37
- data/lib/oga/version.rb +1 -1
- data/lib/oga/xml/document.rb +7 -0
- data/lib/oga/xml/element.rb +3 -6
- data/lib/oga/xml/entities.rb +13 -4
- data/lib/oga/xml/lexer.rb +17 -0
- data/lib/oga/xml/parser.rb +96 -85
- data/lib/oga/xml/text.rb +9 -1
- data/lib/oga/xpath/parser.rb +88 -90
- metadata +3 -3
@@ -101,6 +101,31 @@ public class Lexer extends RubyObject
|
|
101
101
|
int pe = data.length;
|
102
102
|
int eof = data.length;
|
103
103
|
|
104
|
+
String id_advance_line = "advance_line";
|
105
|
+
String id_on_attribute = "on_attribute";
|
106
|
+
String id_on_attribute_ns = "on_attribute_ns";
|
107
|
+
String id_on_cdata = "on_cdata";
|
108
|
+
String id_on_comment = "on_comment";
|
109
|
+
String id_on_doctype_end = "on_doctype_end";
|
110
|
+
String id_on_doctype_inline = "on_doctype_inline";
|
111
|
+
String id_on_doctype_name = "on_doctype_name";
|
112
|
+
String id_on_doctype_start = "on_doctype_start";
|
113
|
+
String id_on_doctype_type = "on_doctype_type";
|
114
|
+
String id_on_element_end = "on_element_end";
|
115
|
+
String id_on_element_name = "on_element_name";
|
116
|
+
String id_on_element_ns = "on_element_ns";
|
117
|
+
String id_on_element_open_end = "on_element_open_end";
|
118
|
+
String id_on_element_start = "on_element_start";
|
119
|
+
String id_on_proc_ins_end = "on_proc_ins_end";
|
120
|
+
String id_on_proc_ins_name = "on_proc_ins_name";
|
121
|
+
String id_on_proc_ins_start = "on_proc_ins_start";
|
122
|
+
String id_on_string_body = "on_string_body";
|
123
|
+
String id_on_string_dquote = "on_string_dquote";
|
124
|
+
String id_on_string_squote = "on_string_squote";
|
125
|
+
String id_on_text = "on_text";
|
126
|
+
String id_on_xml_decl_end = "on_xml_decl_end";
|
127
|
+
String id_on_xml_decl_start = "on_xml_decl_start";
|
128
|
+
|
104
129
|
%% write exec;
|
105
130
|
|
106
131
|
this.lines = lines;
|
@@ -160,6 +185,17 @@ public class Lexer extends RubyObject
|
|
160
185
|
|
161
186
|
this.callMethod(context, "advance_line", lines);
|
162
187
|
}
|
188
|
+
|
189
|
+
/**
|
190
|
+
* Returns true if we're in an HTML script tag. See
|
191
|
+
* Oga::XML::Lexer#inside_html_script? for more information.
|
192
|
+
*/
|
193
|
+
public Boolean inside_html_script_p()
|
194
|
+
{
|
195
|
+
ThreadContext context = this.runtime.getCurrentContext();
|
196
|
+
|
197
|
+
return this.callMethod(context, "inside_html_script?").isTrue();
|
198
|
+
}
|
163
199
|
}
|
164
200
|
|
165
201
|
%%{
|
data/ext/ragel/base_lexer.rl
CHANGED
@@ -28,6 +28,17 @@
|
|
28
28
|
# When you call a method in Ruby make sure that said method is defined as
|
29
29
|
# an instance method in the `Oga::XML::Lexer` class.
|
30
30
|
#
|
31
|
+
# The name of the callback to invoke should be an identifier starting with
|
32
|
+
# "id_". The identifier should be defined in the associated C and Java code.
|
33
|
+
# In case of C code its value should be a Symbol as a ID object, for Java
|
34
|
+
# it should be a String. For example:
|
35
|
+
#
|
36
|
+
# ID id_foo = rb_intern("foo");
|
37
|
+
#
|
38
|
+
# And for Java:
|
39
|
+
#
|
40
|
+
# String id_foo = "foo";
|
41
|
+
#
|
31
42
|
# ## Machine Transitions
|
32
43
|
#
|
33
44
|
# To transition from one machine to another always use `fnext` instead of
|
@@ -59,7 +70,7 @@
|
|
59
70
|
comment = comment_start (any* -- comment_end) comment_end;
|
60
71
|
|
61
72
|
action start_comment {
|
62
|
-
callback(
|
73
|
+
callback(id_on_comment, data, encoding, ts + 4, te - 3);
|
63
74
|
}
|
64
75
|
|
65
76
|
# CDATA
|
@@ -75,7 +86,7 @@
|
|
75
86
|
cdata = cdata_start (any* -- cdata_end) cdata_end;
|
76
87
|
|
77
88
|
action start_cdata {
|
78
|
-
callback(
|
89
|
+
callback(id_on_cdata, data, encoding, ts + 9, te - 3);
|
79
90
|
}
|
80
91
|
|
81
92
|
# Processing Instructions
|
@@ -93,8 +104,8 @@
|
|
93
104
|
proc_ins_end = '?>';
|
94
105
|
|
95
106
|
action start_proc_ins {
|
96
|
-
callback_simple(
|
97
|
-
callback(
|
107
|
+
callback_simple(id_on_proc_ins_start);
|
108
|
+
callback(id_on_proc_ins_name, data, encoding, ts + 2, te);
|
98
109
|
|
99
110
|
mark = te;
|
100
111
|
|
@@ -103,8 +114,8 @@
|
|
103
114
|
|
104
115
|
proc_ins_body := |*
|
105
116
|
proc_ins_end => {
|
106
|
-
callback(
|
107
|
-
callback_simple(
|
117
|
+
callback(id_on_text, data, encoding, mark, ts);
|
118
|
+
callback_simple(id_on_proc_ins_end);
|
108
119
|
|
109
120
|
mark = 0;
|
110
121
|
|
@@ -124,7 +135,7 @@
|
|
124
135
|
squote = "'";
|
125
136
|
|
126
137
|
action emit_string {
|
127
|
-
callback(
|
138
|
+
callback(id_on_string_body, data, encoding, ts, te);
|
128
139
|
|
129
140
|
if ( lines > 0 )
|
130
141
|
{
|
@@ -135,13 +146,13 @@
|
|
135
146
|
}
|
136
147
|
|
137
148
|
action start_string_squote {
|
138
|
-
callback_simple(
|
149
|
+
callback_simple(id_on_string_squote);
|
139
150
|
|
140
151
|
fcall string_squote;
|
141
152
|
}
|
142
153
|
|
143
154
|
action start_string_dquote {
|
144
|
-
callback_simple(
|
155
|
+
callback_simple(id_on_string_dquote);
|
145
156
|
|
146
157
|
fcall string_dquote;
|
147
158
|
}
|
@@ -150,7 +161,7 @@
|
|
150
161
|
^squote* $count_newlines => emit_string;
|
151
162
|
|
152
163
|
squote => {
|
153
|
-
callback_simple(
|
164
|
+
callback_simple(id_on_string_squote);
|
154
165
|
|
155
166
|
fret;
|
156
167
|
};
|
@@ -160,7 +171,7 @@
|
|
160
171
|
^dquote* $count_newlines => emit_string;
|
161
172
|
|
162
173
|
dquote => {
|
163
|
-
callback_simple(
|
174
|
+
callback_simple(id_on_string_dquote);
|
164
175
|
|
165
176
|
fret;
|
166
177
|
};
|
@@ -179,22 +190,35 @@
|
|
179
190
|
doctype_start = '<!DOCTYPE'i whitespace+;
|
180
191
|
|
181
192
|
action start_doctype {
|
182
|
-
callback_simple(
|
193
|
+
callback_simple(id_on_doctype_start);
|
183
194
|
fnext doctype;
|
184
195
|
}
|
185
196
|
|
197
|
+
# Machine for processing inline rules of a doctype.
|
198
|
+
doctype_inline := |*
|
199
|
+
^']'* $count_newlines => {
|
200
|
+
callback(id_on_doctype_inline, data, encoding, ts, te);
|
201
|
+
|
202
|
+
if ( lines > 0 )
|
203
|
+
{
|
204
|
+
advance_line(lines);
|
205
|
+
|
206
|
+
lines = 0;
|
207
|
+
}
|
208
|
+
};
|
209
|
+
|
210
|
+
']' => { fnext doctype; };
|
211
|
+
*|;
|
212
|
+
|
186
213
|
# Machine for processing doctypes. Doctype values such as the public
|
187
214
|
# and system IDs are treated as T_STRING tokens.
|
188
215
|
doctype := |*
|
189
216
|
'PUBLIC' | 'SYSTEM' => {
|
190
|
-
callback(
|
217
|
+
callback(id_on_doctype_type, data, encoding, ts, te);
|
191
218
|
};
|
192
219
|
|
193
|
-
#
|
194
|
-
|
195
|
-
'[' any+ :> ']' => {
|
196
|
-
callback("on_doctype_inline", data, encoding, ts + 1, te - 1);
|
197
|
-
};
|
220
|
+
# Starts a set of inline doctype rules.
|
221
|
+
'[' => { fnext doctype_inline; };
|
198
222
|
|
199
223
|
# Lex the public/system IDs as regular strings.
|
200
224
|
squote => start_string_squote;
|
@@ -205,11 +229,11 @@
|
|
205
229
|
whitespace;
|
206
230
|
|
207
231
|
identifier => {
|
208
|
-
callback(
|
232
|
+
callback(id_on_doctype_name, data, encoding, ts, te);
|
209
233
|
};
|
210
234
|
|
211
235
|
'>' => {
|
212
|
-
callback_simple(
|
236
|
+
callback_simple(id_on_doctype_end);
|
213
237
|
fnext main;
|
214
238
|
};
|
215
239
|
*|;
|
@@ -222,20 +246,20 @@
|
|
222
246
|
xml_decl_end = '?>';
|
223
247
|
|
224
248
|
action start_xml_decl {
|
225
|
-
callback_simple(
|
249
|
+
callback_simple(id_on_xml_decl_start);
|
226
250
|
fnext xml_decl;
|
227
251
|
}
|
228
252
|
|
229
253
|
# Machine that processes the contents of an XML declaration tag.
|
230
254
|
xml_decl := |*
|
231
255
|
xml_decl_end => {
|
232
|
-
callback_simple(
|
256
|
+
callback_simple(id_on_xml_decl_end);
|
233
257
|
fnext main;
|
234
258
|
};
|
235
259
|
|
236
260
|
# Attributes and their values (e.g. version="1.0").
|
237
261
|
identifier => {
|
238
|
-
callback(
|
262
|
+
callback(id_on_attribute, data, encoding, ts, te);
|
239
263
|
};
|
240
264
|
|
241
265
|
squote => start_string_squote;
|
@@ -257,23 +281,23 @@
|
|
257
281
|
element_end = '</' identifier (':' identifier)* '>';
|
258
282
|
|
259
283
|
action start_element {
|
260
|
-
callback_simple(
|
284
|
+
callback_simple(id_on_element_start);
|
261
285
|
fhold;
|
262
286
|
fnext element_name;
|
263
287
|
}
|
264
288
|
|
265
289
|
action close_element {
|
266
|
-
callback_simple(
|
290
|
+
callback_simple(id_on_element_end);
|
267
291
|
}
|
268
292
|
|
269
293
|
# Machine used for lexing the name/namespace of an element.
|
270
294
|
element_name := |*
|
271
295
|
identifier ':' => {
|
272
|
-
callback(
|
296
|
+
callback(id_on_element_ns, data, encoding, ts, te - 1);
|
273
297
|
};
|
274
298
|
|
275
299
|
identifier => {
|
276
|
-
callback(
|
300
|
+
callback(id_on_element_name, data, encoding, ts, te);
|
277
301
|
fnext element_head;
|
278
302
|
};
|
279
303
|
*|;
|
@@ -284,16 +308,16 @@
|
|
284
308
|
whitespace | '=';
|
285
309
|
|
286
310
|
newline => {
|
287
|
-
callback_simple(
|
311
|
+
callback_simple(id_advance_line);
|
288
312
|
};
|
289
313
|
|
290
314
|
# Attribute names and namespaces.
|
291
315
|
identifier ':' => {
|
292
|
-
callback(
|
316
|
+
callback(id_on_attribute_ns, data, encoding, ts, te - 1);
|
293
317
|
};
|
294
318
|
|
295
319
|
identifier => {
|
296
|
-
callback(
|
320
|
+
callback(id_on_attribute, data, encoding, ts, te);
|
297
321
|
};
|
298
322
|
|
299
323
|
# Attribute values.
|
@@ -302,13 +326,23 @@
|
|
302
326
|
|
303
327
|
# We're done with the open tag of the element.
|
304
328
|
'>' => {
|
305
|
-
callback_simple(
|
306
|
-
|
329
|
+
callback_simple(id_on_element_open_end);
|
330
|
+
|
331
|
+
if ( inside_html_script_p() )
|
332
|
+
{
|
333
|
+
mark = ts + 1;
|
334
|
+
|
335
|
+
fnext script_text;
|
336
|
+
}
|
337
|
+
else
|
338
|
+
{
|
339
|
+
fnext main;
|
340
|
+
}
|
307
341
|
};
|
308
342
|
|
309
343
|
# Self closing tags.
|
310
344
|
'/>' => {
|
311
|
-
callback_simple(
|
345
|
+
callback_simple(id_on_element_end);
|
312
346
|
fnext main;
|
313
347
|
};
|
314
348
|
*|;
|
@@ -337,7 +371,7 @@
|
|
337
371
|
|
338
372
|
text := |*
|
339
373
|
terminate_text | allowed_text => {
|
340
|
-
callback(
|
374
|
+
callback(id_on_text, data, encoding, ts, te);
|
341
375
|
|
342
376
|
if ( lines > 0 )
|
343
377
|
{
|
@@ -351,7 +385,7 @@
|
|
351
385
|
|
352
386
|
# Text followed by a special tag, such as "foo<!--"
|
353
387
|
allowed_text %{ mark = p; } terminate_text => {
|
354
|
-
callback(
|
388
|
+
callback(id_on_text, data, encoding, ts, mark);
|
355
389
|
|
356
390
|
p = mark - 1;
|
357
391
|
mark = 0;
|
@@ -367,6 +401,30 @@
|
|
367
401
|
};
|
368
402
|
*|;
|
369
403
|
|
404
|
+
# <script> tags in HTML can contain basically anything except for the
|
405
|
+
# literal "</script>". As a result of this we can't use the regular text
|
406
|
+
# machine.
|
407
|
+
script_text := |*
|
408
|
+
'</script>' => {
|
409
|
+
callback(id_on_text, data, encoding, mark, ts);
|
410
|
+
|
411
|
+
mark = 0;
|
412
|
+
|
413
|
+
if ( lines > 0 )
|
414
|
+
{
|
415
|
+
advance_line(lines);
|
416
|
+
|
417
|
+
lines = 0;
|
418
|
+
}
|
419
|
+
|
420
|
+
callback_simple(id_on_element_end);
|
421
|
+
|
422
|
+
fnext main;
|
423
|
+
};
|
424
|
+
|
425
|
+
any $count_newlines;
|
426
|
+
*|;
|
427
|
+
|
370
428
|
# The main machine aka the entry point of Ragel.
|
371
429
|
main := |*
|
372
430
|
doctype_start => start_doctype;
|
data/lib/oga/css/parser.rb
CHANGED
@@ -301,38 +301,40 @@ module Oga
|
|
301
301
|
##### State transition tables begin ###
|
302
302
|
|
303
303
|
racc_action_table = [
|
304
|
-
13,
|
305
|
-
13, 40, 19,
|
306
|
-
|
307
|
-
|
308
|
-
19,
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
20, 21,
|
304
|
+
13, 24, 19, 13, 23, 19, 25, 23, 26, 35,
|
305
|
+
13, 40, 19, 41, 23, 9, 10, 11, 9, 10,
|
306
|
+
11, 43, 44, 20, 21, 45, 20, 21, 13, 48,
|
307
|
+
19, 49, 23, 20, 21, 68, 68, 68, 13, 68,
|
308
|
+
19, 68, 23, 9, 10, 11, 64, 19, 68, 23,
|
309
|
+
74, 20, 21, 9, 10, 11, 63, 62, 75, 65,
|
310
|
+
66, 20, 21, 19, 13, 23, 62, 77, 20, 21,
|
311
|
+
19, 13, 23, 19, 62, 23, 62, nil, nil, 9,
|
312
|
+
10, 11, 13, nil, 20, 21, 9, 10, 11, nil,
|
313
|
+
nil, 20, 21, nil, 20, 21, nil, 9, 10, 11,
|
314
|
+
50, 51, 52, 53, 54, 55 ]
|
314
315
|
|
315
316
|
racc_action_check = [
|
316
|
-
0,
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
317
|
+
0, 1, 0, 25, 0, 25, 3, 25, 4, 13,
|
318
|
+
19, 20, 5, 21, 5, 0, 0, 0, 25, 25,
|
319
|
+
25, 22, 23, 0, 0, 24, 25, 25, 26, 35,
|
320
|
+
26, 36, 26, 5, 5, 50, 51, 52, 43, 53,
|
321
|
+
43, 54, 43, 26, 26, 26, 43, 6, 55, 6,
|
322
|
+
56, 26, 26, 43, 43, 43, 43, 43, 57, 43,
|
323
|
+
43, 43, 43, 7, 9, 7, 63, 64, 6, 6,
|
324
|
+
28, 10, 28, 29, 75, 29, 77, nil, nil, 9,
|
325
|
+
9, 9, 11, nil, 7, 7, 10, 10, 10, nil,
|
326
|
+
nil, 28, 28, nil, 29, 29, nil, 11, 11, 11,
|
327
|
+
37, 37, 37, 37, 37, 37 ]
|
326
328
|
|
327
329
|
racc_action_pointer = [
|
328
|
-
-2,
|
329
|
-
|
330
|
-
9,
|
331
|
-
nil, nil, nil, nil, nil,
|
332
|
-
nil, nil, nil,
|
333
|
-
|
334
|
-
nil, nil, nil,
|
335
|
-
nil, nil, nil, nil, nil,
|
330
|
+
-2, 1, nil, -1, 1, 8, 43, 59, nil, 62,
|
331
|
+
69, 80, nil, 6, nil, nil, nil, nil, nil, 8,
|
332
|
+
9, 11, 13, 20, 25, 1, 26, nil, 66, 69,
|
333
|
+
nil, nil, nil, nil, nil, 27, 26, 89, nil, nil,
|
334
|
+
nil, nil, nil, 36, nil, nil, nil, nil, nil, nil,
|
335
|
+
13, 14, 15, 17, 19, 26, 41, 38, nil, nil,
|
336
|
+
nil, nil, nil, 45, 47, nil, nil, nil, nil, nil,
|
337
|
+
nil, nil, nil, nil, nil, 53, nil, 55, nil, nil ]
|
336
338
|
|
337
339
|
racc_action_default = [
|
338
340
|
-2, -59, -1, -3, -4, -7, -8, -10, -12, -16,
|
@@ -346,22 +348,22 @@ racc_action_default = [
|
|
346
348
|
|
347
349
|
racc_goto_table = [
|
348
350
|
3, 57, 28, 29, 27, 67, 69, 70, 71, 72,
|
349
|
-
73,
|
350
|
-
|
351
|
-
|
351
|
+
73, 32, 32, 32, 31, 31, 31, 30, 33, 34,
|
352
|
+
1, 76, 2, 4, 39, 46, 47, 27, 27, 36,
|
353
|
+
37, 38, 42, 78, 56, 79, 58, 59, 60, nil,
|
352
354
|
nil, nil, nil, 61 ]
|
353
355
|
|
354
356
|
racc_goto_check = [
|
355
357
|
3, 23, 5, 5, 11, 19, 19, 19, 19, 19,
|
356
|
-
19,
|
357
|
-
|
358
|
-
|
358
|
+
19, 7, 7, 7, 8, 8, 8, 9, 9, 9,
|
359
|
+
1, 23, 2, 4, 10, 3, 3, 11, 11, 16,
|
360
|
+
17, 18, 21, 23, 22, 23, 24, 25, 26, nil,
|
359
361
|
nil, nil, nil, 3 ]
|
360
362
|
|
361
363
|
racc_goto_pointer = [
|
362
|
-
nil,
|
363
|
-
|
364
|
-
nil,
|
364
|
+
nil, 20, 22, 0, 23, -4, nil, 2, 5, 8,
|
365
|
+
5, -1, nil, nil, nil, nil, 10, 11, 12, -45,
|
366
|
+
nil, 10, -9, -42, -7, -6, -5 ]
|
365
367
|
|
366
368
|
racc_goto_default = [
|
367
369
|
nil, nil, nil, nil, nil, 5, 6, 7, 8, nil,
|
data/lib/oga/version.rb
CHANGED