oga 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -101,6 +101,31 @@ public class Lexer extends RubyObject
101
101
  int pe = data.length;
102
102
  int eof = data.length;
103
103
 
104
+ String id_advance_line = "advance_line";
105
+ String id_on_attribute = "on_attribute";
106
+ String id_on_attribute_ns = "on_attribute_ns";
107
+ String id_on_cdata = "on_cdata";
108
+ String id_on_comment = "on_comment";
109
+ String id_on_doctype_end = "on_doctype_end";
110
+ String id_on_doctype_inline = "on_doctype_inline";
111
+ String id_on_doctype_name = "on_doctype_name";
112
+ String id_on_doctype_start = "on_doctype_start";
113
+ String id_on_doctype_type = "on_doctype_type";
114
+ String id_on_element_end = "on_element_end";
115
+ String id_on_element_name = "on_element_name";
116
+ String id_on_element_ns = "on_element_ns";
117
+ String id_on_element_open_end = "on_element_open_end";
118
+ String id_on_element_start = "on_element_start";
119
+ String id_on_proc_ins_end = "on_proc_ins_end";
120
+ String id_on_proc_ins_name = "on_proc_ins_name";
121
+ String id_on_proc_ins_start = "on_proc_ins_start";
122
+ String id_on_string_body = "on_string_body";
123
+ String id_on_string_dquote = "on_string_dquote";
124
+ String id_on_string_squote = "on_string_squote";
125
+ String id_on_text = "on_text";
126
+ String id_on_xml_decl_end = "on_xml_decl_end";
127
+ String id_on_xml_decl_start = "on_xml_decl_start";
128
+
104
129
  %% write exec;
105
130
 
106
131
  this.lines = lines;
@@ -160,6 +185,17 @@ public class Lexer extends RubyObject
160
185
 
161
186
  this.callMethod(context, "advance_line", lines);
162
187
  }
188
+
189
+ /**
190
+ * Returns true if we're in an HTML script tag. See
191
+ * Oga::XML::Lexer#inside_html_script? for more information.
192
+ */
193
+ public Boolean inside_html_script_p()
194
+ {
195
+ ThreadContext context = this.runtime.getCurrentContext();
196
+
197
+ return this.callMethod(context, "inside_html_script?").isTrue();
198
+ }
163
199
  }
164
200
 
165
201
  %%{
@@ -28,6 +28,17 @@
28
28
  # When you call a method in Ruby make sure that said method is defined as
29
29
  # an instance method in the `Oga::XML::Lexer` class.
30
30
  #
31
+ # The name of the callback to invoke should be an identifier starting with
32
+ # "id_". The identifier should be defined in the associated C and Java code.
33
+ # In case of C code its value should be a Symbol as a ID object, for Java
34
+ # it should be a String. For example:
35
+ #
36
+ # ID id_foo = rb_intern("foo");
37
+ #
38
+ # And for Java:
39
+ #
40
+ # String id_foo = "foo";
41
+ #
31
42
  # ## Machine Transitions
32
43
  #
33
44
  # To transition from one machine to another always use `fnext` instead of
@@ -59,7 +70,7 @@
59
70
  comment = comment_start (any* -- comment_end) comment_end;
60
71
 
61
72
  action start_comment {
62
- callback("on_comment", data, encoding, ts + 4, te - 3);
73
+ callback(id_on_comment, data, encoding, ts + 4, te - 3);
63
74
  }
64
75
 
65
76
  # CDATA
@@ -75,7 +86,7 @@
75
86
  cdata = cdata_start (any* -- cdata_end) cdata_end;
76
87
 
77
88
  action start_cdata {
78
- callback("on_cdata", data, encoding, ts + 9, te - 3);
89
+ callback(id_on_cdata, data, encoding, ts + 9, te - 3);
79
90
  }
80
91
 
81
92
  # Processing Instructions
@@ -93,8 +104,8 @@
93
104
  proc_ins_end = '?>';
94
105
 
95
106
  action start_proc_ins {
96
- callback_simple("on_proc_ins_start");
97
- callback("on_proc_ins_name", data, encoding, ts + 2, te);
107
+ callback_simple(id_on_proc_ins_start);
108
+ callback(id_on_proc_ins_name, data, encoding, ts + 2, te);
98
109
 
99
110
  mark = te;
100
111
 
@@ -103,8 +114,8 @@
103
114
 
104
115
  proc_ins_body := |*
105
116
  proc_ins_end => {
106
- callback("on_text", data, encoding, mark, ts);
107
- callback_simple("on_proc_ins_end");
117
+ callback(id_on_text, data, encoding, mark, ts);
118
+ callback_simple(id_on_proc_ins_end);
108
119
 
109
120
  mark = 0;
110
121
 
@@ -124,7 +135,7 @@
124
135
  squote = "'";
125
136
 
126
137
  action emit_string {
127
- callback("on_string_body", data, encoding, ts, te);
138
+ callback(id_on_string_body, data, encoding, ts, te);
128
139
 
129
140
  if ( lines > 0 )
130
141
  {
@@ -135,13 +146,13 @@
135
146
  }
136
147
 
137
148
  action start_string_squote {
138
- callback_simple("on_string_squote");
149
+ callback_simple(id_on_string_squote);
139
150
 
140
151
  fcall string_squote;
141
152
  }
142
153
 
143
154
  action start_string_dquote {
144
- callback_simple("on_string_dquote");
155
+ callback_simple(id_on_string_dquote);
145
156
 
146
157
  fcall string_dquote;
147
158
  }
@@ -150,7 +161,7 @@
150
161
  ^squote* $count_newlines => emit_string;
151
162
 
152
163
  squote => {
153
- callback_simple("on_string_squote");
164
+ callback_simple(id_on_string_squote);
154
165
 
155
166
  fret;
156
167
  };
@@ -160,7 +171,7 @@
160
171
  ^dquote* $count_newlines => emit_string;
161
172
 
162
173
  dquote => {
163
- callback_simple("on_string_dquote");
174
+ callback_simple(id_on_string_dquote);
164
175
 
165
176
  fret;
166
177
  };
@@ -179,22 +190,35 @@
179
190
  doctype_start = '<!DOCTYPE'i whitespace+;
180
191
 
181
192
  action start_doctype {
182
- callback_simple("on_doctype_start");
193
+ callback_simple(id_on_doctype_start);
183
194
  fnext doctype;
184
195
  }
185
196
 
197
+ # Machine for processing inline rules of a doctype.
198
+ doctype_inline := |*
199
+ ^']'* $count_newlines => {
200
+ callback(id_on_doctype_inline, data, encoding, ts, te);
201
+
202
+ if ( lines > 0 )
203
+ {
204
+ advance_line(lines);
205
+
206
+ lines = 0;
207
+ }
208
+ };
209
+
210
+ ']' => { fnext doctype; };
211
+ *|;
212
+
186
213
  # Machine for processing doctypes. Doctype values such as the public
187
214
  # and system IDs are treated as T_STRING tokens.
188
215
  doctype := |*
189
216
  'PUBLIC' | 'SYSTEM' => {
190
- callback("on_doctype_type", data, encoding, ts, te);
217
+ callback(id_on_doctype_type, data, encoding, ts, te);
191
218
  };
192
219
 
193
- # Consumes everything between the [ and ]. Due to the use of :> the ]
194
- # is not consumed by any+.
195
- '[' any+ :> ']' => {
196
- callback("on_doctype_inline", data, encoding, ts + 1, te - 1);
197
- };
220
+ # Starts a set of inline doctype rules.
221
+ '[' => { fnext doctype_inline; };
198
222
 
199
223
  # Lex the public/system IDs as regular strings.
200
224
  squote => start_string_squote;
@@ -205,11 +229,11 @@
205
229
  whitespace;
206
230
 
207
231
  identifier => {
208
- callback("on_doctype_name", data, encoding, ts, te);
232
+ callback(id_on_doctype_name, data, encoding, ts, te);
209
233
  };
210
234
 
211
235
  '>' => {
212
- callback_simple("on_doctype_end");
236
+ callback_simple(id_on_doctype_end);
213
237
  fnext main;
214
238
  };
215
239
  *|;
@@ -222,20 +246,20 @@
222
246
  xml_decl_end = '?>';
223
247
 
224
248
  action start_xml_decl {
225
- callback_simple("on_xml_decl_start");
249
+ callback_simple(id_on_xml_decl_start);
226
250
  fnext xml_decl;
227
251
  }
228
252
 
229
253
  # Machine that processes the contents of an XML declaration tag.
230
254
  xml_decl := |*
231
255
  xml_decl_end => {
232
- callback_simple("on_xml_decl_end");
256
+ callback_simple(id_on_xml_decl_end);
233
257
  fnext main;
234
258
  };
235
259
 
236
260
  # Attributes and their values (e.g. version="1.0").
237
261
  identifier => {
238
- callback("on_attribute", data, encoding, ts, te);
262
+ callback(id_on_attribute, data, encoding, ts, te);
239
263
  };
240
264
 
241
265
  squote => start_string_squote;
@@ -257,23 +281,23 @@
257
281
  element_end = '</' identifier (':' identifier)* '>';
258
282
 
259
283
  action start_element {
260
- callback_simple("on_element_start");
284
+ callback_simple(id_on_element_start);
261
285
  fhold;
262
286
  fnext element_name;
263
287
  }
264
288
 
265
289
  action close_element {
266
- callback_simple("on_element_end");
290
+ callback_simple(id_on_element_end);
267
291
  }
268
292
 
269
293
  # Machine used for lexing the name/namespace of an element.
270
294
  element_name := |*
271
295
  identifier ':' => {
272
- callback("on_element_ns", data, encoding, ts, te - 1);
296
+ callback(id_on_element_ns, data, encoding, ts, te - 1);
273
297
  };
274
298
 
275
299
  identifier => {
276
- callback("on_element_name", data, encoding, ts, te);
300
+ callback(id_on_element_name, data, encoding, ts, te);
277
301
  fnext element_head;
278
302
  };
279
303
  *|;
@@ -284,16 +308,16 @@
284
308
  whitespace | '=';
285
309
 
286
310
  newline => {
287
- callback_simple("advance_line");
311
+ callback_simple(id_advance_line);
288
312
  };
289
313
 
290
314
  # Attribute names and namespaces.
291
315
  identifier ':' => {
292
- callback("on_attribute_ns", data, encoding, ts, te - 1);
316
+ callback(id_on_attribute_ns, data, encoding, ts, te - 1);
293
317
  };
294
318
 
295
319
  identifier => {
296
- callback("on_attribute", data, encoding, ts, te);
320
+ callback(id_on_attribute, data, encoding, ts, te);
297
321
  };
298
322
 
299
323
  # Attribute values.
@@ -302,13 +326,23 @@
302
326
 
303
327
  # We're done with the open tag of the element.
304
328
  '>' => {
305
- callback_simple("on_element_open_end");
306
- fnext main;
329
+ callback_simple(id_on_element_open_end);
330
+
331
+ if ( inside_html_script_p() )
332
+ {
333
+ mark = ts + 1;
334
+
335
+ fnext script_text;
336
+ }
337
+ else
338
+ {
339
+ fnext main;
340
+ }
307
341
  };
308
342
 
309
343
  # Self closing tags.
310
344
  '/>' => {
311
- callback_simple("on_element_end");
345
+ callback_simple(id_on_element_end);
312
346
  fnext main;
313
347
  };
314
348
  *|;
@@ -337,7 +371,7 @@
337
371
 
338
372
  text := |*
339
373
  terminate_text | allowed_text => {
340
- callback("on_text", data, encoding, ts, te);
374
+ callback(id_on_text, data, encoding, ts, te);
341
375
 
342
376
  if ( lines > 0 )
343
377
  {
@@ -351,7 +385,7 @@
351
385
 
352
386
  # Text followed by a special tag, such as "foo<!--"
353
387
  allowed_text %{ mark = p; } terminate_text => {
354
- callback("on_text", data, encoding, ts, mark);
388
+ callback(id_on_text, data, encoding, ts, mark);
355
389
 
356
390
  p = mark - 1;
357
391
  mark = 0;
@@ -367,6 +401,30 @@
367
401
  };
368
402
  *|;
369
403
 
404
+ # <script> tags in HTML can contain basically anything except for the
405
+ # literal "</script>". As a result of this we can't use the regular text
406
+ # machine.
407
+ script_text := |*
408
+ '</script>' => {
409
+ callback(id_on_text, data, encoding, mark, ts);
410
+
411
+ mark = 0;
412
+
413
+ if ( lines > 0 )
414
+ {
415
+ advance_line(lines);
416
+
417
+ lines = 0;
418
+ }
419
+
420
+ callback_simple(id_on_element_end);
421
+
422
+ fnext main;
423
+ };
424
+
425
+ any $count_newlines;
426
+ *|;
427
+
370
428
  # The main machine aka the entry point of Ragel.
371
429
  main := |*
372
430
  doctype_start => start_doctype;
@@ -301,38 +301,40 @@ module Oga
301
301
  ##### State transition tables begin ###
302
302
 
303
303
  racc_action_table = [
304
- 13, 41, 19, 43, 23, 13, 44, 45, 26, 25,
305
- 13, 40, 19, 13, 23, 9, 10, 11, 64, 48,
306
- 9, 10, 11, 20, 21, 9, 10, 11, 63, 62,
307
- 49, 65, 66, 20, 21, 13, 35, 19, 13, 23,
308
- 19, 24, 23, 19, 68, 23, 19, 68, 23, 68,
309
- 9, 10, 11, 9, 10, 11, 68, 68, 20, 21,
310
- 68, 20, 21, 13, 20, 21, 13, 20, 21, 19,
311
- 74, 23, 19, 75, 23, 19, 62, 23, 9, 10,
312
- 11, 9, 10, 11, 50, 51, 52, 53, 54, 55,
313
- 20, 21, 77, 20, 21, 62, 20, 21, 62 ]
304
+ 13, 24, 19, 13, 23, 19, 25, 23, 26, 35,
305
+ 13, 40, 19, 41, 23, 9, 10, 11, 9, 10,
306
+ 11, 43, 44, 20, 21, 45, 20, 21, 13, 48,
307
+ 19, 49, 23, 20, 21, 68, 68, 68, 13, 68,
308
+ 19, 68, 23, 9, 10, 11, 64, 19, 68, 23,
309
+ 74, 20, 21, 9, 10, 11, 63, 62, 75, 65,
310
+ 66, 20, 21, 19, 13, 23, 62, 77, 20, 21,
311
+ 19, 13, 23, 19, 62, 23, 62, nil, nil, 9,
312
+ 10, 11, 13, nil, 20, 21, 9, 10, 11, nil,
313
+ nil, 20, 21, nil, 20, 21, nil, 9, 10, 11,
314
+ 50, 51, 52, 53, 54, 55 ]
314
315
 
315
316
  racc_action_check = [
316
- 0, 21, 0, 22, 0, 11, 23, 24, 4, 3,
317
- 43, 20, 43, 19, 43, 0, 0, 0, 43, 35,
318
- 11, 11, 11, 0, 0, 43, 43, 43, 43, 43,
319
- 36, 43, 43, 43, 43, 25, 13, 25, 26, 25,
320
- 26, 1, 26, 5, 50, 5, 6, 51, 6, 52,
321
- 25, 25, 25, 26, 26, 26, 53, 54, 25, 25,
322
- 55, 26, 26, 10, 5, 5, 9, 6, 6, 7,
323
- 56, 7, 28, 57, 28, 29, 63, 29, 10, 10,
324
- 10, 9, 9, 9, 37, 37, 37, 37, 37, 37,
325
- 7, 7, 64, 28, 28, 75, 29, 29, 77 ]
317
+ 0, 1, 0, 25, 0, 25, 3, 25, 4, 13,
318
+ 19, 20, 5, 21, 5, 0, 0, 0, 25, 25,
319
+ 25, 22, 23, 0, 0, 24, 25, 25, 26, 35,
320
+ 26, 36, 26, 5, 5, 50, 51, 52, 43, 53,
321
+ 43, 54, 43, 26, 26, 26, 43, 6, 55, 6,
322
+ 56, 26, 26, 43, 43, 43, 43, 43, 57, 43,
323
+ 43, 43, 43, 7, 9, 7, 63, 64, 6, 6,
324
+ 28, 10, 28, 29, 75, 29, 77, nil, nil, 9,
325
+ 9, 9, 11, nil, 7, 7, 10, 10, 10, nil,
326
+ nil, 28, 28, nil, 29, 29, nil, 11, 11, 11,
327
+ 37, 37, 37, 37, 37, 37 ]
326
328
 
327
329
  racc_action_pointer = [
328
- -2, 41, nil, 2, 1, 39, 42, 65, nil, 64,
329
- 61, 3, nil, 33, nil, nil, nil, nil, nil, 11,
330
- 9, -1, -5, 4, 7, 33, 36, nil, 68, 71,
331
- nil, nil, nil, nil, nil, 17, 25, 73, nil, nil,
332
- nil, nil, nil, 8, nil, nil, nil, nil, nil, nil,
333
- 22, 25, 27, 34, 35, 38, 61, 53, nil, nil,
334
- nil, nil, nil, 55, 72, nil, nil, nil, nil, nil,
335
- nil, nil, nil, nil, nil, 74, nil, 77, nil, nil ]
330
+ -2, 1, nil, -1, 1, 8, 43, 59, nil, 62,
331
+ 69, 80, nil, 6, nil, nil, nil, nil, nil, 8,
332
+ 9, 11, 13, 20, 25, 1, 26, nil, 66, 69,
333
+ nil, nil, nil, nil, nil, 27, 26, 89, nil, nil,
334
+ nil, nil, nil, 36, nil, nil, nil, nil, nil, nil,
335
+ 13, 14, 15, 17, 19, 26, 41, 38, nil, nil,
336
+ nil, nil, nil, 45, 47, nil, nil, nil, nil, nil,
337
+ nil, nil, nil, nil, nil, 53, nil, 55, nil, nil ]
336
338
 
337
339
  racc_action_default = [
338
340
  -2, -59, -1, -3, -4, -7, -8, -10, -12, -16,
@@ -346,22 +348,22 @@ racc_action_default = [
346
348
 
347
349
  racc_goto_table = [
348
350
  3, 57, 28, 29, 27, 67, 69, 70, 71, 72,
349
- 73, 31, 31, 31, 32, 32, 32, 30, 33, 34,
350
- 36, 76, 4, 39, 37, 46, 47, 27, 27, 38,
351
- 1, 42, 56, 78, 2, 79, 58, 59, 60, nil,
351
+ 73, 32, 32, 32, 31, 31, 31, 30, 33, 34,
352
+ 1, 76, 2, 4, 39, 46, 47, 27, 27, 36,
353
+ 37, 38, 42, 78, 56, 79, 58, 59, 60, nil,
352
354
  nil, nil, nil, 61 ]
353
355
 
354
356
  racc_goto_check = [
355
357
  3, 23, 5, 5, 11, 19, 19, 19, 19, 19,
356
- 19, 8, 8, 8, 7, 7, 7, 9, 9, 9,
357
- 16, 23, 4, 10, 17, 3, 3, 11, 11, 18,
358
- 1, 21, 22, 23, 2, 23, 24, 25, 26, nil,
358
+ 19, 7, 7, 7, 8, 8, 8, 9, 9, 9,
359
+ 1, 23, 2, 4, 10, 3, 3, 11, 11, 16,
360
+ 17, 18, 21, 23, 22, 23, 24, 25, 26, nil,
359
361
  nil, nil, nil, 3 ]
360
362
 
361
363
  racc_goto_pointer = [
362
- nil, 30, 34, 0, 22, -4, nil, 5, 2, 8,
363
- 4, -1, nil, nil, nil, nil, 1, 5, 10, -45,
364
- nil, 9, -11, -42, -7, -6, -5 ]
364
+ nil, 20, 22, 0, 23, -4, nil, 2, 5, 8,
365
+ 5, -1, nil, nil, nil, nil, 10, 11, 12, -45,
366
+ nil, 10, -9, -42, -7, -6, -5 ]
365
367
 
366
368
  racc_goto_default = [
367
369
  nil, nil, nil, nil, nil, 5, 6, 7, 8, nil,
data/lib/oga/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Oga
2
- VERSION = '0.2.0'
2
+ VERSION = '0.2.1'
3
3
  end # Oga