oga 0.3.2-java → 0.3.3-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -194,13 +194,23 @@ public class Lexer extends RubyObject
194
194
  }
195
195
 
196
196
  /**
197
- * See * Oga::XML::Lexer#literal_html_element? for more information.
197
+ * @see Oga::XML::Lexer#html_script?
198
198
  */
199
- public Boolean literal_html_element_p()
199
+ public Boolean html_script_p()
200
200
  {
201
201
  ThreadContext context = this.runtime.getCurrentContext();
202
202
 
203
- return this.callMethod(context, "literal_html_element?").isTrue();
203
+ return this.callMethod(context, "html_script?").isTrue();
204
+ }
205
+
206
+ /**
207
+ * @see Oga::XML::Lexer#html_style?
208
+ */
209
+ public Boolean html_style_p()
210
+ {
211
+ ThreadContext context = this.runtime.getCurrentContext();
212
+
213
+ return this.callMethod(context, "html_style?").isTrue();
204
214
  }
205
215
  }
206
216
 
@@ -46,21 +46,31 @@
46
46
  # stack.
47
47
  #
48
48
 
49
- newline = '\r\n' | '\n' | '\r';
49
+ newline = '\r\n' | '\n' | '\r';
50
+ whitespace = [ \t];
51
+ ident_char = [a-zA-Z0-9\-_];
52
+ identifier = ident_char+;
53
+
54
+ whitespace_or_newline = whitespace | newline;
50
55
 
51
56
  action count_newlines {
52
57
  if ( fc == '\n' ) lines++;
53
58
  }
54
59
 
55
- whitespace = [ \t];
56
- ident_char = [a-zA-Z0-9\-_];
57
- identifier = ident_char+;
60
+ action advance_newline {
61
+ advance_line(1);
62
+ }
63
+
64
+ action hold_and_return {
65
+ fhold;
66
+ fret;
67
+ }
58
68
 
59
69
  # Comments
60
70
  #
61
- # http://www.w3.org/TR/html-markup/syntax.html#comments
71
+ # http://www.w3.org/TR/html/syntax.html#comments
62
72
  #
63
- # Unlike the W3 specification these rules *do* allow character sequences
73
+ # Unlike the W3C specification these rules *do* allow character sequences
64
74
  # such as `--` and `->`. Putting extra checks in for these sequences would
65
75
  # actually make the rules/actions more complex.
66
76
  #
@@ -98,7 +108,7 @@
98
108
 
99
109
  # CDATA
100
110
  #
101
- # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
111
+ # http://www.w3.org/TR/html/syntax.html#cdata-sections
102
112
  #
103
113
  # In HTML CDATA tags have no meaning/are not supported. Oga does
104
114
  # support them but treats their contents as plain text.
@@ -232,7 +242,7 @@
232
242
 
233
243
  # DOCTYPES
234
244
  #
235
- # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
245
+ # http://www.w3.org/TR/html/syntax.html#the-doctype
236
246
  #
237
247
  # These rules support the 3 flavours of doctypes:
238
248
  #
@@ -240,10 +250,18 @@
240
250
  # 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
241
251
  # 3. Legacy doctypes
242
252
  #
243
- doctype_start = '<!DOCTYPE'i whitespace+;
253
+ doctype_start = '<!DOCTYPE'i (whitespace_or_newline+ $count_newlines);
244
254
 
245
255
  action start_doctype {
246
256
  callback_simple(id_on_doctype_start);
257
+
258
+ if ( lines > 0 )
259
+ {
260
+ advance_line(lines);
261
+
262
+ lines = 0;
263
+ }
264
+
247
265
  fnext doctype;
248
266
  }
249
267
 
@@ -277,10 +295,6 @@
277
295
  squote => start_string_squote;
278
296
  dquote => start_string_dquote;
279
297
 
280
- # Whitespace inside doctypes is ignored since there's no point in
281
- # including it.
282
- whitespace;
283
-
284
298
  identifier => {
285
299
  callback(id_on_doctype_name, data, encoding, ts, te);
286
300
  };
@@ -289,6 +303,10 @@
289
303
  callback_simple(id_on_doctype_end);
290
304
  fnext main;
291
305
  };
306
+
307
+ newline => advance_newline;
308
+
309
+ whitespace;
292
310
  *|;
293
311
 
294
312
  # XML declaration tags
@@ -338,7 +356,7 @@
338
356
 
339
357
  # Elements
340
358
  #
341
- # http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
359
+ # http://www.w3.org/TR/html/syntax.html#syntax-elements
342
360
  #
343
361
  # Lexing of elements is broken up into different machines that handle the
344
362
  # name/namespace, contents of the open tag and the body of an element. The
@@ -358,6 +376,12 @@
358
376
  callback_simple(id_on_element_end);
359
377
  }
360
378
 
379
+ action close_element_fnext_main {
380
+ callback_simple(id_on_element_end);
381
+
382
+ fnext main;
383
+ }
384
+
361
385
  # Machine used for lexing the name/namespace of an element.
362
386
  element_name := |*
363
387
  identifier ':' => {
@@ -370,16 +394,11 @@
370
394
  };
371
395
  *|;
372
396
 
373
- action hold_start_element_head {
374
- fhold;
375
- fnext element_head;
376
- }
377
-
378
397
  # Characters that can be used for unquoted HTML attribute values.
379
398
  # See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
380
399
  # for more info.
381
400
  html_unquoted_value = ^(
382
- squote | dquote | '`' | '=' | '<' | '>' | whitespace | newline
401
+ squote | dquote | '`' | '=' | '<' | '>' | whitespace_or_newline
383
402
  )+;
384
403
 
385
404
  # Machine used for processing HTML attribute values.
@@ -399,24 +418,33 @@
399
418
  callback_simple(id_on_string_squote);
400
419
  };
401
420
 
402
- any => hold_start_element_head;
421
+ any => hold_and_return;
403
422
  *|;
404
423
 
405
424
  # Machine used for processing XML attribute values.
406
425
  xml_attribute_value := |*
407
- squote => start_string_squote;
408
- dquote => start_string_dquote;
409
- any => hold_start_element_head;
426
+ # The following two actions use "fnext" instead of "fcall". Combined
427
+ # with "element_head" using "fcall" to jump to this machine this means
428
+ # we can return back to "element_head" after processing a single string.
429
+ squote => {
430
+ callback_simple(id_on_string_squote);
431
+
432
+ fnext string_squote;
433
+ };
434
+
435
+ dquote => {
436
+ callback_simple(id_on_string_dquote);
437
+
438
+ fnext string_dquote;
439
+ };
440
+
441
+ any => hold_and_return;
410
442
  *|;
411
443
 
412
444
  # Machine used for processing the contents of an element's starting tag.
413
445
  # This includes the name, namespace and attributes.
414
446
  element_head := |*
415
- whitespace;
416
-
417
- newline => {
418
- callback_simple(id_advance_line);
419
- };
447
+ newline => advance_newline;
420
448
 
421
449
  # Attribute names and namespaces.
422
450
  identifier ':' => {
@@ -431,11 +459,11 @@
431
459
  '=' => {
432
460
  if ( html_p )
433
461
  {
434
- fnext html_attribute_value;
462
+ fcall html_attribute_value;
435
463
  }
436
464
  else
437
465
  {
438
- fnext xml_attribute_value;
466
+ fcall xml_attribute_value;
439
467
  }
440
468
  };
441
469
 
@@ -443,9 +471,13 @@
443
471
  '>' => {
444
472
  callback_simple(id_on_element_open_end);
445
473
 
446
- if ( literal_html_element_p() )
474
+ if ( html_script_p() )
447
475
  {
448
- fnext literal_html_element;
476
+ fnext html_script;
477
+ }
478
+ else if ( html_style_p() )
479
+ {
480
+ fnext html_style;
449
481
  }
450
482
  else
451
483
  {
@@ -458,12 +490,14 @@
458
490
  callback_simple(id_on_element_end);
459
491
  fnext main;
460
492
  };
493
+
494
+ any;
461
495
  *|;
462
496
 
463
497
  # Text
464
498
  #
465
499
  # http://www.w3.org/TR/xml/#syntax
466
- # http://www.w3.org/TR/html-markup/syntax.html#text-syntax
500
+ # http://www.w3.org/TR/html/syntax.html#text
467
501
  #
468
502
  # Text content is everything leading up to certain special tags such as "</"
469
503
  # and "<?".
@@ -482,6 +516,17 @@
482
516
  terminate_text = '</' | '<!' | '<?' | element_start;
483
517
  allowed_text = (any* -- terminate_text) $count_newlines;
484
518
 
519
+ action emit_text {
520
+ callback(id_on_text, data, encoding, ts, te);
521
+
522
+ if ( lines > 0 )
523
+ {
524
+ advance_line(lines);
525
+
526
+ lines = 0;
527
+ }
528
+ }
529
+
485
530
  text := |*
486
531
  terminate_text | allowed_text => {
487
532
  callback(id_on_text, data, encoding, ts, te);
@@ -517,36 +562,17 @@
517
562
  # Certain tags in HTML can contain basically anything except for the literal
518
563
  # closing tag. Two examples are script and style tags. As a result of this
519
564
  # we can't use the regular text machine.
520
- literal_html_closing_tags = '</script>' | '</style>';
521
- literal_html_allowed = (any* -- literal_html_closing_tags) $count_newlines;
522
-
523
- literal_html_element := |*
524
- literal_html_allowed => {
525
- callback(id_on_text, data, encoding, ts, te);
526
-
527
- if ( lines > 0 )
528
- {
529
- advance_line(lines);
530
-
531
- lines = 0;
532
- }
533
- };
534
-
535
- literal_html_allowed %{ mark = p; } literal_html_closing_tags => {
536
- callback(id_on_text, data, encoding, ts, mark);
537
-
538
- p = mark - 1;
539
- mark = 0;
540
565
 
541
- if ( lines > 0 )
542
- {
543
- advance_line(lines);
566
+ literal_html_allowed = (^'<'+ | '<'+) $count_newlines;
544
567
 
545
- lines = 0;
546
- }
568
+ html_script := |*
569
+ literal_html_allowed => emit_text;
570
+ '</script>' => close_element_fnext_main;
571
+ *|;
547
572
 
548
- fnext main;
549
- };
573
+ html_style := |*
574
+ literal_html_allowed => emit_text;
575
+ '</style>' => close_element_fnext_main;
550
576
  *|;
551
577
 
552
578
  # The main machine aka the entry point of Ragel.
data/lib/liboga.jar CHANGED
Binary file
data/lib/oga/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Oga
2
- VERSION = '0.3.2'
2
+ VERSION = '0.3.3'
3
3
  end # Oga
data/lib/oga/xml/lexer.rb CHANGED
@@ -40,12 +40,18 @@ module Oga
40
40
  class Lexer
41
41
  attr_reader :html
42
42
 
43
+ # @return [String]
44
+ HTML_SCRIPT = 'script'
45
+
46
+ # @return [String]
47
+ HTML_STYLE = 'style'
48
+
43
49
  ##
44
50
  # Names of HTML tags of which the content should be lexed as-is.
45
51
  #
46
52
  # @return [Array]
47
53
  #
48
- LITERAL_HTML_ELEMENTS = %w{script style}
54
+ LITERAL_HTML_ELEMENTS = [HTML_SCRIPT, HTML_STYLE]
49
55
 
50
56
  ##
51
57
  # @param [String|IO] data The data to lex. This can either be a String or
@@ -189,12 +195,17 @@ module Oga
189
195
  end
190
196
 
191
197
  ##
192
- # Returns true if the current element's content should be lexed as-is.
198
+ # @return [TrueClass|FalseClass]
193
199
  #
200
+ def html_script?
201
+ return html? && current_element == HTML_SCRIPT
202
+ end
203
+
204
+ ##
194
205
  # @return [TrueClass|FalseClass]
195
206
  #
196
- def literal_html_element?
197
- return html? && LITERAL_HTML_ELEMENTS.include?(current_element)
207
+ def html_style?
208
+ return html? && current_element == HTML_STYLE
198
209
  end
199
210
 
200
211
  ##
@@ -59,7 +59,7 @@ class Parser < LL::Driver
59
59
 
60
60
  CONFIG.rules = [
61
61
  [3, 0, 0, 1], # 0
62
- [3, 1, 4, 22, 6, 0], # 1
62
+ [3, 1, 4, 23, 6, 0], # 1
63
63
  [3, 2, 0, 3], # 2
64
64
  [3, 3, 0, 7], # 3
65
65
  [3, 4, 0, 9], # 4
@@ -71,8 +71,8 @@ class Parser < LL::Driver
71
71
  [3, 10, 1, 6], # 10
72
72
  [3, 11, 0, 6, 1, 7], # 11
73
73
  [3, 12, 1, 6, 0, 5], # 12
74
- [3, 13, 5, 23, 6, 0], # 13
75
- [3, 14, 1, 6, 8, 24, 0, 20], # 14
74
+ [3, 13, 5, 24, 6, 0], # 13
75
+ [3, 14, 1, 6, 8, 25, 0, 21], # 14
76
76
  [3, 15, 1, 6], # 15
77
77
  [3, 16, 1, 15, 0, 8, 1, 13], # 16
78
78
  [3, 17, 0, 8, 1, 14], # 17
@@ -87,21 +87,23 @@ class Parser < LL::Driver
87
87
  [3, 26, 1, 17, 1, 18], # 26
88
88
  [3, 27, 0, 16, 0, 13, 1, 16], # 27
89
89
  [3, 28, 1, 19, 0, 1, 0, 14], # 28
90
- [3, 29, 4, 25, 6, 0], # 29
91
- [3, 30, 8, 26, 1, 20, 1, 21], # 30
92
- [3, 31, 8, 27, 1, 20], # 31
90
+ [3, 29, 4, 26, 6, 0], # 29
91
+ [3, 30, 8, 27, 1, 20, 1, 21], # 30
92
+ [3, 31, 8, 28, 1, 20], # 31
93
93
  [3, 32, 1, 23, 0, 16, 1, 22], # 32
94
- [3, 33, 1, 1], # 33
95
- [3, 34, 1, 3, 0, 21, 1, 3], # 34
96
- [3, 35, 1, 2, 0, 21, 1, 2], # 35
97
- [3, 36, 0, 21, 1, 4], # 36
98
- [3, 37, 2, 0], # 37
99
- [3, 38, 0, 2], # 38
100
- [3, 39, 1, 9], # 39
101
- [3, 40, 0, 20], # 40
102
- [3, 41, 0, 17], # 41
103
- [3, 42, 0, 20], # 42
104
- [3, 43, 0, 20], # 43
94
+ [3, 33, 0, 20, 1, 1], # 33
95
+ [3, 34, 0, 20, 1, 1], # 34
96
+ [3, 35, 2, 0], # 35
97
+ [3, 36, 1, 3, 0, 22, 1, 3], # 36
98
+ [3, 37, 1, 2, 0, 22, 1, 2], # 37
99
+ [3, 38, 0, 22, 1, 4], # 38
100
+ [3, 39, 2, 0], # 39
101
+ [3, 40, 0, 2], # 40
102
+ [3, 41, 1, 9], # 41
103
+ [3, 42, 0, 21], # 42
104
+ [3, 43, 0, 17], # 43
105
+ [3, 44, 0, 21], # 44
106
+ [3, 45, 0, 21], # 45
105
107
  ].freeze
106
108
 
107
109
  CONFIG.table = [
@@ -125,14 +127,15 @@ class Parser < LL::Driver
125
127
  [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 31, 30, -1, -1, -1, -1, -1, -1], # 17
126
128
  [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 32, -1, -1, -1, -1, -1], # 18
127
129
  [-1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 19
128
- [-1, -1, 35, 34, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 20
129
- [37, 37, 37, 37, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37], # 21
130
- [-1, 38, -1, -1, -1, 38, -1, -1, -1, -1, 38, -1, -1, 38, -1, -1, 38, -1, -1, -1, -1, -1, 38, -1, 38, -1, -1, -1], # 22
131
- [-1, -1, -1, -1, -1, -1, -1, -1, -1, 39, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 23
132
- [-1, -1, 40, 40, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 24
133
- [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 41, 41, -1, -1, -1, -1, -1, -1], # 25
134
- [-1, -1, 42, 42, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 26
135
- [-1, -1, 43, 43, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 27
130
+ [35, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35], # 20
131
+ [-1, -1, 37, 36, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 21
132
+ [39, 39, 39, 39, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39], # 22
133
+ [-1, 40, -1, -1, -1, 40, -1, -1, -1, -1, 40, -1, -1, 40, -1, -1, 40, -1, -1, -1, -1, -1, 40, -1, 40, -1, -1, -1], # 23
134
+ [-1, -1, -1, -1, -1, -1, -1, -1, -1, 41, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 24
135
+ [-1, -1, 42, 42, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 25
136
+ [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 43, 43, -1, -1, -1, -1, -1, -1], # 26
137
+ [-1, -1, 44, 44, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 27
138
+ [-1, -1, 45, 45, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 28
136
139
  ].freeze
137
140
 
138
141
  CONFIG.actions = [
@@ -169,17 +172,19 @@ class Parser < LL::Driver
169
172
  [:_rule_30, 3], # 30
170
173
  [:_rule_31, 2], # 31
171
174
  [:_rule_32, 3], # 32
172
- [:_rule_33, 1], # 33
173
- [:_rule_34, 3], # 34
174
- [:_rule_35, 3], # 35
175
- [:_rule_36, 2], # 36
176
- [:_rule_37, 0], # 37
177
- [:_rule_38, 1], # 38
178
- [:_rule_39, 1], # 39
175
+ [:_rule_33, 2], # 33
176
+ [:_rule_34, 2], # 34
177
+ [:_rule_35, 0], # 35
178
+ [:_rule_36, 3], # 36
179
+ [:_rule_37, 3], # 37
180
+ [:_rule_38, 2], # 38
181
+ [:_rule_39, 0], # 39
179
182
  [:_rule_40, 1], # 40
180
183
  [:_rule_41, 1], # 41
181
184
  [:_rule_42, 1], # 42
182
185
  [:_rule_43, 1], # 43
186
+ [:_rule_44, 1], # 44
187
+ [:_rule_45, 1], # 45
183
188
  ].freeze
184
189
 
185
190
  ##
@@ -568,31 +573,35 @@ class Parser < LL::Driver
568
573
  end
569
574
 
570
575
  def _rule_33(val)
571
- on_text(val[0])
576
+
577
+ text = val[1] ? val[0] + val[1] : val[0]
578
+
579
+ on_text(text)
580
+
572
581
  end
573
582
 
574
583
  def _rule_34(val)
575
- val[1]
584
+ val[1] ? val[0] + val[1] : val[0]
576
585
  end
577
586
 
578
587
  def _rule_35(val)
579
- val[1]
588
+ nil
580
589
  end
581
590
 
582
591
  def _rule_36(val)
583
- val[0] + val[1]
592
+ val[1]
584
593
  end
585
594
 
586
595
  def _rule_37(val)
587
- ''
596
+ val[1]
588
597
  end
589
598
 
590
599
  def _rule_38(val)
591
- val[0]
600
+ val[0] + val[1]
592
601
  end
593
602
 
594
603
  def _rule_39(val)
595
- val[0]
604
+ ''
596
605
  end
597
606
 
598
607
  def _rule_40(val)
@@ -610,6 +619,14 @@ class Parser < LL::Driver
610
619
  def _rule_43(val)
611
620
  val[0]
612
621
  end
622
+
623
+ def _rule_44(val)
624
+ val[0]
625
+ end
626
+
627
+ def _rule_45(val)
628
+ val[0]
629
+ end
613
630
  end
614
631
  end
615
632
  end