oga 0.3.2-java → 0.3.3-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -194,13 +194,23 @@ public class Lexer extends RubyObject
194
194
  }
195
195
 
196
196
  /**
197
- * See * Oga::XML::Lexer#literal_html_element? for more information.
197
+ * @see Oga::XML::Lexer#html_script?
198
198
  */
199
- public Boolean literal_html_element_p()
199
+ public Boolean html_script_p()
200
200
  {
201
201
  ThreadContext context = this.runtime.getCurrentContext();
202
202
 
203
- return this.callMethod(context, "literal_html_element?").isTrue();
203
+ return this.callMethod(context, "html_script?").isTrue();
204
+ }
205
+
206
+ /**
207
+ * @see Oga::XML::Lexer#html_style?
208
+ */
209
+ public Boolean html_style_p()
210
+ {
211
+ ThreadContext context = this.runtime.getCurrentContext();
212
+
213
+ return this.callMethod(context, "html_style?").isTrue();
204
214
  }
205
215
  }
206
216
 
@@ -46,21 +46,31 @@
46
46
  # stack.
47
47
  #
48
48
 
49
- newline = '\r\n' | '\n' | '\r';
49
+ newline = '\r\n' | '\n' | '\r';
50
+ whitespace = [ \t];
51
+ ident_char = [a-zA-Z0-9\-_];
52
+ identifier = ident_char+;
53
+
54
+ whitespace_or_newline = whitespace | newline;
50
55
 
51
56
  action count_newlines {
52
57
  if ( fc == '\n' ) lines++;
53
58
  }
54
59
 
55
- whitespace = [ \t];
56
- ident_char = [a-zA-Z0-9\-_];
57
- identifier = ident_char+;
60
+ action advance_newline {
61
+ advance_line(1);
62
+ }
63
+
64
+ action hold_and_return {
65
+ fhold;
66
+ fret;
67
+ }
58
68
 
59
69
  # Comments
60
70
  #
61
- # http://www.w3.org/TR/html-markup/syntax.html#comments
71
+ # http://www.w3.org/TR/html/syntax.html#comments
62
72
  #
63
- # Unlike the W3 specification these rules *do* allow character sequences
73
+ # Unlike the W3C specification these rules *do* allow character sequences
64
74
  # such as `--` and `->`. Putting extra checks in for these sequences would
65
75
  # actually make the rules/actions more complex.
66
76
  #
@@ -98,7 +108,7 @@
98
108
 
99
109
  # CDATA
100
110
  #
101
- # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections
111
+ # http://www.w3.org/TR/html/syntax.html#cdata-sections
102
112
  #
103
113
  # In HTML CDATA tags have no meaning/are not supported. Oga does
104
114
  # support them but treats their contents as plain text.
@@ -232,7 +242,7 @@
232
242
 
233
243
  # DOCTYPES
234
244
  #
235
- # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax
245
+ # http://www.w3.org/TR/html/syntax.html#the-doctype
236
246
  #
237
247
  # These rules support the 3 flavours of doctypes:
238
248
  #
@@ -240,10 +250,18 @@
240
250
  # 2. Deprecated doctypes, the more verbose ones used prior to HTML5.
241
251
  # 3. Legacy doctypes
242
252
  #
243
- doctype_start = '<!DOCTYPE'i whitespace+;
253
+ doctype_start = '<!DOCTYPE'i (whitespace_or_newline+ $count_newlines);
244
254
 
245
255
  action start_doctype {
246
256
  callback_simple(id_on_doctype_start);
257
+
258
+ if ( lines > 0 )
259
+ {
260
+ advance_line(lines);
261
+
262
+ lines = 0;
263
+ }
264
+
247
265
  fnext doctype;
248
266
  }
249
267
 
@@ -277,10 +295,6 @@
277
295
  squote => start_string_squote;
278
296
  dquote => start_string_dquote;
279
297
 
280
- # Whitespace inside doctypes is ignored since there's no point in
281
- # including it.
282
- whitespace;
283
-
284
298
  identifier => {
285
299
  callback(id_on_doctype_name, data, encoding, ts, te);
286
300
  };
@@ -289,6 +303,10 @@
289
303
  callback_simple(id_on_doctype_end);
290
304
  fnext main;
291
305
  };
306
+
307
+ newline => advance_newline;
308
+
309
+ whitespace;
292
310
  *|;
293
311
 
294
312
  # XML declaration tags
@@ -338,7 +356,7 @@
338
356
 
339
357
  # Elements
340
358
  #
341
- # http://www.w3.org/TR/html-markup/syntax.html#syntax-elements
359
+ # http://www.w3.org/TR/html/syntax.html#syntax-elements
342
360
  #
343
361
  # Lexing of elements is broken up into different machines that handle the
344
362
  # name/namespace, contents of the open tag and the body of an element. The
@@ -358,6 +376,12 @@
358
376
  callback_simple(id_on_element_end);
359
377
  }
360
378
 
379
+ action close_element_fnext_main {
380
+ callback_simple(id_on_element_end);
381
+
382
+ fnext main;
383
+ }
384
+
361
385
  # Machine used for lexing the name/namespace of an element.
362
386
  element_name := |*
363
387
  identifier ':' => {
@@ -370,16 +394,11 @@
370
394
  };
371
395
  *|;
372
396
 
373
- action hold_start_element_head {
374
- fhold;
375
- fnext element_head;
376
- }
377
-
378
397
  # Characters that can be used for unquoted HTML attribute values.
379
398
  # See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
380
399
  # for more info.
381
400
  html_unquoted_value = ^(
382
- squote | dquote | '`' | '=' | '<' | '>' | whitespace | newline
401
+ squote | dquote | '`' | '=' | '<' | '>' | whitespace_or_newline
383
402
  )+;
384
403
 
385
404
  # Machine used for processing HTML attribute values.
@@ -399,24 +418,33 @@
399
418
  callback_simple(id_on_string_squote);
400
419
  };
401
420
 
402
- any => hold_start_element_head;
421
+ any => hold_and_return;
403
422
  *|;
404
423
 
405
424
  # Machine used for processing XML attribute values.
406
425
  xml_attribute_value := |*
407
- squote => start_string_squote;
408
- dquote => start_string_dquote;
409
- any => hold_start_element_head;
426
+ # The following two actions use "fnext" instead of "fcall". Combined
427
+ # with "element_head" using "fcall" to jump to this machine this means
428
+ # we can return back to "element_head" after processing a single string.
429
+ squote => {
430
+ callback_simple(id_on_string_squote);
431
+
432
+ fnext string_squote;
433
+ };
434
+
435
+ dquote => {
436
+ callback_simple(id_on_string_dquote);
437
+
438
+ fnext string_dquote;
439
+ };
440
+
441
+ any => hold_and_return;
410
442
  *|;
411
443
 
412
444
  # Machine used for processing the contents of an element's starting tag.
413
445
  # This includes the name, namespace and attributes.
414
446
  element_head := |*
415
- whitespace;
416
-
417
- newline => {
418
- callback_simple(id_advance_line);
419
- };
447
+ newline => advance_newline;
420
448
 
421
449
  # Attribute names and namespaces.
422
450
  identifier ':' => {
@@ -431,11 +459,11 @@
431
459
  '=' => {
432
460
  if ( html_p )
433
461
  {
434
- fnext html_attribute_value;
462
+ fcall html_attribute_value;
435
463
  }
436
464
  else
437
465
  {
438
- fnext xml_attribute_value;
466
+ fcall xml_attribute_value;
439
467
  }
440
468
  };
441
469
 
@@ -443,9 +471,13 @@
443
471
  '>' => {
444
472
  callback_simple(id_on_element_open_end);
445
473
 
446
- if ( literal_html_element_p() )
474
+ if ( html_script_p() )
447
475
  {
448
- fnext literal_html_element;
476
+ fnext html_script;
477
+ }
478
+ else if ( html_style_p() )
479
+ {
480
+ fnext html_style;
449
481
  }
450
482
  else
451
483
  {
@@ -458,12 +490,14 @@
458
490
  callback_simple(id_on_element_end);
459
491
  fnext main;
460
492
  };
493
+
494
+ any;
461
495
  *|;
462
496
 
463
497
  # Text
464
498
  #
465
499
  # http://www.w3.org/TR/xml/#syntax
466
- # http://www.w3.org/TR/html-markup/syntax.html#text-syntax
500
+ # http://www.w3.org/TR/html/syntax.html#text
467
501
  #
468
502
  # Text content is everything leading up to certain special tags such as "</"
469
503
  # and "<?".
@@ -482,6 +516,17 @@
482
516
  terminate_text = '</' | '<!' | '<?' | element_start;
483
517
  allowed_text = (any* -- terminate_text) $count_newlines;
484
518
 
519
+ action emit_text {
520
+ callback(id_on_text, data, encoding, ts, te);
521
+
522
+ if ( lines > 0 )
523
+ {
524
+ advance_line(lines);
525
+
526
+ lines = 0;
527
+ }
528
+ }
529
+
485
530
  text := |*
486
531
  terminate_text | allowed_text => {
487
532
  callback(id_on_text, data, encoding, ts, te);
@@ -517,36 +562,17 @@
517
562
  # Certain tags in HTML can contain basically anything except for the literal
518
563
  # closing tag. Two examples are script and style tags. As a result of this
519
564
  # we can't use the regular text machine.
520
- literal_html_closing_tags = '</script>' | '</style>';
521
- literal_html_allowed = (any* -- literal_html_closing_tags) $count_newlines;
522
-
523
- literal_html_element := |*
524
- literal_html_allowed => {
525
- callback(id_on_text, data, encoding, ts, te);
526
-
527
- if ( lines > 0 )
528
- {
529
- advance_line(lines);
530
-
531
- lines = 0;
532
- }
533
- };
534
-
535
- literal_html_allowed %{ mark = p; } literal_html_closing_tags => {
536
- callback(id_on_text, data, encoding, ts, mark);
537
-
538
- p = mark - 1;
539
- mark = 0;
540
565
 
541
- if ( lines > 0 )
542
- {
543
- advance_line(lines);
566
+ literal_html_allowed = (^'<'+ | '<'+) $count_newlines;
544
567
 
545
- lines = 0;
546
- }
568
+ html_script := |*
569
+ literal_html_allowed => emit_text;
570
+ '</script>' => close_element_fnext_main;
571
+ *|;
547
572
 
548
- fnext main;
549
- };
573
+ html_style := |*
574
+ literal_html_allowed => emit_text;
575
+ '</style>' => close_element_fnext_main;
550
576
  *|;
551
577
 
552
578
  # The main machine aka the entry point of Ragel.
data/lib/liboga.jar CHANGED
Binary file
data/lib/oga/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Oga
2
- VERSION = '0.3.2'
2
+ VERSION = '0.3.3'
3
3
  end # Oga
data/lib/oga/xml/lexer.rb CHANGED
@@ -40,12 +40,18 @@ module Oga
40
40
  class Lexer
41
41
  attr_reader :html
42
42
 
43
+ # @return [String]
44
+ HTML_SCRIPT = 'script'
45
+
46
+ # @return [String]
47
+ HTML_STYLE = 'style'
48
+
43
49
  ##
44
50
  # Names of HTML tags of which the content should be lexed as-is.
45
51
  #
46
52
  # @return [Array]
47
53
  #
48
- LITERAL_HTML_ELEMENTS = %w{script style}
54
+ LITERAL_HTML_ELEMENTS = [HTML_SCRIPT, HTML_STYLE]
49
55
 
50
56
  ##
51
57
  # @param [String|IO] data The data to lex. This can either be a String or
@@ -189,12 +195,17 @@ module Oga
189
195
  end
190
196
 
191
197
  ##
192
- # Returns true if the current element's content should be lexed as-is.
198
+ # @return [TrueClass|FalseClass]
193
199
  #
200
+ def html_script?
201
+ return html? && current_element == HTML_SCRIPT
202
+ end
203
+
204
+ ##
194
205
  # @return [TrueClass|FalseClass]
195
206
  #
196
- def literal_html_element?
197
- return html? && LITERAL_HTML_ELEMENTS.include?(current_element)
207
+ def html_style?
208
+ return html? && current_element == HTML_STYLE
198
209
  end
199
210
 
200
211
  ##
@@ -59,7 +59,7 @@ class Parser < LL::Driver
59
59
 
60
60
  CONFIG.rules = [
61
61
  [3, 0, 0, 1], # 0
62
- [3, 1, 4, 22, 6, 0], # 1
62
+ [3, 1, 4, 23, 6, 0], # 1
63
63
  [3, 2, 0, 3], # 2
64
64
  [3, 3, 0, 7], # 3
65
65
  [3, 4, 0, 9], # 4
@@ -71,8 +71,8 @@ class Parser < LL::Driver
71
71
  [3, 10, 1, 6], # 10
72
72
  [3, 11, 0, 6, 1, 7], # 11
73
73
  [3, 12, 1, 6, 0, 5], # 12
74
- [3, 13, 5, 23, 6, 0], # 13
75
- [3, 14, 1, 6, 8, 24, 0, 20], # 14
74
+ [3, 13, 5, 24, 6, 0], # 13
75
+ [3, 14, 1, 6, 8, 25, 0, 21], # 14
76
76
  [3, 15, 1, 6], # 15
77
77
  [3, 16, 1, 15, 0, 8, 1, 13], # 16
78
78
  [3, 17, 0, 8, 1, 14], # 17
@@ -87,21 +87,23 @@ class Parser < LL::Driver
87
87
  [3, 26, 1, 17, 1, 18], # 26
88
88
  [3, 27, 0, 16, 0, 13, 1, 16], # 27
89
89
  [3, 28, 1, 19, 0, 1, 0, 14], # 28
90
- [3, 29, 4, 25, 6, 0], # 29
91
- [3, 30, 8, 26, 1, 20, 1, 21], # 30
92
- [3, 31, 8, 27, 1, 20], # 31
90
+ [3, 29, 4, 26, 6, 0], # 29
91
+ [3, 30, 8, 27, 1, 20, 1, 21], # 30
92
+ [3, 31, 8, 28, 1, 20], # 31
93
93
  [3, 32, 1, 23, 0, 16, 1, 22], # 32
94
- [3, 33, 1, 1], # 33
95
- [3, 34, 1, 3, 0, 21, 1, 3], # 34
96
- [3, 35, 1, 2, 0, 21, 1, 2], # 35
97
- [3, 36, 0, 21, 1, 4], # 36
98
- [3, 37, 2, 0], # 37
99
- [3, 38, 0, 2], # 38
100
- [3, 39, 1, 9], # 39
101
- [3, 40, 0, 20], # 40
102
- [3, 41, 0, 17], # 41
103
- [3, 42, 0, 20], # 42
104
- [3, 43, 0, 20], # 43
94
+ [3, 33, 0, 20, 1, 1], # 33
95
+ [3, 34, 0, 20, 1, 1], # 34
96
+ [3, 35, 2, 0], # 35
97
+ [3, 36, 1, 3, 0, 22, 1, 3], # 36
98
+ [3, 37, 1, 2, 0, 22, 1, 2], # 37
99
+ [3, 38, 0, 22, 1, 4], # 38
100
+ [3, 39, 2, 0], # 39
101
+ [3, 40, 0, 2], # 40
102
+ [3, 41, 1, 9], # 41
103
+ [3, 42, 0, 21], # 42
104
+ [3, 43, 0, 17], # 43
105
+ [3, 44, 0, 21], # 44
106
+ [3, 45, 0, 21], # 45
105
107
  ].freeze
106
108
 
107
109
  CONFIG.table = [
@@ -125,14 +127,15 @@ class Parser < LL::Driver
125
127
  [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 31, 30, -1, -1, -1, -1, -1, -1], # 17
126
128
  [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 32, -1, -1, -1, -1, -1], # 18
127
129
  [-1, 33, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 19
128
- [-1, -1, 35, 34, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 20
129
- [37, 37, 37, 37, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37], # 21
130
- [-1, 38, -1, -1, -1, 38, -1, -1, -1, -1, 38, -1, -1, 38, -1, -1, 38, -1, -1, -1, -1, -1, 38, -1, 38, -1, -1, -1], # 22
131
- [-1, -1, -1, -1, -1, -1, -1, -1, -1, 39, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 23
132
- [-1, -1, 40, 40, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 24
133
- [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 41, 41, -1, -1, -1, -1, -1, -1], # 25
134
- [-1, -1, 42, 42, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 26
135
- [-1, -1, 43, 43, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 27
130
+ [35, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35], # 20
131
+ [-1, -1, 37, 36, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 21
132
+ [39, 39, 39, 39, 38, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39], # 22
133
+ [-1, 40, -1, -1, -1, 40, -1, -1, -1, -1, 40, -1, -1, 40, -1, -1, 40, -1, -1, -1, -1, -1, 40, -1, 40, -1, -1, -1], # 23
134
+ [-1, -1, -1, -1, -1, -1, -1, -1, -1, 41, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 24
135
+ [-1, -1, 42, 42, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 25
136
+ [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 43, 43, -1, -1, -1, -1, -1, -1], # 26
137
+ [-1, -1, 44, 44, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 27
138
+ [-1, -1, 45, 45, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], # 28
136
139
  ].freeze
137
140
 
138
141
  CONFIG.actions = [
@@ -169,17 +172,19 @@ class Parser < LL::Driver
169
172
  [:_rule_30, 3], # 30
170
173
  [:_rule_31, 2], # 31
171
174
  [:_rule_32, 3], # 32
172
- [:_rule_33, 1], # 33
173
- [:_rule_34, 3], # 34
174
- [:_rule_35, 3], # 35
175
- [:_rule_36, 2], # 36
176
- [:_rule_37, 0], # 37
177
- [:_rule_38, 1], # 38
178
- [:_rule_39, 1], # 39
175
+ [:_rule_33, 2], # 33
176
+ [:_rule_34, 2], # 34
177
+ [:_rule_35, 0], # 35
178
+ [:_rule_36, 3], # 36
179
+ [:_rule_37, 3], # 37
180
+ [:_rule_38, 2], # 38
181
+ [:_rule_39, 0], # 39
179
182
  [:_rule_40, 1], # 40
180
183
  [:_rule_41, 1], # 41
181
184
  [:_rule_42, 1], # 42
182
185
  [:_rule_43, 1], # 43
186
+ [:_rule_44, 1], # 44
187
+ [:_rule_45, 1], # 45
183
188
  ].freeze
184
189
 
185
190
  ##
@@ -568,31 +573,35 @@ class Parser < LL::Driver
568
573
  end
569
574
 
570
575
  def _rule_33(val)
571
- on_text(val[0])
576
+
577
+ text = val[1] ? val[0] + val[1] : val[0]
578
+
579
+ on_text(text)
580
+
572
581
  end
573
582
 
574
583
  def _rule_34(val)
575
- val[1]
584
+ val[1] ? val[0] + val[1] : val[0]
576
585
  end
577
586
 
578
587
  def _rule_35(val)
579
- val[1]
588
+ nil
580
589
  end
581
590
 
582
591
  def _rule_36(val)
583
- val[0] + val[1]
592
+ val[1]
584
593
  end
585
594
 
586
595
  def _rule_37(val)
587
- ''
596
+ val[1]
588
597
  end
589
598
 
590
599
  def _rule_38(val)
591
- val[0]
600
+ val[0] + val[1]
592
601
  end
593
602
 
594
603
  def _rule_39(val)
595
- val[0]
604
+ ''
596
605
  end
597
606
 
598
607
  def _rule_40(val)
@@ -610,6 +619,14 @@ class Parser < LL::Driver
610
619
  def _rule_43(val)
611
620
  val[0]
612
621
  end
622
+
623
+ def _rule_44(val)
624
+ val[0]
625
+ end
626
+
627
+ def _rule_45(val)
628
+ val[0]
629
+ end
613
630
  end
614
631
  end
615
632
  end