oga 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/c/lexer.c +834 -785
- data/ext/java/org/liboga/xml/Lexer.java +273 -244
- data/ext/ragel/base_lexer.rl +31 -5
- data/lib/oga.rb +41 -41
- data/lib/oga/version.rb +1 -1
- data/lib/oga/xml/lexer.rb +23 -8
- metadata +3 -3
data/ext/ragel/base_lexer.rl
CHANGED
@@ -363,16 +363,17 @@
|
|
363
363
|
# body of an element is lexed using the `main` machine.
|
364
364
|
#
|
365
365
|
|
366
|
-
element_start = '<' ident_char;
|
367
|
-
element_end = '</' identifier (':' identifier)* '>';
|
368
|
-
|
369
366
|
action start_element {
|
370
367
|
fhold;
|
371
368
|
fnext element_name;
|
372
369
|
}
|
373
370
|
|
371
|
+
action start_close_element {
|
372
|
+
fnext element_close;
|
373
|
+
}
|
374
|
+
|
374
375
|
action close_element {
|
375
|
-
|
376
|
+
callback(id_on_element_end, data, encoding, ts, te);
|
376
377
|
}
|
377
378
|
|
378
379
|
action close_element_fnext_main {
|
@@ -381,6 +382,9 @@
|
|
381
382
|
fnext main;
|
382
383
|
}
|
383
384
|
|
385
|
+
element_start = '<' ident_char;
|
386
|
+
element_end = '</';
|
387
|
+
|
384
388
|
# Machine used for lexing the name/namespace of an element.
|
385
389
|
element_name := |*
|
386
390
|
identifier ':' => {
|
@@ -393,6 +397,28 @@
|
|
393
397
|
};
|
394
398
|
*|;
|
395
399
|
|
400
|
+
# Machine used for lexing the closing tag of an element
|
401
|
+
element_close := |*
|
402
|
+
# namespace prefixes, currently not used but allows the rule below it
|
403
|
+
# to be used for the actual element name.
|
404
|
+
identifier ':';
|
405
|
+
|
406
|
+
identifier => close_element;
|
407
|
+
|
408
|
+
'>' => {
|
409
|
+
if ( lines > 0 )
|
410
|
+
{
|
411
|
+
advance_line(lines);
|
412
|
+
|
413
|
+
lines = 0;
|
414
|
+
}
|
415
|
+
|
416
|
+
fnext main;
|
417
|
+
};
|
418
|
+
|
419
|
+
any $count_newlines;
|
420
|
+
*|;
|
421
|
+
|
396
422
|
# Characters that can be used for unquoted HTML attribute values.
|
397
423
|
# See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
|
398
424
|
# for more info.
|
@@ -582,7 +608,7 @@
|
|
582
608
|
cdata_start => start_cdata;
|
583
609
|
proc_ins_start => start_proc_ins;
|
584
610
|
element_start => start_element;
|
585
|
-
element_end =>
|
611
|
+
element_end => start_close_element;
|
586
612
|
any => start_text;
|
587
613
|
*|;
|
588
614
|
}%%
|
data/lib/oga.rb
CHANGED
@@ -3,19 +3,19 @@ require 'set'
|
|
3
3
|
require 'stringio'
|
4
4
|
require 'thread'
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
6
|
+
require 'oga/version'
|
7
|
+
require 'oga/oga'
|
8
|
+
require 'oga/lru'
|
9
|
+
require 'oga/entity_decoder'
|
10
|
+
require 'oga/blacklist'
|
11
|
+
require 'oga/whitelist'
|
12
12
|
|
13
13
|
# Load these first so that the native extensions don't have to define the
|
14
14
|
# Oga::XML namespace.
|
15
|
-
|
16
|
-
|
15
|
+
require 'oga/xml/lexer'
|
16
|
+
require 'oga/xml/parser'
|
17
17
|
|
18
|
-
|
18
|
+
require 'liboga'
|
19
19
|
|
20
20
|
#:nocov:
|
21
21
|
if RUBY_PLATFORM == 'java'
|
@@ -23,35 +23,35 @@ if RUBY_PLATFORM == 'java'
|
|
23
23
|
end
|
24
24
|
#:nocov:
|
25
25
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
26
|
+
require 'oga/xml/html_void_elements'
|
27
|
+
require 'oga/xml/entities'
|
28
|
+
require 'oga/xml/querying'
|
29
|
+
require 'oga/xml/traversal'
|
30
|
+
require 'oga/xml/node'
|
31
|
+
require 'oga/xml/document'
|
32
|
+
require 'oga/xml/character_node'
|
33
|
+
require 'oga/xml/text'
|
34
|
+
require 'oga/xml/comment'
|
35
|
+
require 'oga/xml/cdata'
|
36
|
+
require 'oga/xml/xml_declaration'
|
37
|
+
require 'oga/xml/processing_instruction'
|
38
|
+
require 'oga/xml/doctype'
|
39
|
+
require 'oga/xml/namespace'
|
40
|
+
require 'oga/xml/default_namespace'
|
41
|
+
require 'oga/xml/attribute'
|
42
|
+
require 'oga/xml/element'
|
43
|
+
require 'oga/xml/node_set'
|
44
|
+
|
45
|
+
require 'oga/xml/sax_parser'
|
46
|
+
require 'oga/xml/pull_parser'
|
47
|
+
|
48
|
+
require 'oga/html/parser'
|
49
|
+
require 'oga/html/sax_parser'
|
50
|
+
require 'oga/html/entities'
|
51
|
+
|
52
|
+
require 'oga/xpath/lexer'
|
53
|
+
require 'oga/xpath/parser'
|
54
|
+
require 'oga/xpath/evaluator'
|
55
|
+
|
56
|
+
require 'oga/css/lexer'
|
57
|
+
require 'oga/css/parser'
|
data/lib/oga/version.rb
CHANGED
data/lib/oga/xml/lexer.rb
CHANGED
@@ -50,6 +50,10 @@ module Oga
|
|
50
50
|
%w{thead tbody tfoot tr caption colgroup col}
|
51
51
|
)
|
52
52
|
|
53
|
+
HTML_SCRIPT_ELEMENTS = Whitelist.new(%w{script template})
|
54
|
+
|
55
|
+
HTML_TABLE_ROW_ELEMENTS = Whitelist.new(%w{tr}) + HTML_SCRIPT_ELEMENTS
|
56
|
+
|
53
57
|
# Elements that should be closed automatically before a new opening tag is
|
54
58
|
# processed.
|
55
59
|
HTML_CLOSE_SELF = {
|
@@ -59,8 +63,9 @@ module Oga
|
|
59
63
|
'dt' => Blacklist.new(%w{dt dd}),
|
60
64
|
'dd' => Blacklist.new(%w{dt dd}),
|
61
65
|
'p' => Blacklist.new(%w{
|
62
|
-
address article aside blockquote div dl fieldset
|
63
|
-
h4 h5 h6 header hgroup hr main nav
|
66
|
+
address article aside blockquote details div dl fieldset figcaption
|
67
|
+
figure footer form h1 h2 h3 h4 h5 h6 header hgroup hr main menu nav
|
68
|
+
ol p pre section table ul
|
64
69
|
}),
|
65
70
|
'rb' => Blacklist.new(%w{rb rt rtc rp}),
|
66
71
|
'rt' => Blacklist.new(%w{rb rt rtc rp}),
|
@@ -70,11 +75,11 @@ module Oga
|
|
70
75
|
'option' => Blacklist.new(%w{optgroup option}),
|
71
76
|
'colgroup' => Whitelist.new(%w{col template}),
|
72
77
|
'caption' => HTML_TABLE_ALLOWED.to_blacklist,
|
73
|
-
'table' => HTML_TABLE_ALLOWED,
|
74
|
-
'thead' =>
|
75
|
-
'tbody' =>
|
76
|
-
'tfoot' =>
|
77
|
-
'tr' => Whitelist.new(%w{td th}),
|
78
|
+
'table' => HTML_TABLE_ALLOWED + HTML_SCRIPT_ELEMENTS,
|
79
|
+
'thead' => HTML_TABLE_ROW_ELEMENTS,
|
80
|
+
'tbody' => HTML_TABLE_ROW_ELEMENTS,
|
81
|
+
'tfoot' => HTML_TABLE_ROW_ELEMENTS,
|
82
|
+
'tr' => Whitelist.new(%w{td th}) + HTML_SCRIPT_ELEMENTS,
|
78
83
|
'td' => Blacklist.new(%w{td th}) + HTML_TABLE_ALLOWED,
|
79
84
|
'th' => Blacklist.new(%w{td th}) + HTML_TABLE_ALLOWED
|
80
85
|
}
|
@@ -475,9 +480,19 @@ module Oga
|
|
475
480
|
##
|
476
481
|
# Called on the closing tag of an element.
|
477
482
|
#
|
478
|
-
|
483
|
+
# @param [String] ns_name The name of the element (minus namespace
|
484
|
+
# prefix). This is not set for self closing tags.
|
485
|
+
#
|
486
|
+
def on_element_end(name = nil)
|
479
487
|
return if @elements.empty?
|
480
488
|
|
489
|
+
if html? and name and @elements.include?(name)
|
490
|
+
while current_element != name
|
491
|
+
add_token(:T_ELEM_END)
|
492
|
+
@elements.pop
|
493
|
+
end
|
494
|
+
end
|
495
|
+
|
481
496
|
add_token(:T_ELEM_END)
|
482
497
|
|
483
498
|
@elements.pop
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: oga
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yorick Peterse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-06-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: ast
|
@@ -219,7 +219,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
219
219
|
version: '0'
|
220
220
|
requirements: []
|
221
221
|
rubyforge_project:
|
222
|
-
rubygems_version: 2.4.
|
222
|
+
rubygems_version: 2.4.7
|
223
223
|
signing_key:
|
224
224
|
specification_version: 4
|
225
225
|
summary: Oga is an XML/HTML parser written in Ruby.
|