oga 1.0.1-java → 1.0.2-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/c/lexer.c +834 -785
- data/ext/java/org/liboga/xml/Lexer.java +273 -244
- data/ext/ragel/base_lexer.rl +31 -5
- data/lib/liboga.jar +0 -0
- data/lib/oga.rb +41 -41
- data/lib/oga/version.rb +1 -1
- data/lib/oga/xml/lexer.rb +23 -8
- metadata +66 -66
data/ext/ragel/base_lexer.rl
CHANGED
@@ -363,16 +363,17 @@
|
|
363
363
|
# body of an element is lexed using the `main` machine.
|
364
364
|
#
|
365
365
|
|
366
|
-
element_start = '<' ident_char;
|
367
|
-
element_end = '</' identifier (':' identifier)* '>';
|
368
|
-
|
369
366
|
action start_element {
|
370
367
|
fhold;
|
371
368
|
fnext element_name;
|
372
369
|
}
|
373
370
|
|
371
|
+
action start_close_element {
|
372
|
+
fnext element_close;
|
373
|
+
}
|
374
|
+
|
374
375
|
action close_element {
|
375
|
-
|
376
|
+
callback(id_on_element_end, data, encoding, ts, te);
|
376
377
|
}
|
377
378
|
|
378
379
|
action close_element_fnext_main {
|
@@ -381,6 +382,9 @@
|
|
381
382
|
fnext main;
|
382
383
|
}
|
383
384
|
|
385
|
+
element_start = '<' ident_char;
|
386
|
+
element_end = '</';
|
387
|
+
|
384
388
|
# Machine used for lexing the name/namespace of an element.
|
385
389
|
element_name := |*
|
386
390
|
identifier ':' => {
|
@@ -393,6 +397,28 @@
|
|
393
397
|
};
|
394
398
|
*|;
|
395
399
|
|
400
|
+
# Machine used for lexing the closing tag of an element
|
401
|
+
element_close := |*
|
402
|
+
# namespace prefixes, currently not used but allows the rule below it
|
403
|
+
# to be used for the actual element name.
|
404
|
+
identifier ':';
|
405
|
+
|
406
|
+
identifier => close_element;
|
407
|
+
|
408
|
+
'>' => {
|
409
|
+
if ( lines > 0 )
|
410
|
+
{
|
411
|
+
advance_line(lines);
|
412
|
+
|
413
|
+
lines = 0;
|
414
|
+
}
|
415
|
+
|
416
|
+
fnext main;
|
417
|
+
};
|
418
|
+
|
419
|
+
any $count_newlines;
|
420
|
+
*|;
|
421
|
+
|
396
422
|
# Characters that can be used for unquoted HTML attribute values.
|
397
423
|
# See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
|
398
424
|
# for more info.
|
@@ -582,7 +608,7 @@
|
|
582
608
|
cdata_start => start_cdata;
|
583
609
|
proc_ins_start => start_proc_ins;
|
584
610
|
element_start => start_element;
|
585
|
-
element_end =>
|
611
|
+
element_end => start_close_element;
|
586
612
|
any => start_text;
|
587
613
|
*|;
|
588
614
|
}%%
|
data/lib/liboga.jar
CHANGED
Binary file
|
data/lib/oga.rb
CHANGED
@@ -3,19 +3,19 @@ require 'set'
|
|
3
3
|
require 'stringio'
|
4
4
|
require 'thread'
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
6
|
+
require 'oga/version'
|
7
|
+
require 'oga/oga'
|
8
|
+
require 'oga/lru'
|
9
|
+
require 'oga/entity_decoder'
|
10
|
+
require 'oga/blacklist'
|
11
|
+
require 'oga/whitelist'
|
12
12
|
|
13
13
|
# Load these first so that the native extensions don't have to define the
|
14
14
|
# Oga::XML namespace.
|
15
|
-
|
16
|
-
|
15
|
+
require 'oga/xml/lexer'
|
16
|
+
require 'oga/xml/parser'
|
17
17
|
|
18
|
-
|
18
|
+
require 'liboga'
|
19
19
|
|
20
20
|
#:nocov:
|
21
21
|
if RUBY_PLATFORM == 'java'
|
@@ -23,35 +23,35 @@ if RUBY_PLATFORM == 'java'
|
|
23
23
|
end
|
24
24
|
#:nocov:
|
25
25
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
26
|
+
require 'oga/xml/html_void_elements'
|
27
|
+
require 'oga/xml/entities'
|
28
|
+
require 'oga/xml/querying'
|
29
|
+
require 'oga/xml/traversal'
|
30
|
+
require 'oga/xml/node'
|
31
|
+
require 'oga/xml/document'
|
32
|
+
require 'oga/xml/character_node'
|
33
|
+
require 'oga/xml/text'
|
34
|
+
require 'oga/xml/comment'
|
35
|
+
require 'oga/xml/cdata'
|
36
|
+
require 'oga/xml/xml_declaration'
|
37
|
+
require 'oga/xml/processing_instruction'
|
38
|
+
require 'oga/xml/doctype'
|
39
|
+
require 'oga/xml/namespace'
|
40
|
+
require 'oga/xml/default_namespace'
|
41
|
+
require 'oga/xml/attribute'
|
42
|
+
require 'oga/xml/element'
|
43
|
+
require 'oga/xml/node_set'
|
44
|
+
|
45
|
+
require 'oga/xml/sax_parser'
|
46
|
+
require 'oga/xml/pull_parser'
|
47
|
+
|
48
|
+
require 'oga/html/parser'
|
49
|
+
require 'oga/html/sax_parser'
|
50
|
+
require 'oga/html/entities'
|
51
|
+
|
52
|
+
require 'oga/xpath/lexer'
|
53
|
+
require 'oga/xpath/parser'
|
54
|
+
require 'oga/xpath/evaluator'
|
55
|
+
|
56
|
+
require 'oga/css/lexer'
|
57
|
+
require 'oga/css/parser'
|
data/lib/oga/version.rb
CHANGED
data/lib/oga/xml/lexer.rb
CHANGED
@@ -50,6 +50,10 @@ module Oga
|
|
50
50
|
%w{thead tbody tfoot tr caption colgroup col}
|
51
51
|
)
|
52
52
|
|
53
|
+
HTML_SCRIPT_ELEMENTS = Whitelist.new(%w{script template})
|
54
|
+
|
55
|
+
HTML_TABLE_ROW_ELEMENTS = Whitelist.new(%w{tr}) + HTML_SCRIPT_ELEMENTS
|
56
|
+
|
53
57
|
# Elements that should be closed automatically before a new opening tag is
|
54
58
|
# processed.
|
55
59
|
HTML_CLOSE_SELF = {
|
@@ -59,8 +63,9 @@ module Oga
|
|
59
63
|
'dt' => Blacklist.new(%w{dt dd}),
|
60
64
|
'dd' => Blacklist.new(%w{dt dd}),
|
61
65
|
'p' => Blacklist.new(%w{
|
62
|
-
address article aside blockquote div dl fieldset
|
63
|
-
h4 h5 h6 header hgroup hr main nav
|
66
|
+
address article aside blockquote details div dl fieldset figcaption
|
67
|
+
figure footer form h1 h2 h3 h4 h5 h6 header hgroup hr main menu nav
|
68
|
+
ol p pre section table ul
|
64
69
|
}),
|
65
70
|
'rb' => Blacklist.new(%w{rb rt rtc rp}),
|
66
71
|
'rt' => Blacklist.new(%w{rb rt rtc rp}),
|
@@ -70,11 +75,11 @@ module Oga
|
|
70
75
|
'option' => Blacklist.new(%w{optgroup option}),
|
71
76
|
'colgroup' => Whitelist.new(%w{col template}),
|
72
77
|
'caption' => HTML_TABLE_ALLOWED.to_blacklist,
|
73
|
-
'table' => HTML_TABLE_ALLOWED,
|
74
|
-
'thead' =>
|
75
|
-
'tbody' =>
|
76
|
-
'tfoot' =>
|
77
|
-
'tr' => Whitelist.new(%w{td th}),
|
78
|
+
'table' => HTML_TABLE_ALLOWED + HTML_SCRIPT_ELEMENTS,
|
79
|
+
'thead' => HTML_TABLE_ROW_ELEMENTS,
|
80
|
+
'tbody' => HTML_TABLE_ROW_ELEMENTS,
|
81
|
+
'tfoot' => HTML_TABLE_ROW_ELEMENTS,
|
82
|
+
'tr' => Whitelist.new(%w{td th}) + HTML_SCRIPT_ELEMENTS,
|
78
83
|
'td' => Blacklist.new(%w{td th}) + HTML_TABLE_ALLOWED,
|
79
84
|
'th' => Blacklist.new(%w{td th}) + HTML_TABLE_ALLOWED
|
80
85
|
}
|
@@ -475,9 +480,19 @@ module Oga
|
|
475
480
|
##
|
476
481
|
# Called on the closing tag of an element.
|
477
482
|
#
|
478
|
-
|
483
|
+
# @param [String] ns_name The name of the element (minus namespace
|
484
|
+
# prefix). This is not set for self closing tags.
|
485
|
+
#
|
486
|
+
def on_element_end(name = nil)
|
479
487
|
return if @elements.empty?
|
480
488
|
|
489
|
+
if html? and name and @elements.include?(name)
|
490
|
+
while current_element != name
|
491
|
+
add_token(:T_ELEM_END)
|
492
|
+
@elements.pop
|
493
|
+
end
|
494
|
+
end
|
495
|
+
|
481
496
|
add_token(:T_ELEM_END)
|
482
497
|
|
483
498
|
@elements.pop
|
metadata
CHANGED
@@ -1,199 +1,199 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: oga
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Yorick Peterse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-06-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name: ast
|
15
|
-
version_requirements: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - '>='
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
20
14
|
requirement: !ruby/object:Gem::Requirement
|
21
15
|
requirements:
|
22
16
|
- - '>='
|
23
17
|
- !ruby/object:Gem::Version
|
24
18
|
version: '0'
|
19
|
+
name: ast
|
25
20
|
prerelease: false
|
26
21
|
type: :runtime
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: ruby-ll
|
29
22
|
version_requirements: !ruby/object:Gem::Requirement
|
30
23
|
requirements:
|
31
|
-
- -
|
24
|
+
- - '>='
|
32
25
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
34
28
|
requirement: !ruby/object:Gem::Requirement
|
35
29
|
requirements:
|
36
30
|
- - ~>
|
37
31
|
- !ruby/object:Gem::Version
|
38
32
|
version: '2.1'
|
33
|
+
name: ruby-ll
|
39
34
|
prerelease: false
|
40
35
|
type: :runtime
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: rake
|
43
36
|
version_requirements: !ruby/object:Gem::Requirement
|
44
37
|
requirements:
|
45
|
-
- -
|
38
|
+
- - ~>
|
46
39
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
40
|
+
version: '2.1'
|
41
|
+
- !ruby/object:Gem::Dependency
|
48
42
|
requirement: !ruby/object:Gem::Requirement
|
49
43
|
requirements:
|
50
44
|
- - '>='
|
51
45
|
- !ruby/object:Gem::Version
|
52
46
|
version: '0'
|
47
|
+
name: rake
|
53
48
|
prerelease: false
|
54
49
|
type: :development
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: rspec
|
57
50
|
version_requirements: !ruby/object:Gem::Requirement
|
58
51
|
requirements:
|
59
|
-
- -
|
52
|
+
- - '>='
|
60
53
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
62
56
|
requirement: !ruby/object:Gem::Requirement
|
63
57
|
requirements:
|
64
58
|
- - ~>
|
65
59
|
- !ruby/object:Gem::Version
|
66
60
|
version: '3.0'
|
61
|
+
name: rspec
|
67
62
|
prerelease: false
|
68
63
|
type: :development
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: yard
|
71
64
|
version_requirements: !ruby/object:Gem::Requirement
|
72
65
|
requirements:
|
73
|
-
- -
|
66
|
+
- - ~>
|
74
67
|
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
68
|
+
version: '3.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
76
70
|
requirement: !ruby/object:Gem::Requirement
|
77
71
|
requirements:
|
78
72
|
- - '>='
|
79
73
|
- !ruby/object:Gem::Version
|
80
74
|
version: '0'
|
75
|
+
name: yard
|
81
76
|
prerelease: false
|
82
77
|
type: :development
|
83
|
-
- !ruby/object:Gem::Dependency
|
84
|
-
name: simplecov
|
85
78
|
version_requirements: !ruby/object:Gem::Requirement
|
86
79
|
requirements:
|
87
80
|
- - '>='
|
88
81
|
- !ruby/object:Gem::Version
|
89
82
|
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
90
84
|
requirement: !ruby/object:Gem::Requirement
|
91
85
|
requirements:
|
92
86
|
- - '>='
|
93
87
|
- !ruby/object:Gem::Version
|
94
88
|
version: '0'
|
89
|
+
name: simplecov
|
95
90
|
prerelease: false
|
96
91
|
type: :development
|
97
|
-
- !ruby/object:Gem::Dependency
|
98
|
-
name: kramdown
|
99
92
|
version_requirements: !ruby/object:Gem::Requirement
|
100
93
|
requirements:
|
101
94
|
- - '>='
|
102
95
|
- !ruby/object:Gem::Version
|
103
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
104
98
|
requirement: !ruby/object:Gem::Requirement
|
105
99
|
requirements:
|
106
100
|
- - '>='
|
107
101
|
- !ruby/object:Gem::Version
|
108
102
|
version: '0'
|
103
|
+
name: kramdown
|
109
104
|
prerelease: false
|
110
105
|
type: :development
|
111
|
-
- !ruby/object:Gem::Dependency
|
112
|
-
name: benchmark-ips
|
113
106
|
version_requirements: !ruby/object:Gem::Requirement
|
114
107
|
requirements:
|
115
|
-
- -
|
108
|
+
- - '>='
|
116
109
|
- !ruby/object:Gem::Version
|
117
|
-
version: '
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
118
112
|
requirement: !ruby/object:Gem::Requirement
|
119
113
|
requirements:
|
120
114
|
- - ~>
|
121
115
|
- !ruby/object:Gem::Version
|
122
116
|
version: '2.0'
|
117
|
+
name: benchmark-ips
|
123
118
|
prerelease: false
|
124
119
|
type: :development
|
125
|
-
- !ruby/object:Gem::Dependency
|
126
|
-
name: rake-compiler
|
127
120
|
version_requirements: !ruby/object:Gem::Requirement
|
128
121
|
requirements:
|
129
|
-
- -
|
122
|
+
- - ~>
|
130
123
|
- !ruby/object:Gem::Version
|
131
|
-
version: '0'
|
124
|
+
version: '2.0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
132
126
|
requirement: !ruby/object:Gem::Requirement
|
133
127
|
requirements:
|
134
128
|
- - '>='
|
135
129
|
- !ruby/object:Gem::Version
|
136
130
|
version: '0'
|
131
|
+
name: rake-compiler
|
137
132
|
prerelease: false
|
138
133
|
type: :development
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - '>='
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
139
|
description: Oga is an XML/HTML parser written in Ruby.
|
140
140
|
email: yorickpeterse@gmail.com
|
141
141
|
executables: []
|
142
142
|
extensions: []
|
143
143
|
extra_rdoc_files: []
|
144
144
|
files:
|
145
|
-
- doc/manually_creating_documents.md
|
146
145
|
- doc/xml_namespaces.md
|
146
|
+
- doc/manually_creating_documents.md
|
147
147
|
- doc/css_selectors.md
|
148
148
|
- doc/migrating_from_nokogiri.md
|
149
149
|
- doc/css/common.css
|
150
150
|
- lib/oga.rb
|
151
|
-
- lib/oga/
|
152
|
-
- lib/oga/version.rb
|
151
|
+
- lib/oga/whitelist.rb
|
153
152
|
- lib/oga/blacklist.rb
|
154
|
-
- lib/oga/
|
153
|
+
- lib/oga/version.rb
|
155
154
|
- lib/oga/entity_decoder.rb
|
156
|
-
- lib/oga/
|
155
|
+
- lib/oga/lru.rb
|
156
|
+
- lib/oga/oga.rb
|
157
157
|
- lib/oga/css/lexer.rb
|
158
158
|
- lib/oga/css/parser.rb
|
159
|
-
- lib/oga/html/sax_parser.rb
|
160
|
-
- lib/oga/html/parser.rb
|
161
|
-
- lib/oga/html/entities.rb
|
162
|
-
- lib/oga/xml/lexer.rb
|
163
159
|
- lib/oga/xml/namespace.rb
|
164
|
-
- lib/oga/xml/
|
165
|
-
- lib/oga/xml/character_node.rb
|
166
|
-
- lib/oga/xml/sax_parser.rb
|
167
|
-
- lib/oga/xml/doctype.rb
|
168
|
-
- lib/oga/xml/document.rb
|
169
|
-
- lib/oga/xml/comment.rb
|
170
|
-
- lib/oga/xml/default_namespace.rb
|
171
|
-
- lib/oga/xml/text.rb
|
160
|
+
- lib/oga/xml/lexer.rb
|
172
161
|
- lib/oga/xml/querying.rb
|
173
|
-
- lib/oga/xml/attribute.rb
|
174
|
-
- lib/oga/xml/pull_parser.rb
|
175
162
|
- lib/oga/xml/parser.rb
|
176
|
-
- lib/oga/xml/
|
177
|
-
- lib/oga/xml/
|
163
|
+
- lib/oga/xml/traversal.rb
|
164
|
+
- lib/oga/xml/text.rb
|
178
165
|
- lib/oga/xml/node.rb
|
166
|
+
- lib/oga/xml/document.rb
|
167
|
+
- lib/oga/xml/pull_parser.rb
|
179
168
|
- lib/oga/xml/node_set.rb
|
169
|
+
- lib/oga/xml/sax_parser.rb
|
170
|
+
- lib/oga/xml/cdata.rb
|
180
171
|
- lib/oga/xml/element.rb
|
172
|
+
- lib/oga/xml/character_node.rb
|
173
|
+
- lib/oga/xml/doctype.rb
|
174
|
+
- lib/oga/xml/html_void_elements.rb
|
175
|
+
- lib/oga/xml/entities.rb
|
176
|
+
- lib/oga/xml/default_namespace.rb
|
177
|
+
- lib/oga/xml/attribute.rb
|
181
178
|
- lib/oga/xml/xml_declaration.rb
|
182
|
-
- lib/oga/xml/
|
183
|
-
- lib/oga/xml/
|
179
|
+
- lib/oga/xml/processing_instruction.rb
|
180
|
+
- lib/oga/xml/comment.rb
|
181
|
+
- lib/oga/html/parser.rb
|
182
|
+
- lib/oga/html/sax_parser.rb
|
183
|
+
- lib/oga/html/entities.rb
|
184
184
|
- lib/oga/xpath/lexer.rb
|
185
|
-
- lib/oga/xpath/evaluator.rb
|
186
185
|
- lib/oga/xpath/parser.rb
|
187
|
-
-
|
188
|
-
- ext/c/lexer.rl
|
189
|
-
- ext/c/lexer.h
|
190
|
-
- ext/c/liboga.c
|
191
|
-
- ext/c/extconf.rb
|
192
|
-
- ext/c/liboga.h
|
186
|
+
- lib/oga/xpath/evaluator.rb
|
193
187
|
- ext/ragel/base_lexer.rl
|
194
188
|
- ext/java/Liboga.java
|
195
189
|
- ext/java/org/liboga/xml/Lexer.java
|
196
190
|
- ext/java/org/liboga/xml/Lexer.rl
|
191
|
+
- ext/c/extconf.rb
|
192
|
+
- ext/c/lexer.rl
|
193
|
+
- ext/c/lexer.h
|
194
|
+
- ext/c/liboga.c
|
195
|
+
- ext/c/lexer.c
|
196
|
+
- ext/c/liboga.h
|
197
197
|
- README.md
|
198
198
|
- LICENSE
|
199
199
|
- oga.gemspec
|