oga 1.0.1-java → 1.0.2-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/c/lexer.c +834 -785
- data/ext/java/org/liboga/xml/Lexer.java +273 -244
- data/ext/ragel/base_lexer.rl +31 -5
- data/lib/liboga.jar +0 -0
- data/lib/oga.rb +41 -41
- data/lib/oga/version.rb +1 -1
- data/lib/oga/xml/lexer.rb +23 -8
- metadata +66 -66
data/ext/ragel/base_lexer.rl
CHANGED
@@ -363,16 +363,17 @@
|
|
363
363
|
# body of an element is lexed using the `main` machine.
|
364
364
|
#
|
365
365
|
|
366
|
-
element_start = '<' ident_char;
|
367
|
-
element_end = '</' identifier (':' identifier)* '>';
|
368
|
-
|
369
366
|
action start_element {
|
370
367
|
fhold;
|
371
368
|
fnext element_name;
|
372
369
|
}
|
373
370
|
|
371
|
+
action start_close_element {
|
372
|
+
fnext element_close;
|
373
|
+
}
|
374
|
+
|
374
375
|
action close_element {
|
375
|
-
|
376
|
+
callback(id_on_element_end, data, encoding, ts, te);
|
376
377
|
}
|
377
378
|
|
378
379
|
action close_element_fnext_main {
|
@@ -381,6 +382,9 @@
|
|
381
382
|
fnext main;
|
382
383
|
}
|
383
384
|
|
385
|
+
element_start = '<' ident_char;
|
386
|
+
element_end = '</';
|
387
|
+
|
384
388
|
# Machine used for lexing the name/namespace of an element.
|
385
389
|
element_name := |*
|
386
390
|
identifier ':' => {
|
@@ -393,6 +397,28 @@
|
|
393
397
|
};
|
394
398
|
*|;
|
395
399
|
|
400
|
+
# Machine used for lexing the closing tag of an element
|
401
|
+
element_close := |*
|
402
|
+
# namespace prefixes, currently not used but allows the rule below it
|
403
|
+
# to be used for the actual element name.
|
404
|
+
identifier ':';
|
405
|
+
|
406
|
+
identifier => close_element;
|
407
|
+
|
408
|
+
'>' => {
|
409
|
+
if ( lines > 0 )
|
410
|
+
{
|
411
|
+
advance_line(lines);
|
412
|
+
|
413
|
+
lines = 0;
|
414
|
+
}
|
415
|
+
|
416
|
+
fnext main;
|
417
|
+
};
|
418
|
+
|
419
|
+
any $count_newlines;
|
420
|
+
*|;
|
421
|
+
|
396
422
|
# Characters that can be used for unquoted HTML attribute values.
|
397
423
|
# See https://html.spec.whatwg.org/multipage/introduction.html#intro-early-example
|
398
424
|
# for more info.
|
@@ -582,7 +608,7 @@
|
|
582
608
|
cdata_start => start_cdata;
|
583
609
|
proc_ins_start => start_proc_ins;
|
584
610
|
element_start => start_element;
|
585
|
-
element_end =>
|
611
|
+
element_end => start_close_element;
|
586
612
|
any => start_text;
|
587
613
|
*|;
|
588
614
|
}%%
|
data/lib/liboga.jar
CHANGED
Binary file
|
data/lib/oga.rb
CHANGED
@@ -3,19 +3,19 @@ require 'set'
|
|
3
3
|
require 'stringio'
|
4
4
|
require 'thread'
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
6
|
+
require 'oga/version'
|
7
|
+
require 'oga/oga'
|
8
|
+
require 'oga/lru'
|
9
|
+
require 'oga/entity_decoder'
|
10
|
+
require 'oga/blacklist'
|
11
|
+
require 'oga/whitelist'
|
12
12
|
|
13
13
|
# Load these first so that the native extensions don't have to define the
|
14
14
|
# Oga::XML namespace.
|
15
|
-
|
16
|
-
|
15
|
+
require 'oga/xml/lexer'
|
16
|
+
require 'oga/xml/parser'
|
17
17
|
|
18
|
-
|
18
|
+
require 'liboga'
|
19
19
|
|
20
20
|
#:nocov:
|
21
21
|
if RUBY_PLATFORM == 'java'
|
@@ -23,35 +23,35 @@ if RUBY_PLATFORM == 'java'
|
|
23
23
|
end
|
24
24
|
#:nocov:
|
25
25
|
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
26
|
+
require 'oga/xml/html_void_elements'
|
27
|
+
require 'oga/xml/entities'
|
28
|
+
require 'oga/xml/querying'
|
29
|
+
require 'oga/xml/traversal'
|
30
|
+
require 'oga/xml/node'
|
31
|
+
require 'oga/xml/document'
|
32
|
+
require 'oga/xml/character_node'
|
33
|
+
require 'oga/xml/text'
|
34
|
+
require 'oga/xml/comment'
|
35
|
+
require 'oga/xml/cdata'
|
36
|
+
require 'oga/xml/xml_declaration'
|
37
|
+
require 'oga/xml/processing_instruction'
|
38
|
+
require 'oga/xml/doctype'
|
39
|
+
require 'oga/xml/namespace'
|
40
|
+
require 'oga/xml/default_namespace'
|
41
|
+
require 'oga/xml/attribute'
|
42
|
+
require 'oga/xml/element'
|
43
|
+
require 'oga/xml/node_set'
|
44
|
+
|
45
|
+
require 'oga/xml/sax_parser'
|
46
|
+
require 'oga/xml/pull_parser'
|
47
|
+
|
48
|
+
require 'oga/html/parser'
|
49
|
+
require 'oga/html/sax_parser'
|
50
|
+
require 'oga/html/entities'
|
51
|
+
|
52
|
+
require 'oga/xpath/lexer'
|
53
|
+
require 'oga/xpath/parser'
|
54
|
+
require 'oga/xpath/evaluator'
|
55
|
+
|
56
|
+
require 'oga/css/lexer'
|
57
|
+
require 'oga/css/parser'
|
data/lib/oga/version.rb
CHANGED
data/lib/oga/xml/lexer.rb
CHANGED
@@ -50,6 +50,10 @@ module Oga
|
|
50
50
|
%w{thead tbody tfoot tr caption colgroup col}
|
51
51
|
)
|
52
52
|
|
53
|
+
HTML_SCRIPT_ELEMENTS = Whitelist.new(%w{script template})
|
54
|
+
|
55
|
+
HTML_TABLE_ROW_ELEMENTS = Whitelist.new(%w{tr}) + HTML_SCRIPT_ELEMENTS
|
56
|
+
|
53
57
|
# Elements that should be closed automatically before a new opening tag is
|
54
58
|
# processed.
|
55
59
|
HTML_CLOSE_SELF = {
|
@@ -59,8 +63,9 @@ module Oga
|
|
59
63
|
'dt' => Blacklist.new(%w{dt dd}),
|
60
64
|
'dd' => Blacklist.new(%w{dt dd}),
|
61
65
|
'p' => Blacklist.new(%w{
|
62
|
-
address article aside blockquote div dl fieldset
|
63
|
-
h4 h5 h6 header hgroup hr main nav
|
66
|
+
address article aside blockquote details div dl fieldset figcaption
|
67
|
+
figure footer form h1 h2 h3 h4 h5 h6 header hgroup hr main menu nav
|
68
|
+
ol p pre section table ul
|
64
69
|
}),
|
65
70
|
'rb' => Blacklist.new(%w{rb rt rtc rp}),
|
66
71
|
'rt' => Blacklist.new(%w{rb rt rtc rp}),
|
@@ -70,11 +75,11 @@ module Oga
|
|
70
75
|
'option' => Blacklist.new(%w{optgroup option}),
|
71
76
|
'colgroup' => Whitelist.new(%w{col template}),
|
72
77
|
'caption' => HTML_TABLE_ALLOWED.to_blacklist,
|
73
|
-
'table' => HTML_TABLE_ALLOWED,
|
74
|
-
'thead' =>
|
75
|
-
'tbody' =>
|
76
|
-
'tfoot' =>
|
77
|
-
'tr' => Whitelist.new(%w{td th}),
|
78
|
+
'table' => HTML_TABLE_ALLOWED + HTML_SCRIPT_ELEMENTS,
|
79
|
+
'thead' => HTML_TABLE_ROW_ELEMENTS,
|
80
|
+
'tbody' => HTML_TABLE_ROW_ELEMENTS,
|
81
|
+
'tfoot' => HTML_TABLE_ROW_ELEMENTS,
|
82
|
+
'tr' => Whitelist.new(%w{td th}) + HTML_SCRIPT_ELEMENTS,
|
78
83
|
'td' => Blacklist.new(%w{td th}) + HTML_TABLE_ALLOWED,
|
79
84
|
'th' => Blacklist.new(%w{td th}) + HTML_TABLE_ALLOWED
|
80
85
|
}
|
@@ -475,9 +480,19 @@ module Oga
|
|
475
480
|
##
|
476
481
|
# Called on the closing tag of an element.
|
477
482
|
#
|
478
|
-
|
483
|
+
# @param [String] ns_name The name of the element (minus namespace
|
484
|
+
# prefix). This is not set for self closing tags.
|
485
|
+
#
|
486
|
+
def on_element_end(name = nil)
|
479
487
|
return if @elements.empty?
|
480
488
|
|
489
|
+
if html? and name and @elements.include?(name)
|
490
|
+
while current_element != name
|
491
|
+
add_token(:T_ELEM_END)
|
492
|
+
@elements.pop
|
493
|
+
end
|
494
|
+
end
|
495
|
+
|
481
496
|
add_token(:T_ELEM_END)
|
482
497
|
|
483
498
|
@elements.pop
|
metadata
CHANGED
@@ -1,199 +1,199 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: oga
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Yorick Peterse
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-06-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name: ast
|
15
|
-
version_requirements: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - '>='
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
20
14
|
requirement: !ruby/object:Gem::Requirement
|
21
15
|
requirements:
|
22
16
|
- - '>='
|
23
17
|
- !ruby/object:Gem::Version
|
24
18
|
version: '0'
|
19
|
+
name: ast
|
25
20
|
prerelease: false
|
26
21
|
type: :runtime
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: ruby-ll
|
29
22
|
version_requirements: !ruby/object:Gem::Requirement
|
30
23
|
requirements:
|
31
|
-
- -
|
24
|
+
- - '>='
|
32
25
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
34
28
|
requirement: !ruby/object:Gem::Requirement
|
35
29
|
requirements:
|
36
30
|
- - ~>
|
37
31
|
- !ruby/object:Gem::Version
|
38
32
|
version: '2.1'
|
33
|
+
name: ruby-ll
|
39
34
|
prerelease: false
|
40
35
|
type: :runtime
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: rake
|
43
36
|
version_requirements: !ruby/object:Gem::Requirement
|
44
37
|
requirements:
|
45
|
-
- -
|
38
|
+
- - ~>
|
46
39
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
40
|
+
version: '2.1'
|
41
|
+
- !ruby/object:Gem::Dependency
|
48
42
|
requirement: !ruby/object:Gem::Requirement
|
49
43
|
requirements:
|
50
44
|
- - '>='
|
51
45
|
- !ruby/object:Gem::Version
|
52
46
|
version: '0'
|
47
|
+
name: rake
|
53
48
|
prerelease: false
|
54
49
|
type: :development
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: rspec
|
57
50
|
version_requirements: !ruby/object:Gem::Requirement
|
58
51
|
requirements:
|
59
|
-
- -
|
52
|
+
- - '>='
|
60
53
|
- !ruby/object:Gem::Version
|
61
|
-
version: '
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
62
56
|
requirement: !ruby/object:Gem::Requirement
|
63
57
|
requirements:
|
64
58
|
- - ~>
|
65
59
|
- !ruby/object:Gem::Version
|
66
60
|
version: '3.0'
|
61
|
+
name: rspec
|
67
62
|
prerelease: false
|
68
63
|
type: :development
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: yard
|
71
64
|
version_requirements: !ruby/object:Gem::Requirement
|
72
65
|
requirements:
|
73
|
-
- -
|
66
|
+
- - ~>
|
74
67
|
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
68
|
+
version: '3.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
76
70
|
requirement: !ruby/object:Gem::Requirement
|
77
71
|
requirements:
|
78
72
|
- - '>='
|
79
73
|
- !ruby/object:Gem::Version
|
80
74
|
version: '0'
|
75
|
+
name: yard
|
81
76
|
prerelease: false
|
82
77
|
type: :development
|
83
|
-
- !ruby/object:Gem::Dependency
|
84
|
-
name: simplecov
|
85
78
|
version_requirements: !ruby/object:Gem::Requirement
|
86
79
|
requirements:
|
87
80
|
- - '>='
|
88
81
|
- !ruby/object:Gem::Version
|
89
82
|
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
90
84
|
requirement: !ruby/object:Gem::Requirement
|
91
85
|
requirements:
|
92
86
|
- - '>='
|
93
87
|
- !ruby/object:Gem::Version
|
94
88
|
version: '0'
|
89
|
+
name: simplecov
|
95
90
|
prerelease: false
|
96
91
|
type: :development
|
97
|
-
- !ruby/object:Gem::Dependency
|
98
|
-
name: kramdown
|
99
92
|
version_requirements: !ruby/object:Gem::Requirement
|
100
93
|
requirements:
|
101
94
|
- - '>='
|
102
95
|
- !ruby/object:Gem::Version
|
103
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
104
98
|
requirement: !ruby/object:Gem::Requirement
|
105
99
|
requirements:
|
106
100
|
- - '>='
|
107
101
|
- !ruby/object:Gem::Version
|
108
102
|
version: '0'
|
103
|
+
name: kramdown
|
109
104
|
prerelease: false
|
110
105
|
type: :development
|
111
|
-
- !ruby/object:Gem::Dependency
|
112
|
-
name: benchmark-ips
|
113
106
|
version_requirements: !ruby/object:Gem::Requirement
|
114
107
|
requirements:
|
115
|
-
- -
|
108
|
+
- - '>='
|
116
109
|
- !ruby/object:Gem::Version
|
117
|
-
version: '
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
118
112
|
requirement: !ruby/object:Gem::Requirement
|
119
113
|
requirements:
|
120
114
|
- - ~>
|
121
115
|
- !ruby/object:Gem::Version
|
122
116
|
version: '2.0'
|
117
|
+
name: benchmark-ips
|
123
118
|
prerelease: false
|
124
119
|
type: :development
|
125
|
-
- !ruby/object:Gem::Dependency
|
126
|
-
name: rake-compiler
|
127
120
|
version_requirements: !ruby/object:Gem::Requirement
|
128
121
|
requirements:
|
129
|
-
- -
|
122
|
+
- - ~>
|
130
123
|
- !ruby/object:Gem::Version
|
131
|
-
version: '0'
|
124
|
+
version: '2.0'
|
125
|
+
- !ruby/object:Gem::Dependency
|
132
126
|
requirement: !ruby/object:Gem::Requirement
|
133
127
|
requirements:
|
134
128
|
- - '>='
|
135
129
|
- !ruby/object:Gem::Version
|
136
130
|
version: '0'
|
131
|
+
name: rake-compiler
|
137
132
|
prerelease: false
|
138
133
|
type: :development
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - '>='
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
139
|
description: Oga is an XML/HTML parser written in Ruby.
|
140
140
|
email: yorickpeterse@gmail.com
|
141
141
|
executables: []
|
142
142
|
extensions: []
|
143
143
|
extra_rdoc_files: []
|
144
144
|
files:
|
145
|
-
- doc/manually_creating_documents.md
|
146
145
|
- doc/xml_namespaces.md
|
146
|
+
- doc/manually_creating_documents.md
|
147
147
|
- doc/css_selectors.md
|
148
148
|
- doc/migrating_from_nokogiri.md
|
149
149
|
- doc/css/common.css
|
150
150
|
- lib/oga.rb
|
151
|
-
- lib/oga/
|
152
|
-
- lib/oga/version.rb
|
151
|
+
- lib/oga/whitelist.rb
|
153
152
|
- lib/oga/blacklist.rb
|
154
|
-
- lib/oga/
|
153
|
+
- lib/oga/version.rb
|
155
154
|
- lib/oga/entity_decoder.rb
|
156
|
-
- lib/oga/
|
155
|
+
- lib/oga/lru.rb
|
156
|
+
- lib/oga/oga.rb
|
157
157
|
- lib/oga/css/lexer.rb
|
158
158
|
- lib/oga/css/parser.rb
|
159
|
-
- lib/oga/html/sax_parser.rb
|
160
|
-
- lib/oga/html/parser.rb
|
161
|
-
- lib/oga/html/entities.rb
|
162
|
-
- lib/oga/xml/lexer.rb
|
163
159
|
- lib/oga/xml/namespace.rb
|
164
|
-
- lib/oga/xml/
|
165
|
-
- lib/oga/xml/character_node.rb
|
166
|
-
- lib/oga/xml/sax_parser.rb
|
167
|
-
- lib/oga/xml/doctype.rb
|
168
|
-
- lib/oga/xml/document.rb
|
169
|
-
- lib/oga/xml/comment.rb
|
170
|
-
- lib/oga/xml/default_namespace.rb
|
171
|
-
- lib/oga/xml/text.rb
|
160
|
+
- lib/oga/xml/lexer.rb
|
172
161
|
- lib/oga/xml/querying.rb
|
173
|
-
- lib/oga/xml/attribute.rb
|
174
|
-
- lib/oga/xml/pull_parser.rb
|
175
162
|
- lib/oga/xml/parser.rb
|
176
|
-
- lib/oga/xml/
|
177
|
-
- lib/oga/xml/
|
163
|
+
- lib/oga/xml/traversal.rb
|
164
|
+
- lib/oga/xml/text.rb
|
178
165
|
- lib/oga/xml/node.rb
|
166
|
+
- lib/oga/xml/document.rb
|
167
|
+
- lib/oga/xml/pull_parser.rb
|
179
168
|
- lib/oga/xml/node_set.rb
|
169
|
+
- lib/oga/xml/sax_parser.rb
|
170
|
+
- lib/oga/xml/cdata.rb
|
180
171
|
- lib/oga/xml/element.rb
|
172
|
+
- lib/oga/xml/character_node.rb
|
173
|
+
- lib/oga/xml/doctype.rb
|
174
|
+
- lib/oga/xml/html_void_elements.rb
|
175
|
+
- lib/oga/xml/entities.rb
|
176
|
+
- lib/oga/xml/default_namespace.rb
|
177
|
+
- lib/oga/xml/attribute.rb
|
181
178
|
- lib/oga/xml/xml_declaration.rb
|
182
|
-
- lib/oga/xml/
|
183
|
-
- lib/oga/xml/
|
179
|
+
- lib/oga/xml/processing_instruction.rb
|
180
|
+
- lib/oga/xml/comment.rb
|
181
|
+
- lib/oga/html/parser.rb
|
182
|
+
- lib/oga/html/sax_parser.rb
|
183
|
+
- lib/oga/html/entities.rb
|
184
184
|
- lib/oga/xpath/lexer.rb
|
185
|
-
- lib/oga/xpath/evaluator.rb
|
186
185
|
- lib/oga/xpath/parser.rb
|
187
|
-
-
|
188
|
-
- ext/c/lexer.rl
|
189
|
-
- ext/c/lexer.h
|
190
|
-
- ext/c/liboga.c
|
191
|
-
- ext/c/extconf.rb
|
192
|
-
- ext/c/liboga.h
|
186
|
+
- lib/oga/xpath/evaluator.rb
|
193
187
|
- ext/ragel/base_lexer.rl
|
194
188
|
- ext/java/Liboga.java
|
195
189
|
- ext/java/org/liboga/xml/Lexer.java
|
196
190
|
- ext/java/org/liboga/xml/Lexer.rl
|
191
|
+
- ext/c/extconf.rb
|
192
|
+
- ext/c/lexer.rl
|
193
|
+
- ext/c/lexer.h
|
194
|
+
- ext/c/liboga.c
|
195
|
+
- ext/c/lexer.c
|
196
|
+
- ext/c/liboga.h
|
197
197
|
- README.md
|
198
198
|
- LICENSE
|
199
199
|
- oga.gemspec
|