gammo 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.travis.yml +6 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +27 -0
- data/LICENSE.txt +21 -0
- data/README.md +177 -0
- data/Rakefile +25 -0
- data/gammo.gemspec +23 -0
- data/lib/gammo.rb +15 -0
- data/lib/gammo/attribute.rb +17 -0
- data/lib/gammo/fragment_parser.rb +65 -0
- data/lib/gammo/node.rb +157 -0
- data/lib/gammo/parser.rb +524 -0
- data/lib/gammo/parser/constants.rb +94 -0
- data/lib/gammo/parser/foreign.rb +307 -0
- data/lib/gammo/parser/insertion_mode.rb +74 -0
- data/lib/gammo/parser/insertion_mode/after_after_body.rb +36 -0
- data/lib/gammo/parser/insertion_mode/after_after_frameset.rb +32 -0
- data/lib/gammo/parser/insertion_mode/after_body.rb +46 -0
- data/lib/gammo/parser/insertion_mode/after_frameset.rb +39 -0
- data/lib/gammo/parser/insertion_mode/after_head.rb +70 -0
- data/lib/gammo/parser/insertion_mode/before_head.rb +49 -0
- data/lib/gammo/parser/insertion_mode/before_html.rb +45 -0
- data/lib/gammo/parser/insertion_mode/in_body.rb +463 -0
- data/lib/gammo/parser/insertion_mode/in_caption.rb +47 -0
- data/lib/gammo/parser/insertion_mode/in_cell.rb +46 -0
- data/lib/gammo/parser/insertion_mode/in_column_group.rb +66 -0
- data/lib/gammo/parser/insertion_mode/in_frameset.rb +48 -0
- data/lib/gammo/parser/insertion_mode/in_head.rb +98 -0
- data/lib/gammo/parser/insertion_mode/in_head_noscript.rb +52 -0
- data/lib/gammo/parser/insertion_mode/in_row.rb +53 -0
- data/lib/gammo/parser/insertion_mode/in_select.rb +77 -0
- data/lib/gammo/parser/insertion_mode/in_select_in_table.rb +46 -0
- data/lib/gammo/parser/insertion_mode/in_table.rb +114 -0
- data/lib/gammo/parser/insertion_mode/in_table_body.rb +55 -0
- data/lib/gammo/parser/insertion_mode/in_template.rb +80 -0
- data/lib/gammo/parser/insertion_mode/initial.rb +152 -0
- data/lib/gammo/parser/insertion_mode/text.rb +32 -0
- data/lib/gammo/parser/insertion_mode_stack.rb +8 -0
- data/lib/gammo/parser/node_stack.rb +24 -0
- data/lib/gammo/tags.rb +9 -0
- data/lib/gammo/tags/table.rb +744 -0
- data/lib/gammo/tokenizer.rb +373 -0
- data/lib/gammo/tokenizer/debug.rb +34 -0
- data/lib/gammo/tokenizer/entity.rb +2240 -0
- data/lib/gammo/tokenizer/escape.rb +174 -0
- data/lib/gammo/tokenizer/script_scanner.rb +229 -0
- data/lib/gammo/tokenizer/tokens.rb +66 -0
- data/lib/gammo/version.rb +3 -0
- data/misc/html.yaml +384 -0
- data/misc/table.erubi +14 -0
- metadata +97 -0
@@ -0,0 +1,174 @@
|
|
1
|
+
require 'gammo/tokenizer/entity'
|
2
|
+
|
3
|
+
module Gammo
|
4
|
+
class Tokenizer
|
5
|
+
module Escape
|
6
|
+
LONGEST_ENTITY_WITHOUT_SEMICOLON = 6
|
7
|
+
ESCAPE_REPLACEMENT_TABLE = {
|
8
|
+
?& => '&',
|
9
|
+
?' => ''',
|
10
|
+
?< => '<',
|
11
|
+
?> => '>',
|
12
|
+
?" => '"',
|
13
|
+
?\r => ' ',
|
14
|
+
}.freeze
|
15
|
+
|
16
|
+
REPLACEMENT_TABLE = [
|
17
|
+
"\u{20AC}",
|
18
|
+
"\u{0081}",
|
19
|
+
"\u{201A}",
|
20
|
+
"\u{0192}",
|
21
|
+
"\u{201E}",
|
22
|
+
"\u{2026}",
|
23
|
+
"\u{2020}",
|
24
|
+
"\u{2021}",
|
25
|
+
"\u{02C6}",
|
26
|
+
"\u{2030}",
|
27
|
+
"\u{0160}",
|
28
|
+
"\u{2039}",
|
29
|
+
"\u{0152}",
|
30
|
+
"\u{008D}",
|
31
|
+
"\u{017D}",
|
32
|
+
"\u{008F}",
|
33
|
+
"\u{0090}",
|
34
|
+
"\u{2018}",
|
35
|
+
"\u{2019}",
|
36
|
+
"\u{201C}",
|
37
|
+
"\u{201D}",
|
38
|
+
"\u{2022}",
|
39
|
+
"\u{2013}",
|
40
|
+
"\u{2014}",
|
41
|
+
"\u{02DC}",
|
42
|
+
"\u{2122}",
|
43
|
+
"\u{0161}",
|
44
|
+
"\u{203A}",
|
45
|
+
"\u{0153}",
|
46
|
+
"\u{009D}",
|
47
|
+
"\u{017E}",
|
48
|
+
"\u{0178}",
|
49
|
+
].freeze
|
50
|
+
|
51
|
+
# Escapes given string according to {ESCAPE_REPLACEMENT_TABLE}.
|
52
|
+
def escape(s)
|
53
|
+
s.gsub!(/[&'<>"\r]/) { |ch| ESCAPE_REPLACEMENT_TABLE[ch] }
|
54
|
+
end
|
55
|
+
|
56
|
+
# Unescapes given data.
|
57
|
+
# @param [String] data
|
58
|
+
# @return [String, nil]
|
59
|
+
def unescape(data, **options)
|
60
|
+
return unless data
|
61
|
+
data.each_byte.with_index do |byte, i|
|
62
|
+
next unless byte.chr == ?&
|
63
|
+
dst, src = unescape_entity(data, i, i, **options)
|
64
|
+
while src < data.bytes.length
|
65
|
+
byte = data.getbyte(src)
|
66
|
+
if byte.chr == ?&
|
67
|
+
dst, src = unescape_entity(data, dst, src, **options)
|
68
|
+
else
|
69
|
+
data.setbyte(dst, byte)
|
70
|
+
dst, src = dst + 1, src + 1
|
71
|
+
end
|
72
|
+
end
|
73
|
+
return data.byteslice(0, dst)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
def unescape_entity(data, dst, src, in_attribute: false)
|
80
|
+
# No need to count "&".
|
81
|
+
i, s = 1, data.byteslice(src..-1)
|
82
|
+
swap(data, dst, src) if s.length <= 1
|
83
|
+
return unescape_sharp_entity(data, s, dst, src, i) if s[i] == ?#
|
84
|
+
i = consume_entity_chars(s, i)
|
85
|
+
name = s.byteslice(1, i - 1)
|
86
|
+
unless name == '' || (in_attribute && like_query_params?(name, s, i))
|
87
|
+
entities = Entity::CODEPOINT[name.to_sym]
|
88
|
+
entities = entities ? [entities] : Entity::TWO_CODEPOINTS[name.to_sym]
|
89
|
+
return replace_entity(entities, data, dst, src, i) if entities
|
90
|
+
unless in_attribute
|
91
|
+
max = name.length - 1
|
92
|
+
max = LONGEST_ENTITY_WITHOUT_SEMICOLON if max > LONGEST_ENTITY_WITHOUT_SEMICOLON
|
93
|
+
max.downto(1) do |n|
|
94
|
+
if entities = Entity::CODEPOINT[name.byteslice(0, n).to_sym]
|
95
|
+
return replace_entity([entities], data, dst, src, n + 1)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
dst1, src1 = dst + i, src + i
|
101
|
+
data[dst, dst1] = data[src, src1]
|
102
|
+
[dst1, src1]
|
103
|
+
end
|
104
|
+
|
105
|
+
def unescape_sharp_entity(data, s, dst, src, i)
|
106
|
+
return swap(data, dst, src) if s.length <= 3
|
107
|
+
i += 1
|
108
|
+
ch = s[i]
|
109
|
+
hex = false
|
110
|
+
if ch == ?x || ch == ?X
|
111
|
+
hex = true
|
112
|
+
i += 1
|
113
|
+
end
|
114
|
+
x = ?\x0
|
115
|
+
while i < s.length
|
116
|
+
ch = s[i]
|
117
|
+
i += 1
|
118
|
+
if hex
|
119
|
+
if ?0 <= ch && ch <= ?9
|
120
|
+
x = 16 * x.ord + ch.ord - ?0.ord
|
121
|
+
next
|
122
|
+
elsif ?a <= ch && ch <= ?f
|
123
|
+
x = 16 * x.ord + ch.ord - ?a.ord + 10
|
124
|
+
next
|
125
|
+
elsif ?A <= ch && ch <= ?F
|
126
|
+
x = 16 * x.ord + ch.ord - ?A.ord + 10
|
127
|
+
next
|
128
|
+
end
|
129
|
+
elsif (?0 <= ch && ch <= ?9)
|
130
|
+
x = 10 * x.ord + ch.ord - ?0.ord
|
131
|
+
next
|
132
|
+
end
|
133
|
+
i -= 1 if ch != ?;
|
134
|
+
break
|
135
|
+
end
|
136
|
+
return swap(data, dst, src) if i <= 3
|
137
|
+
if 0x80 <= x && x <= 0x9F
|
138
|
+
x = REPLACEMENT_TABLE[x - 0x80].ord
|
139
|
+
elsif x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF
|
140
|
+
x = "\u{FFFD}".ord
|
141
|
+
end
|
142
|
+
x.chr(Encoding::UTF_8).each_byte.with_index { |byte, j| data.setbyte(dst + j, byte) }
|
143
|
+
[dst + x.chr(Encoding::UTF_8).bytes.length, src + i]
|
144
|
+
end
|
145
|
+
|
146
|
+
def swap(data, dst, src)
|
147
|
+
data[dst] = data[src]
|
148
|
+
[dst + 1, src + 1]
|
149
|
+
end
|
150
|
+
|
151
|
+
def consume_entity_chars(s, i)
|
152
|
+
while i < s.length
|
153
|
+
ch = s[i]
|
154
|
+
i += 1
|
155
|
+
next if ?a <= ch && ch <= ?z || ?A <= ch && ch <= ?Z || ?0 <= ch && ch <= ?9
|
156
|
+
i -= 1 if ch != ?;
|
157
|
+
break
|
158
|
+
end
|
159
|
+
i
|
160
|
+
end
|
161
|
+
|
162
|
+
def like_query_params?(name, s, i)
|
163
|
+
name[name.length - 1] != ?; && s.length > i && s[i] == ?=
|
164
|
+
end
|
165
|
+
|
166
|
+
def replace_entity(entities, t, dst, src, i)
|
167
|
+
[entities.inject(dst) { |sum, ch|
|
168
|
+
ch.each_byte.with_index { |byte, j| t.setbyte(sum + j, byte) }
|
169
|
+
sum + ch.bytes.length
|
170
|
+
}, src + i]
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
@@ -0,0 +1,229 @@
|
|
1
|
+
require 'gammo/tokenizer/debug'
|
2
|
+
|
3
|
+
module Gammo
|
4
|
+
class Tokenizer
|
5
|
+
class ScriptScanner
|
6
|
+
include Debug
|
7
|
+
|
8
|
+
attr_reader :scanner, :buffer, :raw_tag
|
9
|
+
|
10
|
+
alias_method :debug?, :debug
|
11
|
+
|
12
|
+
def initialize(scanner, raw_tag:, debug: false)
|
13
|
+
@scanner = scanner
|
14
|
+
@buffer = ''
|
15
|
+
@raw_tag = raw_tag
|
16
|
+
@debug = debug
|
17
|
+
end
|
18
|
+
|
19
|
+
def scan
|
20
|
+
scan_script_data
|
21
|
+
buffer
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def with_extendable_stack
|
27
|
+
begin
|
28
|
+
yield
|
29
|
+
rescue SystemStackError
|
30
|
+
Fiber.new { yield }.resume
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def scan_script_data
|
35
|
+
with_extendable_stack do
|
36
|
+
consume do |byte|
|
37
|
+
byte == ?< ? scan_script_data_less_than_sign : scan_script_data
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def scan_script_data_less_than_sign
|
43
|
+
consume do |byte|
|
44
|
+
case byte
|
45
|
+
when ?/ then return scan_script_data_end_tag_open
|
46
|
+
when ?! then return scan_script_data_escape_start
|
47
|
+
end
|
48
|
+
revert_byte
|
49
|
+
scan_script_data
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def scan_script_data_end_tag_open
|
54
|
+
return if scan_raw_end_tag? || scanner.eos?
|
55
|
+
scan_script_data
|
56
|
+
end
|
57
|
+
|
58
|
+
def scan_script_data_escape_start
|
59
|
+
consume do |byte|
|
60
|
+
return scan_script_data_escape_start_dash if byte == ?-
|
61
|
+
revert_byte
|
62
|
+
scan_script_data
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def scan_script_data_escape_start_dash
|
67
|
+
consume do |byte|
|
68
|
+
return scan_script_data_escaped_dash_dash if byte == ?-
|
69
|
+
revert_byte
|
70
|
+
scan_script_data
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def scan_script_data_escaped
|
75
|
+
consume do |byte|
|
76
|
+
case byte
|
77
|
+
when ?- then return scan_script_data_escaped_dash
|
78
|
+
when ?< then return scan_script_data_escaped_less_than_sign
|
79
|
+
else return scan_script_data_escaped
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def scan_script_data_escaped_dash
|
85
|
+
consume do |byte|
|
86
|
+
case byte
|
87
|
+
when ?- then return scan_script_data_escaped_dash_dash
|
88
|
+
when ?< then return scan_script_data_escaped_less_than_sign
|
89
|
+
else return scan_script_data_escaped
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def scan_script_data_escaped_dash_dash
|
95
|
+
consume do |byte|
|
96
|
+
case byte
|
97
|
+
when ?- then return scan_script_data_escaped_dash_dash
|
98
|
+
when ?< then return scan_script_data_escaped_less_than_sign
|
99
|
+
when ?> then return scan_script_data
|
100
|
+
else return scan_script_data_escaped
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def scan_script_data_escaped_less_than_sign
|
106
|
+
consume do |byte|
|
107
|
+
return scan_script_data_escaped_end_tag_open if byte == ?/
|
108
|
+
return scan_script_data_double_escape_start if byte =~ /[a-zA-Z]/
|
109
|
+
revert_byte
|
110
|
+
scan_script_data
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def scan_script_data_escaped_end_tag_open
|
115
|
+
return if scan_raw_end_tag? || scanner.eos?
|
116
|
+
scan_script_data_escaped
|
117
|
+
end
|
118
|
+
|
119
|
+
def scan_script_data_double_escape_start
|
120
|
+
revert_byte
|
121
|
+
'script'.each_char.with_index do |ch, index|
|
122
|
+
ch = scanner.get_byte
|
123
|
+
buffer << ch
|
124
|
+
return if scanner.eos?
|
125
|
+
unless ch.downcase == 'script'[index]
|
126
|
+
revert_byte
|
127
|
+
return scan_script_data_escaped
|
128
|
+
end
|
129
|
+
end
|
130
|
+
byte = scanner.get_byte
|
131
|
+
buffer << byte
|
132
|
+
return if scanner.eos?
|
133
|
+
case byte
|
134
|
+
when ?\s, ?/, ?> then return scan_script_data_double_escaped
|
135
|
+
else
|
136
|
+
revert_byte
|
137
|
+
scan_script_data_escaped
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def scan_script_data_double_escaped
|
142
|
+
consume do |byte|
|
143
|
+
case byte
|
144
|
+
when ?- then return scan_script_data_double_escaped_dash
|
145
|
+
when ?< then return scan_script_data_double_escaped_less_than_sign
|
146
|
+
else return scan_script_data_double_escaped
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def scan_script_data_double_escaped_dash
|
152
|
+
consume do |byte|
|
153
|
+
case byte
|
154
|
+
when ?- then return scan_script_data_double_escaped_dash_dash
|
155
|
+
when ?< then return scan_script_data_double_escaped_less_than_sign
|
156
|
+
else return scan_script_data_double_escaped
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def scan_script_data_double_escaped_dash_dash
|
162
|
+
consume do |byte|
|
163
|
+
case byte
|
164
|
+
when ?- then return scan_script_data_double_escaped_dash_dash
|
165
|
+
when ?< then return scan_script_data_double_escaped_less_than_sign
|
166
|
+
when ?> then return scan_script_data
|
167
|
+
else return scan_script_data_double_escaped
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
def scan_script_data_double_escaped_less_than_sign
|
173
|
+
consume do |byte|
|
174
|
+
return scan_script_data_double_escape_end if byte == ?/
|
175
|
+
revert_byte
|
176
|
+
scan_script_data_double_escaped
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
def scan_script_data_double_escape_end
|
181
|
+
if scan_raw_end_tag?
|
182
|
+
end_tag = "</#{raw_tag}>"
|
183
|
+
# Last matched char needs to be concatenated.
|
184
|
+
buffer << scanner.string.slice(scanner.pos, end_tag.length)
|
185
|
+
scanner.pos += end_tag.length
|
186
|
+
return scan_script_data_escaped
|
187
|
+
end
|
188
|
+
return if scanner.eos?
|
189
|
+
scan_script_data_double_escaped
|
190
|
+
end
|
191
|
+
|
192
|
+
def consume
|
193
|
+
return unless byte = scanner.get_byte
|
194
|
+
buffer << byte
|
195
|
+
yield byte
|
196
|
+
end
|
197
|
+
|
198
|
+
def revert_byte
|
199
|
+
@buffer = buffer.slice(0, buffer.length - 1)
|
200
|
+
scanner.unscan
|
201
|
+
end
|
202
|
+
|
203
|
+
def scan_raw_end_tag?
|
204
|
+
raw_tag.each_char do |ch|
|
205
|
+
return false unless byte = scanner.get_byte
|
206
|
+
if byte.downcase != ch
|
207
|
+
scanner.unscan
|
208
|
+
return false
|
209
|
+
end
|
210
|
+
buffer << byte
|
211
|
+
end
|
212
|
+
case byte = scanner.get_byte
|
213
|
+
when ?>, ?\s, ?/
|
214
|
+
desired = 3 + raw_tag.length
|
215
|
+
scanner.pos -= desired
|
216
|
+
@buffer = buffer.slice(0, buffer.length - desired + 1)
|
217
|
+
return true
|
218
|
+
when nil
|
219
|
+
return false
|
220
|
+
else
|
221
|
+
buffer << byte
|
222
|
+
end
|
223
|
+
scanner.unscan
|
224
|
+
@buffer = buffer.slice(0, buffer.length - 1)
|
225
|
+
false
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'gammo/tokenizer/escape'
|
2
|
+
|
3
|
+
module Gammo
|
4
|
+
class Tokenizer
|
5
|
+
class BaseToken
|
6
|
+
attr_accessor :attributes, :data, :tag
|
7
|
+
|
8
|
+
def initialize(data = '', attributes: [], tag: nil)
|
9
|
+
@data = data
|
10
|
+
@attributes = attributes
|
11
|
+
@tag = tag
|
12
|
+
end
|
13
|
+
|
14
|
+
def concat(s)
|
15
|
+
data << s
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class EscapedToken < BaseToken
|
20
|
+
include Escape
|
21
|
+
|
22
|
+
NULL = ?\x00.freeze
|
23
|
+
REPLACEMENT = "\ufffd".freeze
|
24
|
+
|
25
|
+
attr_accessor :convert_null, :raw
|
26
|
+
|
27
|
+
def initialize(data = nil, raw: false, convert_null: false, **options)
|
28
|
+
super(data, **options)
|
29
|
+
@raw = raw
|
30
|
+
@convert_null = convert_null
|
31
|
+
load_data(data)
|
32
|
+
end
|
33
|
+
|
34
|
+
def load_data(raw_data)
|
35
|
+
unless raw_data
|
36
|
+
@data = nil
|
37
|
+
return
|
38
|
+
end
|
39
|
+
raw_data = convert_newlines(raw_data).force_encoding(Encoding::UTF_8)
|
40
|
+
raw_data = raw_data.gsub(%r{#{NULL}}, REPLACEMENT) if should_convert_null?(raw_data)
|
41
|
+
@data = require_raw_data? ? raw_data : unescape(raw_data, in_attribute: false)
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
alias_method :convert_null?, :convert_null
|
47
|
+
alias_method :require_raw_data?, :raw
|
48
|
+
|
49
|
+
def should_convert_null?(data)
|
50
|
+
data && (convert_null? || self.class == CommentToken) && data.include?(NULL)
|
51
|
+
end
|
52
|
+
|
53
|
+
def convert_newlines(s)
|
54
|
+
s.gsub(/(\r\n|\r)/, ?\n)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
ErrorToken = Class.new(BaseToken)
|
59
|
+
TextToken = Class.new(EscapedToken)
|
60
|
+
StartTagToken = Class.new(BaseToken)
|
61
|
+
EndTagToken = Class.new(BaseToken)
|
62
|
+
SelfClosingTagToken = Class.new(BaseToken)
|
63
|
+
CommentToken = Class.new(EscapedToken)
|
64
|
+
DoctypeToken = Class.new(EscapedToken)
|
65
|
+
end
|
66
|
+
end
|