gammo 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.travis.yml +6 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +27 -0
- data/LICENSE.txt +21 -0
- data/README.md +177 -0
- data/Rakefile +25 -0
- data/gammo.gemspec +23 -0
- data/lib/gammo.rb +15 -0
- data/lib/gammo/attribute.rb +17 -0
- data/lib/gammo/fragment_parser.rb +65 -0
- data/lib/gammo/node.rb +157 -0
- data/lib/gammo/parser.rb +524 -0
- data/lib/gammo/parser/constants.rb +94 -0
- data/lib/gammo/parser/foreign.rb +307 -0
- data/lib/gammo/parser/insertion_mode.rb +74 -0
- data/lib/gammo/parser/insertion_mode/after_after_body.rb +36 -0
- data/lib/gammo/parser/insertion_mode/after_after_frameset.rb +32 -0
- data/lib/gammo/parser/insertion_mode/after_body.rb +46 -0
- data/lib/gammo/parser/insertion_mode/after_frameset.rb +39 -0
- data/lib/gammo/parser/insertion_mode/after_head.rb +70 -0
- data/lib/gammo/parser/insertion_mode/before_head.rb +49 -0
- data/lib/gammo/parser/insertion_mode/before_html.rb +45 -0
- data/lib/gammo/parser/insertion_mode/in_body.rb +463 -0
- data/lib/gammo/parser/insertion_mode/in_caption.rb +47 -0
- data/lib/gammo/parser/insertion_mode/in_cell.rb +46 -0
- data/lib/gammo/parser/insertion_mode/in_column_group.rb +66 -0
- data/lib/gammo/parser/insertion_mode/in_frameset.rb +48 -0
- data/lib/gammo/parser/insertion_mode/in_head.rb +98 -0
- data/lib/gammo/parser/insertion_mode/in_head_noscript.rb +52 -0
- data/lib/gammo/parser/insertion_mode/in_row.rb +53 -0
- data/lib/gammo/parser/insertion_mode/in_select.rb +77 -0
- data/lib/gammo/parser/insertion_mode/in_select_in_table.rb +46 -0
- data/lib/gammo/parser/insertion_mode/in_table.rb +114 -0
- data/lib/gammo/parser/insertion_mode/in_table_body.rb +55 -0
- data/lib/gammo/parser/insertion_mode/in_template.rb +80 -0
- data/lib/gammo/parser/insertion_mode/initial.rb +152 -0
- data/lib/gammo/parser/insertion_mode/text.rb +32 -0
- data/lib/gammo/parser/insertion_mode_stack.rb +8 -0
- data/lib/gammo/parser/node_stack.rb +24 -0
- data/lib/gammo/tags.rb +9 -0
- data/lib/gammo/tags/table.rb +744 -0
- data/lib/gammo/tokenizer.rb +373 -0
- data/lib/gammo/tokenizer/debug.rb +34 -0
- data/lib/gammo/tokenizer/entity.rb +2240 -0
- data/lib/gammo/tokenizer/escape.rb +174 -0
- data/lib/gammo/tokenizer/script_scanner.rb +229 -0
- data/lib/gammo/tokenizer/tokens.rb +66 -0
- data/lib/gammo/version.rb +3 -0
- data/misc/html.yaml +384 -0
- data/misc/table.erubi +14 -0
- metadata +97 -0
@@ -0,0 +1,174 @@
|
|
1
|
+
require 'gammo/tokenizer/entity'
|
2
|
+
|
3
|
+
module Gammo
|
4
|
+
class Tokenizer
|
5
|
+
module Escape
|
6
|
+
LONGEST_ENTITY_WITHOUT_SEMICOLON = 6
|
7
|
+
ESCAPE_REPLACEMENT_TABLE = {
|
8
|
+
?& => '&',
|
9
|
+
?' => ''',
|
10
|
+
?< => '<',
|
11
|
+
?> => '>',
|
12
|
+
?" => '"',
|
13
|
+
?\r => ' ',
|
14
|
+
}.freeze
|
15
|
+
|
16
|
+
REPLACEMENT_TABLE = [
|
17
|
+
"\u{20AC}",
|
18
|
+
"\u{0081}",
|
19
|
+
"\u{201A}",
|
20
|
+
"\u{0192}",
|
21
|
+
"\u{201E}",
|
22
|
+
"\u{2026}",
|
23
|
+
"\u{2020}",
|
24
|
+
"\u{2021}",
|
25
|
+
"\u{02C6}",
|
26
|
+
"\u{2030}",
|
27
|
+
"\u{0160}",
|
28
|
+
"\u{2039}",
|
29
|
+
"\u{0152}",
|
30
|
+
"\u{008D}",
|
31
|
+
"\u{017D}",
|
32
|
+
"\u{008F}",
|
33
|
+
"\u{0090}",
|
34
|
+
"\u{2018}",
|
35
|
+
"\u{2019}",
|
36
|
+
"\u{201C}",
|
37
|
+
"\u{201D}",
|
38
|
+
"\u{2022}",
|
39
|
+
"\u{2013}",
|
40
|
+
"\u{2014}",
|
41
|
+
"\u{02DC}",
|
42
|
+
"\u{2122}",
|
43
|
+
"\u{0161}",
|
44
|
+
"\u{203A}",
|
45
|
+
"\u{0153}",
|
46
|
+
"\u{009D}",
|
47
|
+
"\u{017E}",
|
48
|
+
"\u{0178}",
|
49
|
+
].freeze
|
50
|
+
|
51
|
+
# Escapes given string according to {ESCAPE_REPLACEMENT_TABLE}.
|
52
|
+
def escape(s)
|
53
|
+
s.gsub!(/[&'<>"\r]/) { |ch| ESCAPE_REPLACEMENT_TABLE[ch] }
|
54
|
+
end
|
55
|
+
|
56
|
+
# Unescapes given data.
|
57
|
+
# @param [String] data
|
58
|
+
# @return [String, nil]
|
59
|
+
def unescape(data, **options)
|
60
|
+
return unless data
|
61
|
+
data.each_byte.with_index do |byte, i|
|
62
|
+
next unless byte.chr == ?&
|
63
|
+
dst, src = unescape_entity(data, i, i, **options)
|
64
|
+
while src < data.bytes.length
|
65
|
+
byte = data.getbyte(src)
|
66
|
+
if byte.chr == ?&
|
67
|
+
dst, src = unescape_entity(data, dst, src, **options)
|
68
|
+
else
|
69
|
+
data.setbyte(dst, byte)
|
70
|
+
dst, src = dst + 1, src + 1
|
71
|
+
end
|
72
|
+
end
|
73
|
+
return data.byteslice(0, dst)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
def unescape_entity(data, dst, src, in_attribute: false)
|
80
|
+
# No need to count "&".
|
81
|
+
i, s = 1, data.byteslice(src..-1)
|
82
|
+
swap(data, dst, src) if s.length <= 1
|
83
|
+
return unescape_sharp_entity(data, s, dst, src, i) if s[i] == ?#
|
84
|
+
i = consume_entity_chars(s, i)
|
85
|
+
name = s.byteslice(1, i - 1)
|
86
|
+
unless name == '' || (in_attribute && like_query_params?(name, s, i))
|
87
|
+
entities = Entity::CODEPOINT[name.to_sym]
|
88
|
+
entities = entities ? [entities] : Entity::TWO_CODEPOINTS[name.to_sym]
|
89
|
+
return replace_entity(entities, data, dst, src, i) if entities
|
90
|
+
unless in_attribute
|
91
|
+
max = name.length - 1
|
92
|
+
max = LONGEST_ENTITY_WITHOUT_SEMICOLON if max > LONGEST_ENTITY_WITHOUT_SEMICOLON
|
93
|
+
max.downto(1) do |n|
|
94
|
+
if entities = Entity::CODEPOINT[name.byteslice(0, n).to_sym]
|
95
|
+
return replace_entity([entities], data, dst, src, n + 1)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
dst1, src1 = dst + i, src + i
|
101
|
+
data[dst, dst1] = data[src, src1]
|
102
|
+
[dst1, src1]
|
103
|
+
end
|
104
|
+
|
105
|
+
def unescape_sharp_entity(data, s, dst, src, i)
|
106
|
+
return swap(data, dst, src) if s.length <= 3
|
107
|
+
i += 1
|
108
|
+
ch = s[i]
|
109
|
+
hex = false
|
110
|
+
if ch == ?x || ch == ?X
|
111
|
+
hex = true
|
112
|
+
i += 1
|
113
|
+
end
|
114
|
+
x = ?\x0
|
115
|
+
while i < s.length
|
116
|
+
ch = s[i]
|
117
|
+
i += 1
|
118
|
+
if hex
|
119
|
+
if ?0 <= ch && ch <= ?9
|
120
|
+
x = 16 * x.ord + ch.ord - ?0.ord
|
121
|
+
next
|
122
|
+
elsif ?a <= ch && ch <= ?f
|
123
|
+
x = 16 * x.ord + ch.ord - ?a.ord + 10
|
124
|
+
next
|
125
|
+
elsif ?A <= ch && ch <= ?F
|
126
|
+
x = 16 * x.ord + ch.ord - ?A.ord + 10
|
127
|
+
next
|
128
|
+
end
|
129
|
+
elsif (?0 <= ch && ch <= ?9)
|
130
|
+
x = 10 * x.ord + ch.ord - ?0.ord
|
131
|
+
next
|
132
|
+
end
|
133
|
+
i -= 1 if ch != ?;
|
134
|
+
break
|
135
|
+
end
|
136
|
+
return swap(data, dst, src) if i <= 3
|
137
|
+
if 0x80 <= x && x <= 0x9F
|
138
|
+
x = REPLACEMENT_TABLE[x - 0x80].ord
|
139
|
+
elsif x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF
|
140
|
+
x = "\u{FFFD}".ord
|
141
|
+
end
|
142
|
+
x.chr(Encoding::UTF_8).each_byte.with_index { |byte, j| data.setbyte(dst + j, byte) }
|
143
|
+
[dst + x.chr(Encoding::UTF_8).bytes.length, src + i]
|
144
|
+
end
|
145
|
+
|
146
|
+
def swap(data, dst, src)
|
147
|
+
data[dst] = data[src]
|
148
|
+
[dst + 1, src + 1]
|
149
|
+
end
|
150
|
+
|
151
|
+
def consume_entity_chars(s, i)
|
152
|
+
while i < s.length
|
153
|
+
ch = s[i]
|
154
|
+
i += 1
|
155
|
+
next if ?a <= ch && ch <= ?z || ?A <= ch && ch <= ?Z || ?0 <= ch && ch <= ?9
|
156
|
+
i -= 1 if ch != ?;
|
157
|
+
break
|
158
|
+
end
|
159
|
+
i
|
160
|
+
end
|
161
|
+
|
162
|
+
def like_query_params?(name, s, i)
|
163
|
+
name[name.length - 1] != ?; && s.length > i && s[i] == ?=
|
164
|
+
end
|
165
|
+
|
166
|
+
def replace_entity(entities, t, dst, src, i)
|
167
|
+
[entities.inject(dst) { |sum, ch|
|
168
|
+
ch.each_byte.with_index { |byte, j| t.setbyte(sum + j, byte) }
|
169
|
+
sum + ch.bytes.length
|
170
|
+
}, src + i]
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
end
|
@@ -0,0 +1,229 @@
|
|
1
|
+
require 'gammo/tokenizer/debug'
|
2
|
+
|
3
|
+
module Gammo
|
4
|
+
class Tokenizer
|
5
|
+
class ScriptScanner
|
6
|
+
include Debug
|
7
|
+
|
8
|
+
attr_reader :scanner, :buffer, :raw_tag
|
9
|
+
|
10
|
+
alias_method :debug?, :debug
|
11
|
+
|
12
|
+
def initialize(scanner, raw_tag:, debug: false)
|
13
|
+
@scanner = scanner
|
14
|
+
@buffer = ''
|
15
|
+
@raw_tag = raw_tag
|
16
|
+
@debug = debug
|
17
|
+
end
|
18
|
+
|
19
|
+
def scan
|
20
|
+
scan_script_data
|
21
|
+
buffer
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def with_extendable_stack
|
27
|
+
begin
|
28
|
+
yield
|
29
|
+
rescue SystemStackError
|
30
|
+
Fiber.new { yield }.resume
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def scan_script_data
|
35
|
+
with_extendable_stack do
|
36
|
+
consume do |byte|
|
37
|
+
byte == ?< ? scan_script_data_less_than_sign : scan_script_data
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def scan_script_data_less_than_sign
|
43
|
+
consume do |byte|
|
44
|
+
case byte
|
45
|
+
when ?/ then return scan_script_data_end_tag_open
|
46
|
+
when ?! then return scan_script_data_escape_start
|
47
|
+
end
|
48
|
+
revert_byte
|
49
|
+
scan_script_data
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def scan_script_data_end_tag_open
|
54
|
+
return if scan_raw_end_tag? || scanner.eos?
|
55
|
+
scan_script_data
|
56
|
+
end
|
57
|
+
|
58
|
+
def scan_script_data_escape_start
|
59
|
+
consume do |byte|
|
60
|
+
return scan_script_data_escape_start_dash if byte == ?-
|
61
|
+
revert_byte
|
62
|
+
scan_script_data
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def scan_script_data_escape_start_dash
|
67
|
+
consume do |byte|
|
68
|
+
return scan_script_data_escaped_dash_dash if byte == ?-
|
69
|
+
revert_byte
|
70
|
+
scan_script_data
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def scan_script_data_escaped
|
75
|
+
consume do |byte|
|
76
|
+
case byte
|
77
|
+
when ?- then return scan_script_data_escaped_dash
|
78
|
+
when ?< then return scan_script_data_escaped_less_than_sign
|
79
|
+
else return scan_script_data_escaped
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def scan_script_data_escaped_dash
|
85
|
+
consume do |byte|
|
86
|
+
case byte
|
87
|
+
when ?- then return scan_script_data_escaped_dash_dash
|
88
|
+
when ?< then return scan_script_data_escaped_less_than_sign
|
89
|
+
else return scan_script_data_escaped
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def scan_script_data_escaped_dash_dash
|
95
|
+
consume do |byte|
|
96
|
+
case byte
|
97
|
+
when ?- then return scan_script_data_escaped_dash_dash
|
98
|
+
when ?< then return scan_script_data_escaped_less_than_sign
|
99
|
+
when ?> then return scan_script_data
|
100
|
+
else return scan_script_data_escaped
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
def scan_script_data_escaped_less_than_sign
|
106
|
+
consume do |byte|
|
107
|
+
return scan_script_data_escaped_end_tag_open if byte == ?/
|
108
|
+
return scan_script_data_double_escape_start if byte =~ /[a-zA-Z]/
|
109
|
+
revert_byte
|
110
|
+
scan_script_data
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def scan_script_data_escaped_end_tag_open
|
115
|
+
return if scan_raw_end_tag? || scanner.eos?
|
116
|
+
scan_script_data_escaped
|
117
|
+
end
|
118
|
+
|
119
|
+
def scan_script_data_double_escape_start
|
120
|
+
revert_byte
|
121
|
+
'script'.each_char.with_index do |ch, index|
|
122
|
+
ch = scanner.get_byte
|
123
|
+
buffer << ch
|
124
|
+
return if scanner.eos?
|
125
|
+
unless ch.downcase == 'script'[index]
|
126
|
+
revert_byte
|
127
|
+
return scan_script_data_escaped
|
128
|
+
end
|
129
|
+
end
|
130
|
+
byte = scanner.get_byte
|
131
|
+
buffer << byte
|
132
|
+
return if scanner.eos?
|
133
|
+
case byte
|
134
|
+
when ?\s, ?/, ?> then return scan_script_data_double_escaped
|
135
|
+
else
|
136
|
+
revert_byte
|
137
|
+
scan_script_data_escaped
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def scan_script_data_double_escaped
|
142
|
+
consume do |byte|
|
143
|
+
case byte
|
144
|
+
when ?- then return scan_script_data_double_escaped_dash
|
145
|
+
when ?< then return scan_script_data_double_escaped_less_than_sign
|
146
|
+
else return scan_script_data_double_escaped
|
147
|
+
end
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
def scan_script_data_double_escaped_dash
|
152
|
+
consume do |byte|
|
153
|
+
case byte
|
154
|
+
when ?- then return scan_script_data_double_escaped_dash_dash
|
155
|
+
when ?< then return scan_script_data_double_escaped_less_than_sign
|
156
|
+
else return scan_script_data_double_escaped
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def scan_script_data_double_escaped_dash_dash
|
162
|
+
consume do |byte|
|
163
|
+
case byte
|
164
|
+
when ?- then return scan_script_data_double_escaped_dash_dash
|
165
|
+
when ?< then return scan_script_data_double_escaped_less_than_sign
|
166
|
+
when ?> then return scan_script_data
|
167
|
+
else return scan_script_data_double_escaped
|
168
|
+
end
|
169
|
+
end
|
170
|
+
end
|
171
|
+
|
172
|
+
def scan_script_data_double_escaped_less_than_sign
|
173
|
+
consume do |byte|
|
174
|
+
return scan_script_data_double_escape_end if byte == ?/
|
175
|
+
revert_byte
|
176
|
+
scan_script_data_double_escaped
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
def scan_script_data_double_escape_end
|
181
|
+
if scan_raw_end_tag?
|
182
|
+
end_tag = "</#{raw_tag}>"
|
183
|
+
# Last matched char needs to be concatenated.
|
184
|
+
buffer << scanner.string.slice(scanner.pos, end_tag.length)
|
185
|
+
scanner.pos += end_tag.length
|
186
|
+
return scan_script_data_escaped
|
187
|
+
end
|
188
|
+
return if scanner.eos?
|
189
|
+
scan_script_data_double_escaped
|
190
|
+
end
|
191
|
+
|
192
|
+
def consume
|
193
|
+
return unless byte = scanner.get_byte
|
194
|
+
buffer << byte
|
195
|
+
yield byte
|
196
|
+
end
|
197
|
+
|
198
|
+
def revert_byte
|
199
|
+
@buffer = buffer.slice(0, buffer.length - 1)
|
200
|
+
scanner.unscan
|
201
|
+
end
|
202
|
+
|
203
|
+
def scan_raw_end_tag?
|
204
|
+
raw_tag.each_char do |ch|
|
205
|
+
return false unless byte = scanner.get_byte
|
206
|
+
if byte.downcase != ch
|
207
|
+
scanner.unscan
|
208
|
+
return false
|
209
|
+
end
|
210
|
+
buffer << byte
|
211
|
+
end
|
212
|
+
case byte = scanner.get_byte
|
213
|
+
when ?>, ?\s, ?/
|
214
|
+
desired = 3 + raw_tag.length
|
215
|
+
scanner.pos -= desired
|
216
|
+
@buffer = buffer.slice(0, buffer.length - desired + 1)
|
217
|
+
return true
|
218
|
+
when nil
|
219
|
+
return false
|
220
|
+
else
|
221
|
+
buffer << byte
|
222
|
+
end
|
223
|
+
scanner.unscan
|
224
|
+
@buffer = buffer.slice(0, buffer.length - 1)
|
225
|
+
false
|
226
|
+
end
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'gammo/tokenizer/escape'
|
2
|
+
|
3
|
+
module Gammo
|
4
|
+
class Tokenizer
|
5
|
+
class BaseToken
|
6
|
+
attr_accessor :attributes, :data, :tag
|
7
|
+
|
8
|
+
def initialize(data = '', attributes: [], tag: nil)
|
9
|
+
@data = data
|
10
|
+
@attributes = attributes
|
11
|
+
@tag = tag
|
12
|
+
end
|
13
|
+
|
14
|
+
def concat(s)
|
15
|
+
data << s
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class EscapedToken < BaseToken
|
20
|
+
include Escape
|
21
|
+
|
22
|
+
NULL = ?\x00.freeze
|
23
|
+
REPLACEMENT = "\ufffd".freeze
|
24
|
+
|
25
|
+
attr_accessor :convert_null, :raw
|
26
|
+
|
27
|
+
def initialize(data = nil, raw: false, convert_null: false, **options)
|
28
|
+
super(data, **options)
|
29
|
+
@raw = raw
|
30
|
+
@convert_null = convert_null
|
31
|
+
load_data(data)
|
32
|
+
end
|
33
|
+
|
34
|
+
def load_data(raw_data)
|
35
|
+
unless raw_data
|
36
|
+
@data = nil
|
37
|
+
return
|
38
|
+
end
|
39
|
+
raw_data = convert_newlines(raw_data).force_encoding(Encoding::UTF_8)
|
40
|
+
raw_data = raw_data.gsub(%r{#{NULL}}, REPLACEMENT) if should_convert_null?(raw_data)
|
41
|
+
@data = require_raw_data? ? raw_data : unescape(raw_data, in_attribute: false)
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
alias_method :convert_null?, :convert_null
|
47
|
+
alias_method :require_raw_data?, :raw
|
48
|
+
|
49
|
+
def should_convert_null?(data)
|
50
|
+
data && (convert_null? || self.class == CommentToken) && data.include?(NULL)
|
51
|
+
end
|
52
|
+
|
53
|
+
def convert_newlines(s)
|
54
|
+
s.gsub(/(\r\n|\r)/, ?\n)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
ErrorToken = Class.new(BaseToken)
|
59
|
+
TextToken = Class.new(EscapedToken)
|
60
|
+
StartTagToken = Class.new(BaseToken)
|
61
|
+
EndTagToken = Class.new(BaseToken)
|
62
|
+
SelfClosingTagToken = Class.new(BaseToken)
|
63
|
+
CommentToken = Class.new(EscapedToken)
|
64
|
+
DoctypeToken = Class.new(EscapedToken)
|
65
|
+
end
|
66
|
+
end
|