gammo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.travis.yml +6 -0
  4. data/Gemfile +9 -0
  5. data/Gemfile.lock +27 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +177 -0
  8. data/Rakefile +25 -0
  9. data/gammo.gemspec +23 -0
  10. data/lib/gammo.rb +15 -0
  11. data/lib/gammo/attribute.rb +17 -0
  12. data/lib/gammo/fragment_parser.rb +65 -0
  13. data/lib/gammo/node.rb +157 -0
  14. data/lib/gammo/parser.rb +524 -0
  15. data/lib/gammo/parser/constants.rb +94 -0
  16. data/lib/gammo/parser/foreign.rb +307 -0
  17. data/lib/gammo/parser/insertion_mode.rb +74 -0
  18. data/lib/gammo/parser/insertion_mode/after_after_body.rb +36 -0
  19. data/lib/gammo/parser/insertion_mode/after_after_frameset.rb +32 -0
  20. data/lib/gammo/parser/insertion_mode/after_body.rb +46 -0
  21. data/lib/gammo/parser/insertion_mode/after_frameset.rb +39 -0
  22. data/lib/gammo/parser/insertion_mode/after_head.rb +70 -0
  23. data/lib/gammo/parser/insertion_mode/before_head.rb +49 -0
  24. data/lib/gammo/parser/insertion_mode/before_html.rb +45 -0
  25. data/lib/gammo/parser/insertion_mode/in_body.rb +463 -0
  26. data/lib/gammo/parser/insertion_mode/in_caption.rb +47 -0
  27. data/lib/gammo/parser/insertion_mode/in_cell.rb +46 -0
  28. data/lib/gammo/parser/insertion_mode/in_column_group.rb +66 -0
  29. data/lib/gammo/parser/insertion_mode/in_frameset.rb +48 -0
  30. data/lib/gammo/parser/insertion_mode/in_head.rb +98 -0
  31. data/lib/gammo/parser/insertion_mode/in_head_noscript.rb +52 -0
  32. data/lib/gammo/parser/insertion_mode/in_row.rb +53 -0
  33. data/lib/gammo/parser/insertion_mode/in_select.rb +77 -0
  34. data/lib/gammo/parser/insertion_mode/in_select_in_table.rb +46 -0
  35. data/lib/gammo/parser/insertion_mode/in_table.rb +114 -0
  36. data/lib/gammo/parser/insertion_mode/in_table_body.rb +55 -0
  37. data/lib/gammo/parser/insertion_mode/in_template.rb +80 -0
  38. data/lib/gammo/parser/insertion_mode/initial.rb +152 -0
  39. data/lib/gammo/parser/insertion_mode/text.rb +32 -0
  40. data/lib/gammo/parser/insertion_mode_stack.rb +8 -0
  41. data/lib/gammo/parser/node_stack.rb +24 -0
  42. data/lib/gammo/tags.rb +9 -0
  43. data/lib/gammo/tags/table.rb +744 -0
  44. data/lib/gammo/tokenizer.rb +373 -0
  45. data/lib/gammo/tokenizer/debug.rb +34 -0
  46. data/lib/gammo/tokenizer/entity.rb +2240 -0
  47. data/lib/gammo/tokenizer/escape.rb +174 -0
  48. data/lib/gammo/tokenizer/script_scanner.rb +229 -0
  49. data/lib/gammo/tokenizer/tokens.rb +66 -0
  50. data/lib/gammo/version.rb +3 -0
  51. data/misc/html.yaml +384 -0
  52. data/misc/table.erubi +14 -0
  53. metadata +97 -0
@@ -0,0 +1,174 @@
1
+ require 'gammo/tokenizer/entity'
2
+
3
+ module Gammo
4
+ class Tokenizer
5
+ module Escape
6
+ LONGEST_ENTITY_WITHOUT_SEMICOLON = 6
7
+ ESCAPE_REPLACEMENT_TABLE = {
8
+ ?& => '&',
9
+ ?' => ''',
10
+ ?< => '&lt;',
11
+ ?> => '&gt;',
12
+ ?" => '&#34;',
13
+ ?\r => '&#13;',
14
+ }.freeze
15
+
16
+ REPLACEMENT_TABLE = [
17
+ "\u{20AC}",
18
+ "\u{0081}",
19
+ "\u{201A}",
20
+ "\u{0192}",
21
+ "\u{201E}",
22
+ "\u{2026}",
23
+ "\u{2020}",
24
+ "\u{2021}",
25
+ "\u{02C6}",
26
+ "\u{2030}",
27
+ "\u{0160}",
28
+ "\u{2039}",
29
+ "\u{0152}",
30
+ "\u{008D}",
31
+ "\u{017D}",
32
+ "\u{008F}",
33
+ "\u{0090}",
34
+ "\u{2018}",
35
+ "\u{2019}",
36
+ "\u{201C}",
37
+ "\u{201D}",
38
+ "\u{2022}",
39
+ "\u{2013}",
40
+ "\u{2014}",
41
+ "\u{02DC}",
42
+ "\u{2122}",
43
+ "\u{0161}",
44
+ "\u{203A}",
45
+ "\u{0153}",
46
+ "\u{009D}",
47
+ "\u{017E}",
48
+ "\u{0178}",
49
+ ].freeze
50
+
51
+ # Escapes given string according to {ESCAPE_REPLACEMENT_TABLE}.
52
+ def escape(s)
53
+ s.gsub!(/[&'<>"\r]/) { |ch| ESCAPE_REPLACEMENT_TABLE[ch] }
54
+ end
55
+
56
+ # Unescapes given data.
57
+ # @param [String] data
58
+ # @return [String, nil]
59
+ def unescape(data, **options)
60
+ return unless data
61
+ data.each_byte.with_index do |byte, i|
62
+ next unless byte.chr == ?&
63
+ dst, src = unescape_entity(data, i, i, **options)
64
+ while src < data.bytes.length
65
+ byte = data.getbyte(src)
66
+ if byte.chr == ?&
67
+ dst, src = unescape_entity(data, dst, src, **options)
68
+ else
69
+ data.setbyte(dst, byte)
70
+ dst, src = dst + 1, src + 1
71
+ end
72
+ end
73
+ return data.byteslice(0, dst)
74
+ end
75
+ end
76
+
77
+ private
78
+
79
+ def unescape_entity(data, dst, src, in_attribute: false)
80
+ # No need to count "&".
81
+ i, s = 1, data.byteslice(src..-1)
82
+ swap(data, dst, src) if s.length <= 1
83
+ return unescape_sharp_entity(data, s, dst, src, i) if s[i] == ?#
84
+ i = consume_entity_chars(s, i)
85
+ name = s.byteslice(1, i - 1)
86
+ unless name == '' || (in_attribute && like_query_params?(name, s, i))
87
+ entities = Entity::CODEPOINT[name.to_sym]
88
+ entities = entities ? [entities] : Entity::TWO_CODEPOINTS[name.to_sym]
89
+ return replace_entity(entities, data, dst, src, i) if entities
90
+ unless in_attribute
91
+ max = name.length - 1
92
+ max = LONGEST_ENTITY_WITHOUT_SEMICOLON if max > LONGEST_ENTITY_WITHOUT_SEMICOLON
93
+ max.downto(1) do |n|
94
+ if entities = Entity::CODEPOINT[name.byteslice(0, n).to_sym]
95
+ return replace_entity([entities], data, dst, src, n + 1)
96
+ end
97
+ end
98
+ end
99
+ end
100
+ dst1, src1 = dst + i, src + i
101
+ data[dst, dst1] = data[src, src1]
102
+ [dst1, src1]
103
+ end
104
+
105
+ def unescape_sharp_entity(data, s, dst, src, i)
106
+ return swap(data, dst, src) if s.length <= 3
107
+ i += 1
108
+ ch = s[i]
109
+ hex = false
110
+ if ch == ?x || ch == ?X
111
+ hex = true
112
+ i += 1
113
+ end
114
+ x = ?\x0
115
+ while i < s.length
116
+ ch = s[i]
117
+ i += 1
118
+ if hex
119
+ if ?0 <= ch && ch <= ?9
120
+ x = 16 * x.ord + ch.ord - ?0.ord
121
+ next
122
+ elsif ?a <= ch && ch <= ?f
123
+ x = 16 * x.ord + ch.ord - ?a.ord + 10
124
+ next
125
+ elsif ?A <= ch && ch <= ?F
126
+ x = 16 * x.ord + ch.ord - ?A.ord + 10
127
+ next
128
+ end
129
+ elsif (?0 <= ch && ch <= ?9)
130
+ x = 10 * x.ord + ch.ord - ?0.ord
131
+ next
132
+ end
133
+ i -= 1 if ch != ?;
134
+ break
135
+ end
136
+ return swap(data, dst, src) if i <= 3
137
+ if 0x80 <= x && x <= 0x9F
138
+ x = REPLACEMENT_TABLE[x - 0x80].ord
139
+ elsif x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF
140
+ x = "\u{FFFD}".ord
141
+ end
142
+ x.chr(Encoding::UTF_8).each_byte.with_index { |byte, j| data.setbyte(dst + j, byte) }
143
+ [dst + x.chr(Encoding::UTF_8).bytes.length, src + i]
144
+ end
145
+
146
+ def swap(data, dst, src)
147
+ data[dst] = data[src]
148
+ [dst + 1, src + 1]
149
+ end
150
+
151
+ def consume_entity_chars(s, i)
152
+ while i < s.length
153
+ ch = s[i]
154
+ i += 1
155
+ next if ?a <= ch && ch <= ?z || ?A <= ch && ch <= ?Z || ?0 <= ch && ch <= ?9
156
+ i -= 1 if ch != ?;
157
+ break
158
+ end
159
+ i
160
+ end
161
+
162
+ def like_query_params?(name, s, i)
163
+ name[name.length - 1] != ?; && s.length > i && s[i] == ?=
164
+ end
165
+
166
+ def replace_entity(entities, t, dst, src, i)
167
+ [entities.inject(dst) { |sum, ch|
168
+ ch.each_byte.with_index { |byte, j| t.setbyte(sum + j, byte) }
169
+ sum + ch.bytes.length
170
+ }, src + i]
171
+ end
172
+ end
173
+ end
174
+ end
@@ -0,0 +1,229 @@
1
+ require 'gammo/tokenizer/debug'
2
+
3
+ module Gammo
4
+ class Tokenizer
5
+ class ScriptScanner
6
+ include Debug
7
+
8
+ attr_reader :scanner, :buffer, :raw_tag
9
+
10
+ alias_method :debug?, :debug
11
+
12
+ def initialize(scanner, raw_tag:, debug: false)
13
+ @scanner = scanner
14
+ @buffer = ''
15
+ @raw_tag = raw_tag
16
+ @debug = debug
17
+ end
18
+
19
+ def scan
20
+ scan_script_data
21
+ buffer
22
+ end
23
+
24
+ private
25
+
26
+ def with_extendable_stack
27
+ begin
28
+ yield
29
+ rescue SystemStackError
30
+ Fiber.new { yield }.resume
31
+ end
32
+ end
33
+
34
+ def scan_script_data
35
+ with_extendable_stack do
36
+ consume do |byte|
37
+ byte == ?< ? scan_script_data_less_than_sign : scan_script_data
38
+ end
39
+ end
40
+ end
41
+
42
+ def scan_script_data_less_than_sign
43
+ consume do |byte|
44
+ case byte
45
+ when ?/ then return scan_script_data_end_tag_open
46
+ when ?! then return scan_script_data_escape_start
47
+ end
48
+ revert_byte
49
+ scan_script_data
50
+ end
51
+ end
52
+
53
+ def scan_script_data_end_tag_open
54
+ return if scan_raw_end_tag? || scanner.eos?
55
+ scan_script_data
56
+ end
57
+
58
+ def scan_script_data_escape_start
59
+ consume do |byte|
60
+ return scan_script_data_escape_start_dash if byte == ?-
61
+ revert_byte
62
+ scan_script_data
63
+ end
64
+ end
65
+
66
+ def scan_script_data_escape_start_dash
67
+ consume do |byte|
68
+ return scan_script_data_escaped_dash_dash if byte == ?-
69
+ revert_byte
70
+ scan_script_data
71
+ end
72
+ end
73
+
74
+ def scan_script_data_escaped
75
+ consume do |byte|
76
+ case byte
77
+ when ?- then return scan_script_data_escaped_dash
78
+ when ?< then return scan_script_data_escaped_less_than_sign
79
+ else return scan_script_data_escaped
80
+ end
81
+ end
82
+ end
83
+
84
+ def scan_script_data_escaped_dash
85
+ consume do |byte|
86
+ case byte
87
+ when ?- then return scan_script_data_escaped_dash_dash
88
+ when ?< then return scan_script_data_escaped_less_than_sign
89
+ else return scan_script_data_escaped
90
+ end
91
+ end
92
+ end
93
+
94
+ def scan_script_data_escaped_dash_dash
95
+ consume do |byte|
96
+ case byte
97
+ when ?- then return scan_script_data_escaped_dash_dash
98
+ when ?< then return scan_script_data_escaped_less_than_sign
99
+ when ?> then return scan_script_data
100
+ else return scan_script_data_escaped
101
+ end
102
+ end
103
+ end
104
+
105
+ def scan_script_data_escaped_less_than_sign
106
+ consume do |byte|
107
+ return scan_script_data_escaped_end_tag_open if byte == ?/
108
+ return scan_script_data_double_escape_start if byte =~ /[a-zA-Z]/
109
+ revert_byte
110
+ scan_script_data
111
+ end
112
+ end
113
+
114
+ def scan_script_data_escaped_end_tag_open
115
+ return if scan_raw_end_tag? || scanner.eos?
116
+ scan_script_data_escaped
117
+ end
118
+
119
+ def scan_script_data_double_escape_start
120
+ revert_byte
121
+ 'script'.each_char.with_index do |ch, index|
122
+ ch = scanner.get_byte
123
+ buffer << ch
124
+ return if scanner.eos?
125
+ unless ch.downcase == 'script'[index]
126
+ revert_byte
127
+ return scan_script_data_escaped
128
+ end
129
+ end
130
+ byte = scanner.get_byte
131
+ buffer << byte
132
+ return if scanner.eos?
133
+ case byte
134
+ when ?\s, ?/, ?> then return scan_script_data_double_escaped
135
+ else
136
+ revert_byte
137
+ scan_script_data_escaped
138
+ end
139
+ end
140
+
141
+ def scan_script_data_double_escaped
142
+ consume do |byte|
143
+ case byte
144
+ when ?- then return scan_script_data_double_escaped_dash
145
+ when ?< then return scan_script_data_double_escaped_less_than_sign
146
+ else return scan_script_data_double_escaped
147
+ end
148
+ end
149
+ end
150
+
151
+ def scan_script_data_double_escaped_dash
152
+ consume do |byte|
153
+ case byte
154
+ when ?- then return scan_script_data_double_escaped_dash_dash
155
+ when ?< then return scan_script_data_double_escaped_less_than_sign
156
+ else return scan_script_data_double_escaped
157
+ end
158
+ end
159
+ end
160
+
161
+ def scan_script_data_double_escaped_dash_dash
162
+ consume do |byte|
163
+ case byte
164
+ when ?- then return scan_script_data_double_escaped_dash_dash
165
+ when ?< then return scan_script_data_double_escaped_less_than_sign
166
+ when ?> then return scan_script_data
167
+ else return scan_script_data_double_escaped
168
+ end
169
+ end
170
+ end
171
+
172
+ def scan_script_data_double_escaped_less_than_sign
173
+ consume do |byte|
174
+ return scan_script_data_double_escape_end if byte == ?/
175
+ revert_byte
176
+ scan_script_data_double_escaped
177
+ end
178
+ end
179
+
180
+ def scan_script_data_double_escape_end
181
+ if scan_raw_end_tag?
182
+ end_tag = "</#{raw_tag}>"
183
+ # Last matched char needs to be concatenated.
184
+ buffer << scanner.string.slice(scanner.pos, end_tag.length)
185
+ scanner.pos += end_tag.length
186
+ return scan_script_data_escaped
187
+ end
188
+ return if scanner.eos?
189
+ scan_script_data_double_escaped
190
+ end
191
+
192
+ def consume
193
+ return unless byte = scanner.get_byte
194
+ buffer << byte
195
+ yield byte
196
+ end
197
+
198
+ def revert_byte
199
+ @buffer = buffer.slice(0, buffer.length - 1)
200
+ scanner.unscan
201
+ end
202
+
203
+ def scan_raw_end_tag?
204
+ raw_tag.each_char do |ch|
205
+ return false unless byte = scanner.get_byte
206
+ if byte.downcase != ch
207
+ scanner.unscan
208
+ return false
209
+ end
210
+ buffer << byte
211
+ end
212
+ case byte = scanner.get_byte
213
+ when ?>, ?\s, ?/
214
+ desired = 3 + raw_tag.length
215
+ scanner.pos -= desired
216
+ @buffer = buffer.slice(0, buffer.length - desired + 1)
217
+ return true
218
+ when nil
219
+ return false
220
+ else
221
+ buffer << byte
222
+ end
223
+ scanner.unscan
224
+ @buffer = buffer.slice(0, buffer.length - 1)
225
+ false
226
+ end
227
+ end
228
+ end
229
+ end
@@ -0,0 +1,66 @@
1
+ require 'gammo/tokenizer/escape'
2
+
3
+ module Gammo
4
+ class Tokenizer
5
+ class BaseToken
6
+ attr_accessor :attributes, :data, :tag
7
+
8
+ def initialize(data = '', attributes: [], tag: nil)
9
+ @data = data
10
+ @attributes = attributes
11
+ @tag = tag
12
+ end
13
+
14
+ def concat(s)
15
+ data << s
16
+ end
17
+ end
18
+
19
+ class EscapedToken < BaseToken
20
+ include Escape
21
+
22
+ NULL = ?\x00.freeze
23
+ REPLACEMENT = "\ufffd".freeze
24
+
25
+ attr_accessor :convert_null, :raw
26
+
27
+ def initialize(data = nil, raw: false, convert_null: false, **options)
28
+ super(data, **options)
29
+ @raw = raw
30
+ @convert_null = convert_null
31
+ load_data(data)
32
+ end
33
+
34
+ def load_data(raw_data)
35
+ unless raw_data
36
+ @data = nil
37
+ return
38
+ end
39
+ raw_data = convert_newlines(raw_data).force_encoding(Encoding::UTF_8)
40
+ raw_data = raw_data.gsub(%r{#{NULL}}, REPLACEMENT) if should_convert_null?(raw_data)
41
+ @data = require_raw_data? ? raw_data : unescape(raw_data, in_attribute: false)
42
+ end
43
+
44
+ private
45
+
46
+ alias_method :convert_null?, :convert_null
47
+ alias_method :require_raw_data?, :raw
48
+
49
+ def should_convert_null?(data)
50
+ data && (convert_null? || self.class == CommentToken) && data.include?(NULL)
51
+ end
52
+
53
+ def convert_newlines(s)
54
+ s.gsub(/(\r\n|\r)/, ?\n)
55
+ end
56
+ end
57
+
58
+ ErrorToken = Class.new(BaseToken)
59
+ TextToken = Class.new(EscapedToken)
60
+ StartTagToken = Class.new(BaseToken)
61
+ EndTagToken = Class.new(BaseToken)
62
+ SelfClosingTagToken = Class.new(BaseToken)
63
+ CommentToken = Class.new(EscapedToken)
64
+ DoctypeToken = Class.new(EscapedToken)
65
+ end
66
+ end