gammo 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.travis.yml +6 -0
  4. data/Gemfile +9 -0
  5. data/Gemfile.lock +27 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +177 -0
  8. data/Rakefile +25 -0
  9. data/gammo.gemspec +23 -0
  10. data/lib/gammo.rb +15 -0
  11. data/lib/gammo/attribute.rb +17 -0
  12. data/lib/gammo/fragment_parser.rb +65 -0
  13. data/lib/gammo/node.rb +157 -0
  14. data/lib/gammo/parser.rb +524 -0
  15. data/lib/gammo/parser/constants.rb +94 -0
  16. data/lib/gammo/parser/foreign.rb +307 -0
  17. data/lib/gammo/parser/insertion_mode.rb +74 -0
  18. data/lib/gammo/parser/insertion_mode/after_after_body.rb +36 -0
  19. data/lib/gammo/parser/insertion_mode/after_after_frameset.rb +32 -0
  20. data/lib/gammo/parser/insertion_mode/after_body.rb +46 -0
  21. data/lib/gammo/parser/insertion_mode/after_frameset.rb +39 -0
  22. data/lib/gammo/parser/insertion_mode/after_head.rb +70 -0
  23. data/lib/gammo/parser/insertion_mode/before_head.rb +49 -0
  24. data/lib/gammo/parser/insertion_mode/before_html.rb +45 -0
  25. data/lib/gammo/parser/insertion_mode/in_body.rb +463 -0
  26. data/lib/gammo/parser/insertion_mode/in_caption.rb +47 -0
  27. data/lib/gammo/parser/insertion_mode/in_cell.rb +46 -0
  28. data/lib/gammo/parser/insertion_mode/in_column_group.rb +66 -0
  29. data/lib/gammo/parser/insertion_mode/in_frameset.rb +48 -0
  30. data/lib/gammo/parser/insertion_mode/in_head.rb +98 -0
  31. data/lib/gammo/parser/insertion_mode/in_head_noscript.rb +52 -0
  32. data/lib/gammo/parser/insertion_mode/in_row.rb +53 -0
  33. data/lib/gammo/parser/insertion_mode/in_select.rb +77 -0
  34. data/lib/gammo/parser/insertion_mode/in_select_in_table.rb +46 -0
  35. data/lib/gammo/parser/insertion_mode/in_table.rb +114 -0
  36. data/lib/gammo/parser/insertion_mode/in_table_body.rb +55 -0
  37. data/lib/gammo/parser/insertion_mode/in_template.rb +80 -0
  38. data/lib/gammo/parser/insertion_mode/initial.rb +152 -0
  39. data/lib/gammo/parser/insertion_mode/text.rb +32 -0
  40. data/lib/gammo/parser/insertion_mode_stack.rb +8 -0
  41. data/lib/gammo/parser/node_stack.rb +24 -0
  42. data/lib/gammo/tags.rb +9 -0
  43. data/lib/gammo/tags/table.rb +744 -0
  44. data/lib/gammo/tokenizer.rb +373 -0
  45. data/lib/gammo/tokenizer/debug.rb +34 -0
  46. data/lib/gammo/tokenizer/entity.rb +2240 -0
  47. data/lib/gammo/tokenizer/escape.rb +174 -0
  48. data/lib/gammo/tokenizer/script_scanner.rb +229 -0
  49. data/lib/gammo/tokenizer/tokens.rb +66 -0
  50. data/lib/gammo/version.rb +3 -0
  51. data/misc/html.yaml +384 -0
  52. data/misc/table.erubi +14 -0
  53. metadata +97 -0
@@ -0,0 +1,174 @@
1
+ require 'gammo/tokenizer/entity'
2
+
3
+ module Gammo
4
+ class Tokenizer
5
+ module Escape
6
+ LONGEST_ENTITY_WITHOUT_SEMICOLON = 6
7
+ ESCAPE_REPLACEMENT_TABLE = {
8
+ ?& => '&',
9
+ ?' => ''',
10
+ ?< => '&lt;',
11
+ ?> => '&gt;',
12
+ ?" => '&#34;',
13
+ ?\r => '&#13;',
14
+ }.freeze
15
+
16
+ REPLACEMENT_TABLE = [
17
+ "\u{20AC}",
18
+ "\u{0081}",
19
+ "\u{201A}",
20
+ "\u{0192}",
21
+ "\u{201E}",
22
+ "\u{2026}",
23
+ "\u{2020}",
24
+ "\u{2021}",
25
+ "\u{02C6}",
26
+ "\u{2030}",
27
+ "\u{0160}",
28
+ "\u{2039}",
29
+ "\u{0152}",
30
+ "\u{008D}",
31
+ "\u{017D}",
32
+ "\u{008F}",
33
+ "\u{0090}",
34
+ "\u{2018}",
35
+ "\u{2019}",
36
+ "\u{201C}",
37
+ "\u{201D}",
38
+ "\u{2022}",
39
+ "\u{2013}",
40
+ "\u{2014}",
41
+ "\u{02DC}",
42
+ "\u{2122}",
43
+ "\u{0161}",
44
+ "\u{203A}",
45
+ "\u{0153}",
46
+ "\u{009D}",
47
+ "\u{017E}",
48
+ "\u{0178}",
49
+ ].freeze
50
+
51
+ # Escapes given string according to {ESCAPE_REPLACEMENT_TABLE}.
52
+ def escape(s)
53
+ s.gsub!(/[&'<>"\r]/) { |ch| ESCAPE_REPLACEMENT_TABLE[ch] }
54
+ end
55
+
56
+ # Unescapes given data.
57
+ # @param [String] data
58
+ # @return [String, nil]
59
+ def unescape(data, **options)
60
+ return unless data
61
+ data.each_byte.with_index do |byte, i|
62
+ next unless byte.chr == ?&
63
+ dst, src = unescape_entity(data, i, i, **options)
64
+ while src < data.bytes.length
65
+ byte = data.getbyte(src)
66
+ if byte.chr == ?&
67
+ dst, src = unescape_entity(data, dst, src, **options)
68
+ else
69
+ data.setbyte(dst, byte)
70
+ dst, src = dst + 1, src + 1
71
+ end
72
+ end
73
+ return data.byteslice(0, dst)
74
+ end
75
+ end
76
+
77
+ private
78
+
79
+ def unescape_entity(data, dst, src, in_attribute: false)
80
+ # No need to count "&".
81
+ i, s = 1, data.byteslice(src..-1)
82
+ swap(data, dst, src) if s.length <= 1
83
+ return unescape_sharp_entity(data, s, dst, src, i) if s[i] == ?#
84
+ i = consume_entity_chars(s, i)
85
+ name = s.byteslice(1, i - 1)
86
+ unless name == '' || (in_attribute && like_query_params?(name, s, i))
87
+ entities = Entity::CODEPOINT[name.to_sym]
88
+ entities = entities ? [entities] : Entity::TWO_CODEPOINTS[name.to_sym]
89
+ return replace_entity(entities, data, dst, src, i) if entities
90
+ unless in_attribute
91
+ max = name.length - 1
92
+ max = LONGEST_ENTITY_WITHOUT_SEMICOLON if max > LONGEST_ENTITY_WITHOUT_SEMICOLON
93
+ max.downto(1) do |n|
94
+ if entities = Entity::CODEPOINT[name.byteslice(0, n).to_sym]
95
+ return replace_entity([entities], data, dst, src, n + 1)
96
+ end
97
+ end
98
+ end
99
+ end
100
+ dst1, src1 = dst + i, src + i
101
+ data[dst, dst1] = data[src, src1]
102
+ [dst1, src1]
103
+ end
104
+
105
+ def unescape_sharp_entity(data, s, dst, src, i)
106
+ return swap(data, dst, src) if s.length <= 3
107
+ i += 1
108
+ ch = s[i]
109
+ hex = false
110
+ if ch == ?x || ch == ?X
111
+ hex = true
112
+ i += 1
113
+ end
114
+ x = ?\x0
115
+ while i < s.length
116
+ ch = s[i]
117
+ i += 1
118
+ if hex
119
+ if ?0 <= ch && ch <= ?9
120
+ x = 16 * x.ord + ch.ord - ?0.ord
121
+ next
122
+ elsif ?a <= ch && ch <= ?f
123
+ x = 16 * x.ord + ch.ord - ?a.ord + 10
124
+ next
125
+ elsif ?A <= ch && ch <= ?F
126
+ x = 16 * x.ord + ch.ord - ?A.ord + 10
127
+ next
128
+ end
129
+ elsif (?0 <= ch && ch <= ?9)
130
+ x = 10 * x.ord + ch.ord - ?0.ord
131
+ next
132
+ end
133
+ i -= 1 if ch != ?;
134
+ break
135
+ end
136
+ return swap(data, dst, src) if i <= 3
137
+ if 0x80 <= x && x <= 0x9F
138
+ x = REPLACEMENT_TABLE[x - 0x80].ord
139
+ elsif x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF
140
+ x = "\u{FFFD}".ord
141
+ end
142
+ x.chr(Encoding::UTF_8).each_byte.with_index { |byte, j| data.setbyte(dst + j, byte) }
143
+ [dst + x.chr(Encoding::UTF_8).bytes.length, src + i]
144
+ end
145
+
146
+ def swap(data, dst, src)
147
+ data[dst] = data[src]
148
+ [dst + 1, src + 1]
149
+ end
150
+
151
+ def consume_entity_chars(s, i)
152
+ while i < s.length
153
+ ch = s[i]
154
+ i += 1
155
+ next if ?a <= ch && ch <= ?z || ?A <= ch && ch <= ?Z || ?0 <= ch && ch <= ?9
156
+ i -= 1 if ch != ?;
157
+ break
158
+ end
159
+ i
160
+ end
161
+
162
+ def like_query_params?(name, s, i)
163
+ name[name.length - 1] != ?; && s.length > i && s[i] == ?=
164
+ end
165
+
166
+ def replace_entity(entities, t, dst, src, i)
167
+ [entities.inject(dst) { |sum, ch|
168
+ ch.each_byte.with_index { |byte, j| t.setbyte(sum + j, byte) }
169
+ sum + ch.bytes.length
170
+ }, src + i]
171
+ end
172
+ end
173
+ end
174
+ end
@@ -0,0 +1,229 @@
1
+ require 'gammo/tokenizer/debug'
2
+
3
+ module Gammo
4
+ class Tokenizer
5
+ class ScriptScanner
6
+ include Debug
7
+
8
+ attr_reader :scanner, :buffer, :raw_tag
9
+
10
+ alias_method :debug?, :debug
11
+
12
+ def initialize(scanner, raw_tag:, debug: false)
13
+ @scanner = scanner
14
+ @buffer = ''
15
+ @raw_tag = raw_tag
16
+ @debug = debug
17
+ end
18
+
19
+ def scan
20
+ scan_script_data
21
+ buffer
22
+ end
23
+
24
+ private
25
+
26
+ def with_extendable_stack
27
+ begin
28
+ yield
29
+ rescue SystemStackError
30
+ Fiber.new { yield }.resume
31
+ end
32
+ end
33
+
34
+ def scan_script_data
35
+ with_extendable_stack do
36
+ consume do |byte|
37
+ byte == ?< ? scan_script_data_less_than_sign : scan_script_data
38
+ end
39
+ end
40
+ end
41
+
42
+ def scan_script_data_less_than_sign
43
+ consume do |byte|
44
+ case byte
45
+ when ?/ then return scan_script_data_end_tag_open
46
+ when ?! then return scan_script_data_escape_start
47
+ end
48
+ revert_byte
49
+ scan_script_data
50
+ end
51
+ end
52
+
53
+ def scan_script_data_end_tag_open
54
+ return if scan_raw_end_tag? || scanner.eos?
55
+ scan_script_data
56
+ end
57
+
58
+ def scan_script_data_escape_start
59
+ consume do |byte|
60
+ return scan_script_data_escape_start_dash if byte == ?-
61
+ revert_byte
62
+ scan_script_data
63
+ end
64
+ end
65
+
66
+ def scan_script_data_escape_start_dash
67
+ consume do |byte|
68
+ return scan_script_data_escaped_dash_dash if byte == ?-
69
+ revert_byte
70
+ scan_script_data
71
+ end
72
+ end
73
+
74
+ def scan_script_data_escaped
75
+ consume do |byte|
76
+ case byte
77
+ when ?- then return scan_script_data_escaped_dash
78
+ when ?< then return scan_script_data_escaped_less_than_sign
79
+ else return scan_script_data_escaped
80
+ end
81
+ end
82
+ end
83
+
84
+ def scan_script_data_escaped_dash
85
+ consume do |byte|
86
+ case byte
87
+ when ?- then return scan_script_data_escaped_dash_dash
88
+ when ?< then return scan_script_data_escaped_less_than_sign
89
+ else return scan_script_data_escaped
90
+ end
91
+ end
92
+ end
93
+
94
+ def scan_script_data_escaped_dash_dash
95
+ consume do |byte|
96
+ case byte
97
+ when ?- then return scan_script_data_escaped_dash_dash
98
+ when ?< then return scan_script_data_escaped_less_than_sign
99
+ when ?> then return scan_script_data
100
+ else return scan_script_data_escaped
101
+ end
102
+ end
103
+ end
104
+
105
+ def scan_script_data_escaped_less_than_sign
106
+ consume do |byte|
107
+ return scan_script_data_escaped_end_tag_open if byte == ?/
108
+ return scan_script_data_double_escape_start if byte =~ /[a-zA-Z]/
109
+ revert_byte
110
+ scan_script_data
111
+ end
112
+ end
113
+
114
+ def scan_script_data_escaped_end_tag_open
115
+ return if scan_raw_end_tag? || scanner.eos?
116
+ scan_script_data_escaped
117
+ end
118
+
119
+ def scan_script_data_double_escape_start
120
+ revert_byte
121
+ 'script'.each_char.with_index do |ch, index|
122
+ ch = scanner.get_byte
123
+ buffer << ch
124
+ return if scanner.eos?
125
+ unless ch.downcase == 'script'[index]
126
+ revert_byte
127
+ return scan_script_data_escaped
128
+ end
129
+ end
130
+ byte = scanner.get_byte
131
+ buffer << byte
132
+ return if scanner.eos?
133
+ case byte
134
+ when ?\s, ?/, ?> then return scan_script_data_double_escaped
135
+ else
136
+ revert_byte
137
+ scan_script_data_escaped
138
+ end
139
+ end
140
+
141
+ def scan_script_data_double_escaped
142
+ consume do |byte|
143
+ case byte
144
+ when ?- then return scan_script_data_double_escaped_dash
145
+ when ?< then return scan_script_data_double_escaped_less_than_sign
146
+ else return scan_script_data_double_escaped
147
+ end
148
+ end
149
+ end
150
+
151
+ def scan_script_data_double_escaped_dash
152
+ consume do |byte|
153
+ case byte
154
+ when ?- then return scan_script_data_double_escaped_dash_dash
155
+ when ?< then return scan_script_data_double_escaped_less_than_sign
156
+ else return scan_script_data_double_escaped
157
+ end
158
+ end
159
+ end
160
+
161
+ def scan_script_data_double_escaped_dash_dash
162
+ consume do |byte|
163
+ case byte
164
+ when ?- then return scan_script_data_double_escaped_dash_dash
165
+ when ?< then return scan_script_data_double_escaped_less_than_sign
166
+ when ?> then return scan_script_data
167
+ else return scan_script_data_double_escaped
168
+ end
169
+ end
170
+ end
171
+
172
+ def scan_script_data_double_escaped_less_than_sign
173
+ consume do |byte|
174
+ return scan_script_data_double_escape_end if byte == ?/
175
+ revert_byte
176
+ scan_script_data_double_escaped
177
+ end
178
+ end
179
+
180
+ def scan_script_data_double_escape_end
181
+ if scan_raw_end_tag?
182
+ end_tag = "</#{raw_tag}>"
183
+ # Last matched char needs to be concatenated.
184
+ buffer << scanner.string.slice(scanner.pos, end_tag.length)
185
+ scanner.pos += end_tag.length
186
+ return scan_script_data_escaped
187
+ end
188
+ return if scanner.eos?
189
+ scan_script_data_double_escaped
190
+ end
191
+
192
+ def consume
193
+ return unless byte = scanner.get_byte
194
+ buffer << byte
195
+ yield byte
196
+ end
197
+
198
+ def revert_byte
199
+ @buffer = buffer.slice(0, buffer.length - 1)
200
+ scanner.unscan
201
+ end
202
+
203
+ def scan_raw_end_tag?
204
+ raw_tag.each_char do |ch|
205
+ return false unless byte = scanner.get_byte
206
+ if byte.downcase != ch
207
+ scanner.unscan
208
+ return false
209
+ end
210
+ buffer << byte
211
+ end
212
+ case byte = scanner.get_byte
213
+ when ?>, ?\s, ?/
214
+ desired = 3 + raw_tag.length
215
+ scanner.pos -= desired
216
+ @buffer = buffer.slice(0, buffer.length - desired + 1)
217
+ return true
218
+ when nil
219
+ return false
220
+ else
221
+ buffer << byte
222
+ end
223
+ scanner.unscan
224
+ @buffer = buffer.slice(0, buffer.length - 1)
225
+ false
226
+ end
227
+ end
228
+ end
229
+ end
@@ -0,0 +1,66 @@
1
+ require 'gammo/tokenizer/escape'
2
+
3
+ module Gammo
4
+ class Tokenizer
5
+ class BaseToken
6
+ attr_accessor :attributes, :data, :tag
7
+
8
+ def initialize(data = '', attributes: [], tag: nil)
9
+ @data = data
10
+ @attributes = attributes
11
+ @tag = tag
12
+ end
13
+
14
+ def concat(s)
15
+ data << s
16
+ end
17
+ end
18
+
19
+ class EscapedToken < BaseToken
20
+ include Escape
21
+
22
+ NULL = ?\x00.freeze
23
+ REPLACEMENT = "\ufffd".freeze
24
+
25
+ attr_accessor :convert_null, :raw
26
+
27
+ def initialize(data = nil, raw: false, convert_null: false, **options)
28
+ super(data, **options)
29
+ @raw = raw
30
+ @convert_null = convert_null
31
+ load_data(data)
32
+ end
33
+
34
+ def load_data(raw_data)
35
+ unless raw_data
36
+ @data = nil
37
+ return
38
+ end
39
+ raw_data = convert_newlines(raw_data).force_encoding(Encoding::UTF_8)
40
+ raw_data = raw_data.gsub(%r{#{NULL}}, REPLACEMENT) if should_convert_null?(raw_data)
41
+ @data = require_raw_data? ? raw_data : unescape(raw_data, in_attribute: false)
42
+ end
43
+
44
+ private
45
+
46
+ alias_method :convert_null?, :convert_null
47
+ alias_method :require_raw_data?, :raw
48
+
49
+ def should_convert_null?(data)
50
+ data && (convert_null? || self.class == CommentToken) && data.include?(NULL)
51
+ end
52
+
53
+ def convert_newlines(s)
54
+ s.gsub(/(\r\n|\r)/, ?\n)
55
+ end
56
+ end
57
+
58
+ ErrorToken = Class.new(BaseToken)
59
+ TextToken = Class.new(EscapedToken)
60
+ StartTagToken = Class.new(BaseToken)
61
+ EndTagToken = Class.new(BaseToken)
62
+ SelfClosingTagToken = Class.new(BaseToken)
63
+ CommentToken = Class.new(EscapedToken)
64
+ DoctypeToken = Class.new(EscapedToken)
65
+ end
66
+ end