html_tokenizer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,337 @@
1
+ require "minitest/autorun"
2
+ require "html_tokenizer"
3
+
4
+ class HtmlTokenizer::TokenizerTest < Minitest::Test
5
+ def test_closing_tag_without_start_is_text
6
+ assert_equal [
7
+ [:text, ">"],
8
+ ], tokenize(">")
9
+ assert_equal [
10
+ [:tag_start, "<"], [:tag_name, "foo"], [:tag_end, ">"], [:text, ">"],
11
+ ], tokenize("<foo>>")
12
+ end
13
+
14
+ def test_tokenize_text
15
+ result = tokenize("\n hello world\n ")
16
+ assert_equal [[:text, "\n hello world\n "]], result
17
+ end
18
+
19
+ def test_namespace_tag_name_multipart
20
+ assert_equal [
21
+ [:tag_start, "<"], [:tag_name, "foo:"], [:tag_name, "bar"],
22
+ ], tokenize("<foo:", "bar")
23
+ end
24
+
25
+ def test_tokenize_doctype
26
+ assert_equal [
27
+ [:tag_start, "<"], [:tag_name, "!DOCTYPE"], [:whitespace, " "],
28
+ [:attribute_name, "html"], [:tag_end, ">"]
29
+ ], tokenize("<!DOCTYPE html>")
30
+ end
31
+
32
+ def test_tokenize_multiple_elements
33
+ assert_equal [
34
+ [:tag_start, "<"], [:tag_name, "div"], [:tag_end, ">"],
35
+ [:text, " bla "],
36
+ [:tag_start, "<"], [:tag_name, "strong"], [:tag_end, ">"]
37
+ ], tokenize("<div> bla <strong>")
38
+ end
39
+
40
+ def test_tokenize_complex_doctype
41
+ text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'
42
+ assert_equal [
43
+ [:tag_start, "<"], [:tag_name, "!DOCTYPE"], [:whitespace, " "],
44
+ [:attribute_name, "html"], [:whitespace, " "],
45
+ [:attribute_name, "PUBLIC"], [:whitespace, " "],
46
+ [:attribute_quoted_value_start, "\""], [:attribute_quoted_value, "-//W3C//DTD XHTML 1.0 Transitional//EN"], [:attribute_quoted_value_end, "\""],
47
+ [:whitespace, " "],
48
+ [:attribute_quoted_value_start, "\""], [:attribute_quoted_value, "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"], [:attribute_quoted_value_end, "\""],
49
+ [:tag_end, ">"]
50
+ ], tokenize(text)
51
+ end
52
+
53
+ def test_tokenize_html_comment
54
+ result = tokenize("<!-- COMMENT -->")
55
+ assert_equal [[:comment_start, "<!--"], [:text, " COMMENT "], [:comment_end, "-->"]], result
56
+ end
57
+
58
+ def test_tokenize_comment_with_newlines
59
+ result = tokenize <<-EOF
60
+ <!-- debug: <%== @unsafe %> -->
61
+ EOF
62
+
63
+ assert_equal [
64
+ [:text, " "], [:comment_start, "<!--"],
65
+ [:text, " debug: <%== @unsafe %> "],
66
+ [:comment_end, "-->"], [:text, "\n"]
67
+ ], result
68
+ end
69
+
70
+ def test_tokenize_cdata_section
71
+ result = tokenize("<![CDATA[ bla bla <!&@#> foo ]]>")
72
+ assert_equal [[:cdata_start, "<![CDATA["], [:text, " bla bla <!&@#> foo "], [:cdata_end, "]]>"]], result
73
+ end
74
+
75
+ def test_tokenizer_cdata_regression
76
+ result = tokenize("<![CDATA[ foo ", " baz ]]>")
77
+ assert_equal [[:cdata_start, "<![CDATA["],
78
+ [:text, " foo "], [:text, " baz "], [:cdata_end, "]]>"]], result
79
+ end
80
+
81
+ def test_tokenizer_comment_regression
82
+ result = tokenize("<!-- foo ", " baz -->")
83
+ assert_equal [[:comment_start, "<!--"],
84
+ [:text, " foo "], [:text, " baz "], [:comment_end, "-->"]], result
85
+ end
86
+
87
+ def test_tokenizer_parse_tag_after_comment_regression
88
+ result = tokenize("<!-- foo --> <li>")
89
+ assert_equal [[:comment_start, "<!--"], [:text, " foo "], [:comment_end, "-->"],
90
+ [:text, " "], [:tag_start, "<"], [:tag_name, "li"], [:tag_end, ">"]], result
91
+ end
92
+
93
+ def test_tokenize_basic_tag
94
+ result = tokenize("<div>")
95
+ assert_equal [[:tag_start, "<"], [:tag_name, "div"], [:tag_end, ">"]], result
96
+ end
97
+
98
+ def test_tokenize_namespaced_tag
99
+ result = tokenize("<ns:foo>")
100
+ assert_equal [[:tag_start, "<"], [:tag_name, "ns:foo"], [:tag_end, ">"]], result
101
+ end
102
+
103
+ def test_tokenize_tag_with_lt
104
+ result = tokenize("<a<b>")
105
+ assert_equal [[:tag_start, "<"], [:tag_name, "a<b"], [:tag_end, ">"]], result
106
+ end
107
+
108
+ def test_tokenize_tag_multipart_name
109
+ result = tokenize("<d", "iv", ">")
110
+ assert_equal [[:tag_start, "<"], [:tag_name, "d"], [:tag_name, "iv"], [:tag_end, ">"]], result
111
+ end
112
+
113
+ def test_tokenize_tag_name_ending_with_slash
114
+ result = tokenize("<div/1>")
115
+ assert_equal [[:tag_start, "<"], [:tag_name, "div"], [:solidus, "/"], [:attribute_name, "1"], [:tag_end, ">"]], result
116
+ end
117
+
118
+ def test_tokenize_empty_tag
119
+ result = tokenize("<>")
120
+ assert_equal [[:tag_start, "<"], [:tag_end, ">"]], result
121
+ end
122
+
123
+ def test_tokenize_tag_with_solidus
124
+ result = tokenize("</>")
125
+ assert_equal [[:tag_start, "<"], [:solidus, "/"], [:tag_end, ">"]], result
126
+ end
127
+
128
+ def test_tokenize_end_tag
129
+ result = tokenize("</div>")
130
+ assert_equal [[:tag_start, "<"], [:solidus, "/"], [:tag_name, "div"], [:tag_end, ">"]], result
131
+ end
132
+
133
+ def test_tokenize_tag_attribute_with_double_quote
134
+ result = tokenize('<div foo="bar">')
135
+ assert_equal [
136
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
137
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_quoted_value_start, "\""], [:attribute_quoted_value, "bar"], [:attribute_quoted_value_end, "\""],
138
+ [:tag_end, ">"]
139
+ ], result
140
+ end
141
+
142
+ def test_tokenize_unquoted_attributes_separated_with_solidus
143
+ result = tokenize('<div foo=1/bar=2>')
144
+ assert_equal [
145
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
146
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_unquoted_value, "1/bar=2"],
147
+ [:tag_end, ">"]
148
+ ], result
149
+ end
150
+
151
+ def test_tokenize_quoted_attributes_separated_with_solidus
152
+ result = tokenize('<div foo="1"/bar="2">')
153
+ assert_equal [
154
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
155
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_quoted_value_start, "\""], [:attribute_quoted_value, "1"], [:attribute_quoted_value_end, "\""],
156
+ [:solidus, "/"],
157
+ [:attribute_name, "bar"], [:equal, "="], [:attribute_quoted_value_start, "\""], [:attribute_quoted_value, "2"], [:attribute_quoted_value_end, "\""],
158
+ [:tag_end, ">"]
159
+ ], result
160
+ end
161
+
162
+ def test_tokenize_tag_attribute_without_space
163
+ result = tokenize('<div foo="bar"baz>')
164
+ assert_equal [
165
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
166
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_quoted_value_start, "\""], [:attribute_quoted_value, "bar"], [:attribute_quoted_value_end, "\""],
167
+ [:attribute_name, "baz"],
168
+ [:tag_end, ">"]
169
+ ], result
170
+ end
171
+
172
+ def test_tokenize_multipart_unquoted_attribute
173
+ result = tokenize('<div foo=', 'bar', 'baz>')
174
+ assert_equal [
175
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
176
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_unquoted_value, "bar"],
177
+ [:attribute_unquoted_value, "baz"], [:tag_end, ">"]
178
+ ], result
179
+ end
180
+
181
+ def test_tokenize_quoted_attribute_separately
182
+ result = tokenize('<div foo=', "'bar'", '>')
183
+ assert_equal [
184
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
185
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_quoted_value_start, "'"], [:attribute_quoted_value, "bar"], [:attribute_quoted_value_end, "'"],
186
+ [:tag_end, ">"]
187
+ ], result
188
+ end
189
+
190
+ def test_tokenize_quoted_attribute_in_multiple_parts
191
+ result = tokenize('<div foo=', "'bar", "baz'", '>')
192
+ assert_equal [
193
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
194
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_quoted_value_start, "'"], [:attribute_quoted_value, "bar"], [:attribute_quoted_value, "baz"], [:attribute_quoted_value_end, "'"],
195
+ [:tag_end, ">"]
196
+ ], result
197
+ end
198
+
199
+ def test_tokenize_tag_attribute_with_single_quote
200
+ result = tokenize("<div foo='bar'>")
201
+ assert_equal [
202
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
203
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_quoted_value_start, "'"], [:attribute_quoted_value, "bar"], [:attribute_quoted_value_end, "'"],
204
+ [:tag_end, ">"]
205
+ ], result
206
+ end
207
+
208
+ def test_tokenize_tag_attribute_with_no_quotes
209
+ result = tokenize("<div foo=bla bar=blo>")
210
+ assert_equal [
211
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
212
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_unquoted_value, "bla"], [:whitespace, " "],
213
+ [:attribute_name, "bar"], [:equal, "="], [:attribute_unquoted_value, "blo"],
214
+ [:tag_end, ">"]
215
+ ], result
216
+ end
217
+
218
+ def test_tokenize_double_equals
219
+ result = tokenize("<div foo=blabar=blo>")
220
+ assert_equal [
221
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
222
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_unquoted_value, "blabar=blo"],
223
+ [:tag_end, ">"]
224
+ ], result
225
+ end
226
+
227
+ def test_tokenize_closing_tag
228
+ result = tokenize('<div foo="bar" />')
229
+ assert_equal [
230
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
231
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_quoted_value_start, "\""], [:attribute_quoted_value, "bar"], [:attribute_quoted_value_end, "\""], [:whitespace, " "],
232
+ [:solidus, "/"], [:tag_end, ">"]
233
+ ], result
234
+ end
235
+
236
+ def test_tokenize_script_tag
237
+ result = tokenize('<script>foo <b> bar</script>')
238
+ assert_equal [
239
+ [:tag_start, "<"], [:tag_name, "script"], [:tag_end, ">"],
240
+ [:text, "foo "], [:text, "<b"], [:text, "> bar"],
241
+ [:tag_start, "<"], [:solidus, "/"], [:tag_name, "script"], [:tag_end, ">"],
242
+ ], result
243
+ end
244
+
245
+ def test_tokenize_textarea_tag
246
+ result = tokenize('<textarea>hello</textarea>')
247
+ assert_equal [
248
+ [:tag_start, "<"], [:tag_name, "textarea"], [:tag_end, ">"],
249
+ [:text, "hello"],
250
+ [:tag_start, "<"], [:solidus, "/"], [:tag_name, "textarea"], [:tag_end, ">"],
251
+ ], result
252
+ end
253
+
254
+ def test_tokenize_style_tag
255
+ result = tokenize('<style></div></style>')
256
+ assert_equal [
257
+ [:tag_start, "<"], [:tag_name, "style"], [:tag_end, ">"],
258
+ [:text, "</div"], [:text, ">"],
259
+ [:tag_start, "<"], [:solidus, "/"], [:tag_name, "style"], [:tag_end, ">"],
260
+ ], result
261
+ end
262
+
263
+ def test_tokenize_script_containing_html
264
+ result = tokenize('<script type="text/html">foo <b> bar</script>')
265
+ assert_equal [
266
+ [:tag_start, "<"], [:tag_name, "script"], [:whitespace, " "],
267
+ [:attribute_name, "type"], [:equal, "="], [:attribute_quoted_value_start, "\""], [:attribute_quoted_value, "text/html"], [:attribute_quoted_value_end, "\""],
268
+ [:tag_end, ">"],
269
+ [:text, "foo "], [:text, "<b"], [:text, "> bar"],
270
+ [:tag_start, "<"], [:solidus, "/"], [:tag_name, "script"], [:tag_end, ">"],
271
+ ], result
272
+ end
273
+
274
+ def test_end_of_tag_on_newline
275
+ data = ["\
276
+ <div define=\"{credential_96_credential1: new Shopify.ProviderCredentials()}\"
277
+ ", "", ">"]
278
+ result = tokenize(*data)
279
+ assert_equal [
280
+ [:text, " "],
281
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "], [:attribute_name, "define"], [:equal, "="], [:attribute_quoted_value_start, "\""], [:attribute_quoted_value, "{credential_96_credential1: new Shopify.ProviderCredentials()}"], [:attribute_quoted_value_end, "\""],
282
+ [:whitespace, "\n "], [:tag_end, ">"]
283
+ ], result
284
+ end
285
+
286
+ def test_tokenize_multi_part_attribute_name
287
+ result = tokenize('<div data-', 'shipping', '-type>')
288
+ assert_equal [
289
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
290
+ [:attribute_name, "data-"], [:attribute_name, "shipping"], [:attribute_name, "-type"],
291
+ [:tag_end, ">"],
292
+ ], result
293
+ end
294
+
295
+ def test_tokenize_attribute_name_with_space_before_equal
296
+ result = tokenize('<a href ="http://www.cra-arc.gc.ca/tx/bsnss/tpcs/gst-tps/menu-eng.html">GST/HST</a>')
297
+ assert_equal [
298
+ [:tag_start, "<"], [:tag_name, "a"], [:whitespace, " "],
299
+ [:attribute_name, "href"], [:whitespace, " "], [:equal, "="],
300
+ [:attribute_quoted_value_start, "\""], [:attribute_quoted_value, "http://www.cra-arc.gc.ca/tx/bsnss/tpcs/gst-tps/menu-eng.html"], [:attribute_quoted_value_end, "\""],
301
+ [:tag_end, ">"], [:text, "GST/HST"],
302
+ [:tag_start, "<"], [:solidus, "/"], [:tag_name, "a"], [:tag_end, ">"]
303
+ ], result
304
+ end
305
+
306
+ def test_raise_in_block
307
+ @tokenizer = HtmlTokenizer::Tokenizer.new
308
+ 10.times do
309
+ e = assert_raises(RuntimeError) do
310
+ @tokenizer.tokenize("<>") do |part|
311
+ raise RuntimeError, "something went wrong"
312
+ end
313
+ end
314
+ assert_equal "something went wrong", e.message
315
+ end
316
+ end
317
+
318
+ def test_tokenize_end_of_script_regression
319
+ result = tokenize("<script><</script>")
320
+ assert_equal [
321
+ [:tag_start, "<"], [:tag_name, "script"], [:tag_end, ">"],
322
+ [:text, "<"],
323
+ [:tag_start, "<"], [:solidus, "/"], [:tag_name, "script"], [:tag_end, ">"]
324
+ ], result
325
+ end
326
+
327
+ private
328
+
329
+ def tokenize(*parts)
330
+ tokens = []
331
+ @tokenizer = HtmlTokenizer::Tokenizer.new
332
+ parts.each do |part|
333
+ @tokenizer.tokenize(part) { |name, start, stop| tokens << [name, part[start..(stop-1)]] }
334
+ end
335
+ tokens
336
+ end
337
+ end
metadata ADDED
@@ -0,0 +1,109 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html_tokenizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Francois Chagnon
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-10-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake-compiler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: minitest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description:
56
+ email:
57
+ executables:
58
+ - html_tokenizer
59
+ extensions:
60
+ - ext/html_tokenizer_ext/extconf.rb
61
+ extra_rdoc_files: []
62
+ files:
63
+ - .autotest
64
+ - .gitignore
65
+ - Gemfile
66
+ - Gemfile.lock
67
+ - LICENSE
68
+ - Manifest.txt
69
+ - README.md
70
+ - Rakefile
71
+ - bin/html_tokenizer
72
+ - ext/html_tokenizer_ext/extconf.rb
73
+ - ext/html_tokenizer_ext/html_tokenizer.c
74
+ - ext/html_tokenizer_ext/html_tokenizer.h
75
+ - ext/html_tokenizer_ext/parser.c
76
+ - ext/html_tokenizer_ext/parser.h
77
+ - ext/html_tokenizer_ext/tokenizer.c
78
+ - ext/html_tokenizer_ext/tokenizer.h
79
+ - html_tokenizer.gemspec
80
+ - lib/html_tokenizer.rb
81
+ - test/unit/parser_test.rb
82
+ - test/unit/tokenizer_test.rb
83
+ homepage:
84
+ licenses: []
85
+ metadata: {}
86
+ post_install_message:
87
+ rdoc_options: []
88
+ require_paths:
89
+ - lib
90
+ - ext
91
+ required_ruby_version: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - '>='
94
+ - !ruby/object:Gem::Version
95
+ version: '0'
96
+ required_rubygems_version: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - '>='
99
+ - !ruby/object:Gem::Version
100
+ version: '0'
101
+ requirements: []
102
+ rubyforge_project:
103
+ rubygems_version: 2.0.14.1
104
+ signing_key:
105
+ specification_version: 4
106
+ summary: HTML Tokenizer
107
+ test_files:
108
+ - test/unit/parser_test.rb
109
+ - test/unit/tokenizer_test.rb