html_tokenizer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,337 @@
1
+ require "minitest/autorun"
2
+ require "html_tokenizer"
3
+
4
+ class HtmlTokenizer::TokenizerTest < Minitest::Test
5
+ def test_closing_tag_without_start_is_text
6
+ assert_equal [
7
+ [:text, ">"],
8
+ ], tokenize(">")
9
+ assert_equal [
10
+ [:tag_start, "<"], [:tag_name, "foo"], [:tag_end, ">"], [:text, ">"],
11
+ ], tokenize("<foo>>")
12
+ end
13
+
14
+ def test_tokenize_text
15
+ result = tokenize("\n hello world\n ")
16
+ assert_equal [[:text, "\n hello world\n "]], result
17
+ end
18
+
19
+ def test_namespace_tag_name_multipart
20
+ assert_equal [
21
+ [:tag_start, "<"], [:tag_name, "foo:"], [:tag_name, "bar"],
22
+ ], tokenize("<foo:", "bar")
23
+ end
24
+
25
+ def test_tokenize_doctype
26
+ assert_equal [
27
+ [:tag_start, "<"], [:tag_name, "!DOCTYPE"], [:whitespace, " "],
28
+ [:attribute_name, "html"], [:tag_end, ">"]
29
+ ], tokenize("<!DOCTYPE html>")
30
+ end
31
+
32
+ def test_tokenize_multiple_elements
33
+ assert_equal [
34
+ [:tag_start, "<"], [:tag_name, "div"], [:tag_end, ">"],
35
+ [:text, " bla "],
36
+ [:tag_start, "<"], [:tag_name, "strong"], [:tag_end, ">"]
37
+ ], tokenize("<div> bla <strong>")
38
+ end
39
+
40
+ def test_tokenize_complex_doctype
41
+ text = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'
42
+ assert_equal [
43
+ [:tag_start, "<"], [:tag_name, "!DOCTYPE"], [:whitespace, " "],
44
+ [:attribute_name, "html"], [:whitespace, " "],
45
+ [:attribute_name, "PUBLIC"], [:whitespace, " "],
46
+ [:attribute_quoted_value_start, "\""], [:attribute_quoted_value, "-//W3C//DTD XHTML 1.0 Transitional//EN"], [:attribute_quoted_value_end, "\""],
47
+ [:whitespace, " "],
48
+ [:attribute_quoted_value_start, "\""], [:attribute_quoted_value, "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"], [:attribute_quoted_value_end, "\""],
49
+ [:tag_end, ">"]
50
+ ], tokenize(text)
51
+ end
52
+
53
+ def test_tokenize_html_comment
54
+ result = tokenize("<!-- COMMENT -->")
55
+ assert_equal [[:comment_start, "<!--"], [:text, " COMMENT "], [:comment_end, "-->"]], result
56
+ end
57
+
58
+ def test_tokenize_comment_with_newlines
59
+ result = tokenize <<-EOF
60
+ <!-- debug: <%== @unsafe %> -->
61
+ EOF
62
+
63
+ assert_equal [
64
+ [:text, " "], [:comment_start, "<!--"],
65
+ [:text, " debug: <%== @unsafe %> "],
66
+ [:comment_end, "-->"], [:text, "\n"]
67
+ ], result
68
+ end
69
+
70
+ def test_tokenize_cdata_section
71
+ result = tokenize("<![CDATA[ bla bla <!&@#> foo ]]>")
72
+ assert_equal [[:cdata_start, "<![CDATA["], [:text, " bla bla <!&@#> foo "], [:cdata_end, "]]>"]], result
73
+ end
74
+
75
+ def test_tokenizer_cdata_regression
76
+ result = tokenize("<![CDATA[ foo ", " baz ]]>")
77
+ assert_equal [[:cdata_start, "<![CDATA["],
78
+ [:text, " foo "], [:text, " baz "], [:cdata_end, "]]>"]], result
79
+ end
80
+
81
+ def test_tokenizer_comment_regression
82
+ result = tokenize("<!-- foo ", " baz -->")
83
+ assert_equal [[:comment_start, "<!--"],
84
+ [:text, " foo "], [:text, " baz "], [:comment_end, "-->"]], result
85
+ end
86
+
87
+ def test_tokenizer_parse_tag_after_comment_regression
88
+ result = tokenize("<!-- foo --> <li>")
89
+ assert_equal [[:comment_start, "<!--"], [:text, " foo "], [:comment_end, "-->"],
90
+ [:text, " "], [:tag_start, "<"], [:tag_name, "li"], [:tag_end, ">"]], result
91
+ end
92
+
93
+ def test_tokenize_basic_tag
94
+ result = tokenize("<div>")
95
+ assert_equal [[:tag_start, "<"], [:tag_name, "div"], [:tag_end, ">"]], result
96
+ end
97
+
98
+ def test_tokenize_namespaced_tag
99
+ result = tokenize("<ns:foo>")
100
+ assert_equal [[:tag_start, "<"], [:tag_name, "ns:foo"], [:tag_end, ">"]], result
101
+ end
102
+
103
+ def test_tokenize_tag_with_lt
104
+ result = tokenize("<a<b>")
105
+ assert_equal [[:tag_start, "<"], [:tag_name, "a<b"], [:tag_end, ">"]], result
106
+ end
107
+
108
+ def test_tokenize_tag_multipart_name
109
+ result = tokenize("<d", "iv", ">")
110
+ assert_equal [[:tag_start, "<"], [:tag_name, "d"], [:tag_name, "iv"], [:tag_end, ">"]], result
111
+ end
112
+
113
+ def test_tokenize_tag_name_ending_with_slash
114
+ result = tokenize("<div/1>")
115
+ assert_equal [[:tag_start, "<"], [:tag_name, "div"], [:solidus, "/"], [:attribute_name, "1"], [:tag_end, ">"]], result
116
+ end
117
+
118
+ def test_tokenize_empty_tag
119
+ result = tokenize("<>")
120
+ assert_equal [[:tag_start, "<"], [:tag_end, ">"]], result
121
+ end
122
+
123
+ def test_tokenize_tag_with_solidus
124
+ result = tokenize("</>")
125
+ assert_equal [[:tag_start, "<"], [:solidus, "/"], [:tag_end, ">"]], result
126
+ end
127
+
128
+ def test_tokenize_end_tag
129
+ result = tokenize("</div>")
130
+ assert_equal [[:tag_start, "<"], [:solidus, "/"], [:tag_name, "div"], [:tag_end, ">"]], result
131
+ end
132
+
133
+ def test_tokenize_tag_attribute_with_double_quote
134
+ result = tokenize('<div foo="bar">')
135
+ assert_equal [
136
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
137
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_quoted_value_start, "\""], [:attribute_quoted_value, "bar"], [:attribute_quoted_value_end, "\""],
138
+ [:tag_end, ">"]
139
+ ], result
140
+ end
141
+
142
+ def test_tokenize_unquoted_attributes_separated_with_solidus
143
+ result = tokenize('<div foo=1/bar=2>')
144
+ assert_equal [
145
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
146
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_unquoted_value, "1/bar=2"],
147
+ [:tag_end, ">"]
148
+ ], result
149
+ end
150
+
151
+ def test_tokenize_quoted_attributes_separated_with_solidus
152
+ result = tokenize('<div foo="1"/bar="2">')
153
+ assert_equal [
154
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
155
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_quoted_value_start, "\""], [:attribute_quoted_value, "1"], [:attribute_quoted_value_end, "\""],
156
+ [:solidus, "/"],
157
+ [:attribute_name, "bar"], [:equal, "="], [:attribute_quoted_value_start, "\""], [:attribute_quoted_value, "2"], [:attribute_quoted_value_end, "\""],
158
+ [:tag_end, ">"]
159
+ ], result
160
+ end
161
+
162
+ def test_tokenize_tag_attribute_without_space
163
+ result = tokenize('<div foo="bar"baz>')
164
+ assert_equal [
165
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
166
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_quoted_value_start, "\""], [:attribute_quoted_value, "bar"], [:attribute_quoted_value_end, "\""],
167
+ [:attribute_name, "baz"],
168
+ [:tag_end, ">"]
169
+ ], result
170
+ end
171
+
172
+ def test_tokenize_multipart_unquoted_attribute
173
+ result = tokenize('<div foo=', 'bar', 'baz>')
174
+ assert_equal [
175
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
176
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_unquoted_value, "bar"],
177
+ [:attribute_unquoted_value, "baz"], [:tag_end, ">"]
178
+ ], result
179
+ end
180
+
181
+ def test_tokenize_quoted_attribute_separately
182
+ result = tokenize('<div foo=', "'bar'", '>')
183
+ assert_equal [
184
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
185
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_quoted_value_start, "'"], [:attribute_quoted_value, "bar"], [:attribute_quoted_value_end, "'"],
186
+ [:tag_end, ">"]
187
+ ], result
188
+ end
189
+
190
+ def test_tokenize_quoted_attribute_in_multiple_parts
191
+ result = tokenize('<div foo=', "'bar", "baz'", '>')
192
+ assert_equal [
193
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
194
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_quoted_value_start, "'"], [:attribute_quoted_value, "bar"], [:attribute_quoted_value, "baz"], [:attribute_quoted_value_end, "'"],
195
+ [:tag_end, ">"]
196
+ ], result
197
+ end
198
+
199
+ def test_tokenize_tag_attribute_with_single_quote
200
+ result = tokenize("<div foo='bar'>")
201
+ assert_equal [
202
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
203
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_quoted_value_start, "'"], [:attribute_quoted_value, "bar"], [:attribute_quoted_value_end, "'"],
204
+ [:tag_end, ">"]
205
+ ], result
206
+ end
207
+
208
+ def test_tokenize_tag_attribute_with_no_quotes
209
+ result = tokenize("<div foo=bla bar=blo>")
210
+ assert_equal [
211
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
212
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_unquoted_value, "bla"], [:whitespace, " "],
213
+ [:attribute_name, "bar"], [:equal, "="], [:attribute_unquoted_value, "blo"],
214
+ [:tag_end, ">"]
215
+ ], result
216
+ end
217
+
218
+ def test_tokenize_double_equals
219
+ result = tokenize("<div foo=blabar=blo>")
220
+ assert_equal [
221
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
222
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_unquoted_value, "blabar=blo"],
223
+ [:tag_end, ">"]
224
+ ], result
225
+ end
226
+
227
+ def test_tokenize_closing_tag
228
+ result = tokenize('<div foo="bar" />')
229
+ assert_equal [
230
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
231
+ [:attribute_name, "foo"], [:equal, "="], [:attribute_quoted_value_start, "\""], [:attribute_quoted_value, "bar"], [:attribute_quoted_value_end, "\""], [:whitespace, " "],
232
+ [:solidus, "/"], [:tag_end, ">"]
233
+ ], result
234
+ end
235
+
236
+ def test_tokenize_script_tag
237
+ result = tokenize('<script>foo <b> bar</script>')
238
+ assert_equal [
239
+ [:tag_start, "<"], [:tag_name, "script"], [:tag_end, ">"],
240
+ [:text, "foo "], [:text, "<b"], [:text, "> bar"],
241
+ [:tag_start, "<"], [:solidus, "/"], [:tag_name, "script"], [:tag_end, ">"],
242
+ ], result
243
+ end
244
+
245
+ def test_tokenize_textarea_tag
246
+ result = tokenize('<textarea>hello</textarea>')
247
+ assert_equal [
248
+ [:tag_start, "<"], [:tag_name, "textarea"], [:tag_end, ">"],
249
+ [:text, "hello"],
250
+ [:tag_start, "<"], [:solidus, "/"], [:tag_name, "textarea"], [:tag_end, ">"],
251
+ ], result
252
+ end
253
+
254
+ def test_tokenize_style_tag
255
+ result = tokenize('<style></div></style>')
256
+ assert_equal [
257
+ [:tag_start, "<"], [:tag_name, "style"], [:tag_end, ">"],
258
+ [:text, "</div"], [:text, ">"],
259
+ [:tag_start, "<"], [:solidus, "/"], [:tag_name, "style"], [:tag_end, ">"],
260
+ ], result
261
+ end
262
+
263
+ def test_tokenize_script_containing_html
264
+ result = tokenize('<script type="text/html">foo <b> bar</script>')
265
+ assert_equal [
266
+ [:tag_start, "<"], [:tag_name, "script"], [:whitespace, " "],
267
+ [:attribute_name, "type"], [:equal, "="], [:attribute_quoted_value_start, "\""], [:attribute_quoted_value, "text/html"], [:attribute_quoted_value_end, "\""],
268
+ [:tag_end, ">"],
269
+ [:text, "foo "], [:text, "<b"], [:text, "> bar"],
270
+ [:tag_start, "<"], [:solidus, "/"], [:tag_name, "script"], [:tag_end, ">"],
271
+ ], result
272
+ end
273
+
274
+ def test_end_of_tag_on_newline
275
+ data = ["\
276
+ <div define=\"{credential_96_credential1: new Shopify.ProviderCredentials()}\"
277
+ ", "", ">"]
278
+ result = tokenize(*data)
279
+ assert_equal [
280
+ [:text, " "],
281
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "], [:attribute_name, "define"], [:equal, "="], [:attribute_quoted_value_start, "\""], [:attribute_quoted_value, "{credential_96_credential1: new Shopify.ProviderCredentials()}"], [:attribute_quoted_value_end, "\""],
282
+ [:whitespace, "\n "], [:tag_end, ">"]
283
+ ], result
284
+ end
285
+
286
+ def test_tokenize_multi_part_attribute_name
287
+ result = tokenize('<div data-', 'shipping', '-type>')
288
+ assert_equal [
289
+ [:tag_start, "<"], [:tag_name, "div"], [:whitespace, " "],
290
+ [:attribute_name, "data-"], [:attribute_name, "shipping"], [:attribute_name, "-type"],
291
+ [:tag_end, ">"],
292
+ ], result
293
+ end
294
+
295
+ def test_tokenize_attribute_name_with_space_before_equal
296
+ result = tokenize('<a href ="http://www.cra-arc.gc.ca/tx/bsnss/tpcs/gst-tps/menu-eng.html">GST/HST</a>')
297
+ assert_equal [
298
+ [:tag_start, "<"], [:tag_name, "a"], [:whitespace, " "],
299
+ [:attribute_name, "href"], [:whitespace, " "], [:equal, "="],
300
+ [:attribute_quoted_value_start, "\""], [:attribute_quoted_value, "http://www.cra-arc.gc.ca/tx/bsnss/tpcs/gst-tps/menu-eng.html"], [:attribute_quoted_value_end, "\""],
301
+ [:tag_end, ">"], [:text, "GST/HST"],
302
+ [:tag_start, "<"], [:solidus, "/"], [:tag_name, "a"], [:tag_end, ">"]
303
+ ], result
304
+ end
305
+
306
+ def test_raise_in_block
307
+ @tokenizer = HtmlTokenizer::Tokenizer.new
308
+ 10.times do
309
+ e = assert_raises(RuntimeError) do
310
+ @tokenizer.tokenize("<>") do |part|
311
+ raise RuntimeError, "something went wrong"
312
+ end
313
+ end
314
+ assert_equal "something went wrong", e.message
315
+ end
316
+ end
317
+
318
+ def test_tokenize_end_of_script_regression
319
+ result = tokenize("<script><</script>")
320
+ assert_equal [
321
+ [:tag_start, "<"], [:tag_name, "script"], [:tag_end, ">"],
322
+ [:text, "<"],
323
+ [:tag_start, "<"], [:solidus, "/"], [:tag_name, "script"], [:tag_end, ">"]
324
+ ], result
325
+ end
326
+
327
+ private
328
+
329
+ def tokenize(*parts)
330
+ tokens = []
331
+ @tokenizer = HtmlTokenizer::Tokenizer.new
332
+ parts.each do |part|
333
+ @tokenizer.tokenize(part) { |name, start, stop| tokens << [name, part[start..(stop-1)]] }
334
+ end
335
+ tokens
336
+ end
337
+ end
metadata ADDED
@@ -0,0 +1,109 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: html_tokenizer
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Francois Chagnon
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2017-10-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake-compiler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: minitest
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description:
56
+ email:
57
+ executables:
58
+ - html_tokenizer
59
+ extensions:
60
+ - ext/html_tokenizer_ext/extconf.rb
61
+ extra_rdoc_files: []
62
+ files:
63
+ - .autotest
64
+ - .gitignore
65
+ - Gemfile
66
+ - Gemfile.lock
67
+ - LICENSE
68
+ - Manifest.txt
69
+ - README.md
70
+ - Rakefile
71
+ - bin/html_tokenizer
72
+ - ext/html_tokenizer_ext/extconf.rb
73
+ - ext/html_tokenizer_ext/html_tokenizer.c
74
+ - ext/html_tokenizer_ext/html_tokenizer.h
75
+ - ext/html_tokenizer_ext/parser.c
76
+ - ext/html_tokenizer_ext/parser.h
77
+ - ext/html_tokenizer_ext/tokenizer.c
78
+ - ext/html_tokenizer_ext/tokenizer.h
79
+ - html_tokenizer.gemspec
80
+ - lib/html_tokenizer.rb
81
+ - test/unit/parser_test.rb
82
+ - test/unit/tokenizer_test.rb
83
+ homepage:
84
+ licenses: []
85
+ metadata: {}
86
+ post_install_message:
87
+ rdoc_options: []
88
+ require_paths:
89
+ - lib
90
+ - ext
91
+ required_ruby_version: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - '>='
94
+ - !ruby/object:Gem::Version
95
+ version: '0'
96
+ required_rubygems_version: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - '>='
99
+ - !ruby/object:Gem::Version
100
+ version: '0'
101
+ requirements: []
102
+ rubyforge_project:
103
+ rubygems_version: 2.0.14.1
104
+ signing_key:
105
+ specification_version: 4
106
+ summary: HTML Tokenizer
107
+ test_files:
108
+ - test/unit/parser_test.rb
109
+ - test/unit/tokenizer_test.rb