html_tokenizer 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,74 @@
1
+ #pragma once
2
+
3
+ enum tokenizer_context {
4
+ TOKENIZER_NONE = 0,
5
+ TOKENIZER_HTML,
6
+ TOKENIZER_OPEN_TAG,
7
+ TOKENIZER_SOLIDUS_OR_TAG_NAME,
8
+ TOKENIZER_TAG_NAME,
9
+ TOKENIZER_CDATA,
10
+ TOKENIZER_RCDATA, // title, textarea
11
+ TOKENIZER_RAWTEXT, // style, xmp, iframe, noembed, noframes
12
+ TOKENIZER_SCRIPT_DATA, // script
13
+ TOKENIZER_PLAINTEXT, // plaintext
14
+ TOKENIZER_COMMENT,
15
+ TOKENIZER_ATTRIBUTE_NAME,
16
+ TOKENIZER_ATTRIBUTE_VALUE,
17
+ TOKENIZER_ATTRIBUTE_UNQUOTED,
18
+ TOKENIZER_ATTRIBUTE_QUOTED,
19
+ };
20
+
21
+ enum token_type {
22
+ TOKEN_NONE = 0,
23
+ TOKEN_TEXT,
24
+ TOKEN_WHITESPACE,
25
+ TOKEN_COMMENT_START,
26
+ TOKEN_COMMENT_END,
27
+ TOKEN_TAG_START,
28
+ TOKEN_TAG_NAME,
29
+ TOKEN_TAG_END,
30
+ TOKEN_ATTRIBUTE_NAME,
31
+ TOKEN_ATTRIBUTE_QUOTED_VALUE_START,
32
+ TOKEN_ATTRIBUTE_QUOTED_VALUE,
33
+ TOKEN_ATTRIBUTE_QUOTED_VALUE_END,
34
+ TOKEN_ATTRIBUTE_UNQUOTED_VALUE,
35
+ TOKEN_CDATA_START,
36
+ TOKEN_CDATA_END,
37
+ TOKEN_SOLIDUS,
38
+ TOKEN_EQUAL,
39
+ TOKEN_MALFORMED,
40
+ };
41
+
42
+ struct scan_t {
43
+ char *string;
44
+ long unsigned int cursor;
45
+ long unsigned int length;
46
+ };
47
+
48
+ struct tokenizer_t
49
+ {
50
+ enum tokenizer_context context[1000];
51
+ uint32_t current_context;
52
+
53
+ void *callback_data;
54
+ void (*f_callback)(struct tokenizer_t *tk, enum token_type type, long unsigned int length, void *data);
55
+
56
+ char attribute_value_start;
57
+ int found_attribute;
58
+
59
+ char *current_tag;
60
+
61
+ int is_closing_tag;
62
+ enum token_type last_token;
63
+
64
+ struct scan_t scan;
65
+ };
66
+
67
+
68
+ void Init_html_tokenizer_tokenizer(VALUE mHtmlTokenizer);
69
+ void tokenizer_init(struct tokenizer_t *tk);
70
+ void tokenizer_scan_all(struct tokenizer_t *tk);
71
+ VALUE token_type_to_symbol(enum token_type type);
72
+
73
+ extern const rb_data_type_t ht_tokenizer_data_type;
74
+ #define Tokenizer_Get_Struct(obj, sval) TypedData_Get_Struct(obj, struct tokenizer_t, &ht_tokenizer_data_type, sval)
@@ -0,0 +1,19 @@
1
+ Gem::Specification.new do |spec|
2
+ spec.name = "html_tokenizer"
3
+ spec.version = "0.0.1"
4
+ spec.summary = "HTML Tokenizer"
5
+ spec.author = "Francois Chagnon"
6
+
7
+ spec.files = Dir.glob("ext/**/*.{c,h,rb}") +
8
+ Dir.glob("lib/**/*.rb")
9
+
10
+ spec.extensions = ['ext/html_tokenizer_ext/extconf.rb']
11
+ spec.files = `git ls-files -z`.split("\x0")
12
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
13
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
14
+ spec.require_paths = ["lib", "ext"]
15
+
16
+ spec.add_development_dependency 'rake', '~> 0'
17
+ spec.add_development_dependency 'rake-compiler', '~> 0'
18
+ spec.add_development_dependency 'minitest', '~> 0'
19
+ end
@@ -0,0 +1,12 @@
1
+ require 'html_tokenizer_ext'
2
+
3
+ module HtmlTokenizer
4
+ class ParserError < RuntimeError
5
+ attr_reader :line, :column
6
+ def initialize(message, line, column)
7
+ super(message)
8
+ @line = line
9
+ @column = column
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,575 @@
1
+ require "minitest/autorun"
2
+ require "html_tokenizer"
3
+
4
+ class HtmlTokenizer::ParserTest < Minitest::Test
5
+ def test_empty_context
6
+ parse
7
+ assert_equal :none, @parser.context
8
+ end
9
+
10
+ def test_open_tag
11
+ parse("<div")
12
+ assert_equal :tag_name, @parser.context
13
+ assert_equal "div", @parser.tag_name
14
+ assert_equal false, @parser.closing_tag?
15
+ end
16
+
17
+ def test_open_attribute_value
18
+ parse('<div "foo')
19
+ assert_equal :quoted_value, @parser.context
20
+ assert_equal 'foo', @parser.attribute_value
21
+ assert_equal '"', @parser.quote_character
22
+ parse('bar"')
23
+ assert_equal :space_after_attribute, @parser.context
24
+ assert_equal 'foobar', @parser.attribute_value
25
+ assert_equal true, @parser.attribute_quoted?
26
+ assert_equal '"', @parser.quote_character
27
+ end
28
+
29
+ def test_multi_part_namespace_tag
30
+ parse("<foo:")
31
+ assert_equal "foo:", @parser.tag_name
32
+ parse("bar")
33
+ assert_equal "foo:bar", @parser.tag_name
34
+ end
35
+
36
+ def test_solidus_after_tag_name
37
+ parse("<foo/")
38
+ assert_equal "foo", @parser.tag_name
39
+ assert_equal :tag_end, @parser.context
40
+ end
41
+
42
+ def test_whitespace_after_tag_name
43
+ parse("<foo ")
44
+ assert_equal "foo", @parser.tag_name
45
+ assert_equal :tag, @parser.context
46
+ end
47
+
48
+ def test_context_is_tag_name_just_after_solidus
49
+ parse("</")
50
+ assert_equal :tag_name, @parser.context
51
+ assert_equal true, @parser.closing_tag?
52
+ end
53
+
54
+ def test_close_tag
55
+ parse("<div", ">")
56
+ assert_equal :none, @parser.context
57
+ end
58
+
59
+ def test_attribute_name
60
+ parse("<div foo")
61
+ assert_equal "div", @parser.tag_name
62
+ assert_equal :attribute_name, @parser.context
63
+ assert_equal "foo", @parser.attribute_name
64
+ parse("bla")
65
+ assert_equal "foobla", @parser.attribute_name
66
+ end
67
+
68
+ def test_attribute_name_and_close
69
+ parse("<div foo>")
70
+ assert_equal "div", @parser.tag_name
71
+ assert_equal "foo", @parser.attribute_name
72
+ assert_nil @parser.attribute_value
73
+ assert_equal :none, @parser.context
74
+ end
75
+
76
+ def test_attribute_solidus_close
77
+ parse("<div foo/>")
78
+ assert_equal "div", @parser.tag_name
79
+ assert_equal "foo", @parser.attribute_name
80
+ assert_nil @parser.attribute_value
81
+ assert_equal :none, @parser.context
82
+ assert_equal false, @parser.closing_tag?
83
+ assert_equal true, @parser.self_closing_tag?
84
+ end
85
+
86
+ def test_attribute_value_solidus_close
87
+ parse("<div 'foo'/>")
88
+ assert_equal "div", @parser.tag_name
89
+ assert_nil @parser.attribute_name
90
+ assert_equal "foo", @parser.attribute_value
91
+ assert_equal true, @parser.attribute_quoted?
92
+ assert_equal :none, @parser.context
93
+ assert_equal false, @parser.closing_tag?
94
+ assert_equal true, @parser.self_closing_tag?
95
+ end
96
+
97
+ def test_attribute_value_and_tag_close
98
+ parse('<div "foo">')
99
+ assert_equal "div", @parser.tag_name
100
+ assert_nil @parser.attribute_name
101
+ assert_equal 'foo', @parser.attribute_value
102
+ assert_equal true, @parser.attribute_quoted?
103
+ assert_equal '"', @parser.quote_character
104
+ assert_equal :none, @parser.context
105
+ assert_equal false, @parser.closing_tag?
106
+ assert_equal false, @parser.self_closing_tag?
107
+ end
108
+
109
+ def test_attribute_value_equal_and_tag_close
110
+ parse("<div foo=>")
111
+ assert_equal "div", @parser.tag_name
112
+ assert_equal "foo", @parser.attribute_name
113
+ assert_nil @parser.attribute_value
114
+ assert_equal :none, @parser.context
115
+ assert_equal false, @parser.closing_tag?
116
+ assert_equal false, @parser.self_closing_tag?
117
+ end
118
+
119
+ def test_attribute_value_open_quote
120
+ parse("<div '")
121
+ assert_nil @parser.attribute_name
122
+ assert_nil @parser.attribute_value
123
+ assert_equal true, @parser.attribute_quoted?
124
+ assert_equal "'", @parser.quote_character
125
+ assert_equal :quoted_value, @parser.context
126
+ end
127
+
128
+ def test_attribute_name_and_value_open_quote
129
+ parse("<div foo='")
130
+ assert_nil @parser.attribute_value
131
+ assert_equal true, @parser.attribute_quoted?
132
+ assert_equal "'", @parser.quote_character
133
+ assert_equal :quoted_value, @parser.context
134
+ end
135
+
136
+ def test_attribute_value_open
137
+ parse("<div foo=")
138
+ assert_equal "div", @parser.tag_name
139
+ assert_equal "foo", @parser.attribute_name
140
+ assert_nil @parser.attribute_value
141
+ assert_equal :after_equal, @parser.context
142
+ end
143
+
144
+ def test_attribute_name_with_solidus
145
+ parse("<div foo=/")
146
+ assert_equal "foo", @parser.attribute_name
147
+ assert_equal "/", @parser.attribute_value
148
+ assert_equal false, @parser.attribute_quoted?
149
+ assert_nil @parser.quote_character
150
+ assert_equal :unquoted_value, @parser.context
151
+ end
152
+
153
+ def test_solidus_anywhere_doesnt_affect_closing_flags
154
+ parse("<div / >")
155
+ assert_equal "div", @parser.tag_name
156
+ assert_equal false, @parser.closing_tag?
157
+ assert_equal false, @parser.self_closing_tag?
158
+ end
159
+
160
+ def test_solidus_at_beginning_and_end_affect_closing_flags
161
+ parse("</div/>")
162
+ assert_equal "div", @parser.tag_name
163
+ assert_equal true, @parser.closing_tag?
164
+ assert_equal true, @parser.self_closing_tag?
165
+ end
166
+
167
+ def test_attribute_name_with_solidus_and_name
168
+ parse("<div foo=/bar")
169
+ assert_equal "foo", @parser.attribute_name
170
+ assert_equal "/bar", @parser.attribute_value
171
+ assert_equal false, @parser.attribute_quoted?
172
+ assert_nil @parser.quote_character
173
+ assert_equal :unquoted_value, @parser.context
174
+ end
175
+
176
+ def test_attribute_with_value_with_solidus
177
+ parse("<div foo='bar'")
178
+ assert_equal "foo", @parser.attribute_name
179
+ assert_equal "bar", @parser.attribute_value
180
+ assert_equal :space_after_attribute, @parser.context
181
+ parse("/baz")
182
+ assert_equal "baz", @parser.attribute_name
183
+ assert_nil @parser.attribute_value
184
+ assert_equal false, @parser.attribute_quoted?
185
+ assert_nil @parser.quote_character
186
+ assert_equal :attribute_name, @parser.context
187
+ end
188
+
189
+ def test_attribute_with_unquoted_value
190
+ parse("<div foo=bar")
191
+ assert_equal "foo", @parser.attribute_name
192
+ assert_equal "bar", @parser.attribute_value
193
+ assert_equal false, @parser.attribute_quoted?
194
+ assert_nil @parser.quote_character
195
+ assert_equal :unquoted_value, @parser.context
196
+ end
197
+
198
+ def test_attribute_with_unquoted_value_tag_end
199
+ parse("<div foo=bar>")
200
+ assert_equal "foo", @parser.attribute_name
201
+ assert_equal "bar", @parser.attribute_value
202
+ assert_equal false, @parser.attribute_quoted?
203
+ assert_nil @parser.quote_character
204
+ assert_equal :none, @parser.context
205
+ end
206
+
207
+ def test_attribute_with_unquoted_value_with_solidus
208
+ parse("<div foo=ba", "r", "/baz")
209
+ assert_equal "foo", @parser.attribute_name
210
+ assert_equal "bar/baz", @parser.attribute_value
211
+ assert_equal false, @parser.attribute_quoted?
212
+ assert_nil @parser.quote_character
213
+ assert_equal :unquoted_value, @parser.context
214
+ end
215
+
216
+ def test_attribute_with_unquoted_value_with_space
217
+ parse("<div foo=ba", "r", " baz")
218
+ assert_equal "baz", @parser.attribute_name
219
+ assert_nil @parser.attribute_value
220
+ assert_equal false, @parser.attribute_quoted?
221
+ assert_nil @parser.quote_character
222
+ assert_equal :attribute_name, @parser.context
223
+ end
224
+
225
+ def test_attribute_with_multipart_unquoted_value
226
+ parse("<div foo=ba", "r", "&baz")
227
+ assert_equal "foo", @parser.attribute_name
228
+ assert_equal "bar&baz", @parser.attribute_value
229
+ assert_equal false, @parser.attribute_quoted?
230
+ assert_nil @parser.quote_character
231
+ assert_equal :unquoted_value, @parser.context
232
+ end
233
+
234
+ def test_attribute_name_incomplete
235
+ parse("<div foo")
236
+ assert_equal "foo", @parser.attribute_name
237
+ assert_equal :attribute_name, @parser.context
238
+ end
239
+
240
+ def test_space_after_attribute_name_switches_context
241
+ parse("<div foo ")
242
+ assert_equal "foo", @parser.attribute_name
243
+ assert_equal :after_attribute_name, @parser.context
244
+ end
245
+
246
+ def test_solidus_after_attribute_name_switches_context
247
+ parse("<div foo/")
248
+ assert_equal "foo", @parser.attribute_name
249
+ assert_equal :tag_end, @parser.context
250
+ end
251
+
252
+ def test_attribute_name_is_complete_after_equal
253
+ parse("<div foo=")
254
+ assert_equal "foo", @parser.attribute_name
255
+ assert_equal :after_equal, @parser.context
256
+ end
257
+
258
+ def test_attribute_name_without_value
259
+ parse("<div foo ")
260
+ assert_equal "foo", @parser.attribute_name
261
+ assert_nil @parser.attribute_value
262
+ assert_equal :after_attribute_name, @parser.context
263
+ end
264
+
265
+ def test_attribute_name_are_separated_by_space
266
+ parse("<div foo bar")
267
+ assert_equal "bar", @parser.attribute_name
268
+ assert_nil @parser.attribute_value
269
+ assert_equal :attribute_name, @parser.context
270
+ end
271
+
272
+ def test_comment_context
273
+ parse("<!--")
274
+ assert_equal :comment, @parser.context
275
+ assert_nil @parser.comment_text
276
+ end
277
+
278
+ def test_cdata_context
279
+ parse("<![CDATA[")
280
+ assert_equal :cdata, @parser.context
281
+ assert_nil @parser.cdata_text
282
+ end
283
+
284
+ def test_comment_text
285
+ parse("<!-- foo")
286
+ assert_equal :comment, @parser.context
287
+ assert_equal " foo", @parser.comment_text
288
+ end
289
+
290
+ def test_cdata_text
291
+ parse("<![CDATA[ foo")
292
+ assert_equal :cdata, @parser.context
293
+ assert_equal " foo", @parser.cdata_text
294
+ end
295
+
296
+ def test_multipart_comment
297
+ parse("<!-- f", "oo", "bar")
298
+ assert_equal :comment, @parser.context
299
+ assert_equal " foobar", @parser.comment_text
300
+ end
301
+
302
+ def test_multipart_cdata
303
+ parse("<![CDATA[ f", "oo", "bar")
304
+ assert_equal :cdata, @parser.context
305
+ assert_equal " foobar", @parser.cdata_text
306
+ end
307
+
308
+ def test_comment_end
309
+ parse("<!-- foo -->")
310
+ assert_equal :none, @parser.context
311
+ assert_equal " foo ", @parser.comment_text
312
+ end
313
+
314
+ def test_cdata_end
315
+ parse("<![CDATA[ foo ]]>")
316
+ assert_equal :none, @parser.context
317
+ assert_equal " foo ", @parser.cdata_text
318
+ end
319
+
320
+ def test_plaintext_never_stops_parsing
321
+ parse("<plaintext>")
322
+ assert_equal :rawtext, @parser.context
323
+ assert_equal "plaintext", @parser.tag_name
324
+ assert_nil @parser.rawtext_text
325
+
326
+ parse("some", "<text")
327
+ assert_equal :rawtext, @parser.context
328
+ assert_equal "some<text", @parser.rawtext_text
329
+
330
+ parse("<plaintext")
331
+ assert_equal :rawtext, @parser.context
332
+ assert_equal "some<text<plaintext", @parser.rawtext_text
333
+
334
+ parse("</plaintext>")
335
+ assert_equal :rawtext, @parser.context
336
+ assert_equal "some<text<plaintext</plaintext>", @parser.rawtext_text
337
+ end
338
+
339
+ %w(title textarea style xmp iframe noembed noframes).each do |name|
340
+ define_method "test_#{name}_rawtext" do
341
+ parse("<#{name}>")
342
+ assert_equal :rawtext, @parser.context
343
+ assert_equal name, @parser.tag_name
344
+ assert_nil @parser.rawtext_text
345
+
346
+ parse("some", "<text")
347
+ assert_equal :rawtext, @parser.context
348
+ assert_equal "some<text", @parser.rawtext_text
349
+
350
+ parse("<#{name}")
351
+ assert_equal :rawtext, @parser.context
352
+ assert_equal "some<text<#{name}", @parser.rawtext_text
353
+
354
+ parse("</#{name}")
355
+ assert_equal :tag_name, @parser.context
356
+ assert_equal "some<text<#{name}", @parser.rawtext_text
357
+
358
+ parse(">")
359
+ assert_equal :none, @parser.context
360
+ assert_equal "some<text<#{name}", @parser.rawtext_text
361
+ end
362
+ end
363
+
364
+ def test_script_rawtext
365
+ parse("<script>data data data")
366
+ assert_equal :rawtext, @parser.context
367
+ assert_equal "script", @parser.tag_name
368
+ assert_equal "data data data", @parser.rawtext_text
369
+ parse("</script")
370
+ assert_equal :tag_name, @parser.context
371
+ assert_equal "script", @parser.tag_name
372
+ parse(">")
373
+ assert_equal :none, @parser.context
374
+ end
375
+
376
+ def test_consecutive_scripts
377
+ parse("<script>foo\n</script>\n<script>bar</script>\n bla")
378
+ assert_equal :none, @parser.context
379
+ end
380
+
381
+ def test_end_of_script_regression
382
+ html = "<script><!</script>"
383
+ parse(html)
384
+ assert_equal :none, @parser.context
385
+ end
386
+
387
+ def test_document_length
388
+ @parser = HtmlTokenizer::Parser.new
389
+ assert_equal 0, @parser.document_length
390
+ parse("abcdef")
391
+ assert_equal 6, @parser.document_length
392
+ parse("abcdef")
393
+ assert_equal 12, @parser.document_length
394
+ end
395
+
396
+ def test_document_method
397
+ @parser = HtmlTokenizer::Parser.new
398
+ assert_nil @parser.document
399
+ parse("abcdef")
400
+ assert_equal "abcdef", @parser.document
401
+ parse("abcdef")
402
+ assert_equal "abcdefabcdef", @parser.document
403
+ end
404
+
405
+ def test_yields_raw_tokens_when_block_given
406
+ tokens = []
407
+ parse("<foo>") do |*token|
408
+ tokens << token
409
+ end
410
+ assert_equal [[:tag_start, 0, 1, 1, 0], [:tag_name, 1, 4, 1, 1], [:tag_end, 4, 5, 1, 4]], tokens
411
+ end
412
+
413
+ def test_yields_line_and_column_numbers
414
+ tokens = []
415
+ parse("<\n>") do |*token|
416
+ tokens << token
417
+ end
418
+ assert_equal [[:tag_start, 0, 1, 1, 0], [:whitespace, 1, 2, 1, 1], [:tag_end, 2, 3, 2, 0]], tokens
419
+ end
420
+
421
+ def test_append_placeholder_adjusts_line_and_column_numbers_but_does_not_parse
422
+ @parser = HtmlTokenizer::Parser.new
423
+ tokens = []
424
+ @parser.parse("foo\n") do |*token|
425
+ tokens << token
426
+ end
427
+ @parser.append_placeholder("<%= some ruby do\n foo\nend %>\n") do |*token|
428
+ tokens << token
429
+ end
430
+ @parser.parse("bar\n") do |*token|
431
+ tokens << token
432
+ end
433
+ assert_equal [[:text, 0, 4, 1, 0], [:text, 34, 38, 5, 0]], tokens
434
+ assert_equal "bar\n", @parser.extract(34, 38)
435
+ end
436
+
437
+ def test_extract_method
438
+ parse("abcdefg")
439
+ assert_equal "a", @parser.extract(0, 1)
440
+ assert_equal "cd", @parser.extract(2, 4)
441
+ end
442
+
443
+ def test_extract_method_raises_argument_error_end_past_length
444
+ parse("abcdefg")
445
+ e = assert_raises(ArgumentError) do
446
+ @parser.extract(0, 32)
447
+ end
448
+ assert_equal "'end' argument not in range of document", e.message
449
+ end
450
+
451
+ def test_extract_method_raises_argument_error_end_less_than_start
452
+ parse("abcdefg")
453
+ e = assert_raises(ArgumentError) do
454
+ @parser.extract(1, 0)
455
+ end
456
+ assert_equal "'end' must be greater or equal than 'start'", e.message
457
+ end
458
+
459
+ def test_solidus_or_tag_name_error
460
+ parse('<>')
461
+ assert_equal 1, @parser.errors_count
462
+ assert_equal "expected '/' or tag name", @parser.errors.first.to_s
463
+ assert_equal 1, @parser.errors.first.line
464
+ assert_equal 1, @parser.errors.first.column
465
+ end
466
+
467
+ def test_solidus_or_tag_name_error_2
468
+ parse('< ')
469
+ assert_equal 1, @parser.errors_count
470
+ assert_equal "expected '/' or tag name", @parser.errors.first.to_s
471
+ assert_equal 1, @parser.errors.first.line
472
+ assert_equal 1, @parser.errors.first.column
473
+ end
474
+
475
+ def test_tag_error
476
+ parse('<foo =')
477
+ assert_equal 1, @parser.errors_count
478
+ assert_equal "expected whitespace, '>', attribute name or value", @parser.errors.first.to_s
479
+ assert_equal 1, @parser.errors.first.line
480
+ assert_equal 5, @parser.errors.first.column
481
+ end
482
+
483
+ def test_tag_end_error
484
+ parse('<foo /x')
485
+ assert_equal 1, @parser.errors_count
486
+ assert_equal "expected '>' after '/'", @parser.errors.first.to_s
487
+ assert_equal 1, @parser.errors.first.line
488
+ assert_equal 6, @parser.errors.first.column
489
+ end
490
+
491
+ def test_tag_end_error_2
492
+ parse('<foo / ')
493
+ assert_equal 1, @parser.errors_count
494
+ assert_equal "expected '>' after '/'", @parser.errors.first.to_s
495
+ assert_equal 1, @parser.errors.first.line
496
+ assert_equal 6, @parser.errors.first.column
497
+ end
498
+
499
+ def test_attribute_name_error
500
+ parse('<foo bar~')
501
+ assert_equal 2, @parser.errors_count
502
+ assert_equal "expected whitespace, '>' or '=' after attribute name", @parser.errors.first.to_s
503
+ assert_equal 1, @parser.errors.first.line
504
+ assert_equal 8, @parser.errors.first.column
505
+ assert_equal "expected whitespace, '>' or '=' after attribute name", @parser.errors[0].to_s
506
+ assert_equal 1, @parser.errors[0].line
507
+ assert_equal 8, @parser.errors[0].column
508
+ end
509
+
510
+ def test_attribute_whitespace_or_equal_error
511
+ parse('<foo bar ~')
512
+ assert_equal 2, @parser.errors_count
513
+ assert_equal "expected '/', '>', \", ' or '=' after attribute name", @parser.errors.first.to_s
514
+ assert_equal 1, @parser.errors.first.line
515
+ assert_equal 9, @parser.errors.first.column
516
+ assert_equal "expected '/', '>', \", ' or '=' after attribute name", @parser.errors[0].to_s
517
+ assert_equal 1, @parser.errors[0].line
518
+ assert_equal 9, @parser.errors[0].column
519
+ end
520
+
521
+ def test_attribute_whitespace_or_equal_error_2
522
+ parse('<foo bar = >')
523
+ assert_equal 1, @parser.errors_count
524
+ assert_equal "expected attribute value after '='", @parser.errors.first.to_s
525
+ assert_equal 1, @parser.errors.first.line
526
+ assert_equal 11, @parser.errors.first.column
527
+ end
528
+
529
+ def test_attribute_after_quoted_value
530
+ parse('<foo bar=""x')
531
+ assert_equal 1, @parser.errors_count
532
+ assert_equal "expected space after attribute value", @parser.errors.first.to_s
533
+ assert_equal 1, @parser.errors.first.line
534
+ assert_equal 11, @parser.errors.first.column
535
+ end
536
+
537
+ def test_valid_syntaxes
538
+ parse(
539
+ '<div>',
540
+ '<div />',
541
+ '<div/>',
542
+ '<div data-thing>',
543
+ '<div data-thing />',
544
+ '<div data-thing/>',
545
+ '<div "value">',
546
+ '<div "value" />',
547
+ '<div "value"/>',
548
+ '<div data-thing = "value">',
549
+ '<div data-thing="value">',
550
+ '<div data-thing="value"/>',
551
+ '<div data-thing data-other-thing="value">',
552
+ '<div data-thing data-other-thing="value"/>',
553
+ "<div \n\t\r data-thing \n\t\r data-other-thing='value'>",
554
+ '<div data-thing "value">',
555
+ '<div data-thing "value"/>',
556
+ '<div data-thing "value" />',
557
+ '<div "value" data-thing>',
558
+ '<div "value" data-thing/>',
559
+ '<div foo=unquoted=bla/>',
560
+ '<div foo=unquoted=bla />',
561
+ '<div foo=unquoted=bla>',
562
+ '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',
563
+ )
564
+ assert_equal 0, @parser.errors_count, "Expected no errors: #{@parser.errors}"
565
+ end
566
+
567
+ private
568
+
569
+ def parse(*parts, &block)
570
+ @parser ||= HtmlTokenizer::Parser.new
571
+ parts.each do |part|
572
+ @parser.parse(part, &block)
573
+ end
574
+ end
575
+ end