html_tokenizer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,74 @@
1
+ #pragma once
2
+
3
+ enum tokenizer_context {
4
+ TOKENIZER_NONE = 0,
5
+ TOKENIZER_HTML,
6
+ TOKENIZER_OPEN_TAG,
7
+ TOKENIZER_SOLIDUS_OR_TAG_NAME,
8
+ TOKENIZER_TAG_NAME,
9
+ TOKENIZER_CDATA,
10
+ TOKENIZER_RCDATA, // title, textarea
11
+ TOKENIZER_RAWTEXT, // style, xmp, iframe, noembed, noframes
12
+ TOKENIZER_SCRIPT_DATA, // script
13
+ TOKENIZER_PLAINTEXT, // plaintext
14
+ TOKENIZER_COMMENT,
15
+ TOKENIZER_ATTRIBUTE_NAME,
16
+ TOKENIZER_ATTRIBUTE_VALUE,
17
+ TOKENIZER_ATTRIBUTE_UNQUOTED,
18
+ TOKENIZER_ATTRIBUTE_QUOTED,
19
+ };
20
+
21
+ enum token_type {
22
+ TOKEN_NONE = 0,
23
+ TOKEN_TEXT,
24
+ TOKEN_WHITESPACE,
25
+ TOKEN_COMMENT_START,
26
+ TOKEN_COMMENT_END,
27
+ TOKEN_TAG_START,
28
+ TOKEN_TAG_NAME,
29
+ TOKEN_TAG_END,
30
+ TOKEN_ATTRIBUTE_NAME,
31
+ TOKEN_ATTRIBUTE_QUOTED_VALUE_START,
32
+ TOKEN_ATTRIBUTE_QUOTED_VALUE,
33
+ TOKEN_ATTRIBUTE_QUOTED_VALUE_END,
34
+ TOKEN_ATTRIBUTE_UNQUOTED_VALUE,
35
+ TOKEN_CDATA_START,
36
+ TOKEN_CDATA_END,
37
+ TOKEN_SOLIDUS,
38
+ TOKEN_EQUAL,
39
+ TOKEN_MALFORMED,
40
+ };
41
+
42
+ struct scan_t {
43
+ char *string;
44
+ long unsigned int cursor;
45
+ long unsigned int length;
46
+ };
47
+
48
+ struct tokenizer_t
49
+ {
50
+ enum tokenizer_context context[1000];
51
+ uint32_t current_context;
52
+
53
+ void *callback_data;
54
+ void (*f_callback)(struct tokenizer_t *tk, enum token_type type, long unsigned int length, void *data);
55
+
56
+ char attribute_value_start;
57
+ int found_attribute;
58
+
59
+ char *current_tag;
60
+
61
+ int is_closing_tag;
62
+ enum token_type last_token;
63
+
64
+ struct scan_t scan;
65
+ };
66
+
67
+
68
+ void Init_html_tokenizer_tokenizer(VALUE mHtmlTokenizer);
69
+ void tokenizer_init(struct tokenizer_t *tk);
70
+ void tokenizer_scan_all(struct tokenizer_t *tk);
71
+ VALUE token_type_to_symbol(enum token_type type);
72
+
73
+ extern const rb_data_type_t ht_tokenizer_data_type;
74
+ #define Tokenizer_Get_Struct(obj, sval) TypedData_Get_Struct(obj, struct tokenizer_t, &ht_tokenizer_data_type, sval)
@@ -0,0 +1,19 @@
1
+ Gem::Specification.new do |spec|
2
+ spec.name = "html_tokenizer"
3
+ spec.version = "0.0.1"
4
+ spec.summary = "HTML Tokenizer"
5
+ spec.author = "Francois Chagnon"
6
+
7
+ spec.files = Dir.glob("ext/**/*.{c,h,rb}") +
8
+ Dir.glob("lib/**/*.rb")
9
+
10
+ spec.extensions = ['ext/html_tokenizer_ext/extconf.rb']
11
+ spec.files = `git ls-files -z`.split("\x0")
12
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
13
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
14
+ spec.require_paths = ["lib", "ext"]
15
+
16
+ spec.add_development_dependency 'rake', '~> 0'
17
+ spec.add_development_dependency 'rake-compiler', '~> 0'
18
+ spec.add_development_dependency 'minitest', '~> 0'
19
+ end
@@ -0,0 +1,12 @@
1
+ require 'html_tokenizer_ext'
2
+
3
+ module HtmlTokenizer
4
+ class ParserError < RuntimeError
5
+ attr_reader :line, :column
6
+ def initialize(message, line, column)
7
+ super(message)
8
+ @line = line
9
+ @column = column
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,575 @@
1
+ require "minitest/autorun"
2
+ require "html_tokenizer"
3
+
4
+ class HtmlTokenizer::ParserTest < Minitest::Test
5
+ def test_empty_context
6
+ parse
7
+ assert_equal :none, @parser.context
8
+ end
9
+
10
+ def test_open_tag
11
+ parse("<div")
12
+ assert_equal :tag_name, @parser.context
13
+ assert_equal "div", @parser.tag_name
14
+ assert_equal false, @parser.closing_tag?
15
+ end
16
+
17
+ def test_open_attribute_value
18
+ parse('<div "foo')
19
+ assert_equal :quoted_value, @parser.context
20
+ assert_equal 'foo', @parser.attribute_value
21
+ assert_equal '"', @parser.quote_character
22
+ parse('bar"')
23
+ assert_equal :space_after_attribute, @parser.context
24
+ assert_equal 'foobar', @parser.attribute_value
25
+ assert_equal true, @parser.attribute_quoted?
26
+ assert_equal '"', @parser.quote_character
27
+ end
28
+
29
+ def test_multi_part_namespace_tag
30
+ parse("<foo:")
31
+ assert_equal "foo:", @parser.tag_name
32
+ parse("bar")
33
+ assert_equal "foo:bar", @parser.tag_name
34
+ end
35
+
36
+ def test_solidus_after_tag_name
37
+ parse("<foo/")
38
+ assert_equal "foo", @parser.tag_name
39
+ assert_equal :tag_end, @parser.context
40
+ end
41
+
42
+ def test_whitespace_after_tag_name
43
+ parse("<foo ")
44
+ assert_equal "foo", @parser.tag_name
45
+ assert_equal :tag, @parser.context
46
+ end
47
+
48
+ def test_context_is_tag_name_just_after_solidus
49
+ parse("</")
50
+ assert_equal :tag_name, @parser.context
51
+ assert_equal true, @parser.closing_tag?
52
+ end
53
+
54
+ def test_close_tag
55
+ parse("<div", ">")
56
+ assert_equal :none, @parser.context
57
+ end
58
+
59
+ def test_attribute_name
60
+ parse("<div foo")
61
+ assert_equal "div", @parser.tag_name
62
+ assert_equal :attribute_name, @parser.context
63
+ assert_equal "foo", @parser.attribute_name
64
+ parse("bla")
65
+ assert_equal "foobla", @parser.attribute_name
66
+ end
67
+
68
+ def test_attribute_name_and_close
69
+ parse("<div foo>")
70
+ assert_equal "div", @parser.tag_name
71
+ assert_equal "foo", @parser.attribute_name
72
+ assert_nil @parser.attribute_value
73
+ assert_equal :none, @parser.context
74
+ end
75
+
76
+ def test_attribute_solidus_close
77
+ parse("<div foo/>")
78
+ assert_equal "div", @parser.tag_name
79
+ assert_equal "foo", @parser.attribute_name
80
+ assert_nil @parser.attribute_value
81
+ assert_equal :none, @parser.context
82
+ assert_equal false, @parser.closing_tag?
83
+ assert_equal true, @parser.self_closing_tag?
84
+ end
85
+
86
+ def test_attribute_value_solidus_close
87
+ parse("<div 'foo'/>")
88
+ assert_equal "div", @parser.tag_name
89
+ assert_nil @parser.attribute_name
90
+ assert_equal "foo", @parser.attribute_value
91
+ assert_equal true, @parser.attribute_quoted?
92
+ assert_equal :none, @parser.context
93
+ assert_equal false, @parser.closing_tag?
94
+ assert_equal true, @parser.self_closing_tag?
95
+ end
96
+
97
+ def test_attribute_value_and_tag_close
98
+ parse('<div "foo">')
99
+ assert_equal "div", @parser.tag_name
100
+ assert_nil @parser.attribute_name
101
+ assert_equal 'foo', @parser.attribute_value
102
+ assert_equal true, @parser.attribute_quoted?
103
+ assert_equal '"', @parser.quote_character
104
+ assert_equal :none, @parser.context
105
+ assert_equal false, @parser.closing_tag?
106
+ assert_equal false, @parser.self_closing_tag?
107
+ end
108
+
109
+ def test_attribute_value_equal_and_tag_close
110
+ parse("<div foo=>")
111
+ assert_equal "div", @parser.tag_name
112
+ assert_equal "foo", @parser.attribute_name
113
+ assert_nil @parser.attribute_value
114
+ assert_equal :none, @parser.context
115
+ assert_equal false, @parser.closing_tag?
116
+ assert_equal false, @parser.self_closing_tag?
117
+ end
118
+
119
+ def test_attribute_value_open_quote
120
+ parse("<div '")
121
+ assert_nil @parser.attribute_name
122
+ assert_nil @parser.attribute_value
123
+ assert_equal true, @parser.attribute_quoted?
124
+ assert_equal "'", @parser.quote_character
125
+ assert_equal :quoted_value, @parser.context
126
+ end
127
+
128
+ def test_attribute_name_and_value_open_quote
129
+ parse("<div foo='")
130
+ assert_nil @parser.attribute_value
131
+ assert_equal true, @parser.attribute_quoted?
132
+ assert_equal "'", @parser.quote_character
133
+ assert_equal :quoted_value, @parser.context
134
+ end
135
+
136
+ def test_attribute_value_open
137
+ parse("<div foo=")
138
+ assert_equal "div", @parser.tag_name
139
+ assert_equal "foo", @parser.attribute_name
140
+ assert_nil @parser.attribute_value
141
+ assert_equal :after_equal, @parser.context
142
+ end
143
+
144
+ def test_attribute_name_with_solidus
145
+ parse("<div foo=/")
146
+ assert_equal "foo", @parser.attribute_name
147
+ assert_equal "/", @parser.attribute_value
148
+ assert_equal false, @parser.attribute_quoted?
149
+ assert_nil @parser.quote_character
150
+ assert_equal :unquoted_value, @parser.context
151
+ end
152
+
153
+ def test_solidus_anywhere_doesnt_affect_closing_flags
154
+ parse("<div / >")
155
+ assert_equal "div", @parser.tag_name
156
+ assert_equal false, @parser.closing_tag?
157
+ assert_equal false, @parser.self_closing_tag?
158
+ end
159
+
160
+ def test_solidus_at_beginning_and_end_affect_closing_flags
161
+ parse("</div/>")
162
+ assert_equal "div", @parser.tag_name
163
+ assert_equal true, @parser.closing_tag?
164
+ assert_equal true, @parser.self_closing_tag?
165
+ end
166
+
167
+ def test_attribute_name_with_solidus_and_name
168
+ parse("<div foo=/bar")
169
+ assert_equal "foo", @parser.attribute_name
170
+ assert_equal "/bar", @parser.attribute_value
171
+ assert_equal false, @parser.attribute_quoted?
172
+ assert_nil @parser.quote_character
173
+ assert_equal :unquoted_value, @parser.context
174
+ end
175
+
176
+ def test_attribute_with_value_with_solidus
177
+ parse("<div foo='bar'")
178
+ assert_equal "foo", @parser.attribute_name
179
+ assert_equal "bar", @parser.attribute_value
180
+ assert_equal :space_after_attribute, @parser.context
181
+ parse("/baz")
182
+ assert_equal "baz", @parser.attribute_name
183
+ assert_nil @parser.attribute_value
184
+ assert_equal false, @parser.attribute_quoted?
185
+ assert_nil @parser.quote_character
186
+ assert_equal :attribute_name, @parser.context
187
+ end
188
+
189
+ def test_attribute_with_unquoted_value
190
+ parse("<div foo=bar")
191
+ assert_equal "foo", @parser.attribute_name
192
+ assert_equal "bar", @parser.attribute_value
193
+ assert_equal false, @parser.attribute_quoted?
194
+ assert_nil @parser.quote_character
195
+ assert_equal :unquoted_value, @parser.context
196
+ end
197
+
198
+ def test_attribute_with_unquoted_value_tag_end
199
+ parse("<div foo=bar>")
200
+ assert_equal "foo", @parser.attribute_name
201
+ assert_equal "bar", @parser.attribute_value
202
+ assert_equal false, @parser.attribute_quoted?
203
+ assert_nil @parser.quote_character
204
+ assert_equal :none, @parser.context
205
+ end
206
+
207
+ def test_attribute_with_unquoted_value_with_solidus
208
+ parse("<div foo=ba", "r", "/baz")
209
+ assert_equal "foo", @parser.attribute_name
210
+ assert_equal "bar/baz", @parser.attribute_value
211
+ assert_equal false, @parser.attribute_quoted?
212
+ assert_nil @parser.quote_character
213
+ assert_equal :unquoted_value, @parser.context
214
+ end
215
+
216
+ def test_attribute_with_unquoted_value_with_space
217
+ parse("<div foo=ba", "r", " baz")
218
+ assert_equal "baz", @parser.attribute_name
219
+ assert_nil @parser.attribute_value
220
+ assert_equal false, @parser.attribute_quoted?
221
+ assert_nil @parser.quote_character
222
+ assert_equal :attribute_name, @parser.context
223
+ end
224
+
225
+ def test_attribute_with_multipart_unquoted_value
226
+ parse("<div foo=ba", "r", "&baz")
227
+ assert_equal "foo", @parser.attribute_name
228
+ assert_equal "bar&baz", @parser.attribute_value
229
+ assert_equal false, @parser.attribute_quoted?
230
+ assert_nil @parser.quote_character
231
+ assert_equal :unquoted_value, @parser.context
232
+ end
233
+
234
+ def test_attribute_name_incomplete
235
+ parse("<div foo")
236
+ assert_equal "foo", @parser.attribute_name
237
+ assert_equal :attribute_name, @parser.context
238
+ end
239
+
240
+ def test_space_after_attribute_name_switches_context
241
+ parse("<div foo ")
242
+ assert_equal "foo", @parser.attribute_name
243
+ assert_equal :after_attribute_name, @parser.context
244
+ end
245
+
246
+ def test_solidus_after_attribute_name_switches_context
247
+ parse("<div foo/")
248
+ assert_equal "foo", @parser.attribute_name
249
+ assert_equal :tag_end, @parser.context
250
+ end
251
+
252
+ def test_attribute_name_is_complete_after_equal
253
+ parse("<div foo=")
254
+ assert_equal "foo", @parser.attribute_name
255
+ assert_equal :after_equal, @parser.context
256
+ end
257
+
258
+ def test_attribute_name_without_value
259
+ parse("<div foo ")
260
+ assert_equal "foo", @parser.attribute_name
261
+ assert_nil @parser.attribute_value
262
+ assert_equal :after_attribute_name, @parser.context
263
+ end
264
+
265
+ def test_attribute_name_are_separated_by_space
266
+ parse("<div foo bar")
267
+ assert_equal "bar", @parser.attribute_name
268
+ assert_nil @parser.attribute_value
269
+ assert_equal :attribute_name, @parser.context
270
+ end
271
+
272
+ def test_comment_context
273
+ parse("<!--")
274
+ assert_equal :comment, @parser.context
275
+ assert_nil @parser.comment_text
276
+ end
277
+
278
+ def test_cdata_context
279
+ parse("<![CDATA[")
280
+ assert_equal :cdata, @parser.context
281
+ assert_nil @parser.cdata_text
282
+ end
283
+
284
+ def test_comment_text
285
+ parse("<!-- foo")
286
+ assert_equal :comment, @parser.context
287
+ assert_equal " foo", @parser.comment_text
288
+ end
289
+
290
+ def test_cdata_text
291
+ parse("<![CDATA[ foo")
292
+ assert_equal :cdata, @parser.context
293
+ assert_equal " foo", @parser.cdata_text
294
+ end
295
+
296
+ def test_multipart_comment
297
+ parse("<!-- f", "oo", "bar")
298
+ assert_equal :comment, @parser.context
299
+ assert_equal " foobar", @parser.comment_text
300
+ end
301
+
302
+ def test_multipart_cdata
303
+ parse("<![CDATA[ f", "oo", "bar")
304
+ assert_equal :cdata, @parser.context
305
+ assert_equal " foobar", @parser.cdata_text
306
+ end
307
+
308
+ def test_comment_end
309
+ parse("<!-- foo -->")
310
+ assert_equal :none, @parser.context
311
+ assert_equal " foo ", @parser.comment_text
312
+ end
313
+
314
+ def test_cdata_end
315
+ parse("<![CDATA[ foo ]]>")
316
+ assert_equal :none, @parser.context
317
+ assert_equal " foo ", @parser.cdata_text
318
+ end
319
+
320
+ def test_plaintext_never_stops_parsing
321
+ parse("<plaintext>")
322
+ assert_equal :rawtext, @parser.context
323
+ assert_equal "plaintext", @parser.tag_name
324
+ assert_nil @parser.rawtext_text
325
+
326
+ parse("some", "<text")
327
+ assert_equal :rawtext, @parser.context
328
+ assert_equal "some<text", @parser.rawtext_text
329
+
330
+ parse("<plaintext")
331
+ assert_equal :rawtext, @parser.context
332
+ assert_equal "some<text<plaintext", @parser.rawtext_text
333
+
334
+ parse("</plaintext>")
335
+ assert_equal :rawtext, @parser.context
336
+ assert_equal "some<text<plaintext</plaintext>", @parser.rawtext_text
337
+ end
338
+
339
+ %w(title textarea style xmp iframe noembed noframes).each do |name|
340
+ define_method "test_#{name}_rawtext" do
341
+ parse("<#{name}>")
342
+ assert_equal :rawtext, @parser.context
343
+ assert_equal name, @parser.tag_name
344
+ assert_nil @parser.rawtext_text
345
+
346
+ parse("some", "<text")
347
+ assert_equal :rawtext, @parser.context
348
+ assert_equal "some<text", @parser.rawtext_text
349
+
350
+ parse("<#{name}")
351
+ assert_equal :rawtext, @parser.context
352
+ assert_equal "some<text<#{name}", @parser.rawtext_text
353
+
354
+ parse("</#{name}")
355
+ assert_equal :tag_name, @parser.context
356
+ assert_equal "some<text<#{name}", @parser.rawtext_text
357
+
358
+ parse(">")
359
+ assert_equal :none, @parser.context
360
+ assert_equal "some<text<#{name}", @parser.rawtext_text
361
+ end
362
+ end
363
+
364
+ def test_script_rawtext
365
+ parse("<script>data data data")
366
+ assert_equal :rawtext, @parser.context
367
+ assert_equal "script", @parser.tag_name
368
+ assert_equal "data data data", @parser.rawtext_text
369
+ parse("</script")
370
+ assert_equal :tag_name, @parser.context
371
+ assert_equal "script", @parser.tag_name
372
+ parse(">")
373
+ assert_equal :none, @parser.context
374
+ end
375
+
376
+ def test_consecutive_scripts
377
+ parse("<script>foo\n</script>\n<script>bar</script>\n bla")
378
+ assert_equal :none, @parser.context
379
+ end
380
+
381
+ def test_end_of_script_regression
382
+ html = "<script><!</script>"
383
+ parse(html)
384
+ assert_equal :none, @parser.context
385
+ end
386
+
387
+ def test_document_length
388
+ @parser = HtmlTokenizer::Parser.new
389
+ assert_equal 0, @parser.document_length
390
+ parse("abcdef")
391
+ assert_equal 6, @parser.document_length
392
+ parse("abcdef")
393
+ assert_equal 12, @parser.document_length
394
+ end
395
+
396
+ def test_document_method
397
+ @parser = HtmlTokenizer::Parser.new
398
+ assert_nil @parser.document
399
+ parse("abcdef")
400
+ assert_equal "abcdef", @parser.document
401
+ parse("abcdef")
402
+ assert_equal "abcdefabcdef", @parser.document
403
+ end
404
+
405
+ def test_yields_raw_tokens_when_block_given
406
+ tokens = []
407
+ parse("<foo>") do |*token|
408
+ tokens << token
409
+ end
410
+ assert_equal [[:tag_start, 0, 1, 1, 0], [:tag_name, 1, 4, 1, 1], [:tag_end, 4, 5, 1, 4]], tokens
411
+ end
412
+
413
+ def test_yields_line_and_column_numbers
414
+ tokens = []
415
+ parse("<\n>") do |*token|
416
+ tokens << token
417
+ end
418
+ assert_equal [[:tag_start, 0, 1, 1, 0], [:whitespace, 1, 2, 1, 1], [:tag_end, 2, 3, 2, 0]], tokens
419
+ end
420
+
421
+ def test_append_placeholder_adjusts_line_and_column_numbers_but_does_not_parse
422
+ @parser = HtmlTokenizer::Parser.new
423
+ tokens = []
424
+ @parser.parse("foo\n") do |*token|
425
+ tokens << token
426
+ end
427
+ @parser.append_placeholder("<%= some ruby do\n foo\nend %>\n") do |*token|
428
+ tokens << token
429
+ end
430
+ @parser.parse("bar\n") do |*token|
431
+ tokens << token
432
+ end
433
+ assert_equal [[:text, 0, 4, 1, 0], [:text, 34, 38, 5, 0]], tokens
434
+ assert_equal "bar\n", @parser.extract(34, 38)
435
+ end
436
+
437
+ def test_extract_method
438
+ parse("abcdefg")
439
+ assert_equal "a", @parser.extract(0, 1)
440
+ assert_equal "cd", @parser.extract(2, 4)
441
+ end
442
+
443
+ def test_extract_method_raises_argument_error_end_past_length
444
+ parse("abcdefg")
445
+ e = assert_raises(ArgumentError) do
446
+ @parser.extract(0, 32)
447
+ end
448
+ assert_equal "'end' argument not in range of document", e.message
449
+ end
450
+
451
+ def test_extract_method_raises_argument_error_end_less_than_start
452
+ parse("abcdefg")
453
+ e = assert_raises(ArgumentError) do
454
+ @parser.extract(1, 0)
455
+ end
456
+ assert_equal "'end' must be greater or equal than 'start'", e.message
457
+ end
458
+
459
+ def test_solidus_or_tag_name_error
460
+ parse('<>')
461
+ assert_equal 1, @parser.errors_count
462
+ assert_equal "expected '/' or tag name", @parser.errors.first.to_s
463
+ assert_equal 1, @parser.errors.first.line
464
+ assert_equal 1, @parser.errors.first.column
465
+ end
466
+
467
+ def test_solidus_or_tag_name_error_2
468
+ parse('< ')
469
+ assert_equal 1, @parser.errors_count
470
+ assert_equal "expected '/' or tag name", @parser.errors.first.to_s
471
+ assert_equal 1, @parser.errors.first.line
472
+ assert_equal 1, @parser.errors.first.column
473
+ end
474
+
475
+ def test_tag_error
476
+ parse('<foo =')
477
+ assert_equal 1, @parser.errors_count
478
+ assert_equal "expected whitespace, '>', attribute name or value", @parser.errors.first.to_s
479
+ assert_equal 1, @parser.errors.first.line
480
+ assert_equal 5, @parser.errors.first.column
481
+ end
482
+
483
+ def test_tag_end_error
484
+ parse('<foo /x')
485
+ assert_equal 1, @parser.errors_count
486
+ assert_equal "expected '>' after '/'", @parser.errors.first.to_s
487
+ assert_equal 1, @parser.errors.first.line
488
+ assert_equal 6, @parser.errors.first.column
489
+ end
490
+
491
+ def test_tag_end_error_2
492
+ parse('<foo / ')
493
+ assert_equal 1, @parser.errors_count
494
+ assert_equal "expected '>' after '/'", @parser.errors.first.to_s
495
+ assert_equal 1, @parser.errors.first.line
496
+ assert_equal 6, @parser.errors.first.column
497
+ end
498
+
499
+ def test_attribute_name_error
500
+ parse('<foo bar~')
501
+ assert_equal 2, @parser.errors_count
502
+ assert_equal "expected whitespace, '>' or '=' after attribute name", @parser.errors.first.to_s
503
+ assert_equal 1, @parser.errors.first.line
504
+ assert_equal 8, @parser.errors.first.column
505
+ assert_equal "expected whitespace, '>' or '=' after attribute name", @parser.errors[0].to_s
506
+ assert_equal 1, @parser.errors[0].line
507
+ assert_equal 8, @parser.errors[0].column
508
+ end
509
+
510
+ def test_attribute_whitespace_or_equal_error
511
+ parse('<foo bar ~')
512
+ assert_equal 2, @parser.errors_count
513
+ assert_equal "expected '/', '>', \", ' or '=' after attribute name", @parser.errors.first.to_s
514
+ assert_equal 1, @parser.errors.first.line
515
+ assert_equal 9, @parser.errors.first.column
516
+ assert_equal "expected '/', '>', \", ' or '=' after attribute name", @parser.errors[0].to_s
517
+ assert_equal 1, @parser.errors[0].line
518
+ assert_equal 9, @parser.errors[0].column
519
+ end
520
+
521
+ def test_attribute_whitespace_or_equal_error_2
522
+ parse('<foo bar = >')
523
+ assert_equal 1, @parser.errors_count
524
+ assert_equal "expected attribute value after '='", @parser.errors.first.to_s
525
+ assert_equal 1, @parser.errors.first.line
526
+ assert_equal 11, @parser.errors.first.column
527
+ end
528
+
529
+ def test_attribute_after_quoted_value
530
+ parse('<foo bar=""x')
531
+ assert_equal 1, @parser.errors_count
532
+ assert_equal "expected space after attribute value", @parser.errors.first.to_s
533
+ assert_equal 1, @parser.errors.first.line
534
+ assert_equal 11, @parser.errors.first.column
535
+ end
536
+
537
+ def test_valid_syntaxes
538
+ parse(
539
+ '<div>',
540
+ '<div />',
541
+ '<div/>',
542
+ '<div data-thing>',
543
+ '<div data-thing />',
544
+ '<div data-thing/>',
545
+ '<div "value">',
546
+ '<div "value" />',
547
+ '<div "value"/>',
548
+ '<div data-thing = "value">',
549
+ '<div data-thing="value">',
550
+ '<div data-thing="value"/>',
551
+ '<div data-thing data-other-thing="value">',
552
+ '<div data-thing data-other-thing="value"/>',
553
+ "<div \n\t\r data-thing \n\t\r data-other-thing='value'>",
554
+ '<div data-thing "value">',
555
+ '<div data-thing "value"/>',
556
+ '<div data-thing "value" />',
557
+ '<div "value" data-thing>',
558
+ '<div "value" data-thing/>',
559
+ '<div foo=unquoted=bla/>',
560
+ '<div foo=unquoted=bla />',
561
+ '<div foo=unquoted=bla>',
562
+ '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',
563
+ )
564
+ assert_equal 0, @parser.errors_count, "Expected no errors: #{@parser.errors}"
565
+ end
566
+
567
+ private
568
+
569
+ def parse(*parts, &block)
570
+ @parser ||= HtmlTokenizer::Parser.new
571
+ parts.each do |part|
572
+ @parser.parse(part, &block)
573
+ end
574
+ end
575
+ end