html_tokenizer 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.autotest +3 -0
- data/.gitignore +35 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +24 -0
- data/LICENSE +21 -0
- data/Manifest.txt +8 -0
- data/README.md +2 -0
- data/Rakefile +20 -0
- data/bin/html_tokenizer +3 -0
- data/ext/html_tokenizer_ext/extconf.rb +6 -0
- data/ext/html_tokenizer_ext/html_tokenizer.c +12 -0
- data/ext/html_tokenizer_ext/html_tokenizer.h +7 -0
- data/ext/html_tokenizer_ext/parser.c +767 -0
- data/ext/html_tokenizer_ext/parser.h +87 -0
- data/ext/html_tokenizer_ext/tokenizer.c +682 -0
- data/ext/html_tokenizer_ext/tokenizer.h +74 -0
- data/html_tokenizer.gemspec +19 -0
- data/lib/html_tokenizer.rb +12 -0
- data/test/unit/parser_test.rb +575 -0
- data/test/unit/tokenizer_test.rb +337 -0
- metadata +109 -0
@@ -0,0 +1,74 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
enum tokenizer_context {
|
4
|
+
TOKENIZER_NONE = 0,
|
5
|
+
TOKENIZER_HTML,
|
6
|
+
TOKENIZER_OPEN_TAG,
|
7
|
+
TOKENIZER_SOLIDUS_OR_TAG_NAME,
|
8
|
+
TOKENIZER_TAG_NAME,
|
9
|
+
TOKENIZER_CDATA,
|
10
|
+
TOKENIZER_RCDATA, // title, textarea
|
11
|
+
TOKENIZER_RAWTEXT, // style, xmp, iframe, noembed, noframes
|
12
|
+
TOKENIZER_SCRIPT_DATA, // script
|
13
|
+
TOKENIZER_PLAINTEXT, // plaintext
|
14
|
+
TOKENIZER_COMMENT,
|
15
|
+
TOKENIZER_ATTRIBUTE_NAME,
|
16
|
+
TOKENIZER_ATTRIBUTE_VALUE,
|
17
|
+
TOKENIZER_ATTRIBUTE_UNQUOTED,
|
18
|
+
TOKENIZER_ATTRIBUTE_QUOTED,
|
19
|
+
};
|
20
|
+
|
21
|
+
enum token_type {
|
22
|
+
TOKEN_NONE = 0,
|
23
|
+
TOKEN_TEXT,
|
24
|
+
TOKEN_WHITESPACE,
|
25
|
+
TOKEN_COMMENT_START,
|
26
|
+
TOKEN_COMMENT_END,
|
27
|
+
TOKEN_TAG_START,
|
28
|
+
TOKEN_TAG_NAME,
|
29
|
+
TOKEN_TAG_END,
|
30
|
+
TOKEN_ATTRIBUTE_NAME,
|
31
|
+
TOKEN_ATTRIBUTE_QUOTED_VALUE_START,
|
32
|
+
TOKEN_ATTRIBUTE_QUOTED_VALUE,
|
33
|
+
TOKEN_ATTRIBUTE_QUOTED_VALUE_END,
|
34
|
+
TOKEN_ATTRIBUTE_UNQUOTED_VALUE,
|
35
|
+
TOKEN_CDATA_START,
|
36
|
+
TOKEN_CDATA_END,
|
37
|
+
TOKEN_SOLIDUS,
|
38
|
+
TOKEN_EQUAL,
|
39
|
+
TOKEN_MALFORMED,
|
40
|
+
};
|
41
|
+
|
42
|
+
struct scan_t {
|
43
|
+
char *string;
|
44
|
+
long unsigned int cursor;
|
45
|
+
long unsigned int length;
|
46
|
+
};
|
47
|
+
|
48
|
+
struct tokenizer_t
|
49
|
+
{
|
50
|
+
enum tokenizer_context context[1000];
|
51
|
+
uint32_t current_context;
|
52
|
+
|
53
|
+
void *callback_data;
|
54
|
+
void (*f_callback)(struct tokenizer_t *tk, enum token_type type, long unsigned int length, void *data);
|
55
|
+
|
56
|
+
char attribute_value_start;
|
57
|
+
int found_attribute;
|
58
|
+
|
59
|
+
char *current_tag;
|
60
|
+
|
61
|
+
int is_closing_tag;
|
62
|
+
enum token_type last_token;
|
63
|
+
|
64
|
+
struct scan_t scan;
|
65
|
+
};
|
66
|
+
|
67
|
+
|
68
|
+
void Init_html_tokenizer_tokenizer(VALUE mHtmlTokenizer);
|
69
|
+
void tokenizer_init(struct tokenizer_t *tk);
|
70
|
+
void tokenizer_scan_all(struct tokenizer_t *tk);
|
71
|
+
VALUE token_type_to_symbol(enum token_type type);
|
72
|
+
|
73
|
+
extern const rb_data_type_t ht_tokenizer_data_type;
|
74
|
+
#define Tokenizer_Get_Struct(obj, sval) TypedData_Get_Struct(obj, struct tokenizer_t, &ht_tokenizer_data_type, sval)
|
@@ -0,0 +1,19 @@
|
|
1
|
+
Gem::Specification.new do |spec|
|
2
|
+
spec.name = "html_tokenizer"
|
3
|
+
spec.version = "0.0.1"
|
4
|
+
spec.summary = "HTML Tokenizer"
|
5
|
+
spec.author = "Francois Chagnon"
|
6
|
+
|
7
|
+
spec.files = Dir.glob("ext/**/*.{c,h,rb}") +
|
8
|
+
Dir.glob("lib/**/*.rb")
|
9
|
+
|
10
|
+
spec.extensions = ['ext/html_tokenizer_ext/extconf.rb']
|
11
|
+
spec.files = `git ls-files -z`.split("\x0")
|
12
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
13
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
14
|
+
spec.require_paths = ["lib", "ext"]
|
15
|
+
|
16
|
+
spec.add_development_dependency 'rake', '~> 0'
|
17
|
+
spec.add_development_dependency 'rake-compiler', '~> 0'
|
18
|
+
spec.add_development_dependency 'minitest', '~> 0'
|
19
|
+
end
|
@@ -0,0 +1,575 @@
|
|
1
|
+
require "minitest/autorun"
|
2
|
+
require "html_tokenizer"
|
3
|
+
|
4
|
+
class HtmlTokenizer::ParserTest < Minitest::Test
|
5
|
+
def test_empty_context
|
6
|
+
parse
|
7
|
+
assert_equal :none, @parser.context
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_open_tag
|
11
|
+
parse("<div")
|
12
|
+
assert_equal :tag_name, @parser.context
|
13
|
+
assert_equal "div", @parser.tag_name
|
14
|
+
assert_equal false, @parser.closing_tag?
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_open_attribute_value
|
18
|
+
parse('<div "foo')
|
19
|
+
assert_equal :quoted_value, @parser.context
|
20
|
+
assert_equal 'foo', @parser.attribute_value
|
21
|
+
assert_equal '"', @parser.quote_character
|
22
|
+
parse('bar"')
|
23
|
+
assert_equal :space_after_attribute, @parser.context
|
24
|
+
assert_equal 'foobar', @parser.attribute_value
|
25
|
+
assert_equal true, @parser.attribute_quoted?
|
26
|
+
assert_equal '"', @parser.quote_character
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_multi_part_namespace_tag
|
30
|
+
parse("<foo:")
|
31
|
+
assert_equal "foo:", @parser.tag_name
|
32
|
+
parse("bar")
|
33
|
+
assert_equal "foo:bar", @parser.tag_name
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_solidus_after_tag_name
|
37
|
+
parse("<foo/")
|
38
|
+
assert_equal "foo", @parser.tag_name
|
39
|
+
assert_equal :tag_end, @parser.context
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_whitespace_after_tag_name
|
43
|
+
parse("<foo ")
|
44
|
+
assert_equal "foo", @parser.tag_name
|
45
|
+
assert_equal :tag, @parser.context
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_context_is_tag_name_just_after_solidus
|
49
|
+
parse("</")
|
50
|
+
assert_equal :tag_name, @parser.context
|
51
|
+
assert_equal true, @parser.closing_tag?
|
52
|
+
end
|
53
|
+
|
54
|
+
def test_close_tag
|
55
|
+
parse("<div", ">")
|
56
|
+
assert_equal :none, @parser.context
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_attribute_name
|
60
|
+
parse("<div foo")
|
61
|
+
assert_equal "div", @parser.tag_name
|
62
|
+
assert_equal :attribute_name, @parser.context
|
63
|
+
assert_equal "foo", @parser.attribute_name
|
64
|
+
parse("bla")
|
65
|
+
assert_equal "foobla", @parser.attribute_name
|
66
|
+
end
|
67
|
+
|
68
|
+
def test_attribute_name_and_close
|
69
|
+
parse("<div foo>")
|
70
|
+
assert_equal "div", @parser.tag_name
|
71
|
+
assert_equal "foo", @parser.attribute_name
|
72
|
+
assert_nil @parser.attribute_value
|
73
|
+
assert_equal :none, @parser.context
|
74
|
+
end
|
75
|
+
|
76
|
+
def test_attribute_solidus_close
|
77
|
+
parse("<div foo/>")
|
78
|
+
assert_equal "div", @parser.tag_name
|
79
|
+
assert_equal "foo", @parser.attribute_name
|
80
|
+
assert_nil @parser.attribute_value
|
81
|
+
assert_equal :none, @parser.context
|
82
|
+
assert_equal false, @parser.closing_tag?
|
83
|
+
assert_equal true, @parser.self_closing_tag?
|
84
|
+
end
|
85
|
+
|
86
|
+
def test_attribute_value_solidus_close
|
87
|
+
parse("<div 'foo'/>")
|
88
|
+
assert_equal "div", @parser.tag_name
|
89
|
+
assert_nil @parser.attribute_name
|
90
|
+
assert_equal "foo", @parser.attribute_value
|
91
|
+
assert_equal true, @parser.attribute_quoted?
|
92
|
+
assert_equal :none, @parser.context
|
93
|
+
assert_equal false, @parser.closing_tag?
|
94
|
+
assert_equal true, @parser.self_closing_tag?
|
95
|
+
end
|
96
|
+
|
97
|
+
def test_attribute_value_and_tag_close
|
98
|
+
parse('<div "foo">')
|
99
|
+
assert_equal "div", @parser.tag_name
|
100
|
+
assert_nil @parser.attribute_name
|
101
|
+
assert_equal 'foo', @parser.attribute_value
|
102
|
+
assert_equal true, @parser.attribute_quoted?
|
103
|
+
assert_equal '"', @parser.quote_character
|
104
|
+
assert_equal :none, @parser.context
|
105
|
+
assert_equal false, @parser.closing_tag?
|
106
|
+
assert_equal false, @parser.self_closing_tag?
|
107
|
+
end
|
108
|
+
|
109
|
+
def test_attribute_value_equal_and_tag_close
|
110
|
+
parse("<div foo=>")
|
111
|
+
assert_equal "div", @parser.tag_name
|
112
|
+
assert_equal "foo", @parser.attribute_name
|
113
|
+
assert_nil @parser.attribute_value
|
114
|
+
assert_equal :none, @parser.context
|
115
|
+
assert_equal false, @parser.closing_tag?
|
116
|
+
assert_equal false, @parser.self_closing_tag?
|
117
|
+
end
|
118
|
+
|
119
|
+
def test_attribute_value_open_quote
|
120
|
+
parse("<div '")
|
121
|
+
assert_nil @parser.attribute_name
|
122
|
+
assert_nil @parser.attribute_value
|
123
|
+
assert_equal true, @parser.attribute_quoted?
|
124
|
+
assert_equal "'", @parser.quote_character
|
125
|
+
assert_equal :quoted_value, @parser.context
|
126
|
+
end
|
127
|
+
|
128
|
+
def test_attribute_name_and_value_open_quote
|
129
|
+
parse("<div foo='")
|
130
|
+
assert_nil @parser.attribute_value
|
131
|
+
assert_equal true, @parser.attribute_quoted?
|
132
|
+
assert_equal "'", @parser.quote_character
|
133
|
+
assert_equal :quoted_value, @parser.context
|
134
|
+
end
|
135
|
+
|
136
|
+
def test_attribute_value_open
|
137
|
+
parse("<div foo=")
|
138
|
+
assert_equal "div", @parser.tag_name
|
139
|
+
assert_equal "foo", @parser.attribute_name
|
140
|
+
assert_nil @parser.attribute_value
|
141
|
+
assert_equal :after_equal, @parser.context
|
142
|
+
end
|
143
|
+
|
144
|
+
def test_attribute_name_with_solidus
|
145
|
+
parse("<div foo=/")
|
146
|
+
assert_equal "foo", @parser.attribute_name
|
147
|
+
assert_equal "/", @parser.attribute_value
|
148
|
+
assert_equal false, @parser.attribute_quoted?
|
149
|
+
assert_nil @parser.quote_character
|
150
|
+
assert_equal :unquoted_value, @parser.context
|
151
|
+
end
|
152
|
+
|
153
|
+
def test_solidus_anywhere_doesnt_affect_closing_flags
|
154
|
+
parse("<div / >")
|
155
|
+
assert_equal "div", @parser.tag_name
|
156
|
+
assert_equal false, @parser.closing_tag?
|
157
|
+
assert_equal false, @parser.self_closing_tag?
|
158
|
+
end
|
159
|
+
|
160
|
+
def test_solidus_at_beginning_and_end_affect_closing_flags
|
161
|
+
parse("</div/>")
|
162
|
+
assert_equal "div", @parser.tag_name
|
163
|
+
assert_equal true, @parser.closing_tag?
|
164
|
+
assert_equal true, @parser.self_closing_tag?
|
165
|
+
end
|
166
|
+
|
167
|
+
def test_attribute_name_with_solidus_and_name
|
168
|
+
parse("<div foo=/bar")
|
169
|
+
assert_equal "foo", @parser.attribute_name
|
170
|
+
assert_equal "/bar", @parser.attribute_value
|
171
|
+
assert_equal false, @parser.attribute_quoted?
|
172
|
+
assert_nil @parser.quote_character
|
173
|
+
assert_equal :unquoted_value, @parser.context
|
174
|
+
end
|
175
|
+
|
176
|
+
def test_attribute_with_value_with_solidus
|
177
|
+
parse("<div foo='bar'")
|
178
|
+
assert_equal "foo", @parser.attribute_name
|
179
|
+
assert_equal "bar", @parser.attribute_value
|
180
|
+
assert_equal :space_after_attribute, @parser.context
|
181
|
+
parse("/baz")
|
182
|
+
assert_equal "baz", @parser.attribute_name
|
183
|
+
assert_nil @parser.attribute_value
|
184
|
+
assert_equal false, @parser.attribute_quoted?
|
185
|
+
assert_nil @parser.quote_character
|
186
|
+
assert_equal :attribute_name, @parser.context
|
187
|
+
end
|
188
|
+
|
189
|
+
def test_attribute_with_unquoted_value
|
190
|
+
parse("<div foo=bar")
|
191
|
+
assert_equal "foo", @parser.attribute_name
|
192
|
+
assert_equal "bar", @parser.attribute_value
|
193
|
+
assert_equal false, @parser.attribute_quoted?
|
194
|
+
assert_nil @parser.quote_character
|
195
|
+
assert_equal :unquoted_value, @parser.context
|
196
|
+
end
|
197
|
+
|
198
|
+
def test_attribute_with_unquoted_value_tag_end
|
199
|
+
parse("<div foo=bar>")
|
200
|
+
assert_equal "foo", @parser.attribute_name
|
201
|
+
assert_equal "bar", @parser.attribute_value
|
202
|
+
assert_equal false, @parser.attribute_quoted?
|
203
|
+
assert_nil @parser.quote_character
|
204
|
+
assert_equal :none, @parser.context
|
205
|
+
end
|
206
|
+
|
207
|
+
def test_attribute_with_unquoted_value_with_solidus
|
208
|
+
parse("<div foo=ba", "r", "/baz")
|
209
|
+
assert_equal "foo", @parser.attribute_name
|
210
|
+
assert_equal "bar/baz", @parser.attribute_value
|
211
|
+
assert_equal false, @parser.attribute_quoted?
|
212
|
+
assert_nil @parser.quote_character
|
213
|
+
assert_equal :unquoted_value, @parser.context
|
214
|
+
end
|
215
|
+
|
216
|
+
def test_attribute_with_unquoted_value_with_space
|
217
|
+
parse("<div foo=ba", "r", " baz")
|
218
|
+
assert_equal "baz", @parser.attribute_name
|
219
|
+
assert_nil @parser.attribute_value
|
220
|
+
assert_equal false, @parser.attribute_quoted?
|
221
|
+
assert_nil @parser.quote_character
|
222
|
+
assert_equal :attribute_name, @parser.context
|
223
|
+
end
|
224
|
+
|
225
|
+
def test_attribute_with_multipart_unquoted_value
|
226
|
+
parse("<div foo=ba", "r", "&baz")
|
227
|
+
assert_equal "foo", @parser.attribute_name
|
228
|
+
assert_equal "bar&baz", @parser.attribute_value
|
229
|
+
assert_equal false, @parser.attribute_quoted?
|
230
|
+
assert_nil @parser.quote_character
|
231
|
+
assert_equal :unquoted_value, @parser.context
|
232
|
+
end
|
233
|
+
|
234
|
+
def test_attribute_name_incomplete
|
235
|
+
parse("<div foo")
|
236
|
+
assert_equal "foo", @parser.attribute_name
|
237
|
+
assert_equal :attribute_name, @parser.context
|
238
|
+
end
|
239
|
+
|
240
|
+
def test_space_after_attribute_name_switches_context
|
241
|
+
parse("<div foo ")
|
242
|
+
assert_equal "foo", @parser.attribute_name
|
243
|
+
assert_equal :after_attribute_name, @parser.context
|
244
|
+
end
|
245
|
+
|
246
|
+
def test_solidus_after_attribute_name_switches_context
|
247
|
+
parse("<div foo/")
|
248
|
+
assert_equal "foo", @parser.attribute_name
|
249
|
+
assert_equal :tag_end, @parser.context
|
250
|
+
end
|
251
|
+
|
252
|
+
def test_attribute_name_is_complete_after_equal
|
253
|
+
parse("<div foo=")
|
254
|
+
assert_equal "foo", @parser.attribute_name
|
255
|
+
assert_equal :after_equal, @parser.context
|
256
|
+
end
|
257
|
+
|
258
|
+
def test_attribute_name_without_value
|
259
|
+
parse("<div foo ")
|
260
|
+
assert_equal "foo", @parser.attribute_name
|
261
|
+
assert_nil @parser.attribute_value
|
262
|
+
assert_equal :after_attribute_name, @parser.context
|
263
|
+
end
|
264
|
+
|
265
|
+
def test_attribute_name_are_separated_by_space
|
266
|
+
parse("<div foo bar")
|
267
|
+
assert_equal "bar", @parser.attribute_name
|
268
|
+
assert_nil @parser.attribute_value
|
269
|
+
assert_equal :attribute_name, @parser.context
|
270
|
+
end
|
271
|
+
|
272
|
+
def test_comment_context
|
273
|
+
parse("<!--")
|
274
|
+
assert_equal :comment, @parser.context
|
275
|
+
assert_nil @parser.comment_text
|
276
|
+
end
|
277
|
+
|
278
|
+
def test_cdata_context
|
279
|
+
parse("<![CDATA[")
|
280
|
+
assert_equal :cdata, @parser.context
|
281
|
+
assert_nil @parser.cdata_text
|
282
|
+
end
|
283
|
+
|
284
|
+
def test_comment_text
|
285
|
+
parse("<!-- foo")
|
286
|
+
assert_equal :comment, @parser.context
|
287
|
+
assert_equal " foo", @parser.comment_text
|
288
|
+
end
|
289
|
+
|
290
|
+
def test_cdata_text
|
291
|
+
parse("<![CDATA[ foo")
|
292
|
+
assert_equal :cdata, @parser.context
|
293
|
+
assert_equal " foo", @parser.cdata_text
|
294
|
+
end
|
295
|
+
|
296
|
+
def test_multipart_comment
|
297
|
+
parse("<!-- f", "oo", "bar")
|
298
|
+
assert_equal :comment, @parser.context
|
299
|
+
assert_equal " foobar", @parser.comment_text
|
300
|
+
end
|
301
|
+
|
302
|
+
def test_multipart_cdata
|
303
|
+
parse("<![CDATA[ f", "oo", "bar")
|
304
|
+
assert_equal :cdata, @parser.context
|
305
|
+
assert_equal " foobar", @parser.cdata_text
|
306
|
+
end
|
307
|
+
|
308
|
+
def test_comment_end
|
309
|
+
parse("<!-- foo -->")
|
310
|
+
assert_equal :none, @parser.context
|
311
|
+
assert_equal " foo ", @parser.comment_text
|
312
|
+
end
|
313
|
+
|
314
|
+
def test_cdata_end
|
315
|
+
parse("<![CDATA[ foo ]]>")
|
316
|
+
assert_equal :none, @parser.context
|
317
|
+
assert_equal " foo ", @parser.cdata_text
|
318
|
+
end
|
319
|
+
|
320
|
+
def test_plaintext_never_stops_parsing
|
321
|
+
parse("<plaintext>")
|
322
|
+
assert_equal :rawtext, @parser.context
|
323
|
+
assert_equal "plaintext", @parser.tag_name
|
324
|
+
assert_nil @parser.rawtext_text
|
325
|
+
|
326
|
+
parse("some", "<text")
|
327
|
+
assert_equal :rawtext, @parser.context
|
328
|
+
assert_equal "some<text", @parser.rawtext_text
|
329
|
+
|
330
|
+
parse("<plaintext")
|
331
|
+
assert_equal :rawtext, @parser.context
|
332
|
+
assert_equal "some<text<plaintext", @parser.rawtext_text
|
333
|
+
|
334
|
+
parse("</plaintext>")
|
335
|
+
assert_equal :rawtext, @parser.context
|
336
|
+
assert_equal "some<text<plaintext</plaintext>", @parser.rawtext_text
|
337
|
+
end
|
338
|
+
|
339
|
+
%w(title textarea style xmp iframe noembed noframes).each do |name|
|
340
|
+
define_method "test_#{name}_rawtext" do
|
341
|
+
parse("<#{name}>")
|
342
|
+
assert_equal :rawtext, @parser.context
|
343
|
+
assert_equal name, @parser.tag_name
|
344
|
+
assert_nil @parser.rawtext_text
|
345
|
+
|
346
|
+
parse("some", "<text")
|
347
|
+
assert_equal :rawtext, @parser.context
|
348
|
+
assert_equal "some<text", @parser.rawtext_text
|
349
|
+
|
350
|
+
parse("<#{name}")
|
351
|
+
assert_equal :rawtext, @parser.context
|
352
|
+
assert_equal "some<text<#{name}", @parser.rawtext_text
|
353
|
+
|
354
|
+
parse("</#{name}")
|
355
|
+
assert_equal :tag_name, @parser.context
|
356
|
+
assert_equal "some<text<#{name}", @parser.rawtext_text
|
357
|
+
|
358
|
+
parse(">")
|
359
|
+
assert_equal :none, @parser.context
|
360
|
+
assert_equal "some<text<#{name}", @parser.rawtext_text
|
361
|
+
end
|
362
|
+
end
|
363
|
+
|
364
|
+
def test_script_rawtext
|
365
|
+
parse("<script>data data data")
|
366
|
+
assert_equal :rawtext, @parser.context
|
367
|
+
assert_equal "script", @parser.tag_name
|
368
|
+
assert_equal "data data data", @parser.rawtext_text
|
369
|
+
parse("</script")
|
370
|
+
assert_equal :tag_name, @parser.context
|
371
|
+
assert_equal "script", @parser.tag_name
|
372
|
+
parse(">")
|
373
|
+
assert_equal :none, @parser.context
|
374
|
+
end
|
375
|
+
|
376
|
+
def test_consecutive_scripts
|
377
|
+
parse("<script>foo\n</script>\n<script>bar</script>\n bla")
|
378
|
+
assert_equal :none, @parser.context
|
379
|
+
end
|
380
|
+
|
381
|
+
def test_end_of_script_regression
|
382
|
+
html = "<script><!</script>"
|
383
|
+
parse(html)
|
384
|
+
assert_equal :none, @parser.context
|
385
|
+
end
|
386
|
+
|
387
|
+
def test_document_length
|
388
|
+
@parser = HtmlTokenizer::Parser.new
|
389
|
+
assert_equal 0, @parser.document_length
|
390
|
+
parse("abcdef")
|
391
|
+
assert_equal 6, @parser.document_length
|
392
|
+
parse("abcdef")
|
393
|
+
assert_equal 12, @parser.document_length
|
394
|
+
end
|
395
|
+
|
396
|
+
def test_document_method
|
397
|
+
@parser = HtmlTokenizer::Parser.new
|
398
|
+
assert_nil @parser.document
|
399
|
+
parse("abcdef")
|
400
|
+
assert_equal "abcdef", @parser.document
|
401
|
+
parse("abcdef")
|
402
|
+
assert_equal "abcdefabcdef", @parser.document
|
403
|
+
end
|
404
|
+
|
405
|
+
def test_yields_raw_tokens_when_block_given
|
406
|
+
tokens = []
|
407
|
+
parse("<foo>") do |*token|
|
408
|
+
tokens << token
|
409
|
+
end
|
410
|
+
assert_equal [[:tag_start, 0, 1, 1, 0], [:tag_name, 1, 4, 1, 1], [:tag_end, 4, 5, 1, 4]], tokens
|
411
|
+
end
|
412
|
+
|
413
|
+
def test_yields_line_and_column_numbers
|
414
|
+
tokens = []
|
415
|
+
parse("<\n>") do |*token|
|
416
|
+
tokens << token
|
417
|
+
end
|
418
|
+
assert_equal [[:tag_start, 0, 1, 1, 0], [:whitespace, 1, 2, 1, 1], [:tag_end, 2, 3, 2, 0]], tokens
|
419
|
+
end
|
420
|
+
|
421
|
+
def test_append_placeholder_adjusts_line_and_column_numbers_but_does_not_parse
|
422
|
+
@parser = HtmlTokenizer::Parser.new
|
423
|
+
tokens = []
|
424
|
+
@parser.parse("foo\n") do |*token|
|
425
|
+
tokens << token
|
426
|
+
end
|
427
|
+
@parser.append_placeholder("<%= some ruby do\n foo\nend %>\n") do |*token|
|
428
|
+
tokens << token
|
429
|
+
end
|
430
|
+
@parser.parse("bar\n") do |*token|
|
431
|
+
tokens << token
|
432
|
+
end
|
433
|
+
assert_equal [[:text, 0, 4, 1, 0], [:text, 34, 38, 5, 0]], tokens
|
434
|
+
assert_equal "bar\n", @parser.extract(34, 38)
|
435
|
+
end
|
436
|
+
|
437
|
+
def test_extract_method
|
438
|
+
parse("abcdefg")
|
439
|
+
assert_equal "a", @parser.extract(0, 1)
|
440
|
+
assert_equal "cd", @parser.extract(2, 4)
|
441
|
+
end
|
442
|
+
|
443
|
+
def test_extract_method_raises_argument_error_end_past_length
|
444
|
+
parse("abcdefg")
|
445
|
+
e = assert_raises(ArgumentError) do
|
446
|
+
@parser.extract(0, 32)
|
447
|
+
end
|
448
|
+
assert_equal "'end' argument not in range of document", e.message
|
449
|
+
end
|
450
|
+
|
451
|
+
def test_extract_method_raises_argument_error_end_less_than_start
|
452
|
+
parse("abcdefg")
|
453
|
+
e = assert_raises(ArgumentError) do
|
454
|
+
@parser.extract(1, 0)
|
455
|
+
end
|
456
|
+
assert_equal "'end' must be greater or equal than 'start'", e.message
|
457
|
+
end
|
458
|
+
|
459
|
+
def test_solidus_or_tag_name_error
|
460
|
+
parse('<>')
|
461
|
+
assert_equal 1, @parser.errors_count
|
462
|
+
assert_equal "expected '/' or tag name", @parser.errors.first.to_s
|
463
|
+
assert_equal 1, @parser.errors.first.line
|
464
|
+
assert_equal 1, @parser.errors.first.column
|
465
|
+
end
|
466
|
+
|
467
|
+
def test_solidus_or_tag_name_error_2
|
468
|
+
parse('< ')
|
469
|
+
assert_equal 1, @parser.errors_count
|
470
|
+
assert_equal "expected '/' or tag name", @parser.errors.first.to_s
|
471
|
+
assert_equal 1, @parser.errors.first.line
|
472
|
+
assert_equal 1, @parser.errors.first.column
|
473
|
+
end
|
474
|
+
|
475
|
+
def test_tag_error
|
476
|
+
parse('<foo =')
|
477
|
+
assert_equal 1, @parser.errors_count
|
478
|
+
assert_equal "expected whitespace, '>', attribute name or value", @parser.errors.first.to_s
|
479
|
+
assert_equal 1, @parser.errors.first.line
|
480
|
+
assert_equal 5, @parser.errors.first.column
|
481
|
+
end
|
482
|
+
|
483
|
+
def test_tag_end_error
|
484
|
+
parse('<foo /x')
|
485
|
+
assert_equal 1, @parser.errors_count
|
486
|
+
assert_equal "expected '>' after '/'", @parser.errors.first.to_s
|
487
|
+
assert_equal 1, @parser.errors.first.line
|
488
|
+
assert_equal 6, @parser.errors.first.column
|
489
|
+
end
|
490
|
+
|
491
|
+
def test_tag_end_error_2
|
492
|
+
parse('<foo / ')
|
493
|
+
assert_equal 1, @parser.errors_count
|
494
|
+
assert_equal "expected '>' after '/'", @parser.errors.first.to_s
|
495
|
+
assert_equal 1, @parser.errors.first.line
|
496
|
+
assert_equal 6, @parser.errors.first.column
|
497
|
+
end
|
498
|
+
|
499
|
+
def test_attribute_name_error
|
500
|
+
parse('<foo bar~')
|
501
|
+
assert_equal 2, @parser.errors_count
|
502
|
+
assert_equal "expected whitespace, '>' or '=' after attribute name", @parser.errors.first.to_s
|
503
|
+
assert_equal 1, @parser.errors.first.line
|
504
|
+
assert_equal 8, @parser.errors.first.column
|
505
|
+
assert_equal "expected whitespace, '>' or '=' after attribute name", @parser.errors[0].to_s
|
506
|
+
assert_equal 1, @parser.errors[0].line
|
507
|
+
assert_equal 8, @parser.errors[0].column
|
508
|
+
end
|
509
|
+
|
510
|
+
def test_attribute_whitespace_or_equal_error
|
511
|
+
parse('<foo bar ~')
|
512
|
+
assert_equal 2, @parser.errors_count
|
513
|
+
assert_equal "expected '/', '>', \", ' or '=' after attribute name", @parser.errors.first.to_s
|
514
|
+
assert_equal 1, @parser.errors.first.line
|
515
|
+
assert_equal 9, @parser.errors.first.column
|
516
|
+
assert_equal "expected '/', '>', \", ' or '=' after attribute name", @parser.errors[0].to_s
|
517
|
+
assert_equal 1, @parser.errors[0].line
|
518
|
+
assert_equal 9, @parser.errors[0].column
|
519
|
+
end
|
520
|
+
|
521
|
+
def test_attribute_whitespace_or_equal_error_2
|
522
|
+
parse('<foo bar = >')
|
523
|
+
assert_equal 1, @parser.errors_count
|
524
|
+
assert_equal "expected attribute value after '='", @parser.errors.first.to_s
|
525
|
+
assert_equal 1, @parser.errors.first.line
|
526
|
+
assert_equal 11, @parser.errors.first.column
|
527
|
+
end
|
528
|
+
|
529
|
+
def test_attribute_after_quoted_value
|
530
|
+
parse('<foo bar=""x')
|
531
|
+
assert_equal 1, @parser.errors_count
|
532
|
+
assert_equal "expected space after attribute value", @parser.errors.first.to_s
|
533
|
+
assert_equal 1, @parser.errors.first.line
|
534
|
+
assert_equal 11, @parser.errors.first.column
|
535
|
+
end
|
536
|
+
|
537
|
+
def test_valid_syntaxes
|
538
|
+
parse(
|
539
|
+
'<div>',
|
540
|
+
'<div />',
|
541
|
+
'<div/>',
|
542
|
+
'<div data-thing>',
|
543
|
+
'<div data-thing />',
|
544
|
+
'<div data-thing/>',
|
545
|
+
'<div "value">',
|
546
|
+
'<div "value" />',
|
547
|
+
'<div "value"/>',
|
548
|
+
'<div data-thing = "value">',
|
549
|
+
'<div data-thing="value">',
|
550
|
+
'<div data-thing="value"/>',
|
551
|
+
'<div data-thing data-other-thing="value">',
|
552
|
+
'<div data-thing data-other-thing="value"/>',
|
553
|
+
"<div \n\t\r data-thing \n\t\r data-other-thing='value'>",
|
554
|
+
'<div data-thing "value">',
|
555
|
+
'<div data-thing "value"/>',
|
556
|
+
'<div data-thing "value" />',
|
557
|
+
'<div "value" data-thing>',
|
558
|
+
'<div "value" data-thing/>',
|
559
|
+
'<div foo=unquoted=bla/>',
|
560
|
+
'<div foo=unquoted=bla />',
|
561
|
+
'<div foo=unquoted=bla>',
|
562
|
+
'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',
|
563
|
+
)
|
564
|
+
assert_equal 0, @parser.errors_count, "Expected no errors: #{@parser.errors}"
|
565
|
+
end
|
566
|
+
|
567
|
+
private
|
568
|
+
|
569
|
+
def parse(*parts, &block)
|
570
|
+
@parser ||= HtmlTokenizer::Parser.new
|
571
|
+
parts.each do |part|
|
572
|
+
@parser.parse(part, &block)
|
573
|
+
end
|
574
|
+
end
|
575
|
+
end
|