html_tokenizer 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.autotest +3 -0
- data/.gitignore +35 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +24 -0
- data/LICENSE +21 -0
- data/Manifest.txt +8 -0
- data/README.md +2 -0
- data/Rakefile +20 -0
- data/bin/html_tokenizer +3 -0
- data/ext/html_tokenizer_ext/extconf.rb +6 -0
- data/ext/html_tokenizer_ext/html_tokenizer.c +12 -0
- data/ext/html_tokenizer_ext/html_tokenizer.h +7 -0
- data/ext/html_tokenizer_ext/parser.c +767 -0
- data/ext/html_tokenizer_ext/parser.h +87 -0
- data/ext/html_tokenizer_ext/tokenizer.c +682 -0
- data/ext/html_tokenizer_ext/tokenizer.h +74 -0
- data/html_tokenizer.gemspec +19 -0
- data/lib/html_tokenizer.rb +12 -0
- data/test/unit/parser_test.rb +575 -0
- data/test/unit/tokenizer_test.rb +337 -0
- metadata +109 -0
@@ -0,0 +1,74 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
enum tokenizer_context {
|
4
|
+
TOKENIZER_NONE = 0,
|
5
|
+
TOKENIZER_HTML,
|
6
|
+
TOKENIZER_OPEN_TAG,
|
7
|
+
TOKENIZER_SOLIDUS_OR_TAG_NAME,
|
8
|
+
TOKENIZER_TAG_NAME,
|
9
|
+
TOKENIZER_CDATA,
|
10
|
+
TOKENIZER_RCDATA, // title, textarea
|
11
|
+
TOKENIZER_RAWTEXT, // style, xmp, iframe, noembed, noframes
|
12
|
+
TOKENIZER_SCRIPT_DATA, // script
|
13
|
+
TOKENIZER_PLAINTEXT, // plaintext
|
14
|
+
TOKENIZER_COMMENT,
|
15
|
+
TOKENIZER_ATTRIBUTE_NAME,
|
16
|
+
TOKENIZER_ATTRIBUTE_VALUE,
|
17
|
+
TOKENIZER_ATTRIBUTE_UNQUOTED,
|
18
|
+
TOKENIZER_ATTRIBUTE_QUOTED,
|
19
|
+
};
|
20
|
+
|
21
|
+
enum token_type {
|
22
|
+
TOKEN_NONE = 0,
|
23
|
+
TOKEN_TEXT,
|
24
|
+
TOKEN_WHITESPACE,
|
25
|
+
TOKEN_COMMENT_START,
|
26
|
+
TOKEN_COMMENT_END,
|
27
|
+
TOKEN_TAG_START,
|
28
|
+
TOKEN_TAG_NAME,
|
29
|
+
TOKEN_TAG_END,
|
30
|
+
TOKEN_ATTRIBUTE_NAME,
|
31
|
+
TOKEN_ATTRIBUTE_QUOTED_VALUE_START,
|
32
|
+
TOKEN_ATTRIBUTE_QUOTED_VALUE,
|
33
|
+
TOKEN_ATTRIBUTE_QUOTED_VALUE_END,
|
34
|
+
TOKEN_ATTRIBUTE_UNQUOTED_VALUE,
|
35
|
+
TOKEN_CDATA_START,
|
36
|
+
TOKEN_CDATA_END,
|
37
|
+
TOKEN_SOLIDUS,
|
38
|
+
TOKEN_EQUAL,
|
39
|
+
TOKEN_MALFORMED,
|
40
|
+
};
|
41
|
+
|
42
|
+
struct scan_t {
|
43
|
+
char *string;
|
44
|
+
long unsigned int cursor;
|
45
|
+
long unsigned int length;
|
46
|
+
};
|
47
|
+
|
48
|
+
struct tokenizer_t
|
49
|
+
{
|
50
|
+
enum tokenizer_context context[1000];
|
51
|
+
uint32_t current_context;
|
52
|
+
|
53
|
+
void *callback_data;
|
54
|
+
void (*f_callback)(struct tokenizer_t *tk, enum token_type type, long unsigned int length, void *data);
|
55
|
+
|
56
|
+
char attribute_value_start;
|
57
|
+
int found_attribute;
|
58
|
+
|
59
|
+
char *current_tag;
|
60
|
+
|
61
|
+
int is_closing_tag;
|
62
|
+
enum token_type last_token;
|
63
|
+
|
64
|
+
struct scan_t scan;
|
65
|
+
};
|
66
|
+
|
67
|
+
|
68
|
+
void Init_html_tokenizer_tokenizer(VALUE mHtmlTokenizer);
|
69
|
+
void tokenizer_init(struct tokenizer_t *tk);
|
70
|
+
void tokenizer_scan_all(struct tokenizer_t *tk);
|
71
|
+
VALUE token_type_to_symbol(enum token_type type);
|
72
|
+
|
73
|
+
extern const rb_data_type_t ht_tokenizer_data_type;
|
74
|
+
#define Tokenizer_Get_Struct(obj, sval) TypedData_Get_Struct(obj, struct tokenizer_t, &ht_tokenizer_data_type, sval)
|
@@ -0,0 +1,19 @@
|
|
1
|
+
Gem::Specification.new do |spec|
|
2
|
+
spec.name = "html_tokenizer"
|
3
|
+
spec.version = "0.0.1"
|
4
|
+
spec.summary = "HTML Tokenizer"
|
5
|
+
spec.author = "Francois Chagnon"
|
6
|
+
|
7
|
+
spec.files = Dir.glob("ext/**/*.{c,h,rb}") +
|
8
|
+
Dir.glob("lib/**/*.rb")
|
9
|
+
|
10
|
+
spec.extensions = ['ext/html_tokenizer_ext/extconf.rb']
|
11
|
+
spec.files = `git ls-files -z`.split("\x0")
|
12
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
13
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
14
|
+
spec.require_paths = ["lib", "ext"]
|
15
|
+
|
16
|
+
spec.add_development_dependency 'rake', '~> 0'
|
17
|
+
spec.add_development_dependency 'rake-compiler', '~> 0'
|
18
|
+
spec.add_development_dependency 'minitest', '~> 0'
|
19
|
+
end
|
@@ -0,0 +1,575 @@
|
|
1
|
+
require "minitest/autorun"
|
2
|
+
require "html_tokenizer"
|
3
|
+
|
4
|
+
class HtmlTokenizer::ParserTest < Minitest::Test
|
5
|
+
def test_empty_context
|
6
|
+
parse
|
7
|
+
assert_equal :none, @parser.context
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_open_tag
|
11
|
+
parse("<div")
|
12
|
+
assert_equal :tag_name, @parser.context
|
13
|
+
assert_equal "div", @parser.tag_name
|
14
|
+
assert_equal false, @parser.closing_tag?
|
15
|
+
end
|
16
|
+
|
17
|
+
def test_open_attribute_value
|
18
|
+
parse('<div "foo')
|
19
|
+
assert_equal :quoted_value, @parser.context
|
20
|
+
assert_equal 'foo', @parser.attribute_value
|
21
|
+
assert_equal '"', @parser.quote_character
|
22
|
+
parse('bar"')
|
23
|
+
assert_equal :space_after_attribute, @parser.context
|
24
|
+
assert_equal 'foobar', @parser.attribute_value
|
25
|
+
assert_equal true, @parser.attribute_quoted?
|
26
|
+
assert_equal '"', @parser.quote_character
|
27
|
+
end
|
28
|
+
|
29
|
+
def test_multi_part_namespace_tag
|
30
|
+
parse("<foo:")
|
31
|
+
assert_equal "foo:", @parser.tag_name
|
32
|
+
parse("bar")
|
33
|
+
assert_equal "foo:bar", @parser.tag_name
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_solidus_after_tag_name
|
37
|
+
parse("<foo/")
|
38
|
+
assert_equal "foo", @parser.tag_name
|
39
|
+
assert_equal :tag_end, @parser.context
|
40
|
+
end
|
41
|
+
|
42
|
+
def test_whitespace_after_tag_name
|
43
|
+
parse("<foo ")
|
44
|
+
assert_equal "foo", @parser.tag_name
|
45
|
+
assert_equal :tag, @parser.context
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_context_is_tag_name_just_after_solidus
|
49
|
+
parse("</")
|
50
|
+
assert_equal :tag_name, @parser.context
|
51
|
+
assert_equal true, @parser.closing_tag?
|
52
|
+
end
|
53
|
+
|
54
|
+
def test_close_tag
|
55
|
+
parse("<div", ">")
|
56
|
+
assert_equal :none, @parser.context
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_attribute_name
|
60
|
+
parse("<div foo")
|
61
|
+
assert_equal "div", @parser.tag_name
|
62
|
+
assert_equal :attribute_name, @parser.context
|
63
|
+
assert_equal "foo", @parser.attribute_name
|
64
|
+
parse("bla")
|
65
|
+
assert_equal "foobla", @parser.attribute_name
|
66
|
+
end
|
67
|
+
|
68
|
+
def test_attribute_name_and_close
|
69
|
+
parse("<div foo>")
|
70
|
+
assert_equal "div", @parser.tag_name
|
71
|
+
assert_equal "foo", @parser.attribute_name
|
72
|
+
assert_nil @parser.attribute_value
|
73
|
+
assert_equal :none, @parser.context
|
74
|
+
end
|
75
|
+
|
76
|
+
def test_attribute_solidus_close
|
77
|
+
parse("<div foo/>")
|
78
|
+
assert_equal "div", @parser.tag_name
|
79
|
+
assert_equal "foo", @parser.attribute_name
|
80
|
+
assert_nil @parser.attribute_value
|
81
|
+
assert_equal :none, @parser.context
|
82
|
+
assert_equal false, @parser.closing_tag?
|
83
|
+
assert_equal true, @parser.self_closing_tag?
|
84
|
+
end
|
85
|
+
|
86
|
+
def test_attribute_value_solidus_close
|
87
|
+
parse("<div 'foo'/>")
|
88
|
+
assert_equal "div", @parser.tag_name
|
89
|
+
assert_nil @parser.attribute_name
|
90
|
+
assert_equal "foo", @parser.attribute_value
|
91
|
+
assert_equal true, @parser.attribute_quoted?
|
92
|
+
assert_equal :none, @parser.context
|
93
|
+
assert_equal false, @parser.closing_tag?
|
94
|
+
assert_equal true, @parser.self_closing_tag?
|
95
|
+
end
|
96
|
+
|
97
|
+
def test_attribute_value_and_tag_close
|
98
|
+
parse('<div "foo">')
|
99
|
+
assert_equal "div", @parser.tag_name
|
100
|
+
assert_nil @parser.attribute_name
|
101
|
+
assert_equal 'foo', @parser.attribute_value
|
102
|
+
assert_equal true, @parser.attribute_quoted?
|
103
|
+
assert_equal '"', @parser.quote_character
|
104
|
+
assert_equal :none, @parser.context
|
105
|
+
assert_equal false, @parser.closing_tag?
|
106
|
+
assert_equal false, @parser.self_closing_tag?
|
107
|
+
end
|
108
|
+
|
109
|
+
def test_attribute_value_equal_and_tag_close
|
110
|
+
parse("<div foo=>")
|
111
|
+
assert_equal "div", @parser.tag_name
|
112
|
+
assert_equal "foo", @parser.attribute_name
|
113
|
+
assert_nil @parser.attribute_value
|
114
|
+
assert_equal :none, @parser.context
|
115
|
+
assert_equal false, @parser.closing_tag?
|
116
|
+
assert_equal false, @parser.self_closing_tag?
|
117
|
+
end
|
118
|
+
|
119
|
+
def test_attribute_value_open_quote
|
120
|
+
parse("<div '")
|
121
|
+
assert_nil @parser.attribute_name
|
122
|
+
assert_nil @parser.attribute_value
|
123
|
+
assert_equal true, @parser.attribute_quoted?
|
124
|
+
assert_equal "'", @parser.quote_character
|
125
|
+
assert_equal :quoted_value, @parser.context
|
126
|
+
end
|
127
|
+
|
128
|
+
def test_attribute_name_and_value_open_quote
|
129
|
+
parse("<div foo='")
|
130
|
+
assert_nil @parser.attribute_value
|
131
|
+
assert_equal true, @parser.attribute_quoted?
|
132
|
+
assert_equal "'", @parser.quote_character
|
133
|
+
assert_equal :quoted_value, @parser.context
|
134
|
+
end
|
135
|
+
|
136
|
+
def test_attribute_value_open
|
137
|
+
parse("<div foo=")
|
138
|
+
assert_equal "div", @parser.tag_name
|
139
|
+
assert_equal "foo", @parser.attribute_name
|
140
|
+
assert_nil @parser.attribute_value
|
141
|
+
assert_equal :after_equal, @parser.context
|
142
|
+
end
|
143
|
+
|
144
|
+
def test_attribute_name_with_solidus
|
145
|
+
parse("<div foo=/")
|
146
|
+
assert_equal "foo", @parser.attribute_name
|
147
|
+
assert_equal "/", @parser.attribute_value
|
148
|
+
assert_equal false, @parser.attribute_quoted?
|
149
|
+
assert_nil @parser.quote_character
|
150
|
+
assert_equal :unquoted_value, @parser.context
|
151
|
+
end
|
152
|
+
|
153
|
+
def test_solidus_anywhere_doesnt_affect_closing_flags
|
154
|
+
parse("<div / >")
|
155
|
+
assert_equal "div", @parser.tag_name
|
156
|
+
assert_equal false, @parser.closing_tag?
|
157
|
+
assert_equal false, @parser.self_closing_tag?
|
158
|
+
end
|
159
|
+
|
160
|
+
def test_solidus_at_beginning_and_end_affect_closing_flags
|
161
|
+
parse("</div/>")
|
162
|
+
assert_equal "div", @parser.tag_name
|
163
|
+
assert_equal true, @parser.closing_tag?
|
164
|
+
assert_equal true, @parser.self_closing_tag?
|
165
|
+
end
|
166
|
+
|
167
|
+
def test_attribute_name_with_solidus_and_name
|
168
|
+
parse("<div foo=/bar")
|
169
|
+
assert_equal "foo", @parser.attribute_name
|
170
|
+
assert_equal "/bar", @parser.attribute_value
|
171
|
+
assert_equal false, @parser.attribute_quoted?
|
172
|
+
assert_nil @parser.quote_character
|
173
|
+
assert_equal :unquoted_value, @parser.context
|
174
|
+
end
|
175
|
+
|
176
|
+
def test_attribute_with_value_with_solidus
|
177
|
+
parse("<div foo='bar'")
|
178
|
+
assert_equal "foo", @parser.attribute_name
|
179
|
+
assert_equal "bar", @parser.attribute_value
|
180
|
+
assert_equal :space_after_attribute, @parser.context
|
181
|
+
parse("/baz")
|
182
|
+
assert_equal "baz", @parser.attribute_name
|
183
|
+
assert_nil @parser.attribute_value
|
184
|
+
assert_equal false, @parser.attribute_quoted?
|
185
|
+
assert_nil @parser.quote_character
|
186
|
+
assert_equal :attribute_name, @parser.context
|
187
|
+
end
|
188
|
+
|
189
|
+
def test_attribute_with_unquoted_value
|
190
|
+
parse("<div foo=bar")
|
191
|
+
assert_equal "foo", @parser.attribute_name
|
192
|
+
assert_equal "bar", @parser.attribute_value
|
193
|
+
assert_equal false, @parser.attribute_quoted?
|
194
|
+
assert_nil @parser.quote_character
|
195
|
+
assert_equal :unquoted_value, @parser.context
|
196
|
+
end
|
197
|
+
|
198
|
+
def test_attribute_with_unquoted_value_tag_end
|
199
|
+
parse("<div foo=bar>")
|
200
|
+
assert_equal "foo", @parser.attribute_name
|
201
|
+
assert_equal "bar", @parser.attribute_value
|
202
|
+
assert_equal false, @parser.attribute_quoted?
|
203
|
+
assert_nil @parser.quote_character
|
204
|
+
assert_equal :none, @parser.context
|
205
|
+
end
|
206
|
+
|
207
|
+
def test_attribute_with_unquoted_value_with_solidus
|
208
|
+
parse("<div foo=ba", "r", "/baz")
|
209
|
+
assert_equal "foo", @parser.attribute_name
|
210
|
+
assert_equal "bar/baz", @parser.attribute_value
|
211
|
+
assert_equal false, @parser.attribute_quoted?
|
212
|
+
assert_nil @parser.quote_character
|
213
|
+
assert_equal :unquoted_value, @parser.context
|
214
|
+
end
|
215
|
+
|
216
|
+
def test_attribute_with_unquoted_value_with_space
|
217
|
+
parse("<div foo=ba", "r", " baz")
|
218
|
+
assert_equal "baz", @parser.attribute_name
|
219
|
+
assert_nil @parser.attribute_value
|
220
|
+
assert_equal false, @parser.attribute_quoted?
|
221
|
+
assert_nil @parser.quote_character
|
222
|
+
assert_equal :attribute_name, @parser.context
|
223
|
+
end
|
224
|
+
|
225
|
+
def test_attribute_with_multipart_unquoted_value
|
226
|
+
parse("<div foo=ba", "r", "&baz")
|
227
|
+
assert_equal "foo", @parser.attribute_name
|
228
|
+
assert_equal "bar&baz", @parser.attribute_value
|
229
|
+
assert_equal false, @parser.attribute_quoted?
|
230
|
+
assert_nil @parser.quote_character
|
231
|
+
assert_equal :unquoted_value, @parser.context
|
232
|
+
end
|
233
|
+
|
234
|
+
def test_attribute_name_incomplete
|
235
|
+
parse("<div foo")
|
236
|
+
assert_equal "foo", @parser.attribute_name
|
237
|
+
assert_equal :attribute_name, @parser.context
|
238
|
+
end
|
239
|
+
|
240
|
+
def test_space_after_attribute_name_switches_context
|
241
|
+
parse("<div foo ")
|
242
|
+
assert_equal "foo", @parser.attribute_name
|
243
|
+
assert_equal :after_attribute_name, @parser.context
|
244
|
+
end
|
245
|
+
|
246
|
+
def test_solidus_after_attribute_name_switches_context
|
247
|
+
parse("<div foo/")
|
248
|
+
assert_equal "foo", @parser.attribute_name
|
249
|
+
assert_equal :tag_end, @parser.context
|
250
|
+
end
|
251
|
+
|
252
|
+
def test_attribute_name_is_complete_after_equal
|
253
|
+
parse("<div foo=")
|
254
|
+
assert_equal "foo", @parser.attribute_name
|
255
|
+
assert_equal :after_equal, @parser.context
|
256
|
+
end
|
257
|
+
|
258
|
+
def test_attribute_name_without_value
|
259
|
+
parse("<div foo ")
|
260
|
+
assert_equal "foo", @parser.attribute_name
|
261
|
+
assert_nil @parser.attribute_value
|
262
|
+
assert_equal :after_attribute_name, @parser.context
|
263
|
+
end
|
264
|
+
|
265
|
+
def test_attribute_name_are_separated_by_space
|
266
|
+
parse("<div foo bar")
|
267
|
+
assert_equal "bar", @parser.attribute_name
|
268
|
+
assert_nil @parser.attribute_value
|
269
|
+
assert_equal :attribute_name, @parser.context
|
270
|
+
end
|
271
|
+
|
272
|
+
def test_comment_context
|
273
|
+
parse("<!--")
|
274
|
+
assert_equal :comment, @parser.context
|
275
|
+
assert_nil @parser.comment_text
|
276
|
+
end
|
277
|
+
|
278
|
+
def test_cdata_context
|
279
|
+
parse("<![CDATA[")
|
280
|
+
assert_equal :cdata, @parser.context
|
281
|
+
assert_nil @parser.cdata_text
|
282
|
+
end
|
283
|
+
|
284
|
+
def test_comment_text
|
285
|
+
parse("<!-- foo")
|
286
|
+
assert_equal :comment, @parser.context
|
287
|
+
assert_equal " foo", @parser.comment_text
|
288
|
+
end
|
289
|
+
|
290
|
+
def test_cdata_text
|
291
|
+
parse("<![CDATA[ foo")
|
292
|
+
assert_equal :cdata, @parser.context
|
293
|
+
assert_equal " foo", @parser.cdata_text
|
294
|
+
end
|
295
|
+
|
296
|
+
def test_multipart_comment
|
297
|
+
parse("<!-- f", "oo", "bar")
|
298
|
+
assert_equal :comment, @parser.context
|
299
|
+
assert_equal " foobar", @parser.comment_text
|
300
|
+
end
|
301
|
+
|
302
|
+
def test_multipart_cdata
|
303
|
+
parse("<![CDATA[ f", "oo", "bar")
|
304
|
+
assert_equal :cdata, @parser.context
|
305
|
+
assert_equal " foobar", @parser.cdata_text
|
306
|
+
end
|
307
|
+
|
308
|
+
def test_comment_end
|
309
|
+
parse("<!-- foo -->")
|
310
|
+
assert_equal :none, @parser.context
|
311
|
+
assert_equal " foo ", @parser.comment_text
|
312
|
+
end
|
313
|
+
|
314
|
+
def test_cdata_end
|
315
|
+
parse("<![CDATA[ foo ]]>")
|
316
|
+
assert_equal :none, @parser.context
|
317
|
+
assert_equal " foo ", @parser.cdata_text
|
318
|
+
end
|
319
|
+
|
320
|
+
def test_plaintext_never_stops_parsing
|
321
|
+
parse("<plaintext>")
|
322
|
+
assert_equal :rawtext, @parser.context
|
323
|
+
assert_equal "plaintext", @parser.tag_name
|
324
|
+
assert_nil @parser.rawtext_text
|
325
|
+
|
326
|
+
parse("some", "<text")
|
327
|
+
assert_equal :rawtext, @parser.context
|
328
|
+
assert_equal "some<text", @parser.rawtext_text
|
329
|
+
|
330
|
+
parse("<plaintext")
|
331
|
+
assert_equal :rawtext, @parser.context
|
332
|
+
assert_equal "some<text<plaintext", @parser.rawtext_text
|
333
|
+
|
334
|
+
parse("</plaintext>")
|
335
|
+
assert_equal :rawtext, @parser.context
|
336
|
+
assert_equal "some<text<plaintext</plaintext>", @parser.rawtext_text
|
337
|
+
end
|
338
|
+
|
339
|
+
%w(title textarea style xmp iframe noembed noframes).each do |name|
|
340
|
+
define_method "test_#{name}_rawtext" do
|
341
|
+
parse("<#{name}>")
|
342
|
+
assert_equal :rawtext, @parser.context
|
343
|
+
assert_equal name, @parser.tag_name
|
344
|
+
assert_nil @parser.rawtext_text
|
345
|
+
|
346
|
+
parse("some", "<text")
|
347
|
+
assert_equal :rawtext, @parser.context
|
348
|
+
assert_equal "some<text", @parser.rawtext_text
|
349
|
+
|
350
|
+
parse("<#{name}")
|
351
|
+
assert_equal :rawtext, @parser.context
|
352
|
+
assert_equal "some<text<#{name}", @parser.rawtext_text
|
353
|
+
|
354
|
+
parse("</#{name}")
|
355
|
+
assert_equal :tag_name, @parser.context
|
356
|
+
assert_equal "some<text<#{name}", @parser.rawtext_text
|
357
|
+
|
358
|
+
parse(">")
|
359
|
+
assert_equal :none, @parser.context
|
360
|
+
assert_equal "some<text<#{name}", @parser.rawtext_text
|
361
|
+
end
|
362
|
+
end
|
363
|
+
|
364
|
+
def test_script_rawtext
|
365
|
+
parse("<script>data data data")
|
366
|
+
assert_equal :rawtext, @parser.context
|
367
|
+
assert_equal "script", @parser.tag_name
|
368
|
+
assert_equal "data data data", @parser.rawtext_text
|
369
|
+
parse("</script")
|
370
|
+
assert_equal :tag_name, @parser.context
|
371
|
+
assert_equal "script", @parser.tag_name
|
372
|
+
parse(">")
|
373
|
+
assert_equal :none, @parser.context
|
374
|
+
end
|
375
|
+
|
376
|
+
def test_consecutive_scripts
|
377
|
+
parse("<script>foo\n</script>\n<script>bar</script>\n bla")
|
378
|
+
assert_equal :none, @parser.context
|
379
|
+
end
|
380
|
+
|
381
|
+
def test_end_of_script_regression
|
382
|
+
html = "<script><!</script>"
|
383
|
+
parse(html)
|
384
|
+
assert_equal :none, @parser.context
|
385
|
+
end
|
386
|
+
|
387
|
+
def test_document_length
|
388
|
+
@parser = HtmlTokenizer::Parser.new
|
389
|
+
assert_equal 0, @parser.document_length
|
390
|
+
parse("abcdef")
|
391
|
+
assert_equal 6, @parser.document_length
|
392
|
+
parse("abcdef")
|
393
|
+
assert_equal 12, @parser.document_length
|
394
|
+
end
|
395
|
+
|
396
|
+
def test_document_method
|
397
|
+
@parser = HtmlTokenizer::Parser.new
|
398
|
+
assert_nil @parser.document
|
399
|
+
parse("abcdef")
|
400
|
+
assert_equal "abcdef", @parser.document
|
401
|
+
parse("abcdef")
|
402
|
+
assert_equal "abcdefabcdef", @parser.document
|
403
|
+
end
|
404
|
+
|
405
|
+
def test_yields_raw_tokens_when_block_given
|
406
|
+
tokens = []
|
407
|
+
parse("<foo>") do |*token|
|
408
|
+
tokens << token
|
409
|
+
end
|
410
|
+
assert_equal [[:tag_start, 0, 1, 1, 0], [:tag_name, 1, 4, 1, 1], [:tag_end, 4, 5, 1, 4]], tokens
|
411
|
+
end
|
412
|
+
|
413
|
+
def test_yields_line_and_column_numbers
|
414
|
+
tokens = []
|
415
|
+
parse("<\n>") do |*token|
|
416
|
+
tokens << token
|
417
|
+
end
|
418
|
+
assert_equal [[:tag_start, 0, 1, 1, 0], [:whitespace, 1, 2, 1, 1], [:tag_end, 2, 3, 2, 0]], tokens
|
419
|
+
end
|
420
|
+
|
421
|
+
def test_append_placeholder_adjusts_line_and_column_numbers_but_does_not_parse
|
422
|
+
@parser = HtmlTokenizer::Parser.new
|
423
|
+
tokens = []
|
424
|
+
@parser.parse("foo\n") do |*token|
|
425
|
+
tokens << token
|
426
|
+
end
|
427
|
+
@parser.append_placeholder("<%= some ruby do\n foo\nend %>\n") do |*token|
|
428
|
+
tokens << token
|
429
|
+
end
|
430
|
+
@parser.parse("bar\n") do |*token|
|
431
|
+
tokens << token
|
432
|
+
end
|
433
|
+
assert_equal [[:text, 0, 4, 1, 0], [:text, 34, 38, 5, 0]], tokens
|
434
|
+
assert_equal "bar\n", @parser.extract(34, 38)
|
435
|
+
end
|
436
|
+
|
437
|
+
def test_extract_method
|
438
|
+
parse("abcdefg")
|
439
|
+
assert_equal "a", @parser.extract(0, 1)
|
440
|
+
assert_equal "cd", @parser.extract(2, 4)
|
441
|
+
end
|
442
|
+
|
443
|
+
def test_extract_method_raises_argument_error_end_past_length
|
444
|
+
parse("abcdefg")
|
445
|
+
e = assert_raises(ArgumentError) do
|
446
|
+
@parser.extract(0, 32)
|
447
|
+
end
|
448
|
+
assert_equal "'end' argument not in range of document", e.message
|
449
|
+
end
|
450
|
+
|
451
|
+
def test_extract_method_raises_argument_error_end_less_than_start
|
452
|
+
parse("abcdefg")
|
453
|
+
e = assert_raises(ArgumentError) do
|
454
|
+
@parser.extract(1, 0)
|
455
|
+
end
|
456
|
+
assert_equal "'end' must be greater or equal than 'start'", e.message
|
457
|
+
end
|
458
|
+
|
459
|
+
def test_solidus_or_tag_name_error
|
460
|
+
parse('<>')
|
461
|
+
assert_equal 1, @parser.errors_count
|
462
|
+
assert_equal "expected '/' or tag name", @parser.errors.first.to_s
|
463
|
+
assert_equal 1, @parser.errors.first.line
|
464
|
+
assert_equal 1, @parser.errors.first.column
|
465
|
+
end
|
466
|
+
|
467
|
+
def test_solidus_or_tag_name_error_2
|
468
|
+
parse('< ')
|
469
|
+
assert_equal 1, @parser.errors_count
|
470
|
+
assert_equal "expected '/' or tag name", @parser.errors.first.to_s
|
471
|
+
assert_equal 1, @parser.errors.first.line
|
472
|
+
assert_equal 1, @parser.errors.first.column
|
473
|
+
end
|
474
|
+
|
475
|
+
def test_tag_error
|
476
|
+
parse('<foo =')
|
477
|
+
assert_equal 1, @parser.errors_count
|
478
|
+
assert_equal "expected whitespace, '>', attribute name or value", @parser.errors.first.to_s
|
479
|
+
assert_equal 1, @parser.errors.first.line
|
480
|
+
assert_equal 5, @parser.errors.first.column
|
481
|
+
end
|
482
|
+
|
483
|
+
def test_tag_end_error
|
484
|
+
parse('<foo /x')
|
485
|
+
assert_equal 1, @parser.errors_count
|
486
|
+
assert_equal "expected '>' after '/'", @parser.errors.first.to_s
|
487
|
+
assert_equal 1, @parser.errors.first.line
|
488
|
+
assert_equal 6, @parser.errors.first.column
|
489
|
+
end
|
490
|
+
|
491
|
+
def test_tag_end_error_2
|
492
|
+
parse('<foo / ')
|
493
|
+
assert_equal 1, @parser.errors_count
|
494
|
+
assert_equal "expected '>' after '/'", @parser.errors.first.to_s
|
495
|
+
assert_equal 1, @parser.errors.first.line
|
496
|
+
assert_equal 6, @parser.errors.first.column
|
497
|
+
end
|
498
|
+
|
499
|
+
def test_attribute_name_error
|
500
|
+
parse('<foo bar~')
|
501
|
+
assert_equal 2, @parser.errors_count
|
502
|
+
assert_equal "expected whitespace, '>' or '=' after attribute name", @parser.errors.first.to_s
|
503
|
+
assert_equal 1, @parser.errors.first.line
|
504
|
+
assert_equal 8, @parser.errors.first.column
|
505
|
+
assert_equal "expected whitespace, '>' or '=' after attribute name", @parser.errors[0].to_s
|
506
|
+
assert_equal 1, @parser.errors[0].line
|
507
|
+
assert_equal 8, @parser.errors[0].column
|
508
|
+
end
|
509
|
+
|
510
|
+
def test_attribute_whitespace_or_equal_error
|
511
|
+
parse('<foo bar ~')
|
512
|
+
assert_equal 2, @parser.errors_count
|
513
|
+
assert_equal "expected '/', '>', \", ' or '=' after attribute name", @parser.errors.first.to_s
|
514
|
+
assert_equal 1, @parser.errors.first.line
|
515
|
+
assert_equal 9, @parser.errors.first.column
|
516
|
+
assert_equal "expected '/', '>', \", ' or '=' after attribute name", @parser.errors[0].to_s
|
517
|
+
assert_equal 1, @parser.errors[0].line
|
518
|
+
assert_equal 9, @parser.errors[0].column
|
519
|
+
end
|
520
|
+
|
521
|
+
def test_attribute_whitespace_or_equal_error_2
|
522
|
+
parse('<foo bar = >')
|
523
|
+
assert_equal 1, @parser.errors_count
|
524
|
+
assert_equal "expected attribute value after '='", @parser.errors.first.to_s
|
525
|
+
assert_equal 1, @parser.errors.first.line
|
526
|
+
assert_equal 11, @parser.errors.first.column
|
527
|
+
end
|
528
|
+
|
529
|
+
def test_attribute_after_quoted_value
|
530
|
+
parse('<foo bar=""x')
|
531
|
+
assert_equal 1, @parser.errors_count
|
532
|
+
assert_equal "expected space after attribute value", @parser.errors.first.to_s
|
533
|
+
assert_equal 1, @parser.errors.first.line
|
534
|
+
assert_equal 11, @parser.errors.first.column
|
535
|
+
end
|
536
|
+
|
537
|
+
def test_valid_syntaxes
|
538
|
+
parse(
|
539
|
+
'<div>',
|
540
|
+
'<div />',
|
541
|
+
'<div/>',
|
542
|
+
'<div data-thing>',
|
543
|
+
'<div data-thing />',
|
544
|
+
'<div data-thing/>',
|
545
|
+
'<div "value">',
|
546
|
+
'<div "value" />',
|
547
|
+
'<div "value"/>',
|
548
|
+
'<div data-thing = "value">',
|
549
|
+
'<div data-thing="value">',
|
550
|
+
'<div data-thing="value"/>',
|
551
|
+
'<div data-thing data-other-thing="value">',
|
552
|
+
'<div data-thing data-other-thing="value"/>',
|
553
|
+
"<div \n\t\r data-thing \n\t\r data-other-thing='value'>",
|
554
|
+
'<div data-thing "value">',
|
555
|
+
'<div data-thing "value"/>',
|
556
|
+
'<div data-thing "value" />',
|
557
|
+
'<div "value" data-thing>',
|
558
|
+
'<div "value" data-thing/>',
|
559
|
+
'<div foo=unquoted=bla/>',
|
560
|
+
'<div foo=unquoted=bla />',
|
561
|
+
'<div foo=unquoted=bla>',
|
562
|
+
'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">',
|
563
|
+
)
|
564
|
+
assert_equal 0, @parser.errors_count, "Expected no errors: #{@parser.errors}"
|
565
|
+
end
|
566
|
+
|
567
|
+
private
|
568
|
+
|
569
|
+
def parse(*parts, &block)
|
570
|
+
@parser ||= HtmlTokenizer::Parser.new
|
571
|
+
parts.each do |part|
|
572
|
+
@parser.parse(part, &block)
|
573
|
+
end
|
574
|
+
end
|
575
|
+
end
|