html_tokenizer 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.autotest +3 -0
- data/.gitignore +35 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +24 -0
- data/LICENSE +21 -0
- data/Manifest.txt +8 -0
- data/README.md +2 -0
- data/Rakefile +20 -0
- data/bin/html_tokenizer +3 -0
- data/ext/html_tokenizer_ext/extconf.rb +6 -0
- data/ext/html_tokenizer_ext/html_tokenizer.c +12 -0
- data/ext/html_tokenizer_ext/html_tokenizer.h +7 -0
- data/ext/html_tokenizer_ext/parser.c +767 -0
- data/ext/html_tokenizer_ext/parser.h +87 -0
- data/ext/html_tokenizer_ext/tokenizer.c +682 -0
- data/ext/html_tokenizer_ext/tokenizer.h +74 -0
- data/html_tokenizer.gemspec +19 -0
- data/lib/html_tokenizer.rb +12 -0
- data/test/unit/parser_test.rb +575 -0
- data/test/unit/tokenizer_test.rb +337 -0
- metadata +109 -0
@@ -0,0 +1,87 @@
|
|
1
|
+
#pragma once
|
2
|
+
#include "tokenizer.h"
|
3
|
+
|
4
|
+
enum parser_context {
|
5
|
+
PARSER_NONE,
|
6
|
+
PARSER_SOLIDUS_OR_TAG_NAME,
|
7
|
+
PARSER_TAG_NAME,
|
8
|
+
PARSER_TAG,
|
9
|
+
PARSER_ATTRIBUTE_NAME,
|
10
|
+
PARSER_ATTRIBUTE_WHITESPACE_OR_EQUAL,
|
11
|
+
PARSER_ATTRIBUTE_WHITESPACE_OR_VALUE,
|
12
|
+
PARSER_ATTRIBUTE_QUOTED_VALUE,
|
13
|
+
PARSER_SPACE_AFTER_ATTRIBUTE,
|
14
|
+
PARSER_ATTRIBUTE_UNQUOTED_VALUE,
|
15
|
+
PARSER_TAG_END,
|
16
|
+
PARSER_COMMENT,
|
17
|
+
PARSER_CDATA,
|
18
|
+
};
|
19
|
+
|
20
|
+
struct parser_document_error_t {
|
21
|
+
char *message;
|
22
|
+
long unsigned int line_number;
|
23
|
+
long unsigned int column_number;
|
24
|
+
};
|
25
|
+
|
26
|
+
struct parser_document_t {
|
27
|
+
long unsigned int length;
|
28
|
+
char *data;
|
29
|
+
long unsigned int line_number;
|
30
|
+
long unsigned int column_number;
|
31
|
+
};
|
32
|
+
|
33
|
+
struct token_reference_t {
|
34
|
+
enum token_type type;
|
35
|
+
long unsigned int start;
|
36
|
+
long unsigned int length;
|
37
|
+
long unsigned int line_number;
|
38
|
+
long unsigned int column_number;
|
39
|
+
};
|
40
|
+
|
41
|
+
struct parser_tag_t {
|
42
|
+
struct token_reference_t name;
|
43
|
+
int self_closing;
|
44
|
+
};
|
45
|
+
|
46
|
+
struct parser_attribute_t {
|
47
|
+
struct token_reference_t name;
|
48
|
+
struct token_reference_t value;
|
49
|
+
int is_quoted;
|
50
|
+
};
|
51
|
+
|
52
|
+
struct parser_rawtext_t {
|
53
|
+
struct token_reference_t text;
|
54
|
+
};
|
55
|
+
|
56
|
+
struct parser_comment_t {
|
57
|
+
struct token_reference_t text;
|
58
|
+
};
|
59
|
+
|
60
|
+
struct parser_cdata_t {
|
61
|
+
struct token_reference_t text;
|
62
|
+
};
|
63
|
+
|
64
|
+
struct parser_t
|
65
|
+
{
|
66
|
+
struct tokenizer_t tk;
|
67
|
+
|
68
|
+
struct parser_document_t doc;
|
69
|
+
|
70
|
+
size_t errors_count;
|
71
|
+
struct parser_document_error_t *errors;
|
72
|
+
|
73
|
+
enum parser_context context;
|
74
|
+
struct parser_tag_t tag;
|
75
|
+
struct parser_attribute_t attribute;
|
76
|
+
struct parser_rawtext_t rawtext;
|
77
|
+
struct parser_comment_t comment;
|
78
|
+
struct parser_cdata_t cdata;
|
79
|
+
};
|
80
|
+
|
81
|
+
void Init_html_tokenizer_parser(VALUE mHtmlTokenizer);
|
82
|
+
|
83
|
+
extern const rb_data_type_t ht_parser_data_type;
|
84
|
+
#define Parser_Get_Struct(obj, sval) TypedData_Get_Struct(obj, struct parser_t, &ht_parser_data_type, sval)
|
85
|
+
|
86
|
+
#define PARSE_AGAIN return 1
|
87
|
+
#define PARSE_DONE return 0
|
@@ -0,0 +1,682 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include "html_tokenizer.h"
|
3
|
+
#include "tokenizer.h"
|
4
|
+
|
5
|
+
static VALUE cTokenizer = Qnil;
|
6
|
+
|
7
|
+
static void tokenizer_mark(void *ptr)
|
8
|
+
{}
|
9
|
+
|
10
|
+
static void tokenizer_free(void *ptr)
|
11
|
+
{
|
12
|
+
struct tokenizer_t *tk = ptr;
|
13
|
+
if(tk) {
|
14
|
+
if(tk->current_tag) {
|
15
|
+
DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag);
|
16
|
+
xfree(tk->current_tag);
|
17
|
+
tk->current_tag = NULL;
|
18
|
+
}
|
19
|
+
if(tk->scan.string) {
|
20
|
+
DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string);
|
21
|
+
xfree(tk->scan.string);
|
22
|
+
tk->scan.string = NULL;
|
23
|
+
}
|
24
|
+
DBG_PRINT("tk=%p xfree(tk)", tk);
|
25
|
+
xfree(tk);
|
26
|
+
}
|
27
|
+
}
|
28
|
+
|
29
|
+
static size_t tokenizer_memsize(const void *ptr)
|
30
|
+
{
|
31
|
+
return ptr ? sizeof(struct tokenizer_t) : 0;
|
32
|
+
}
|
33
|
+
|
34
|
+
const rb_data_type_t ht_tokenizer_data_type = {
|
35
|
+
"ht_tokenizer_data_type",
|
36
|
+
{ tokenizer_mark, tokenizer_free, tokenizer_memsize, },
|
37
|
+
#if defined(RUBY_TYPED_FREE_IMMEDIATELY)
|
38
|
+
NULL, NULL, RUBY_TYPED_FREE_IMMEDIATELY
|
39
|
+
#endif
|
40
|
+
};
|
41
|
+
|
42
|
+
static VALUE tokenizer_allocate(VALUE klass)
|
43
|
+
{
|
44
|
+
VALUE obj;
|
45
|
+
struct tokenizer_t *tokenizer = NULL;
|
46
|
+
|
47
|
+
obj = TypedData_Make_Struct(klass, struct tokenizer_t, &ht_tokenizer_data_type, tokenizer);
|
48
|
+
DBG_PRINT("tk=%p allocate", tokenizer);
|
49
|
+
|
50
|
+
memset((void *)&tokenizer->context, TOKENIZER_NONE, sizeof(struct tokenizer_t));
|
51
|
+
|
52
|
+
return obj;
|
53
|
+
}
|
54
|
+
|
55
|
+
void tokenizer_init(struct tokenizer_t *tk)
|
56
|
+
{
|
57
|
+
tk->current_context = 0;
|
58
|
+
tk->context[0] = TOKENIZER_HTML;
|
59
|
+
|
60
|
+
tk->scan.string = NULL;
|
61
|
+
tk->scan.cursor = 0;
|
62
|
+
tk->scan.length = 0;
|
63
|
+
|
64
|
+
tk->attribute_value_start = 0;
|
65
|
+
tk->found_attribute = 0;
|
66
|
+
tk->current_tag = NULL;
|
67
|
+
tk->is_closing_tag = 0;
|
68
|
+
tk->last_token = TOKEN_NONE;
|
69
|
+
tk->callback_data = NULL;
|
70
|
+
tk->f_callback = NULL;
|
71
|
+
|
72
|
+
return;
|
73
|
+
}
|
74
|
+
|
75
|
+
VALUE token_type_to_symbol(enum token_type type)
|
76
|
+
{
|
77
|
+
switch(type) {
|
78
|
+
case TOKEN_NONE:
|
79
|
+
return ID2SYM(rb_intern("none"));
|
80
|
+
case TOKEN_TEXT:
|
81
|
+
return ID2SYM(rb_intern("text"));
|
82
|
+
case TOKEN_WHITESPACE:
|
83
|
+
return ID2SYM(rb_intern("whitespace"));
|
84
|
+
case TOKEN_COMMENT_START:
|
85
|
+
return ID2SYM(rb_intern("comment_start"));
|
86
|
+
case TOKEN_COMMENT_END:
|
87
|
+
return ID2SYM(rb_intern("comment_end"));
|
88
|
+
case TOKEN_TAG_NAME:
|
89
|
+
return ID2SYM(rb_intern("tag_name"));
|
90
|
+
case TOKEN_TAG_START:
|
91
|
+
return ID2SYM(rb_intern("tag_start"));
|
92
|
+
case TOKEN_TAG_END:
|
93
|
+
return ID2SYM(rb_intern("tag_end"));
|
94
|
+
case TOKEN_ATTRIBUTE_NAME:
|
95
|
+
return ID2SYM(rb_intern("attribute_name"));
|
96
|
+
case TOKEN_ATTRIBUTE_QUOTED_VALUE_START:
|
97
|
+
return ID2SYM(rb_intern("attribute_quoted_value_start"));
|
98
|
+
case TOKEN_ATTRIBUTE_QUOTED_VALUE:
|
99
|
+
return ID2SYM(rb_intern("attribute_quoted_value"));
|
100
|
+
case TOKEN_ATTRIBUTE_QUOTED_VALUE_END:
|
101
|
+
return ID2SYM(rb_intern("attribute_quoted_value_end"));
|
102
|
+
case TOKEN_ATTRIBUTE_UNQUOTED_VALUE:
|
103
|
+
return ID2SYM(rb_intern("attribute_unquoted_value"));
|
104
|
+
case TOKEN_CDATA_START:
|
105
|
+
return ID2SYM(rb_intern("cdata_start"));
|
106
|
+
case TOKEN_CDATA_END:
|
107
|
+
return ID2SYM(rb_intern("cdata_end"));
|
108
|
+
case TOKEN_SOLIDUS:
|
109
|
+
return ID2SYM(rb_intern("solidus"));
|
110
|
+
case TOKEN_EQUAL:
|
111
|
+
return ID2SYM(rb_intern("equal"));
|
112
|
+
case TOKEN_MALFORMED:
|
113
|
+
return ID2SYM(rb_intern("malformed"));
|
114
|
+
}
|
115
|
+
return Qnil;
|
116
|
+
}
|
117
|
+
|
118
|
+
static void tokenizer_yield_tag(struct tokenizer_t *tk, enum token_type type, long unsigned int length, void *data)
|
119
|
+
{
|
120
|
+
tk->last_token = type;
|
121
|
+
rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan.cursor), INT2NUM(tk->scan.cursor + length));
|
122
|
+
}
|
123
|
+
|
124
|
+
static void tokenizer_callback(struct tokenizer_t *tk, enum token_type type, long unsigned int length)
|
125
|
+
{
|
126
|
+
if(tk->f_callback)
|
127
|
+
tk->f_callback(tk, type, length, tk->callback_data);
|
128
|
+
tk->scan.cursor += length;
|
129
|
+
}
|
130
|
+
|
131
|
+
static VALUE tokenizer_initialize_method(VALUE self)
|
132
|
+
{
|
133
|
+
struct tokenizer_t *tk = NULL;
|
134
|
+
|
135
|
+
Tokenizer_Get_Struct(self, tk);
|
136
|
+
DBG_PRINT("tk=%p initialize", tk);
|
137
|
+
|
138
|
+
tokenizer_init(tk);
|
139
|
+
tk->f_callback = tokenizer_yield_tag;
|
140
|
+
|
141
|
+
return Qnil;
|
142
|
+
}
|
143
|
+
|
144
|
+
static inline int eos(struct scan_t *scan)
|
145
|
+
{
|
146
|
+
return scan->cursor >= scan->length;
|
147
|
+
}
|
148
|
+
|
149
|
+
static inline long unsigned int length_remaining(struct scan_t *scan)
|
150
|
+
{
|
151
|
+
return scan->length - scan->cursor;
|
152
|
+
}
|
153
|
+
|
154
|
+
static inline void push_context(struct tokenizer_t *tk, enum tokenizer_context ctx)
|
155
|
+
{
|
156
|
+
tk->context[++tk->current_context] = ctx;
|
157
|
+
}
|
158
|
+
|
159
|
+
static inline void pop_context(struct tokenizer_t *tk)
|
160
|
+
{
|
161
|
+
tk->context[tk->current_context--] = TOKENIZER_NONE;
|
162
|
+
}
|
163
|
+
|
164
|
+
static int is_text(struct scan_t *scan, long unsigned int *length)
|
165
|
+
{
|
166
|
+
long unsigned int i;
|
167
|
+
|
168
|
+
*length = 0;
|
169
|
+
for(i = scan->cursor;i < scan->length; i++, (*length)++) {
|
170
|
+
if(scan->string[i] == '<')
|
171
|
+
break;
|
172
|
+
}
|
173
|
+
return *length != 0;
|
174
|
+
}
|
175
|
+
|
176
|
+
static inline int is_comment_start(struct scan_t *scan)
|
177
|
+
{
|
178
|
+
return (length_remaining(scan) >= 4) &&
|
179
|
+
!strncmp((const char *)&scan->string[scan->cursor], "<!--", 4);
|
180
|
+
}
|
181
|
+
|
182
|
+
static inline int is_doctype(struct scan_t *scan)
|
183
|
+
{
|
184
|
+
return (length_remaining(scan) >= 9) &&
|
185
|
+
!strncasecmp((const char *)&scan->string[scan->cursor], "<!DOCTYPE", 9);
|
186
|
+
}
|
187
|
+
|
188
|
+
static inline int is_cdata_start(struct scan_t *scan)
|
189
|
+
{
|
190
|
+
return (length_remaining(scan) >= 9) &&
|
191
|
+
!strncasecmp((const char *)&scan->string[scan->cursor], "<![CDATA[", 9);
|
192
|
+
}
|
193
|
+
|
194
|
+
static inline int is_char(struct scan_t *scan, const char c)
|
195
|
+
{
|
196
|
+
return (length_remaining(scan) >= 1) && (scan->string[scan->cursor] == c);
|
197
|
+
}
|
198
|
+
|
199
|
+
static inline int is_alnum(const char c)
|
200
|
+
{
|
201
|
+
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9');
|
202
|
+
}
|
203
|
+
|
204
|
+
static int is_tag_start(struct scan_t *scan, long unsigned int *length,
|
205
|
+
int *closing_tag, const char **tag_name, long unsigned int *tag_name_length)
|
206
|
+
{
|
207
|
+
long unsigned int i, start;
|
208
|
+
|
209
|
+
if(scan->string[scan->cursor] != '<')
|
210
|
+
return 0;
|
211
|
+
|
212
|
+
*length = 1;
|
213
|
+
|
214
|
+
if(scan->string[scan->cursor+1] == '/') {
|
215
|
+
*closing_tag = 1;
|
216
|
+
(*length)++;
|
217
|
+
} else {
|
218
|
+
*closing_tag = 0;
|
219
|
+
}
|
220
|
+
|
221
|
+
*tag_name = &scan->string[scan->cursor + (*length)];
|
222
|
+
start = *length;
|
223
|
+
for(i = scan->cursor + (*length);i < scan->length; i++, (*length)++) {
|
224
|
+
if(!is_alnum(scan->string[i]) && scan->string[i] != ':')
|
225
|
+
break;
|
226
|
+
}
|
227
|
+
|
228
|
+
*tag_name_length = *length - start;
|
229
|
+
return 1;
|
230
|
+
}
|
231
|
+
|
232
|
+
static int is_tag_name(struct scan_t *scan, const char **tag_name, unsigned long int *tag_name_length)
|
233
|
+
{
|
234
|
+
long unsigned int i;
|
235
|
+
|
236
|
+
*tag_name_length = 0;
|
237
|
+
*tag_name = &scan->string[scan->cursor];
|
238
|
+
for(i = scan->cursor;i < scan->length; i++, (*tag_name_length)++) {
|
239
|
+
if(scan->string[i] == ' ' || scan->string[i] == '\t' ||
|
240
|
+
scan->string[i] == '\r' || scan->string[i] == '\n' ||
|
241
|
+
scan->string[i] == '>' || scan->string[i] == '/')
|
242
|
+
break;
|
243
|
+
}
|
244
|
+
|
245
|
+
return *tag_name_length != 0;
|
246
|
+
}
|
247
|
+
|
248
|
+
static int is_whitespace(struct scan_t *scan, unsigned long int *length)
|
249
|
+
{
|
250
|
+
long unsigned int i;
|
251
|
+
|
252
|
+
*length = 0;
|
253
|
+
for(i = scan->cursor;i < scan->length; i++, (*length)++) {
|
254
|
+
if(scan->string[i] != ' ' && scan->string[i] != '\t' &&
|
255
|
+
scan->string[i] != '\r' && scan->string[i] != '\n')
|
256
|
+
break;
|
257
|
+
}
|
258
|
+
return *length != 0;
|
259
|
+
}
|
260
|
+
|
261
|
+
static int is_attribute_name(struct scan_t *scan, unsigned long int *length)
|
262
|
+
{
|
263
|
+
long unsigned int i;
|
264
|
+
|
265
|
+
*length = 0;
|
266
|
+
for(i = scan->cursor;i < scan->length; i++, (*length)++) {
|
267
|
+
if(!is_alnum(scan->string[i]) && scan->string[i] != ':' &&
|
268
|
+
scan->string[i] != '-' && scan->string[i] != '_' &&
|
269
|
+
scan->string[i] != '.')
|
270
|
+
break;
|
271
|
+
}
|
272
|
+
return *length != 0;
|
273
|
+
}
|
274
|
+
|
275
|
+
static int is_unquoted_value(struct scan_t *scan, unsigned long int *length)
|
276
|
+
{
|
277
|
+
long unsigned int i;
|
278
|
+
|
279
|
+
*length = 0;
|
280
|
+
for(i = scan->cursor;i < scan->length; i++, (*length)++) {
|
281
|
+
if(scan->string[i] == ' ' || scan->string[i] == '\r' ||
|
282
|
+
scan->string[i] == '\n' || scan->string[i] == '\t' ||
|
283
|
+
scan->string[i] == '>')
|
284
|
+
break;
|
285
|
+
}
|
286
|
+
return *length != 0;
|
287
|
+
}
|
288
|
+
|
289
|
+
static int is_attribute_string(struct scan_t *scan, unsigned long int *length, const char attribute_value_start)
|
290
|
+
{
|
291
|
+
long unsigned int i;
|
292
|
+
|
293
|
+
*length = 0;
|
294
|
+
for(i = scan->cursor;i < scan->length; i++, (*length)++) {
|
295
|
+
if(scan->string[i] == attribute_value_start)
|
296
|
+
break;
|
297
|
+
}
|
298
|
+
return *length != 0;
|
299
|
+
}
|
300
|
+
|
301
|
+
static int is_comment_end(struct scan_t *scan, unsigned long int *length, const char **end)
|
302
|
+
{
|
303
|
+
long unsigned int i;
|
304
|
+
|
305
|
+
*length = 0;
|
306
|
+
for(i = scan->cursor;i < scan->length; i++, (*length)++) {
|
307
|
+
if(i < (scan->length - 2) && scan->string[i] == '-' && scan->string[i+1] == '-' &&
|
308
|
+
scan->string[i+2] == '>') {
|
309
|
+
*end = &scan->string[i];
|
310
|
+
break;
|
311
|
+
}
|
312
|
+
}
|
313
|
+
return *length != 0;
|
314
|
+
}
|
315
|
+
|
316
|
+
static int is_cdata_end(struct scan_t *scan, unsigned long int *length, const char **end)
|
317
|
+
{
|
318
|
+
long unsigned int i;
|
319
|
+
|
320
|
+
*length = 0;
|
321
|
+
for(i = scan->cursor;i < scan->length; i++, (*length)++) {
|
322
|
+
if(i < (scan->length-2) && scan->string[i] == ']' && scan->string[i+1] == ']' &&
|
323
|
+
scan->string[i+2] == '>') {
|
324
|
+
*end = &scan->string[i];
|
325
|
+
break;
|
326
|
+
}
|
327
|
+
}
|
328
|
+
return *length != 0;
|
329
|
+
}
|
330
|
+
|
331
|
+
static int scan_html(struct tokenizer_t *tk)
|
332
|
+
{
|
333
|
+
long unsigned int length = 0;
|
334
|
+
|
335
|
+
if(is_char(&tk->scan, '<')) {
|
336
|
+
push_context(tk, TOKENIZER_OPEN_TAG);
|
337
|
+
return 1;
|
338
|
+
}
|
339
|
+
else if(is_text(&tk->scan, &length)) {
|
340
|
+
tokenizer_callback(tk, TOKEN_TEXT, length);
|
341
|
+
return 1;
|
342
|
+
}
|
343
|
+
return 0;
|
344
|
+
}
|
345
|
+
|
346
|
+
static int scan_open_tag(struct tokenizer_t *tk)
|
347
|
+
{
|
348
|
+
unsigned long int length = 0;
|
349
|
+
|
350
|
+
if(is_comment_start(&tk->scan)) {
|
351
|
+
tokenizer_callback(tk, TOKEN_COMMENT_START, 4);
|
352
|
+
pop_context(tk); // back to html
|
353
|
+
push_context(tk, TOKENIZER_COMMENT);
|
354
|
+
return 1;
|
355
|
+
}
|
356
|
+
else if(is_doctype(&tk->scan)) {
|
357
|
+
tokenizer_callback(tk, TOKEN_TAG_START, 1);
|
358
|
+
tokenizer_callback(tk, TOKEN_TAG_NAME, 8);
|
359
|
+
return 1;
|
360
|
+
}
|
361
|
+
else if(is_cdata_start(&tk->scan)) {
|
362
|
+
tokenizer_callback(tk, TOKEN_CDATA_START, 9);
|
363
|
+
pop_context(tk); // back to html
|
364
|
+
push_context(tk, TOKENIZER_CDATA);
|
365
|
+
return 1;
|
366
|
+
}
|
367
|
+
else if(is_char(&tk->scan, '<')) {
|
368
|
+
tokenizer_callback(tk, TOKEN_TAG_START, 1);
|
369
|
+
push_context(tk, TOKENIZER_SOLIDUS_OR_TAG_NAME);
|
370
|
+
return 1;
|
371
|
+
}
|
372
|
+
else if(is_whitespace(&tk->scan, &length)) {
|
373
|
+
tokenizer_callback(tk, TOKEN_WHITESPACE, length);
|
374
|
+
return 1;
|
375
|
+
}
|
376
|
+
else if(is_attribute_name(&tk->scan, &length)) {
|
377
|
+
tokenizer_callback(tk, TOKEN_ATTRIBUTE_NAME, length);
|
378
|
+
push_context(tk, TOKENIZER_ATTRIBUTE_NAME);
|
379
|
+
return 1;
|
380
|
+
}
|
381
|
+
else if(is_char(&tk->scan, '\'') || is_char(&tk->scan, '"')) {
|
382
|
+
push_context(tk, TOKENIZER_ATTRIBUTE_VALUE);
|
383
|
+
return 1;
|
384
|
+
}
|
385
|
+
else if(is_char(&tk->scan, '=')) {
|
386
|
+
tokenizer_callback(tk, TOKEN_EQUAL, 1);
|
387
|
+
push_context(tk, TOKENIZER_ATTRIBUTE_VALUE);
|
388
|
+
return 1;
|
389
|
+
}
|
390
|
+
else if(is_char(&tk->scan, '/')) {
|
391
|
+
tokenizer_callback(tk, TOKEN_SOLIDUS, 1);
|
392
|
+
return 1;
|
393
|
+
}
|
394
|
+
else if(is_char(&tk->scan, '>')) {
|
395
|
+
tokenizer_callback(tk, TOKEN_TAG_END, 1);
|
396
|
+
pop_context(tk); // pop tag context
|
397
|
+
|
398
|
+
if(tk->current_tag && !tk->is_closing_tag) {
|
399
|
+
if(!strcasecmp("title", tk->current_tag) ||
|
400
|
+
!strcasecmp("textarea", tk->current_tag)) {
|
401
|
+
push_context(tk, TOKENIZER_RCDATA);
|
402
|
+
return 1;
|
403
|
+
}
|
404
|
+
else if(!strcasecmp("style", tk->current_tag) ||
|
405
|
+
!strcasecmp("xmp", tk->current_tag) || !strcasecmp("iframe", tk->current_tag) ||
|
406
|
+
!strcasecmp("noembed", tk->current_tag) || !strcasecmp("noframes", tk->current_tag) ||
|
407
|
+
!strcasecmp("listing", tk->current_tag)) {
|
408
|
+
push_context(tk, TOKENIZER_RAWTEXT);
|
409
|
+
return 1;
|
410
|
+
}
|
411
|
+
else if(!strcasecmp("script", tk->current_tag)) {
|
412
|
+
push_context(tk, TOKENIZER_SCRIPT_DATA);
|
413
|
+
return 1;
|
414
|
+
}
|
415
|
+
else if(!strcasecmp("plaintext", tk->current_tag)) {
|
416
|
+
push_context(tk, TOKENIZER_PLAINTEXT);
|
417
|
+
return 1;
|
418
|
+
}
|
419
|
+
}
|
420
|
+
return 1;
|
421
|
+
}
|
422
|
+
return 0;
|
423
|
+
}
|
424
|
+
|
425
|
+
static int scan_solidus_or_tag_name(struct tokenizer_t *tk)
|
426
|
+
{
|
427
|
+
if(tk->current_tag)
|
428
|
+
tk->current_tag[0] = '\0';
|
429
|
+
|
430
|
+
if(is_char(&tk->scan, '/')) {
|
431
|
+
tk->is_closing_tag = 1;
|
432
|
+
tokenizer_callback(tk, TOKEN_SOLIDUS, 1);
|
433
|
+
}
|
434
|
+
else {
|
435
|
+
tk->is_closing_tag = 0;
|
436
|
+
}
|
437
|
+
|
438
|
+
pop_context(tk);
|
439
|
+
push_context(tk, TOKENIZER_TAG_NAME);
|
440
|
+
return 1;
|
441
|
+
}
|
442
|
+
|
443
|
+
static int scan_tag_name(struct tokenizer_t *tk)
|
444
|
+
{
|
445
|
+
unsigned long int length = 0, tag_name_length = 0;
|
446
|
+
const char *tag_name = NULL;
|
447
|
+
void *old;
|
448
|
+
|
449
|
+
if(is_tag_name(&tk->scan, &tag_name, &tag_name_length)) {
|
450
|
+
length = (tk->current_tag ? strlen(tk->current_tag) : 0);
|
451
|
+
old = tk->current_tag;
|
452
|
+
REALLOC_N(tk->current_tag, char, length + tag_name_length + 1);
|
453
|
+
DBG_PRINT("tk=%p realloc(tk->current_tag) %p -> %p length=%lu", tk, old,
|
454
|
+
tk->current_tag, length + tag_name_length + 1);
|
455
|
+
tk->current_tag[length] = 0;
|
456
|
+
|
457
|
+
strncat(tk->current_tag, tag_name, tag_name_length);
|
458
|
+
|
459
|
+
tokenizer_callback(tk, TOKEN_TAG_NAME, tag_name_length);
|
460
|
+
return 1;
|
461
|
+
}
|
462
|
+
|
463
|
+
pop_context(tk); // back to open_tag
|
464
|
+
return 1;
|
465
|
+
}
|
466
|
+
|
467
|
+
static int scan_attribute_name(struct tokenizer_t *tk)
|
468
|
+
{
|
469
|
+
unsigned long int length = 0;
|
470
|
+
|
471
|
+
if(is_attribute_name(&tk->scan, &length)) {
|
472
|
+
tokenizer_callback(tk, TOKEN_ATTRIBUTE_NAME, length);
|
473
|
+
return 1;
|
474
|
+
}
|
475
|
+
|
476
|
+
pop_context(tk); // back to open tag
|
477
|
+
return 1;
|
478
|
+
}
|
479
|
+
|
480
|
+
static int scan_attribute_value(struct tokenizer_t *tk)
|
481
|
+
{
|
482
|
+
unsigned long int length = 0;
|
483
|
+
|
484
|
+
if(is_whitespace(&tk->scan, &length)) {
|
485
|
+
tokenizer_callback(tk, TOKEN_WHITESPACE, length);
|
486
|
+
return 1;
|
487
|
+
}
|
488
|
+
else if(is_char(&tk->scan, '\'') || is_char(&tk->scan, '"')) {
|
489
|
+
tk->attribute_value_start = tk->scan.string[tk->scan.cursor];
|
490
|
+
tokenizer_callback(tk, TOKEN_ATTRIBUTE_QUOTED_VALUE_START, 1);
|
491
|
+
pop_context(tk); // back to open tag
|
492
|
+
push_context(tk, TOKENIZER_ATTRIBUTE_QUOTED);
|
493
|
+
return 1;
|
494
|
+
}
|
495
|
+
|
496
|
+
pop_context(tk); // back to open tag
|
497
|
+
push_context(tk, TOKENIZER_ATTRIBUTE_UNQUOTED);
|
498
|
+
return 1;
|
499
|
+
}
|
500
|
+
|
501
|
+
static int scan_attribute_unquoted(struct tokenizer_t *tk)
|
502
|
+
{
|
503
|
+
unsigned long int length = 0;
|
504
|
+
|
505
|
+
if(is_unquoted_value(&tk->scan, &length)) {
|
506
|
+
tokenizer_callback(tk, TOKEN_ATTRIBUTE_UNQUOTED_VALUE, length);
|
507
|
+
return 1;
|
508
|
+
}
|
509
|
+
|
510
|
+
pop_context(tk); // back to open tag
|
511
|
+
return 1;
|
512
|
+
}
|
513
|
+
|
514
|
+
static int scan_attribute_quoted(struct tokenizer_t *tk)
|
515
|
+
{
|
516
|
+
unsigned long int length = 0;
|
517
|
+
|
518
|
+
if(is_char(&tk->scan, tk->attribute_value_start)) {
|
519
|
+
tokenizer_callback(tk, TOKEN_ATTRIBUTE_QUOTED_VALUE_END, 1);
|
520
|
+
pop_context(tk); // back to open tag
|
521
|
+
return 1;
|
522
|
+
}
|
523
|
+
else if(is_attribute_string(&tk->scan, &length, tk->attribute_value_start)) {
|
524
|
+
tokenizer_callback(tk, TOKEN_ATTRIBUTE_QUOTED_VALUE, length);
|
525
|
+
return 1;
|
526
|
+
}
|
527
|
+
return 0;
|
528
|
+
}
|
529
|
+
|
530
|
+
static int scan_comment(struct tokenizer_t *tk)
|
531
|
+
{
|
532
|
+
unsigned long int length = 0;
|
533
|
+
const char *comment_end = NULL;
|
534
|
+
|
535
|
+
if(is_comment_end(&tk->scan, &length, &comment_end)) {
|
536
|
+
tokenizer_callback(tk, TOKEN_TEXT, length);
|
537
|
+
if(comment_end) {
|
538
|
+
tokenizer_callback(tk, TOKEN_COMMENT_END, 3);
|
539
|
+
pop_context(tk); // back to document
|
540
|
+
}
|
541
|
+
return 1;
|
542
|
+
}
|
543
|
+
else {
|
544
|
+
tokenizer_callback(tk, TOKEN_TEXT, length_remaining(&tk->scan));
|
545
|
+
return 1;
|
546
|
+
}
|
547
|
+
return 0;
|
548
|
+
}
|
549
|
+
|
550
|
+
static int scan_cdata(struct tokenizer_t *tk)
|
551
|
+
{
|
552
|
+
unsigned long int length = 0;
|
553
|
+
const char *cdata_end = NULL;
|
554
|
+
|
555
|
+
if(is_cdata_end(&tk->scan, &length, &cdata_end)) {
|
556
|
+
tokenizer_callback(tk, TOKEN_TEXT, length);
|
557
|
+
if(cdata_end)
|
558
|
+
tokenizer_callback(tk, TOKEN_CDATA_END, 3);
|
559
|
+
return 1;
|
560
|
+
}
|
561
|
+
else {
|
562
|
+
tokenizer_callback(tk, TOKEN_TEXT, length_remaining(&tk->scan));
|
563
|
+
return 1;
|
564
|
+
}
|
565
|
+
return 0;
|
566
|
+
}
|
567
|
+
|
568
|
+
static int scan_rawtext(struct tokenizer_t *tk)
|
569
|
+
{
|
570
|
+
long unsigned int length = 0, tag_name_length = 0;
|
571
|
+
const char *tag_name = NULL;
|
572
|
+
int closing_tag = 0;
|
573
|
+
|
574
|
+
if(is_tag_start(&tk->scan, &length, &closing_tag, &tag_name, &tag_name_length)) {
|
575
|
+
if(closing_tag && tk->current_tag && !strncasecmp((const char *)tag_name, tk->current_tag, tag_name_length)) {
|
576
|
+
pop_context(tk);
|
577
|
+
} else {
|
578
|
+
tokenizer_callback(tk, TOKEN_TEXT, length);
|
579
|
+
}
|
580
|
+
return 1;
|
581
|
+
}
|
582
|
+
else if(is_text(&tk->scan, &length)) {
|
583
|
+
tokenizer_callback(tk, TOKEN_TEXT, length);
|
584
|
+
return 1;
|
585
|
+
}
|
586
|
+
else {
|
587
|
+
tokenizer_callback(tk, TOKEN_TEXT, length_remaining(&tk->scan));
|
588
|
+
return 1;
|
589
|
+
}
|
590
|
+
return 0;
|
591
|
+
}
|
592
|
+
|
593
|
+
static int scan_plaintext(struct tokenizer_t *tk)
|
594
|
+
{
|
595
|
+
tokenizer_callback(tk, TOKEN_TEXT, length_remaining(&tk->scan));
|
596
|
+
return 1;
|
597
|
+
}
|
598
|
+
|
599
|
+
static int scan_once(struct tokenizer_t *tk)
|
600
|
+
{
|
601
|
+
switch(tk->context[tk->current_context]) {
|
602
|
+
case TOKENIZER_NONE:
|
603
|
+
break;
|
604
|
+
case TOKENIZER_HTML:
|
605
|
+
return scan_html(tk);
|
606
|
+
case TOKENIZER_OPEN_TAG:
|
607
|
+
return scan_open_tag(tk);
|
608
|
+
case TOKENIZER_SOLIDUS_OR_TAG_NAME:
|
609
|
+
return scan_solidus_or_tag_name(tk);
|
610
|
+
case TOKENIZER_TAG_NAME:
|
611
|
+
return scan_tag_name(tk);
|
612
|
+
case TOKENIZER_COMMENT:
|
613
|
+
return scan_comment(tk);
|
614
|
+
case TOKENIZER_CDATA:
|
615
|
+
return scan_cdata(tk);
|
616
|
+
case TOKENIZER_RCDATA:
|
617
|
+
case TOKENIZER_RAWTEXT:
|
618
|
+
case TOKENIZER_SCRIPT_DATA:
|
619
|
+
/* we don't consume character references so all
|
620
|
+
of these states are effectively the same */
|
621
|
+
return scan_rawtext(tk);
|
622
|
+
case TOKENIZER_PLAINTEXT:
|
623
|
+
return scan_plaintext(tk);
|
624
|
+
case TOKENIZER_ATTRIBUTE_NAME:
|
625
|
+
return scan_attribute_name(tk);
|
626
|
+
case TOKENIZER_ATTRIBUTE_VALUE:
|
627
|
+
return scan_attribute_value(tk);
|
628
|
+
case TOKENIZER_ATTRIBUTE_UNQUOTED:
|
629
|
+
return scan_attribute_unquoted(tk);
|
630
|
+
case TOKENIZER_ATTRIBUTE_QUOTED:
|
631
|
+
return scan_attribute_quoted(tk);
|
632
|
+
}
|
633
|
+
return 0;
|
634
|
+
}
|
635
|
+
|
636
|
+
void tokenizer_scan_all(struct tokenizer_t *tk)
|
637
|
+
{
|
638
|
+
while(!eos(&tk->scan) && scan_once(tk)) {}
|
639
|
+
if(!eos(&tk->scan)) {
|
640
|
+
tokenizer_callback(tk, TOKEN_MALFORMED, length_remaining(&tk->scan));
|
641
|
+
}
|
642
|
+
return;
|
643
|
+
}
|
644
|
+
|
645
|
+
static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
|
646
|
+
{
|
647
|
+
struct tokenizer_t *tk = NULL;
|
648
|
+
char *c_source;
|
649
|
+
char *old;
|
650
|
+
|
651
|
+
if(NIL_P(source))
|
652
|
+
return Qnil;
|
653
|
+
|
654
|
+
Check_Type(source, T_STRING);
|
655
|
+
Tokenizer_Get_Struct(self, tk);
|
656
|
+
|
657
|
+
c_source = StringValueCStr(source);
|
658
|
+
tk->scan.cursor = 0;
|
659
|
+
tk->scan.length = strlen(c_source);
|
660
|
+
|
661
|
+
old = tk->scan.string;
|
662
|
+
REALLOC_N(tk->scan.string, char, tk->scan.length+1);
|
663
|
+
DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
|
664
|
+
tk->scan.string, tk->scan.length+1);
|
665
|
+
strncpy(tk->scan.string, c_source, tk->scan.length);
|
666
|
+
|
667
|
+
tokenizer_scan_all(tk);
|
668
|
+
|
669
|
+
DBG_PRINT("tk=%p xfree(tk->scan.string) 0x%p", tk, tk->scan.string);
|
670
|
+
xfree(tk->scan.string);
|
671
|
+
tk->scan.string = NULL;
|
672
|
+
|
673
|
+
return Qtrue;
|
674
|
+
}
|
675
|
+
|
676
|
+
void Init_html_tokenizer_tokenizer(VALUE mHtmlTokenizer)
|
677
|
+
{
|
678
|
+
cTokenizer = rb_define_class_under(mHtmlTokenizer, "Tokenizer", rb_cObject);
|
679
|
+
rb_define_alloc_func(cTokenizer, tokenizer_allocate);
|
680
|
+
rb_define_method(cTokenizer, "initialize", tokenizer_initialize_method, 0);
|
681
|
+
rb_define_method(cTokenizer, "tokenize", tokenizer_tokenize_method, 1);
|
682
|
+
}
|