html_tokenizer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,87 @@
1
+ #pragma once
2
+ #include "tokenizer.h"
3
+
4
+ enum parser_context {
5
+ PARSER_NONE,
6
+ PARSER_SOLIDUS_OR_TAG_NAME,
7
+ PARSER_TAG_NAME,
8
+ PARSER_TAG,
9
+ PARSER_ATTRIBUTE_NAME,
10
+ PARSER_ATTRIBUTE_WHITESPACE_OR_EQUAL,
11
+ PARSER_ATTRIBUTE_WHITESPACE_OR_VALUE,
12
+ PARSER_ATTRIBUTE_QUOTED_VALUE,
13
+ PARSER_SPACE_AFTER_ATTRIBUTE,
14
+ PARSER_ATTRIBUTE_UNQUOTED_VALUE,
15
+ PARSER_TAG_END,
16
+ PARSER_COMMENT,
17
+ PARSER_CDATA,
18
+ };
19
+
20
+ struct parser_document_error_t {
21
+ char *message;
22
+ long unsigned int line_number;
23
+ long unsigned int column_number;
24
+ };
25
+
26
+ struct parser_document_t {
27
+ long unsigned int length;
28
+ char *data;
29
+ long unsigned int line_number;
30
+ long unsigned int column_number;
31
+ };
32
+
33
+ struct token_reference_t {
34
+ enum token_type type;
35
+ long unsigned int start;
36
+ long unsigned int length;
37
+ long unsigned int line_number;
38
+ long unsigned int column_number;
39
+ };
40
+
41
+ struct parser_tag_t {
42
+ struct token_reference_t name;
43
+ int self_closing;
44
+ };
45
+
46
+ struct parser_attribute_t {
47
+ struct token_reference_t name;
48
+ struct token_reference_t value;
49
+ int is_quoted;
50
+ };
51
+
52
+ struct parser_rawtext_t {
53
+ struct token_reference_t text;
54
+ };
55
+
56
+ struct parser_comment_t {
57
+ struct token_reference_t text;
58
+ };
59
+
60
+ struct parser_cdata_t {
61
+ struct token_reference_t text;
62
+ };
63
+
64
+ struct parser_t
65
+ {
66
+ struct tokenizer_t tk;
67
+
68
+ struct parser_document_t doc;
69
+
70
+ size_t errors_count;
71
+ struct parser_document_error_t *errors;
72
+
73
+ enum parser_context context;
74
+ struct parser_tag_t tag;
75
+ struct parser_attribute_t attribute;
76
+ struct parser_rawtext_t rawtext;
77
+ struct parser_comment_t comment;
78
+ struct parser_cdata_t cdata;
79
+ };
80
+
81
+ void Init_html_tokenizer_parser(VALUE mHtmlTokenizer);
82
+
83
+ extern const rb_data_type_t ht_parser_data_type;
84
+ #define Parser_Get_Struct(obj, sval) TypedData_Get_Struct(obj, struct parser_t, &ht_parser_data_type, sval)
85
+
86
+ #define PARSE_AGAIN return 1
87
+ #define PARSE_DONE return 0
@@ -0,0 +1,682 @@
1
+ #include <ruby.h>
2
+ #include "html_tokenizer.h"
3
+ #include "tokenizer.h"
4
+
5
+ static VALUE cTokenizer = Qnil;
6
+
7
+ static void tokenizer_mark(void *ptr)
8
+ {}
9
+
10
+ static void tokenizer_free(void *ptr)
11
+ {
12
+ struct tokenizer_t *tk = ptr;
13
+ if(tk) {
14
+ if(tk->current_tag) {
15
+ DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag);
16
+ xfree(tk->current_tag);
17
+ tk->current_tag = NULL;
18
+ }
19
+ if(tk->scan.string) {
20
+ DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string);
21
+ xfree(tk->scan.string);
22
+ tk->scan.string = NULL;
23
+ }
24
+ DBG_PRINT("tk=%p xfree(tk)", tk);
25
+ xfree(tk);
26
+ }
27
+ }
28
+
29
+ static size_t tokenizer_memsize(const void *ptr)
30
+ {
31
+ return ptr ? sizeof(struct tokenizer_t) : 0;
32
+ }
33
+
34
+ const rb_data_type_t ht_tokenizer_data_type = {
35
+ "ht_tokenizer_data_type",
36
+ { tokenizer_mark, tokenizer_free, tokenizer_memsize, },
37
+ #if defined(RUBY_TYPED_FREE_IMMEDIATELY)
38
+ NULL, NULL, RUBY_TYPED_FREE_IMMEDIATELY
39
+ #endif
40
+ };
41
+
42
+ static VALUE tokenizer_allocate(VALUE klass)
43
+ {
44
+ VALUE obj;
45
+ struct tokenizer_t *tokenizer = NULL;
46
+
47
+ obj = TypedData_Make_Struct(klass, struct tokenizer_t, &ht_tokenizer_data_type, tokenizer);
48
+ DBG_PRINT("tk=%p allocate", tokenizer);
49
+
50
+ memset((void *)&tokenizer->context, TOKENIZER_NONE, sizeof(struct tokenizer_t));
51
+
52
+ return obj;
53
+ }
54
+
55
+ void tokenizer_init(struct tokenizer_t *tk)
56
+ {
57
+ tk->current_context = 0;
58
+ tk->context[0] = TOKENIZER_HTML;
59
+
60
+ tk->scan.string = NULL;
61
+ tk->scan.cursor = 0;
62
+ tk->scan.length = 0;
63
+
64
+ tk->attribute_value_start = 0;
65
+ tk->found_attribute = 0;
66
+ tk->current_tag = NULL;
67
+ tk->is_closing_tag = 0;
68
+ tk->last_token = TOKEN_NONE;
69
+ tk->callback_data = NULL;
70
+ tk->f_callback = NULL;
71
+
72
+ return;
73
+ }
74
+
75
+ VALUE token_type_to_symbol(enum token_type type)
76
+ {
77
+ switch(type) {
78
+ case TOKEN_NONE:
79
+ return ID2SYM(rb_intern("none"));
80
+ case TOKEN_TEXT:
81
+ return ID2SYM(rb_intern("text"));
82
+ case TOKEN_WHITESPACE:
83
+ return ID2SYM(rb_intern("whitespace"));
84
+ case TOKEN_COMMENT_START:
85
+ return ID2SYM(rb_intern("comment_start"));
86
+ case TOKEN_COMMENT_END:
87
+ return ID2SYM(rb_intern("comment_end"));
88
+ case TOKEN_TAG_NAME:
89
+ return ID2SYM(rb_intern("tag_name"));
90
+ case TOKEN_TAG_START:
91
+ return ID2SYM(rb_intern("tag_start"));
92
+ case TOKEN_TAG_END:
93
+ return ID2SYM(rb_intern("tag_end"));
94
+ case TOKEN_ATTRIBUTE_NAME:
95
+ return ID2SYM(rb_intern("attribute_name"));
96
+ case TOKEN_ATTRIBUTE_QUOTED_VALUE_START:
97
+ return ID2SYM(rb_intern("attribute_quoted_value_start"));
98
+ case TOKEN_ATTRIBUTE_QUOTED_VALUE:
99
+ return ID2SYM(rb_intern("attribute_quoted_value"));
100
+ case TOKEN_ATTRIBUTE_QUOTED_VALUE_END:
101
+ return ID2SYM(rb_intern("attribute_quoted_value_end"));
102
+ case TOKEN_ATTRIBUTE_UNQUOTED_VALUE:
103
+ return ID2SYM(rb_intern("attribute_unquoted_value"));
104
+ case TOKEN_CDATA_START:
105
+ return ID2SYM(rb_intern("cdata_start"));
106
+ case TOKEN_CDATA_END:
107
+ return ID2SYM(rb_intern("cdata_end"));
108
+ case TOKEN_SOLIDUS:
109
+ return ID2SYM(rb_intern("solidus"));
110
+ case TOKEN_EQUAL:
111
+ return ID2SYM(rb_intern("equal"));
112
+ case TOKEN_MALFORMED:
113
+ return ID2SYM(rb_intern("malformed"));
114
+ }
115
+ return Qnil;
116
+ }
117
+
118
+ static void tokenizer_yield_tag(struct tokenizer_t *tk, enum token_type type, long unsigned int length, void *data)
119
+ {
120
+ tk->last_token = type;
121
+ rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan.cursor), INT2NUM(tk->scan.cursor + length));
122
+ }
123
+
124
+ static void tokenizer_callback(struct tokenizer_t *tk, enum token_type type, long unsigned int length)
125
+ {
126
+ if(tk->f_callback)
127
+ tk->f_callback(tk, type, length, tk->callback_data);
128
+ tk->scan.cursor += length;
129
+ }
130
+
131
+ static VALUE tokenizer_initialize_method(VALUE self)
132
+ {
133
+ struct tokenizer_t *tk = NULL;
134
+
135
+ Tokenizer_Get_Struct(self, tk);
136
+ DBG_PRINT("tk=%p initialize", tk);
137
+
138
+ tokenizer_init(tk);
139
+ tk->f_callback = tokenizer_yield_tag;
140
+
141
+ return Qnil;
142
+ }
143
+
144
+ static inline int eos(struct scan_t *scan)
145
+ {
146
+ return scan->cursor >= scan->length;
147
+ }
148
+
149
+ static inline long unsigned int length_remaining(struct scan_t *scan)
150
+ {
151
+ return scan->length - scan->cursor;
152
+ }
153
+
154
+ static inline void push_context(struct tokenizer_t *tk, enum tokenizer_context ctx)
155
+ {
156
+ tk->context[++tk->current_context] = ctx;
157
+ }
158
+
159
+ static inline void pop_context(struct tokenizer_t *tk)
160
+ {
161
+ tk->context[tk->current_context--] = TOKENIZER_NONE;
162
+ }
163
+
164
+ static int is_text(struct scan_t *scan, long unsigned int *length)
165
+ {
166
+ long unsigned int i;
167
+
168
+ *length = 0;
169
+ for(i = scan->cursor;i < scan->length; i++, (*length)++) {
170
+ if(scan->string[i] == '<')
171
+ break;
172
+ }
173
+ return *length != 0;
174
+ }
175
+
176
+ static inline int is_comment_start(struct scan_t *scan)
177
+ {
178
+ return (length_remaining(scan) >= 4) &&
179
+ !strncmp((const char *)&scan->string[scan->cursor], "<!--", 4);
180
+ }
181
+
182
+ static inline int is_doctype(struct scan_t *scan)
183
+ {
184
+ return (length_remaining(scan) >= 9) &&
185
+ !strncasecmp((const char *)&scan->string[scan->cursor], "<!DOCTYPE", 9);
186
+ }
187
+
188
+ static inline int is_cdata_start(struct scan_t *scan)
189
+ {
190
+ return (length_remaining(scan) >= 9) &&
191
+ !strncasecmp((const char *)&scan->string[scan->cursor], "<![CDATA[", 9);
192
+ }
193
+
194
+ static inline int is_char(struct scan_t *scan, const char c)
195
+ {
196
+ return (length_remaining(scan) >= 1) && (scan->string[scan->cursor] == c);
197
+ }
198
+
199
+ static inline int is_alnum(const char c)
200
+ {
201
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9');
202
+ }
203
+
204
+ static int is_tag_start(struct scan_t *scan, long unsigned int *length,
205
+ int *closing_tag, const char **tag_name, long unsigned int *tag_name_length)
206
+ {
207
+ long unsigned int i, start;
208
+
209
+ if(scan->string[scan->cursor] != '<')
210
+ return 0;
211
+
212
+ *length = 1;
213
+
214
+ if(scan->string[scan->cursor+1] == '/') {
215
+ *closing_tag = 1;
216
+ (*length)++;
217
+ } else {
218
+ *closing_tag = 0;
219
+ }
220
+
221
+ *tag_name = &scan->string[scan->cursor + (*length)];
222
+ start = *length;
223
+ for(i = scan->cursor + (*length);i < scan->length; i++, (*length)++) {
224
+ if(!is_alnum(scan->string[i]) && scan->string[i] != ':')
225
+ break;
226
+ }
227
+
228
+ *tag_name_length = *length - start;
229
+ return 1;
230
+ }
231
+
232
+ static int is_tag_name(struct scan_t *scan, const char **tag_name, unsigned long int *tag_name_length)
233
+ {
234
+ long unsigned int i;
235
+
236
+ *tag_name_length = 0;
237
+ *tag_name = &scan->string[scan->cursor];
238
+ for(i = scan->cursor;i < scan->length; i++, (*tag_name_length)++) {
239
+ if(scan->string[i] == ' ' || scan->string[i] == '\t' ||
240
+ scan->string[i] == '\r' || scan->string[i] == '\n' ||
241
+ scan->string[i] == '>' || scan->string[i] == '/')
242
+ break;
243
+ }
244
+
245
+ return *tag_name_length != 0;
246
+ }
247
+
248
+ static int is_whitespace(struct scan_t *scan, unsigned long int *length)
249
+ {
250
+ long unsigned int i;
251
+
252
+ *length = 0;
253
+ for(i = scan->cursor;i < scan->length; i++, (*length)++) {
254
+ if(scan->string[i] != ' ' && scan->string[i] != '\t' &&
255
+ scan->string[i] != '\r' && scan->string[i] != '\n')
256
+ break;
257
+ }
258
+ return *length != 0;
259
+ }
260
+
261
+ static int is_attribute_name(struct scan_t *scan, unsigned long int *length)
262
+ {
263
+ long unsigned int i;
264
+
265
+ *length = 0;
266
+ for(i = scan->cursor;i < scan->length; i++, (*length)++) {
267
+ if(!is_alnum(scan->string[i]) && scan->string[i] != ':' &&
268
+ scan->string[i] != '-' && scan->string[i] != '_' &&
269
+ scan->string[i] != '.')
270
+ break;
271
+ }
272
+ return *length != 0;
273
+ }
274
+
275
+ static int is_unquoted_value(struct scan_t *scan, unsigned long int *length)
276
+ {
277
+ long unsigned int i;
278
+
279
+ *length = 0;
280
+ for(i = scan->cursor;i < scan->length; i++, (*length)++) {
281
+ if(scan->string[i] == ' ' || scan->string[i] == '\r' ||
282
+ scan->string[i] == '\n' || scan->string[i] == '\t' ||
283
+ scan->string[i] == '>')
284
+ break;
285
+ }
286
+ return *length != 0;
287
+ }
288
+
289
+ static int is_attribute_string(struct scan_t *scan, unsigned long int *length, const char attribute_value_start)
290
+ {
291
+ long unsigned int i;
292
+
293
+ *length = 0;
294
+ for(i = scan->cursor;i < scan->length; i++, (*length)++) {
295
+ if(scan->string[i] == attribute_value_start)
296
+ break;
297
+ }
298
+ return *length != 0;
299
+ }
300
+
301
+ static int is_comment_end(struct scan_t *scan, unsigned long int *length, const char **end)
302
+ {
303
+ long unsigned int i;
304
+
305
+ *length = 0;
306
+ for(i = scan->cursor;i < scan->length; i++, (*length)++) {
307
+ if(i < (scan->length - 2) && scan->string[i] == '-' && scan->string[i+1] == '-' &&
308
+ scan->string[i+2] == '>') {
309
+ *end = &scan->string[i];
310
+ break;
311
+ }
312
+ }
313
+ return *length != 0;
314
+ }
315
+
316
+ static int is_cdata_end(struct scan_t *scan, unsigned long int *length, const char **end)
317
+ {
318
+ long unsigned int i;
319
+
320
+ *length = 0;
321
+ for(i = scan->cursor;i < scan->length; i++, (*length)++) {
322
+ if(i < (scan->length-2) && scan->string[i] == ']' && scan->string[i+1] == ']' &&
323
+ scan->string[i+2] == '>') {
324
+ *end = &scan->string[i];
325
+ break;
326
+ }
327
+ }
328
+ return *length != 0;
329
+ }
330
+
331
+ static int scan_html(struct tokenizer_t *tk)
332
+ {
333
+ long unsigned int length = 0;
334
+
335
+ if(is_char(&tk->scan, '<')) {
336
+ push_context(tk, TOKENIZER_OPEN_TAG);
337
+ return 1;
338
+ }
339
+ else if(is_text(&tk->scan, &length)) {
340
+ tokenizer_callback(tk, TOKEN_TEXT, length);
341
+ return 1;
342
+ }
343
+ return 0;
344
+ }
345
+
346
+ static int scan_open_tag(struct tokenizer_t *tk)
347
+ {
348
+ unsigned long int length = 0;
349
+
350
+ if(is_comment_start(&tk->scan)) {
351
+ tokenizer_callback(tk, TOKEN_COMMENT_START, 4);
352
+ pop_context(tk); // back to html
353
+ push_context(tk, TOKENIZER_COMMENT);
354
+ return 1;
355
+ }
356
+ else if(is_doctype(&tk->scan)) {
357
+ tokenizer_callback(tk, TOKEN_TAG_START, 1);
358
+ tokenizer_callback(tk, TOKEN_TAG_NAME, 8);
359
+ return 1;
360
+ }
361
+ else if(is_cdata_start(&tk->scan)) {
362
+ tokenizer_callback(tk, TOKEN_CDATA_START, 9);
363
+ pop_context(tk); // back to html
364
+ push_context(tk, TOKENIZER_CDATA);
365
+ return 1;
366
+ }
367
+ else if(is_char(&tk->scan, '<')) {
368
+ tokenizer_callback(tk, TOKEN_TAG_START, 1);
369
+ push_context(tk, TOKENIZER_SOLIDUS_OR_TAG_NAME);
370
+ return 1;
371
+ }
372
+ else if(is_whitespace(&tk->scan, &length)) {
373
+ tokenizer_callback(tk, TOKEN_WHITESPACE, length);
374
+ return 1;
375
+ }
376
+ else if(is_attribute_name(&tk->scan, &length)) {
377
+ tokenizer_callback(tk, TOKEN_ATTRIBUTE_NAME, length);
378
+ push_context(tk, TOKENIZER_ATTRIBUTE_NAME);
379
+ return 1;
380
+ }
381
+ else if(is_char(&tk->scan, '\'') || is_char(&tk->scan, '"')) {
382
+ push_context(tk, TOKENIZER_ATTRIBUTE_VALUE);
383
+ return 1;
384
+ }
385
+ else if(is_char(&tk->scan, '=')) {
386
+ tokenizer_callback(tk, TOKEN_EQUAL, 1);
387
+ push_context(tk, TOKENIZER_ATTRIBUTE_VALUE);
388
+ return 1;
389
+ }
390
+ else if(is_char(&tk->scan, '/')) {
391
+ tokenizer_callback(tk, TOKEN_SOLIDUS, 1);
392
+ return 1;
393
+ }
394
+ else if(is_char(&tk->scan, '>')) {
395
+ tokenizer_callback(tk, TOKEN_TAG_END, 1);
396
+ pop_context(tk); // pop tag context
397
+
398
+ if(tk->current_tag && !tk->is_closing_tag) {
399
+ if(!strcasecmp("title", tk->current_tag) ||
400
+ !strcasecmp("textarea", tk->current_tag)) {
401
+ push_context(tk, TOKENIZER_RCDATA);
402
+ return 1;
403
+ }
404
+ else if(!strcasecmp("style", tk->current_tag) ||
405
+ !strcasecmp("xmp", tk->current_tag) || !strcasecmp("iframe", tk->current_tag) ||
406
+ !strcasecmp("noembed", tk->current_tag) || !strcasecmp("noframes", tk->current_tag) ||
407
+ !strcasecmp("listing", tk->current_tag)) {
408
+ push_context(tk, TOKENIZER_RAWTEXT);
409
+ return 1;
410
+ }
411
+ else if(!strcasecmp("script", tk->current_tag)) {
412
+ push_context(tk, TOKENIZER_SCRIPT_DATA);
413
+ return 1;
414
+ }
415
+ else if(!strcasecmp("plaintext", tk->current_tag)) {
416
+ push_context(tk, TOKENIZER_PLAINTEXT);
417
+ return 1;
418
+ }
419
+ }
420
+ return 1;
421
+ }
422
+ return 0;
423
+ }
424
+
425
+ static int scan_solidus_or_tag_name(struct tokenizer_t *tk)
426
+ {
427
+ if(tk->current_tag)
428
+ tk->current_tag[0] = '\0';
429
+
430
+ if(is_char(&tk->scan, '/')) {
431
+ tk->is_closing_tag = 1;
432
+ tokenizer_callback(tk, TOKEN_SOLIDUS, 1);
433
+ }
434
+ else {
435
+ tk->is_closing_tag = 0;
436
+ }
437
+
438
+ pop_context(tk);
439
+ push_context(tk, TOKENIZER_TAG_NAME);
440
+ return 1;
441
+ }
442
+
443
+ static int scan_tag_name(struct tokenizer_t *tk)
444
+ {
445
+ unsigned long int length = 0, tag_name_length = 0;
446
+ const char *tag_name = NULL;
447
+ void *old;
448
+
449
+ if(is_tag_name(&tk->scan, &tag_name, &tag_name_length)) {
450
+ length = (tk->current_tag ? strlen(tk->current_tag) : 0);
451
+ old = tk->current_tag;
452
+ REALLOC_N(tk->current_tag, char, length + tag_name_length + 1);
453
+ DBG_PRINT("tk=%p realloc(tk->current_tag) %p -> %p length=%lu", tk, old,
454
+ tk->current_tag, length + tag_name_length + 1);
455
+ tk->current_tag[length] = 0;
456
+
457
+ strncat(tk->current_tag, tag_name, tag_name_length);
458
+
459
+ tokenizer_callback(tk, TOKEN_TAG_NAME, tag_name_length);
460
+ return 1;
461
+ }
462
+
463
+ pop_context(tk); // back to open_tag
464
+ return 1;
465
+ }
466
+
467
+ static int scan_attribute_name(struct tokenizer_t *tk)
468
+ {
469
+ unsigned long int length = 0;
470
+
471
+ if(is_attribute_name(&tk->scan, &length)) {
472
+ tokenizer_callback(tk, TOKEN_ATTRIBUTE_NAME, length);
473
+ return 1;
474
+ }
475
+
476
+ pop_context(tk); // back to open tag
477
+ return 1;
478
+ }
479
+
480
+ static int scan_attribute_value(struct tokenizer_t *tk)
481
+ {
482
+ unsigned long int length = 0;
483
+
484
+ if(is_whitespace(&tk->scan, &length)) {
485
+ tokenizer_callback(tk, TOKEN_WHITESPACE, length);
486
+ return 1;
487
+ }
488
+ else if(is_char(&tk->scan, '\'') || is_char(&tk->scan, '"')) {
489
+ tk->attribute_value_start = tk->scan.string[tk->scan.cursor];
490
+ tokenizer_callback(tk, TOKEN_ATTRIBUTE_QUOTED_VALUE_START, 1);
491
+ pop_context(tk); // back to open tag
492
+ push_context(tk, TOKENIZER_ATTRIBUTE_QUOTED);
493
+ return 1;
494
+ }
495
+
496
+ pop_context(tk); // back to open tag
497
+ push_context(tk, TOKENIZER_ATTRIBUTE_UNQUOTED);
498
+ return 1;
499
+ }
500
+
501
+ static int scan_attribute_unquoted(struct tokenizer_t *tk)
502
+ {
503
+ unsigned long int length = 0;
504
+
505
+ if(is_unquoted_value(&tk->scan, &length)) {
506
+ tokenizer_callback(tk, TOKEN_ATTRIBUTE_UNQUOTED_VALUE, length);
507
+ return 1;
508
+ }
509
+
510
+ pop_context(tk); // back to open tag
511
+ return 1;
512
+ }
513
+
514
+ static int scan_attribute_quoted(struct tokenizer_t *tk)
515
+ {
516
+ unsigned long int length = 0;
517
+
518
+ if(is_char(&tk->scan, tk->attribute_value_start)) {
519
+ tokenizer_callback(tk, TOKEN_ATTRIBUTE_QUOTED_VALUE_END, 1);
520
+ pop_context(tk); // back to open tag
521
+ return 1;
522
+ }
523
+ else if(is_attribute_string(&tk->scan, &length, tk->attribute_value_start)) {
524
+ tokenizer_callback(tk, TOKEN_ATTRIBUTE_QUOTED_VALUE, length);
525
+ return 1;
526
+ }
527
+ return 0;
528
+ }
529
+
530
+ static int scan_comment(struct tokenizer_t *tk)
531
+ {
532
+ unsigned long int length = 0;
533
+ const char *comment_end = NULL;
534
+
535
+ if(is_comment_end(&tk->scan, &length, &comment_end)) {
536
+ tokenizer_callback(tk, TOKEN_TEXT, length);
537
+ if(comment_end) {
538
+ tokenizer_callback(tk, TOKEN_COMMENT_END, 3);
539
+ pop_context(tk); // back to document
540
+ }
541
+ return 1;
542
+ }
543
+ else {
544
+ tokenizer_callback(tk, TOKEN_TEXT, length_remaining(&tk->scan));
545
+ return 1;
546
+ }
547
+ return 0;
548
+ }
549
+
550
+ static int scan_cdata(struct tokenizer_t *tk)
551
+ {
552
+ unsigned long int length = 0;
553
+ const char *cdata_end = NULL;
554
+
555
+ if(is_cdata_end(&tk->scan, &length, &cdata_end)) {
556
+ tokenizer_callback(tk, TOKEN_TEXT, length);
557
+ if(cdata_end)
558
+ tokenizer_callback(tk, TOKEN_CDATA_END, 3);
559
+ return 1;
560
+ }
561
+ else {
562
+ tokenizer_callback(tk, TOKEN_TEXT, length_remaining(&tk->scan));
563
+ return 1;
564
+ }
565
+ return 0;
566
+ }
567
+
568
+ static int scan_rawtext(struct tokenizer_t *tk)
569
+ {
570
+ long unsigned int length = 0, tag_name_length = 0;
571
+ const char *tag_name = NULL;
572
+ int closing_tag = 0;
573
+
574
+ if(is_tag_start(&tk->scan, &length, &closing_tag, &tag_name, &tag_name_length)) {
575
+ if(closing_tag && tk->current_tag && !strncasecmp((const char *)tag_name, tk->current_tag, tag_name_length)) {
576
+ pop_context(tk);
577
+ } else {
578
+ tokenizer_callback(tk, TOKEN_TEXT, length);
579
+ }
580
+ return 1;
581
+ }
582
+ else if(is_text(&tk->scan, &length)) {
583
+ tokenizer_callback(tk, TOKEN_TEXT, length);
584
+ return 1;
585
+ }
586
+ else {
587
+ tokenizer_callback(tk, TOKEN_TEXT, length_remaining(&tk->scan));
588
+ return 1;
589
+ }
590
+ return 0;
591
+ }
592
+
593
+ static int scan_plaintext(struct tokenizer_t *tk)
594
+ {
595
+ tokenizer_callback(tk, TOKEN_TEXT, length_remaining(&tk->scan));
596
+ return 1;
597
+ }
598
+
599
+ static int scan_once(struct tokenizer_t *tk)
600
+ {
601
+ switch(tk->context[tk->current_context]) {
602
+ case TOKENIZER_NONE:
603
+ break;
604
+ case TOKENIZER_HTML:
605
+ return scan_html(tk);
606
+ case TOKENIZER_OPEN_TAG:
607
+ return scan_open_tag(tk);
608
+ case TOKENIZER_SOLIDUS_OR_TAG_NAME:
609
+ return scan_solidus_or_tag_name(tk);
610
+ case TOKENIZER_TAG_NAME:
611
+ return scan_tag_name(tk);
612
+ case TOKENIZER_COMMENT:
613
+ return scan_comment(tk);
614
+ case TOKENIZER_CDATA:
615
+ return scan_cdata(tk);
616
+ case TOKENIZER_RCDATA:
617
+ case TOKENIZER_RAWTEXT:
618
+ case TOKENIZER_SCRIPT_DATA:
619
+ /* we don't consume character references so all
620
+ of these states are effectively the same */
621
+ return scan_rawtext(tk);
622
+ case TOKENIZER_PLAINTEXT:
623
+ return scan_plaintext(tk);
624
+ case TOKENIZER_ATTRIBUTE_NAME:
625
+ return scan_attribute_name(tk);
626
+ case TOKENIZER_ATTRIBUTE_VALUE:
627
+ return scan_attribute_value(tk);
628
+ case TOKENIZER_ATTRIBUTE_UNQUOTED:
629
+ return scan_attribute_unquoted(tk);
630
+ case TOKENIZER_ATTRIBUTE_QUOTED:
631
+ return scan_attribute_quoted(tk);
632
+ }
633
+ return 0;
634
+ }
635
+
636
+ void tokenizer_scan_all(struct tokenizer_t *tk)
637
+ {
638
+ while(!eos(&tk->scan) && scan_once(tk)) {}
639
+ if(!eos(&tk->scan)) {
640
+ tokenizer_callback(tk, TOKEN_MALFORMED, length_remaining(&tk->scan));
641
+ }
642
+ return;
643
+ }
644
+
645
+ static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
646
+ {
647
+ struct tokenizer_t *tk = NULL;
648
+ char *c_source;
649
+ char *old;
650
+
651
+ if(NIL_P(source))
652
+ return Qnil;
653
+
654
+ Check_Type(source, T_STRING);
655
+ Tokenizer_Get_Struct(self, tk);
656
+
657
+ c_source = StringValueCStr(source);
658
+ tk->scan.cursor = 0;
659
+ tk->scan.length = strlen(c_source);
660
+
661
+ old = tk->scan.string;
662
+ REALLOC_N(tk->scan.string, char, tk->scan.length+1);
663
+ DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
664
+ tk->scan.string, tk->scan.length+1);
665
+ strncpy(tk->scan.string, c_source, tk->scan.length);
666
+
667
+ tokenizer_scan_all(tk);
668
+
669
+ DBG_PRINT("tk=%p xfree(tk->scan.string) 0x%p", tk, tk->scan.string);
670
+ xfree(tk->scan.string);
671
+ tk->scan.string = NULL;
672
+
673
+ return Qtrue;
674
+ }
675
+
676
+ void Init_html_tokenizer_tokenizer(VALUE mHtmlTokenizer)
677
+ {
678
+ cTokenizer = rb_define_class_under(mHtmlTokenizer, "Tokenizer", rb_cObject);
679
+ rb_define_alloc_func(cTokenizer, tokenizer_allocate);
680
+ rb_define_method(cTokenizer, "initialize", tokenizer_initialize_method, 0);
681
+ rb_define_method(cTokenizer, "tokenize", tokenizer_tokenize_method, 1);
682
+ }