better_html 1.0.16 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (99) hide show
  1. checksums.yaml +4 -4
  2. data/MIT-LICENSE +1 -0
  3. data/Rakefile +19 -14
  4. data/ext/better_html_ext/better_html.h +1 -0
  5. data/ext/better_html_ext/extconf.rb +16 -0
  6. data/ext/better_html_ext/html_tokenizer.c +12 -0
  7. data/ext/better_html_ext/html_tokenizer.h +7 -0
  8. data/ext/better_html_ext/parser.c +793 -0
  9. data/ext/better_html_ext/parser.h +93 -0
  10. data/ext/better_html_ext/tokenizer.c +717 -0
  11. data/ext/better_html_ext/tokenizer.h +80 -0
  12. data/lib/better_html/ast/iterator.rb +14 -9
  13. data/lib/better_html/ast/node.rb +4 -2
  14. data/lib/better_html/better_erb/erubi_implementation.rb +43 -39
  15. data/lib/better_html/better_erb/runtime_checks.rb +140 -133
  16. data/lib/better_html/better_erb/validated_output_buffer.rb +30 -22
  17. data/lib/better_html/better_erb.rb +58 -54
  18. data/lib/better_html/config.rb +7 -4
  19. data/lib/better_html/errors.rb +4 -2
  20. data/lib/better_html/helpers.rb +7 -3
  21. data/lib/better_html/html_attributes.rb +6 -2
  22. data/lib/better_html/parser.rb +21 -14
  23. data/lib/better_html/railtie.rb +8 -4
  24. data/lib/better_html/test_helper/ruby_node.rb +15 -10
  25. data/lib/better_html/test_helper/safe_erb/allowed_script_type.rb +8 -4
  26. data/lib/better_html/test_helper/safe_erb/base.rb +12 -9
  27. data/lib/better_html/test_helper/safe_erb/no_javascript_tag_helper.rb +7 -3
  28. data/lib/better_html/test_helper/safe_erb/no_statements.rb +7 -3
  29. data/lib/better_html/test_helper/safe_erb/script_interpolation.rb +9 -4
  30. data/lib/better_html/test_helper/safe_erb/tag_interpolation.rb +23 -20
  31. data/lib/better_html/test_helper/safe_erb_tester.rb +33 -31
  32. data/lib/better_html/test_helper/safe_lodash_tester.rb +36 -35
  33. data/lib/better_html/test_helper/safety_error.rb +2 -0
  34. data/lib/better_html/tokenizer/base_erb.rb +14 -10
  35. data/lib/better_html/tokenizer/html_erb.rb +3 -2
  36. data/lib/better_html/tokenizer/html_lodash.rb +22 -14
  37. data/lib/better_html/tokenizer/javascript_erb.rb +3 -1
  38. data/lib/better_html/tokenizer/location.rb +17 -6
  39. data/lib/better_html/tokenizer/token.rb +2 -0
  40. data/lib/better_html/tokenizer/token_array.rb +8 -8
  41. data/lib/better_html/tree/attribute.rb +10 -6
  42. data/lib/better_html/tree/attributes_list.rb +9 -5
  43. data/lib/better_html/tree/tag.rb +10 -6
  44. data/lib/better_html/version.rb +3 -1
  45. data/lib/better_html.rb +19 -17
  46. data/lib/tasks/better_html_tasks.rake +1 -0
  47. metadata +39 -147
  48. data/lib/better_html/better_erb/erubis_implementation.rb +0 -44
  49. data/test/better_html/better_erb/implementation_test.rb +0 -406
  50. data/test/better_html/errors_test.rb +0 -13
  51. data/test/better_html/helpers_test.rb +0 -49
  52. data/test/better_html/parser_test.rb +0 -314
  53. data/test/better_html/test_helper/ruby_node_test.rb +0 -288
  54. data/test/better_html/test_helper/safe_erb/allowed_script_type_test.rb +0 -46
  55. data/test/better_html/test_helper/safe_erb/no_javascript_tag_helper_test.rb +0 -37
  56. data/test/better_html/test_helper/safe_erb/no_statements_test.rb +0 -129
  57. data/test/better_html/test_helper/safe_erb/script_interpolation_test.rb +0 -149
  58. data/test/better_html/test_helper/safe_erb/tag_interpolation_test.rb +0 -303
  59. data/test/better_html/test_helper/safe_lodash_tester_test.rb +0 -90
  60. data/test/better_html/tokenizer/html_erb_test.rb +0 -180
  61. data/test/better_html/tokenizer/html_lodash_test.rb +0 -98
  62. data/test/better_html/tokenizer/location_test.rb +0 -75
  63. data/test/better_html/tokenizer/token_array_test.rb +0 -146
  64. data/test/better_html/tokenizer/token_test.rb +0 -15
  65. data/test/dummy/README.rdoc +0 -28
  66. data/test/dummy/Rakefile +0 -6
  67. data/test/dummy/app/assets/javascripts/application.js +0 -13
  68. data/test/dummy/app/assets/stylesheets/application.css +0 -15
  69. data/test/dummy/app/controllers/application_controller.rb +0 -5
  70. data/test/dummy/app/helpers/application_helper.rb +0 -2
  71. data/test/dummy/app/views/layouts/application.html.erb +0 -14
  72. data/test/dummy/bin/bundle +0 -3
  73. data/test/dummy/bin/rails +0 -4
  74. data/test/dummy/bin/rake +0 -4
  75. data/test/dummy/bin/setup +0 -29
  76. data/test/dummy/config/application.rb +0 -26
  77. data/test/dummy/config/boot.rb +0 -5
  78. data/test/dummy/config/database.yml +0 -25
  79. data/test/dummy/config/environment.rb +0 -5
  80. data/test/dummy/config/environments/development.rb +0 -41
  81. data/test/dummy/config/environments/production.rb +0 -79
  82. data/test/dummy/config/environments/test.rb +0 -42
  83. data/test/dummy/config/initializers/assets.rb +0 -11
  84. data/test/dummy/config/initializers/backtrace_silencers.rb +0 -7
  85. data/test/dummy/config/initializers/cookies_serializer.rb +0 -3
  86. data/test/dummy/config/initializers/filter_parameter_logging.rb +0 -4
  87. data/test/dummy/config/initializers/inflections.rb +0 -16
  88. data/test/dummy/config/initializers/mime_types.rb +0 -4
  89. data/test/dummy/config/initializers/session_store.rb +0 -3
  90. data/test/dummy/config/initializers/wrap_parameters.rb +0 -14
  91. data/test/dummy/config/locales/en.yml +0 -23
  92. data/test/dummy/config/routes.rb +0 -56
  93. data/test/dummy/config/secrets.yml +0 -22
  94. data/test/dummy/config.ru +0 -4
  95. data/test/dummy/public/404.html +0 -67
  96. data/test/dummy/public/422.html +0 -67
  97. data/test/dummy/public/500.html +0 -66
  98. data/test/dummy/public/favicon.ico +0 -0
  99. data/test/test_helper.rb +0 -29
@@ -0,0 +1,717 @@
1
+ #include <ruby.h>
2
+ #include <ruby/encoding.h>
3
+ #include "html_tokenizer.h"
4
+ #include "tokenizer.h"
5
+
6
+ static VALUE cTokenizer = Qnil;
7
+
8
+ static void tokenizer_mark(void *ptr)
9
+ {}
10
+
11
+ static void tokenizer_free(void *ptr)
12
+ {
13
+ struct tokenizer_t *tk = ptr;
14
+ if(tk) {
15
+ tokenizer_free_members(tk);
16
+ DBG_PRINT("tk=%p xfree(tk)", tk);
17
+ xfree(tk);
18
+ }
19
+ }
20
+
21
+ static size_t tokenizer_memsize(const void *ptr)
22
+ {
23
+ return ptr ? sizeof(struct tokenizer_t) : 0;
24
+ }
25
+
26
+ const rb_data_type_t ht_tokenizer_data_type = {
27
+ "ht_tokenizer_data_type",
28
+ { tokenizer_mark, tokenizer_free, tokenizer_memsize, },
29
+ #if defined(RUBY_TYPED_FREE_IMMEDIATELY)
30
+ NULL, NULL, RUBY_TYPED_FREE_IMMEDIATELY
31
+ #endif
32
+ };
33
+
34
+ static VALUE tokenizer_allocate(VALUE klass)
35
+ {
36
+ VALUE obj;
37
+ struct tokenizer_t *tokenizer = NULL;
38
+
39
+ obj = TypedData_Make_Struct(klass, struct tokenizer_t, &ht_tokenizer_data_type, tokenizer);
40
+ DBG_PRINT("tk=%p allocate", tokenizer);
41
+
42
+ memset((void *)&tokenizer->context, TOKENIZER_NONE, sizeof(struct tokenizer_t));
43
+
44
+ return obj;
45
+ }
46
+
47
+ void tokenizer_init(struct tokenizer_t *tk)
48
+ {
49
+ tk->current_context = 0;
50
+ tk->context[0] = TOKENIZER_HTML;
51
+
52
+ tk->scan.string = NULL;
53
+ tk->scan.cursor = 0;
54
+ tk->scan.length = 0;
55
+ tk->scan.mb_cursor = 0;
56
+ tk->scan.enc_index = 0;
57
+
58
+ tk->attribute_value_start = 0;
59
+ tk->found_attribute = 0;
60
+ tk->current_tag = NULL;
61
+ tk->is_closing_tag = 0;
62
+ tk->last_token = TOKEN_NONE;
63
+ tk->callback_data = NULL;
64
+ tk->f_callback = NULL;
65
+
66
+ return;
67
+ }
68
+
69
+ void tokenizer_free_members(struct tokenizer_t *tk)
70
+ {
71
+ if(tk->current_tag) {
72
+ DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag);
73
+ xfree(tk->current_tag);
74
+ tk->current_tag = NULL;
75
+ }
76
+ if(tk->scan.string) {
77
+ DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string);
78
+ xfree(tk->scan.string);
79
+ tk->scan.string = NULL;
80
+ }
81
+ return;
82
+ }
83
+
84
+ VALUE token_type_to_symbol(enum token_type type)
85
+ {
86
+ switch(type) {
87
+ case TOKEN_NONE:
88
+ return ID2SYM(rb_intern("none"));
89
+ case TOKEN_TEXT:
90
+ return ID2SYM(rb_intern("text"));
91
+ case TOKEN_WHITESPACE:
92
+ return ID2SYM(rb_intern("whitespace"));
93
+ case TOKEN_COMMENT_START:
94
+ return ID2SYM(rb_intern("comment_start"));
95
+ case TOKEN_COMMENT_END:
96
+ return ID2SYM(rb_intern("comment_end"));
97
+ case TOKEN_TAG_NAME:
98
+ return ID2SYM(rb_intern("tag_name"));
99
+ case TOKEN_TAG_START:
100
+ return ID2SYM(rb_intern("tag_start"));
101
+ case TOKEN_TAG_END:
102
+ return ID2SYM(rb_intern("tag_end"));
103
+ case TOKEN_ATTRIBUTE_NAME:
104
+ return ID2SYM(rb_intern("attribute_name"));
105
+ case TOKEN_ATTRIBUTE_QUOTED_VALUE_START:
106
+ return ID2SYM(rb_intern("attribute_quoted_value_start"));
107
+ case TOKEN_ATTRIBUTE_QUOTED_VALUE:
108
+ return ID2SYM(rb_intern("attribute_quoted_value"));
109
+ case TOKEN_ATTRIBUTE_QUOTED_VALUE_END:
110
+ return ID2SYM(rb_intern("attribute_quoted_value_end"));
111
+ case TOKEN_ATTRIBUTE_UNQUOTED_VALUE:
112
+ return ID2SYM(rb_intern("attribute_unquoted_value"));
113
+ case TOKEN_CDATA_START:
114
+ return ID2SYM(rb_intern("cdata_start"));
115
+ case TOKEN_CDATA_END:
116
+ return ID2SYM(rb_intern("cdata_end"));
117
+ case TOKEN_SOLIDUS:
118
+ return ID2SYM(rb_intern("solidus"));
119
+ case TOKEN_EQUAL:
120
+ return ID2SYM(rb_intern("equal"));
121
+ case TOKEN_MALFORMED:
122
+ return ID2SYM(rb_intern("malformed"));
123
+ }
124
+ return Qnil;
125
+ }
126
+
127
+ static long unsigned int tokenizer_mblength(struct tokenizer_t *tk, long unsigned int length)
128
+ {
129
+ rb_encoding *enc = rb_enc_from_index(tk->scan.enc_index);
130
+ const char *buf = tk->scan.string + tk->scan.cursor;
131
+ return rb_enc_strlen(buf, buf + length, enc);
132
+ }
133
+
134
+ static void tokenizer_yield_tag(struct tokenizer_t *tk, enum token_type type, long unsigned int length, void *data)
135
+ {
136
+ long unsigned int mb_length = tokenizer_mblength(tk, length);
137
+ tk->last_token = type;
138
+ rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan.mb_cursor), INT2NUM(tk->scan.mb_cursor + mb_length));
139
+ }
140
+
141
+ static void tokenizer_callback(struct tokenizer_t *tk, enum token_type type, long unsigned int length)
142
+ {
143
+ long unsigned int mb_length = tokenizer_mblength(tk, length);
144
+ if(tk->f_callback)
145
+ tk->f_callback(tk, type, length, tk->callback_data);
146
+ tk->scan.cursor += length;
147
+ tk->scan.mb_cursor += mb_length;
148
+ }
149
+
150
+ static VALUE tokenizer_initialize_method(VALUE self)
151
+ {
152
+ struct tokenizer_t *tk = NULL;
153
+
154
+ Tokenizer_Get_Struct(self, tk);
155
+ DBG_PRINT("tk=%p initialize", tk);
156
+
157
+ tokenizer_init(tk);
158
+ tk->f_callback = tokenizer_yield_tag;
159
+
160
+ return Qnil;
161
+ }
162
+
163
+ static inline int eos(struct scan_t *scan)
164
+ {
165
+ return scan->cursor >= scan->length;
166
+ }
167
+
168
+ static inline long unsigned int length_remaining(struct scan_t *scan)
169
+ {
170
+ return scan->length - scan->cursor;
171
+ }
172
+
173
+ static inline void push_context(struct tokenizer_t *tk, enum tokenizer_context ctx)
174
+ {
175
+ tk->context[++tk->current_context] = ctx;
176
+ }
177
+
178
+ static inline void pop_context(struct tokenizer_t *tk)
179
+ {
180
+ tk->context[tk->current_context--] = TOKENIZER_NONE;
181
+ }
182
+
183
+ static int is_text(struct scan_t *scan, long unsigned int *length)
184
+ {
185
+ long unsigned int i;
186
+
187
+ *length = 0;
188
+ for(i = scan->cursor;i < scan->length; i++, (*length)++) {
189
+ if(scan->string[i] == '<')
190
+ break;
191
+ }
192
+ return *length != 0;
193
+ }
194
+
195
+ static inline int is_comment_start(struct scan_t *scan)
196
+ {
197
+ return (length_remaining(scan) >= 4) &&
198
+ !strncmp((const char *)&scan->string[scan->cursor], "<!--", 4);
199
+ }
200
+
201
+ static inline int is_doctype(struct scan_t *scan)
202
+ {
203
+ return (length_remaining(scan) >= 9) &&
204
+ !strncasecmp((const char *)&scan->string[scan->cursor], "<!DOCTYPE", 9);
205
+ }
206
+
207
+ static inline int is_cdata_start(struct scan_t *scan)
208
+ {
209
+ return (length_remaining(scan) >= 9) &&
210
+ !strncasecmp((const char *)&scan->string[scan->cursor], "<![CDATA[", 9);
211
+ }
212
+
213
+ static inline int is_char(struct scan_t *scan, const char c)
214
+ {
215
+ return (length_remaining(scan) >= 1) && (scan->string[scan->cursor] == c);
216
+ }
217
+
218
+ static inline int is_alnum(const char c)
219
+ {
220
+ return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9');
221
+ }
222
+
223
+ static int is_tag_start(struct scan_t *scan, long unsigned int *length,
224
+ int *closing_tag, const char **tag_name, long unsigned int *tag_name_length)
225
+ {
226
+ long unsigned int i, start;
227
+
228
+ if(scan->string[scan->cursor] != '<')
229
+ return 0;
230
+
231
+ *length = 1;
232
+
233
+ if(scan->string[scan->cursor+1] == '/') {
234
+ *closing_tag = 1;
235
+ (*length)++;
236
+ } else {
237
+ *closing_tag = 0;
238
+ }
239
+
240
+ *tag_name = &scan->string[scan->cursor + (*length)];
241
+ start = *length;
242
+ for(i = scan->cursor + (*length);i < scan->length; i++, (*length)++) {
243
+ if(!is_alnum(scan->string[i]) && scan->string[i] != ':')
244
+ break;
245
+ }
246
+
247
+ *tag_name_length = *length - start;
248
+ return 1;
249
+ }
250
+
251
+ static int is_tag_name(struct scan_t *scan, const char **tag_name, unsigned long int *tag_name_length)
252
+ {
253
+ long unsigned int i;
254
+
255
+ *tag_name_length = 0;
256
+ *tag_name = &scan->string[scan->cursor];
257
+ for(i = scan->cursor;i < scan->length; i++, (*tag_name_length)++) {
258
+ if(scan->string[i] == ' ' || scan->string[i] == '\t' ||
259
+ scan->string[i] == '\r' || scan->string[i] == '\n' ||
260
+ scan->string[i] == '>' || scan->string[i] == '/')
261
+ break;
262
+ }
263
+
264
+ return *tag_name_length != 0;
265
+ }
266
+
267
+ static int is_whitespace(struct scan_t *scan, unsigned long int *length)
268
+ {
269
+ long unsigned int i;
270
+
271
+ *length = 0;
272
+ for(i = scan->cursor;i < scan->length; i++, (*length)++) {
273
+ if(scan->string[i] != ' ' && scan->string[i] != '\t' &&
274
+ scan->string[i] != '\r' && scan->string[i] != '\n')
275
+ break;
276
+ }
277
+ return *length != 0;
278
+ }
279
+
280
+ static int is_attribute_name(struct scan_t *scan, unsigned long int *length)
281
+ {
282
+ long unsigned int i;
283
+
284
+ *length = 0;
285
+ for(i = scan->cursor;i < scan->length; i++, (*length)++) {
286
+ if(!is_alnum(scan->string[i]) && scan->string[i] != ':' &&
287
+ scan->string[i] != '-' && scan->string[i] != '_' &&
288
+ scan->string[i] != '.')
289
+ break;
290
+ }
291
+ return *length != 0;
292
+ }
293
+
294
+ static int is_unquoted_value(struct scan_t *scan, unsigned long int *length)
295
+ {
296
+ long unsigned int i;
297
+
298
+ *length = 0;
299
+ for(i = scan->cursor;i < scan->length; i++, (*length)++) {
300
+ if(scan->string[i] == ' ' || scan->string[i] == '\r' ||
301
+ scan->string[i] == '\n' || scan->string[i] == '\t' ||
302
+ scan->string[i] == '>')
303
+ break;
304
+ }
305
+ return *length != 0;
306
+ }
307
+
308
+ static int is_attribute_string(struct scan_t *scan, unsigned long int *length, const char attribute_value_start)
309
+ {
310
+ long unsigned int i;
311
+
312
+ *length = 0;
313
+ for(i = scan->cursor;i < scan->length; i++, (*length)++) {
314
+ if(scan->string[i] == attribute_value_start)
315
+ break;
316
+ }
317
+ return *length != 0;
318
+ }
319
+
320
+ static int is_comment_end(struct scan_t *scan, unsigned long int *length, const char **end)
321
+ {
322
+ long unsigned int i;
323
+
324
+ *length = 0;
325
+ for(i = scan->cursor;i < scan->length; i++, (*length)++) {
326
+ if(i < (scan->length - 2) && scan->string[i] == '-' && scan->string[i+1] == '-' &&
327
+ scan->string[i+2] == '>') {
328
+ *end = &scan->string[i];
329
+ break;
330
+ }
331
+ }
332
+ return *length != 0;
333
+ }
334
+
335
+ static int is_cdata_end(struct scan_t *scan, unsigned long int *length, const char **end)
336
+ {
337
+ long unsigned int i;
338
+
339
+ *length = 0;
340
+ for(i = scan->cursor;i < scan->length; i++, (*length)++) {
341
+ if(i < (scan->length-2) && scan->string[i] == ']' && scan->string[i+1] == ']' &&
342
+ scan->string[i+2] == '>') {
343
+ *end = &scan->string[i];
344
+ break;
345
+ }
346
+ }
347
+ return *length != 0;
348
+ }
349
+
350
+ static int scan_html(struct tokenizer_t *tk)
351
+ {
352
+ long unsigned int length = 0;
353
+
354
+ if(is_char(&tk->scan, '<')) {
355
+ push_context(tk, TOKENIZER_OPEN_TAG);
356
+ return 1;
357
+ }
358
+ else if(is_text(&tk->scan, &length)) {
359
+ tokenizer_callback(tk, TOKEN_TEXT, length);
360
+ return 1;
361
+ }
362
+ return 0;
363
+ }
364
+
365
+ static int scan_open_tag(struct tokenizer_t *tk)
366
+ {
367
+ unsigned long int length = 0;
368
+
369
+ if(is_comment_start(&tk->scan)) {
370
+ tokenizer_callback(tk, TOKEN_COMMENT_START, 4);
371
+ pop_context(tk); // back to html
372
+ push_context(tk, TOKENIZER_COMMENT);
373
+ return 1;
374
+ }
375
+ else if(is_doctype(&tk->scan)) {
376
+ tokenizer_callback(tk, TOKEN_TAG_START, 1);
377
+ tokenizer_callback(tk, TOKEN_TAG_NAME, 8);
378
+ push_context(tk, TOKENIZER_TAG_NAME);
379
+ return 1;
380
+ }
381
+ else if(is_cdata_start(&tk->scan)) {
382
+ tokenizer_callback(tk, TOKEN_CDATA_START, 9);
383
+ pop_context(tk); // back to html
384
+ push_context(tk, TOKENIZER_CDATA);
385
+ return 1;
386
+ }
387
+ else if(is_char(&tk->scan, '<')) {
388
+ tokenizer_callback(tk, TOKEN_TAG_START, 1);
389
+ push_context(tk, TOKENIZER_SOLIDUS_OR_TAG_NAME);
390
+ return 1;
391
+ }
392
+ else if(is_whitespace(&tk->scan, &length)) {
393
+ tokenizer_callback(tk, TOKEN_WHITESPACE, length);
394
+ return 1;
395
+ }
396
+ else if(is_attribute_name(&tk->scan, &length)) {
397
+ tokenizer_callback(tk, TOKEN_ATTRIBUTE_NAME, length);
398
+ push_context(tk, TOKENIZER_ATTRIBUTE_NAME);
399
+ return 1;
400
+ }
401
+ else if(is_char(&tk->scan, '\'') || is_char(&tk->scan, '"')) {
402
+ push_context(tk, TOKENIZER_ATTRIBUTE_VALUE);
403
+ return 1;
404
+ }
405
+ else if(is_char(&tk->scan, '=')) {
406
+ tokenizer_callback(tk, TOKEN_EQUAL, 1);
407
+ push_context(tk, TOKENIZER_ATTRIBUTE_VALUE);
408
+ return 1;
409
+ }
410
+ else if(is_char(&tk->scan, '/')) {
411
+ tokenizer_callback(tk, TOKEN_SOLIDUS, 1);
412
+ return 1;
413
+ }
414
+ else if(is_char(&tk->scan, '>')) {
415
+ tokenizer_callback(tk, TOKEN_TAG_END, 1);
416
+ pop_context(tk); // pop tag context
417
+
418
+ if(tk->current_tag && !tk->is_closing_tag) {
419
+ if(!strcasecmp("title", tk->current_tag) ||
420
+ !strcasecmp("textarea", tk->current_tag)) {
421
+ push_context(tk, TOKENIZER_RCDATA);
422
+ return 1;
423
+ }
424
+ else if(!strcasecmp("style", tk->current_tag) ||
425
+ !strcasecmp("xmp", tk->current_tag) || !strcasecmp("iframe", tk->current_tag) ||
426
+ !strcasecmp("noembed", tk->current_tag) || !strcasecmp("noframes", tk->current_tag) ||
427
+ !strcasecmp("listing", tk->current_tag)) {
428
+ push_context(tk, TOKENIZER_RAWTEXT);
429
+ return 1;
430
+ }
431
+ else if(!strcasecmp("script", tk->current_tag)) {
432
+ push_context(tk, TOKENIZER_SCRIPT_DATA);
433
+ return 1;
434
+ }
435
+ else if(!strcasecmp("plaintext", tk->current_tag)) {
436
+ push_context(tk, TOKENIZER_PLAINTEXT);
437
+ return 1;
438
+ }
439
+ }
440
+ return 1;
441
+ }
442
+ return 0;
443
+ }
444
+
445
+ static int scan_solidus_or_tag_name(struct tokenizer_t *tk)
446
+ {
447
+ if(tk->current_tag)
448
+ tk->current_tag[0] = '\0';
449
+
450
+ if(is_char(&tk->scan, '/')) {
451
+ tk->is_closing_tag = 1;
452
+ tokenizer_callback(tk, TOKEN_SOLIDUS, 1);
453
+ }
454
+ else {
455
+ tk->is_closing_tag = 0;
456
+ }
457
+
458
+ pop_context(tk);
459
+ push_context(tk, TOKENIZER_TAG_NAME);
460
+ return 1;
461
+ }
462
+
463
+ static int scan_tag_name(struct tokenizer_t *tk)
464
+ {
465
+ unsigned long int length = 0, tag_name_length = 0;
466
+ const char *tag_name = NULL;
467
+ void *old;
468
+
469
+ if(is_tag_name(&tk->scan, &tag_name, &tag_name_length)) {
470
+ length = (tk->current_tag ? strlen(tk->current_tag) : 0);
471
+ old = tk->current_tag;
472
+ REALLOC_N(tk->current_tag, char, length + tag_name_length + 1);
473
+ DBG_PRINT("tk=%p realloc(tk->current_tag) %p -> %p length=%lu", tk, old,
474
+ tk->current_tag, length + tag_name_length + 1);
475
+ tk->current_tag[length] = 0;
476
+
477
+ strncat(tk->current_tag, tag_name, tag_name_length);
478
+
479
+ tokenizer_callback(tk, TOKEN_TAG_NAME, tag_name_length);
480
+ return 1;
481
+ }
482
+
483
+ pop_context(tk); // back to open_tag
484
+ return 1;
485
+ }
486
+
487
+ static int scan_attribute_name(struct tokenizer_t *tk)
488
+ {
489
+ unsigned long int length = 0;
490
+
491
+ if(is_attribute_name(&tk->scan, &length)) {
492
+ tokenizer_callback(tk, TOKEN_ATTRIBUTE_NAME, length);
493
+ return 1;
494
+ }
495
+
496
+ pop_context(tk); // back to open tag
497
+ return 1;
498
+ }
499
+
500
+ static int scan_attribute_value(struct tokenizer_t *tk)
501
+ {
502
+ unsigned long int length = 0;
503
+
504
+ if(is_whitespace(&tk->scan, &length)) {
505
+ tokenizer_callback(tk, TOKEN_WHITESPACE, length);
506
+ return 1;
507
+ }
508
+ else if(is_char(&tk->scan, '\'') || is_char(&tk->scan, '"')) {
509
+ tk->attribute_value_start = tk->scan.string[tk->scan.cursor];
510
+ tokenizer_callback(tk, TOKEN_ATTRIBUTE_QUOTED_VALUE_START, 1);
511
+ pop_context(tk); // back to open tag
512
+ push_context(tk, TOKENIZER_ATTRIBUTE_QUOTED);
513
+ return 1;
514
+ }
515
+
516
+ pop_context(tk); // back to open tag
517
+ push_context(tk, TOKENIZER_ATTRIBUTE_UNQUOTED);
518
+ return 1;
519
+ }
520
+
521
+ static int scan_attribute_unquoted(struct tokenizer_t *tk)
522
+ {
523
+ unsigned long int length = 0;
524
+
525
+ if(is_unquoted_value(&tk->scan, &length)) {
526
+ tokenizer_callback(tk, TOKEN_ATTRIBUTE_UNQUOTED_VALUE, length);
527
+ return 1;
528
+ }
529
+
530
+ pop_context(tk); // back to open tag
531
+ return 1;
532
+ }
533
+
534
+ static int scan_attribute_quoted(struct tokenizer_t *tk)
535
+ {
536
+ unsigned long int length = 0;
537
+
538
+ if(is_char(&tk->scan, tk->attribute_value_start)) {
539
+ tokenizer_callback(tk, TOKEN_ATTRIBUTE_QUOTED_VALUE_END, 1);
540
+ pop_context(tk); // back to open tag
541
+ return 1;
542
+ }
543
+ else if(is_attribute_string(&tk->scan, &length, tk->attribute_value_start)) {
544
+ tokenizer_callback(tk, TOKEN_ATTRIBUTE_QUOTED_VALUE, length);
545
+ return 1;
546
+ }
547
+ return 0;
548
+ }
549
+
550
+ static int scan_comment(struct tokenizer_t *tk)
551
+ {
552
+ unsigned long int length = 0;
553
+ const char *comment_end = NULL;
554
+
555
+ if(is_comment_end(&tk->scan, &length, &comment_end)) {
556
+ tokenizer_callback(tk, TOKEN_TEXT, length);
557
+ if(comment_end) {
558
+ tokenizer_callback(tk, TOKEN_COMMENT_END, 3);
559
+ pop_context(tk); // back to document
560
+ }
561
+ return 1;
562
+ }
563
+ else {
564
+ tokenizer_callback(tk, TOKEN_TEXT, length_remaining(&tk->scan));
565
+ return 1;
566
+ }
567
+ return 0;
568
+ }
569
+
570
+ static int scan_cdata(struct tokenizer_t *tk)
571
+ {
572
+ unsigned long int length = 0;
573
+ const char *cdata_end = NULL;
574
+
575
+ if(is_cdata_end(&tk->scan, &length, &cdata_end)) {
576
+ tokenizer_callback(tk, TOKEN_TEXT, length);
577
+ if(cdata_end) {
578
+ tokenizer_callback(tk, TOKEN_CDATA_END, 3);
579
+ pop_context(tk); // back to document
580
+ }
581
+ return 1;
582
+ }
583
+ else {
584
+ tokenizer_callback(tk, TOKEN_TEXT, length_remaining(&tk->scan));
585
+ return 1;
586
+ }
587
+ return 0;
588
+ }
589
+
590
+ static int scan_rawtext(struct tokenizer_t *tk)
591
+ {
592
+ long unsigned int length = 0, tag_name_length = 0;
593
+ const char *tag_name = NULL;
594
+ int closing_tag = 0;
595
+
596
+ if(is_tag_start(&tk->scan, &length, &closing_tag, &tag_name, &tag_name_length)) {
597
+ if(closing_tag && tk->current_tag && !strncasecmp((const char *)tag_name, tk->current_tag, tag_name_length)) {
598
+ pop_context(tk);
599
+ } else {
600
+ tokenizer_callback(tk, TOKEN_TEXT, length);
601
+ }
602
+ return 1;
603
+ }
604
+ else if(is_text(&tk->scan, &length)) {
605
+ tokenizer_callback(tk, TOKEN_TEXT, length);
606
+ return 1;
607
+ }
608
+ else {
609
+ tokenizer_callback(tk, TOKEN_TEXT, length_remaining(&tk->scan));
610
+ return 1;
611
+ }
612
+ return 0;
613
+ }
614
+
615
+ static int scan_plaintext(struct tokenizer_t *tk)
616
+ {
617
+ tokenizer_callback(tk, TOKEN_TEXT, length_remaining(&tk->scan));
618
+ return 1;
619
+ }
620
+
621
+ static int scan_once(struct tokenizer_t *tk)
622
+ {
623
+ switch(tk->context[tk->current_context]) {
624
+ case TOKENIZER_NONE:
625
+ break;
626
+ case TOKENIZER_HTML:
627
+ return scan_html(tk);
628
+ case TOKENIZER_OPEN_TAG:
629
+ return scan_open_tag(tk);
630
+ case TOKENIZER_SOLIDUS_OR_TAG_NAME:
631
+ return scan_solidus_or_tag_name(tk);
632
+ case TOKENIZER_TAG_NAME:
633
+ return scan_tag_name(tk);
634
+ case TOKENIZER_COMMENT:
635
+ return scan_comment(tk);
636
+ case TOKENIZER_CDATA:
637
+ return scan_cdata(tk);
638
+ case TOKENIZER_RCDATA:
639
+ case TOKENIZER_RAWTEXT:
640
+ case TOKENIZER_SCRIPT_DATA:
641
+ /* we don't consume character references so all
642
+ of these states are effectively the same */
643
+ return scan_rawtext(tk);
644
+ case TOKENIZER_PLAINTEXT:
645
+ return scan_plaintext(tk);
646
+ case TOKENIZER_ATTRIBUTE_NAME:
647
+ return scan_attribute_name(tk);
648
+ case TOKENIZER_ATTRIBUTE_VALUE:
649
+ return scan_attribute_value(tk);
650
+ case TOKENIZER_ATTRIBUTE_UNQUOTED:
651
+ return scan_attribute_unquoted(tk);
652
+ case TOKENIZER_ATTRIBUTE_QUOTED:
653
+ return scan_attribute_quoted(tk);
654
+ }
655
+ return 0;
656
+ }
657
+
658
+ void tokenizer_scan_all(struct tokenizer_t *tk)
659
+ {
660
+ while(!eos(&tk->scan) && scan_once(tk)) {}
661
+ if(!eos(&tk->scan)) {
662
+ tokenizer_callback(tk, TOKEN_MALFORMED, length_remaining(&tk->scan));
663
+ }
664
+ return;
665
+ }
666
+
667
+ void tokenizer_set_scan_string(struct tokenizer_t *tk, const char *string, long unsigned int length)
668
+ {
669
+ const char *old = tk->scan.string;
670
+ REALLOC_N(tk->scan.string, char, string ? length + 1 : 0);
671
+ DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
672
+ tk->scan.string, length + 1);
673
+ if(string && length > 0) {
674
+ strncpy(tk->scan.string, string, length);
675
+ tk->scan.string[length] = 0;
676
+ }
677
+ tk->scan.length = length;
678
+ return;
679
+ }
680
+
681
+ void tokenizer_free_scan_string(struct tokenizer_t *tk)
682
+ {
683
+ tokenizer_set_scan_string(tk, NULL, 0);
684
+ return;
685
+ }
686
+
687
+ static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
688
+ {
689
+ struct tokenizer_t *tk = NULL;
690
+ char *c_source;
691
+
692
+ if(NIL_P(source))
693
+ return Qnil;
694
+
695
+ Check_Type(source, T_STRING);
696
+ Tokenizer_Get_Struct(self, tk);
697
+
698
+ c_source = StringValueCStr(source);
699
+ tk->scan.cursor = 0;
700
+ tokenizer_set_scan_string(tk, c_source, strlen(c_source));
701
+ tk->scan.enc_index = rb_enc_get_index(source);
702
+ tk->scan.mb_cursor = 0;
703
+
704
+ tokenizer_scan_all(tk);
705
+
706
+ tokenizer_free_scan_string(tk);
707
+
708
+ return Qtrue;
709
+ }
710
+
711
+ void Init_html_tokenizer_tokenizer(VALUE mHtmlTokenizer)
712
+ {
713
+ cTokenizer = rb_define_class_under(mHtmlTokenizer, "Tokenizer", rb_cObject);
714
+ rb_define_alloc_func(cTokenizer, tokenizer_allocate);
715
+ rb_define_method(cTokenizer, "initialize", tokenizer_initialize_method, 0);
716
+ rb_define_method(cTokenizer, "tokenize", tokenizer_tokenize_method, 1);
717
+ }