better_html 1.0.16 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (99) hide show
  1. checksums.yaml +4 -4
  2. data/MIT-LICENSE +1 -0
  3. data/Rakefile +19 -14
  4. data/ext/better_html_ext/better_html.h +1 -0
  5. data/ext/better_html_ext/extconf.rb +16 -0
  6. data/ext/better_html_ext/html_tokenizer.c +12 -0
  7. data/ext/better_html_ext/html_tokenizer.h +7 -0
  8. data/ext/better_html_ext/parser.c +793 -0
  9. data/ext/better_html_ext/parser.h +93 -0
  10. data/ext/better_html_ext/tokenizer.c +717 -0
  11. data/ext/better_html_ext/tokenizer.h +80 -0
  12. data/lib/better_html/ast/iterator.rb +14 -9
  13. data/lib/better_html/ast/node.rb +4 -2
  14. data/lib/better_html/better_erb/erubi_implementation.rb +43 -39
  15. data/lib/better_html/better_erb/runtime_checks.rb +140 -133
  16. data/lib/better_html/better_erb/validated_output_buffer.rb +30 -22
  17. data/lib/better_html/better_erb.rb +58 -54
  18. data/lib/better_html/config.rb +7 -4
  19. data/lib/better_html/errors.rb +4 -2
  20. data/lib/better_html/helpers.rb +7 -3
  21. data/lib/better_html/html_attributes.rb +6 -2
  22. data/lib/better_html/parser.rb +21 -14
  23. data/lib/better_html/railtie.rb +8 -4
  24. data/lib/better_html/test_helper/ruby_node.rb +15 -10
  25. data/lib/better_html/test_helper/safe_erb/allowed_script_type.rb +8 -4
  26. data/lib/better_html/test_helper/safe_erb/base.rb +12 -9
  27. data/lib/better_html/test_helper/safe_erb/no_javascript_tag_helper.rb +7 -3
  28. data/lib/better_html/test_helper/safe_erb/no_statements.rb +7 -3
  29. data/lib/better_html/test_helper/safe_erb/script_interpolation.rb +9 -4
  30. data/lib/better_html/test_helper/safe_erb/tag_interpolation.rb +23 -20
  31. data/lib/better_html/test_helper/safe_erb_tester.rb +33 -31
  32. data/lib/better_html/test_helper/safe_lodash_tester.rb +36 -35
  33. data/lib/better_html/test_helper/safety_error.rb +2 -0
  34. data/lib/better_html/tokenizer/base_erb.rb +14 -10
  35. data/lib/better_html/tokenizer/html_erb.rb +3 -2
  36. data/lib/better_html/tokenizer/html_lodash.rb +22 -14
  37. data/lib/better_html/tokenizer/javascript_erb.rb +3 -1
  38. data/lib/better_html/tokenizer/location.rb +17 -6
  39. data/lib/better_html/tokenizer/token.rb +2 -0
  40. data/lib/better_html/tokenizer/token_array.rb +8 -8
  41. data/lib/better_html/tree/attribute.rb +10 -6
  42. data/lib/better_html/tree/attributes_list.rb +9 -5
  43. data/lib/better_html/tree/tag.rb +10 -6
  44. data/lib/better_html/version.rb +3 -1
  45. data/lib/better_html.rb +19 -17
  46. data/lib/tasks/better_html_tasks.rake +1 -0
  47. metadata +39 -147
  48. data/lib/better_html/better_erb/erubis_implementation.rb +0 -44
  49. data/test/better_html/better_erb/implementation_test.rb +0 -406
  50. data/test/better_html/errors_test.rb +0 -13
  51. data/test/better_html/helpers_test.rb +0 -49
  52. data/test/better_html/parser_test.rb +0 -314
  53. data/test/better_html/test_helper/ruby_node_test.rb +0 -288
  54. data/test/better_html/test_helper/safe_erb/allowed_script_type_test.rb +0 -46
  55. data/test/better_html/test_helper/safe_erb/no_javascript_tag_helper_test.rb +0 -37
  56. data/test/better_html/test_helper/safe_erb/no_statements_test.rb +0 -129
  57. data/test/better_html/test_helper/safe_erb/script_interpolation_test.rb +0 -149
  58. data/test/better_html/test_helper/safe_erb/tag_interpolation_test.rb +0 -303
  59. data/test/better_html/test_helper/safe_lodash_tester_test.rb +0 -90
  60. data/test/better_html/tokenizer/html_erb_test.rb +0 -180
  61. data/test/better_html/tokenizer/html_lodash_test.rb +0 -98
  62. data/test/better_html/tokenizer/location_test.rb +0 -75
  63. data/test/better_html/tokenizer/token_array_test.rb +0 -146
  64. data/test/better_html/tokenizer/token_test.rb +0 -15
  65. data/test/dummy/README.rdoc +0 -28
  66. data/test/dummy/Rakefile +0 -6
  67. data/test/dummy/app/assets/javascripts/application.js +0 -13
  68. data/test/dummy/app/assets/stylesheets/application.css +0 -15
  69. data/test/dummy/app/controllers/application_controller.rb +0 -5
  70. data/test/dummy/app/helpers/application_helper.rb +0 -2
  71. data/test/dummy/app/views/layouts/application.html.erb +0 -14
  72. data/test/dummy/bin/bundle +0 -3
  73. data/test/dummy/bin/rails +0 -4
  74. data/test/dummy/bin/rake +0 -4
  75. data/test/dummy/bin/setup +0 -29
  76. data/test/dummy/config/application.rb +0 -26
  77. data/test/dummy/config/boot.rb +0 -5
  78. data/test/dummy/config/database.yml +0 -25
  79. data/test/dummy/config/environment.rb +0 -5
  80. data/test/dummy/config/environments/development.rb +0 -41
  81. data/test/dummy/config/environments/production.rb +0 -79
  82. data/test/dummy/config/environments/test.rb +0 -42
  83. data/test/dummy/config/initializers/assets.rb +0 -11
  84. data/test/dummy/config/initializers/backtrace_silencers.rb +0 -7
  85. data/test/dummy/config/initializers/cookies_serializer.rb +0 -3
  86. data/test/dummy/config/initializers/filter_parameter_logging.rb +0 -4
  87. data/test/dummy/config/initializers/inflections.rb +0 -16
  88. data/test/dummy/config/initializers/mime_types.rb +0 -4
  89. data/test/dummy/config/initializers/session_store.rb +0 -3
  90. data/test/dummy/config/initializers/wrap_parameters.rb +0 -14
  91. data/test/dummy/config/locales/en.yml +0 -23
  92. data/test/dummy/config/routes.rb +0 -56
  93. data/test/dummy/config/secrets.yml +0 -22
  94. data/test/dummy/config.ru +0 -4
  95. data/test/dummy/public/404.html +0 -67
  96. data/test/dummy/public/422.html +0 -67
  97. data/test/dummy/public/500.html +0 -66
  98. data/test/dummy/public/favicon.ico +0 -0
  99. data/test/test_helper.rb +0 -29
@@ -0,0 +1,793 @@
1
+ #include <ruby.h>
2
+ #include <ruby/encoding.h>
3
+ #include "html_tokenizer.h"
4
+ #include "parser.h"
5
+
6
+ static VALUE cParser = Qnil;
7
+
8
+ static void parser_mark(void *ptr)
9
+ {}
10
+
11
+ static void parser_free(void *ptr)
12
+ {
13
+ struct parser_t *parser = ptr;
14
+ size_t i;
15
+
16
+ if(parser) {
17
+ tokenizer_free_members(&parser->tk);
18
+ if(parser->doc.data) {
19
+ DBG_PRINT("parser=%p xfree(parser->doc.data) %p", parser, parser->doc.data);
20
+ xfree(parser->doc.data);
21
+ parser->doc.data = NULL;
22
+ }
23
+ if(parser->errors_count && parser->errors) {
24
+ for(i=0; i<parser->errors_count; i++) {
25
+ if(!parser->errors[i].message)
26
+ continue;
27
+ DBG_PRINT("parser=%p xfree(parser->errors.messages[%u]) %p", parser, i, parser->errors[i].message);
28
+ xfree(parser->errors[i].message);
29
+ parser->errors[i].message = NULL;
30
+ }
31
+ DBG_PRINT("parser=%p xfree(parser->errors.messages) %p", parser, parser->errors);
32
+ xfree(parser->errors);
33
+ parser->errors = NULL;
34
+ parser->errors_count = 0;
35
+ }
36
+ DBG_PRINT("parser=%p xfree(parser)", parser);
37
+ xfree(parser);
38
+ }
39
+ }
40
+
41
+ static size_t parser_memsize(const void *ptr)
42
+ {
43
+ return ptr ? sizeof(struct parser_t) : 0;
44
+ }
45
+
46
+ const rb_data_type_t ht_parser_data_type = {
47
+ "ht_parser_data_type",
48
+ { parser_mark, parser_free, parser_memsize, },
49
+ #if defined(RUBY_TYPED_FREE_IMMEDIATELY)
50
+ NULL, NULL, RUBY_TYPED_FREE_IMMEDIATELY
51
+ #endif
52
+ };
53
+
54
+ static VALUE parser_allocate(VALUE klass)
55
+ {
56
+ VALUE obj;
57
+ struct parser_t *parser = NULL;
58
+
59
+ obj = TypedData_Make_Struct(klass, struct parser_t, &ht_parser_data_type, parser);
60
+ DBG_PRINT("parser=%p allocate", parser);
61
+
62
+ return obj;
63
+ }
64
+
65
+ static inline void parser_append_ref(struct token_reference_t *dest, struct token_reference_t *src)
66
+ {
67
+ if(dest->type == TOKEN_NONE || dest->type != src->type || (dest->start + dest->length) != src->start) {
68
+ dest->type = src->type;
69
+ dest->start = src->start;
70
+ dest->mb_start = src->mb_start;
71
+ dest->length = src->length;
72
+ dest->line_number = src->line_number;
73
+ dest->column_number = src->column_number;
74
+ }
75
+ else {
76
+ dest->type = src->type;
77
+ dest->length += src->length;
78
+ }
79
+ }
80
+
81
+ static void parser_add_error(struct parser_t *parser, const char *message)
82
+ {
83
+ REALLOC_N(parser->errors, struct parser_document_error_t, parser->errors_count + 1);
84
+ parser->errors[parser->errors_count].message = strdup(message);
85
+ parser->errors[parser->errors_count].pos = parser->tk.scan.cursor;
86
+ parser->errors[parser->errors_count].mb_pos = parser->tk.scan.mb_cursor;
87
+ parser->errors[parser->errors_count].line_number = parser->doc.line_number;
88
+ parser->errors[parser->errors_count].column_number = parser->doc.column_number;
89
+ parser->errors_count += 1;
90
+ return;
91
+ }
92
+
93
+ static int parse_none(struct parser_t *parser, struct token_reference_t *ref)
94
+ {
95
+ if(ref->type == TOKEN_TAG_START) {
96
+ parser->tag.self_closing = 0;
97
+ parser->context = PARSER_SOLIDUS_OR_TAG_NAME;
98
+ parser->tag.name.type = TOKEN_NONE;
99
+ }
100
+ else if(ref->type == TOKEN_COMMENT_START) {
101
+ parser->context = PARSER_COMMENT;
102
+ parser->comment.text.type = TOKEN_NONE;
103
+ }
104
+ else if(ref->type == TOKEN_CDATA_START) {
105
+ parser->context = PARSER_CDATA;
106
+ parser->cdata.text.type = TOKEN_NONE;
107
+ }
108
+ PARSE_DONE;
109
+ }
110
+
111
+ static int parse_rawtext(struct parser_t *parser, struct token_reference_t *ref)
112
+ {
113
+ if(ref->type == TOKEN_TEXT) {
114
+ parser_append_ref(&parser->rawtext.text, ref);
115
+ }
116
+ else {
117
+ parser->context = PARSER_NONE;
118
+ parse_none(parser, ref);
119
+ }
120
+ PARSE_DONE;
121
+ }
122
+
123
+ static int parse_comment(struct parser_t *parser, struct token_reference_t *ref)
124
+ {
125
+ if(ref->type == TOKEN_COMMENT_END) {
126
+ parser->context = PARSER_NONE;
127
+ }
128
+ else if(ref->type == TOKEN_TEXT) {
129
+ parser_append_ref(&parser->comment.text, ref);
130
+ }
131
+ PARSE_DONE;
132
+ }
133
+
134
+ static int parse_cdata(struct parser_t *parser, struct token_reference_t *ref)
135
+ {
136
+ if(ref->type == TOKEN_CDATA_END) {
137
+ parser->context = PARSER_NONE;
138
+ }
139
+ else if(ref->type == TOKEN_TEXT) {
140
+ parser_append_ref(&parser->cdata.text, ref);
141
+ }
142
+ PARSE_DONE;
143
+ }
144
+
145
+ static int parse_solidus_or_tag_name(struct parser_t *parser, struct token_reference_t *ref)
146
+ {
147
+ if(ref->type == TOKEN_SOLIDUS) {
148
+ // ignore solidus before tag name
149
+ parser->context = PARSER_TAG_NAME;
150
+ }
151
+ else if(ref->type == TOKEN_TAG_NAME) {
152
+ parser->context = PARSER_TAG_NAME;
153
+ PARSE_AGAIN;
154
+ }
155
+ else {
156
+ parser_add_error(parser, "expected '/' or tag name");
157
+ parser->context = PARSER_TAG;
158
+ PARSE_AGAIN;
159
+ }
160
+ PARSE_DONE;
161
+ }
162
+
163
+ static int parse_tag_name(struct parser_t *parser, struct token_reference_t *ref)
164
+ {
165
+ if(ref->type == TOKEN_TAG_NAME) {
166
+ parser_append_ref(&parser->tag.name, ref);
167
+ }
168
+ else if(ref->type == TOKEN_WHITESPACE) {
169
+ parser->context = PARSER_TAG;
170
+ }
171
+ else if(ref->type == TOKEN_TAG_END) {
172
+ parser->context = PARSER_NONE;
173
+ }
174
+ else if(ref->type == TOKEN_SOLIDUS) {
175
+ parser->context = PARSER_TAG;
176
+ PARSE_AGAIN;
177
+ }
178
+ else {
179
+ // not reachable
180
+ rb_raise(rb_eArgError, "expected whitespace, '/' or '>' after tag name");
181
+ }
182
+ PARSE_DONE;
183
+ }
184
+
185
+ static int parse_tag(struct parser_t *parser, struct token_reference_t *ref)
186
+ {
187
+ if(ref->type == TOKEN_TAG_END) {
188
+ parser->context = PARSER_NONE;
189
+ }
190
+ else if(ref->type == TOKEN_WHITESPACE) {
191
+ // ignore whitespaces
192
+ }
193
+ else if(ref->type == TOKEN_SOLIDUS) {
194
+ parser->context = PARSER_TAG_END;
195
+ }
196
+ else if(ref->type == TOKEN_ATTRIBUTE_NAME) {
197
+ parser->context = PARSER_ATTRIBUTE_NAME;
198
+ parser->attribute.name.type = TOKEN_NONE;
199
+ parser->attribute.value.type = TOKEN_NONE;
200
+ parser->attribute.is_quoted = 0;
201
+ PARSE_AGAIN;
202
+ }
203
+ else if(ref->type == TOKEN_ATTRIBUTE_QUOTED_VALUE_START) {
204
+ parser->context = PARSER_ATTRIBUTE_QUOTED_VALUE;
205
+ parser->attribute.name.type = TOKEN_NONE;
206
+ parser->attribute.value.type = TOKEN_NONE;
207
+ parser->attribute.is_quoted = 1;
208
+ }
209
+ else {
210
+ // unexpected
211
+ parser_add_error(parser, "expected whitespace, '>', attribute name or value");
212
+ }
213
+ PARSE_DONE;
214
+ }
215
+
216
+ static int parse_tag_end(struct parser_t *parser, struct token_reference_t *ref)
217
+ {
218
+ if(ref->type == TOKEN_TAG_END) {
219
+ parser->tag.self_closing = 1;
220
+ parser->context = PARSER_NONE;
221
+ }
222
+ else {
223
+ parser_add_error(parser, "expected '>' after '/'");
224
+ parser->context = PARSER_TAG;
225
+ PARSE_AGAIN;
226
+ }
227
+ PARSE_DONE;
228
+ }
229
+
230
+ static int parse_attribute_name(struct parser_t *parser, struct token_reference_t *ref)
231
+ {
232
+ if(ref->type == TOKEN_ATTRIBUTE_NAME) {
233
+ parser_append_ref(&parser->attribute.name, ref);
234
+ }
235
+ else if(ref->type == TOKEN_TAG_END || ref->type == TOKEN_SOLIDUS) {
236
+ parser->context = PARSER_TAG;
237
+ PARSE_AGAIN;
238
+ }
239
+ else if(ref->type == TOKEN_WHITESPACE) {
240
+ parser->context = PARSER_ATTRIBUTE_WHITESPACE_OR_EQUAL;
241
+ PARSE_AGAIN;
242
+ }
243
+ else if(ref->type == TOKEN_EQUAL) {
244
+ parser->context = PARSER_ATTRIBUTE_WHITESPACE_OR_VALUE;
245
+ }
246
+ else {
247
+ parser_add_error(parser, "expected whitespace, '>' or '=' after attribute name");
248
+ parser->context = PARSER_TAG;
249
+ PARSE_AGAIN;
250
+ }
251
+ PARSE_DONE;
252
+ }
253
+
254
+ static int parse_attribute_whitespace_or_equal(struct parser_t *parser, struct token_reference_t *ref)
255
+ {
256
+ if(ref->type == TOKEN_WHITESPACE) {
257
+ // swallow whitespace after attribute name
258
+ }
259
+ else if(ref->type == TOKEN_TAG_END || ref->type == TOKEN_SOLIDUS) {
260
+ parser->context = PARSER_TAG;
261
+ PARSE_AGAIN;
262
+ }
263
+ else if(ref->type == TOKEN_EQUAL) {
264
+ parser->context = PARSER_ATTRIBUTE_WHITESPACE_OR_VALUE;
265
+ }
266
+ else if(ref->type == TOKEN_ATTRIBUTE_NAME) {
267
+ // start new attribute after whitespace
268
+ parser->context = PARSER_TAG;
269
+ PARSE_AGAIN;
270
+ }
271
+ else if(ref->type == TOKEN_ATTRIBUTE_QUOTED_VALUE_START) {
272
+ // start quoted value after whitespace
273
+ parser->context = PARSER_TAG;
274
+ PARSE_AGAIN;
275
+ }
276
+ else {
277
+ parser_add_error(parser, "expected '/', '>', \", ' or '=' after attribute name");
278
+ parser->context = PARSER_TAG;
279
+ PARSE_AGAIN;
280
+ }
281
+
282
+ PARSE_DONE;
283
+ }
284
+
285
+ static int parse_attribute_whitespace_or_value(struct parser_t *parser, struct token_reference_t *ref)
286
+ {
287
+ if(ref->type == TOKEN_WHITESPACE) {
288
+ // swallow whitespace after equal sign
289
+ }
290
+ else if(ref->type == TOKEN_ATTRIBUTE_QUOTED_VALUE_START) {
291
+ parser->context = PARSER_ATTRIBUTE_QUOTED_VALUE;
292
+ parser->attribute.is_quoted = 1;
293
+ }
294
+ else if(ref->type == TOKEN_ATTRIBUTE_UNQUOTED_VALUE) {
295
+ parser->context = PARSER_ATTRIBUTE_UNQUOTED_VALUE;
296
+ PARSE_AGAIN;
297
+ }
298
+ else {
299
+ parser_add_error(parser, "expected attribute value after '='");
300
+ parser->context = PARSER_TAG;
301
+ PARSE_AGAIN;
302
+ }
303
+
304
+ PARSE_DONE;
305
+ }
306
+
307
+ static int parse_attribute_quoted_value(struct parser_t *parser, struct token_reference_t *ref)
308
+ {
309
+ if(ref->type == TOKEN_ATTRIBUTE_QUOTED_VALUE) {
310
+ parser_append_ref(&parser->attribute.value, ref);
311
+ }
312
+ else if(ref->type == TOKEN_ATTRIBUTE_QUOTED_VALUE_END) {
313
+ parser->context = PARSER_SPACE_AFTER_ATTRIBUTE;
314
+ }
315
+ else {
316
+ // not reachable
317
+ rb_raise(rb_eArgError, "expected end-quote after quoted value");
318
+ }
319
+
320
+ PARSE_DONE;
321
+ }
322
+
323
+ static int parse_space_after_attribute(struct parser_t *parser, struct token_reference_t *ref)
324
+ {
325
+ if(ref->type == TOKEN_WHITESPACE) {
326
+ parser->context = PARSER_TAG;
327
+ }
328
+ else if(ref->type == TOKEN_TAG_END || ref->type == TOKEN_SOLIDUS) {
329
+ parser->context = PARSER_TAG;
330
+ PARSE_AGAIN;
331
+ }
332
+ else {
333
+ parser_add_error(parser, "expected space after attribute value");
334
+ parser->context = PARSER_TAG;
335
+ PARSE_AGAIN;
336
+ }
337
+
338
+ PARSE_DONE;
339
+ }
340
+
341
+ static int parse_attribute_unquoted_value(struct parser_t *parser, struct token_reference_t *ref)
342
+ {
343
+ if(ref->type == TOKEN_ATTRIBUTE_UNQUOTED_VALUE) {
344
+ parser_append_ref(&parser->attribute.value, ref);
345
+ }
346
+ else if(ref->type == TOKEN_WHITESPACE) {
347
+ parser->context = PARSER_TAG;
348
+ }
349
+ else if(ref->type == TOKEN_TAG_END || ref->type == TOKEN_SOLIDUS) {
350
+ parser->context = PARSER_TAG;
351
+ PARSE_AGAIN;
352
+ }
353
+ else {
354
+ // not reachable
355
+ rb_raise(rb_eArgError, "expected space or end-of-tag after unquoted value");
356
+ }
357
+
358
+ PARSE_DONE;
359
+ }
360
+
361
+ static inline int rawtext_context(struct parser_t *parser)
362
+ {
363
+ enum tokenizer_context ctx = parser->tk.context[parser->tk.current_context];
364
+ return (ctx == TOKENIZER_RCDATA || ctx == TOKENIZER_RAWTEXT ||
365
+ ctx == TOKENIZER_SCRIPT_DATA || ctx == TOKENIZER_PLAINTEXT);
366
+ }
367
+
368
+ static void parser_adjust_line_number(struct parser_t *parser, long unsigned int start, long unsigned int length)
369
+ {
370
+ rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
371
+ long unsigned int i;
372
+ const char *buf, *nextlf;
373
+
374
+ for(i = 0; i < length;) {
375
+ buf = &parser->doc.data[start + i];
376
+ nextlf = memchr(buf, '\n', length - i);
377
+ if(nextlf) {
378
+ parser->doc.column_number = 0;
379
+ parser->doc.line_number += 1;
380
+ i += (nextlf - buf) + 1;
381
+ }
382
+ else {
383
+ parser->doc.column_number += rb_enc_strlen(buf, buf + length - i, enc);
384
+ break;
385
+ }
386
+ }
387
+
388
+ return;
389
+ }
390
+
391
+ static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type type, unsigned long int length, void *data)
392
+ {
393
+ struct parser_t *parser = (struct parser_t *)data;
394
+ struct token_reference_t ref = {
395
+ .type = type,
396
+ .start = tk->scan.cursor,
397
+ .mb_start = tk->scan.mb_cursor,
398
+ .length = length,
399
+ .line_number = parser->doc.line_number,
400
+ .column_number = parser->doc.column_number,
401
+ };
402
+ int parse_again = 1;
403
+ long unsigned int mb_strlen;
404
+ rb_encoding *enc;
405
+
406
+ while(parse_again) {
407
+ switch(parser->context)
408
+ {
409
+ case PARSER_NONE:
410
+ if(rawtext_context(parser))
411
+ parse_again = parse_rawtext(parser, &ref);
412
+ else
413
+ parse_again = parse_none(parser, &ref);
414
+ break;
415
+ case PARSER_SOLIDUS_OR_TAG_NAME:
416
+ parse_again = parse_solidus_or_tag_name(parser, &ref);
417
+ break;
418
+ case PARSER_TAG_NAME:
419
+ parse_again = parse_tag_name(parser, &ref);
420
+ break;
421
+ case PARSER_TAG:
422
+ parse_again = parse_tag(parser, &ref);
423
+ break;
424
+ case PARSER_ATTRIBUTE_NAME:
425
+ parse_again = parse_attribute_name(parser, &ref);
426
+ break;
427
+ case PARSER_ATTRIBUTE_WHITESPACE_OR_EQUAL:
428
+ parse_again = parse_attribute_whitespace_or_equal(parser, &ref);
429
+ break;
430
+ case PARSER_ATTRIBUTE_WHITESPACE_OR_VALUE:
431
+ parse_again = parse_attribute_whitespace_or_value(parser, &ref);
432
+ break;
433
+ case PARSER_ATTRIBUTE_QUOTED_VALUE:
434
+ parse_again = parse_attribute_quoted_value(parser, &ref);
435
+ break;
436
+ case PARSER_SPACE_AFTER_ATTRIBUTE:
437
+ parse_again = parse_space_after_attribute(parser, &ref);
438
+ break;
439
+ case PARSER_ATTRIBUTE_UNQUOTED_VALUE:
440
+ parse_again = parse_attribute_unquoted_value(parser, &ref);
441
+ break;
442
+ case PARSER_TAG_END:
443
+ parse_again = parse_tag_end(parser, &ref);
444
+ break;
445
+ case PARSER_CDATA:
446
+ parse_again = parse_cdata(parser, &ref);
447
+ break;
448
+ case PARSER_COMMENT:
449
+ parse_again = parse_comment(parser, &ref);
450
+ break;
451
+ }
452
+ }
453
+
454
+ if(rb_block_given_p()) {
455
+ enc = rb_enc_from_index(parser->doc.enc_index);
456
+ mb_strlen = rb_enc_strlen(parser->doc.data + ref.start, parser->doc.data + ref.start + ref.length, enc);
457
+ rb_yield_values(5, token_type_to_symbol(type),
458
+ INT2NUM(ref.mb_start), INT2NUM(ref.mb_start + mb_strlen),
459
+ INT2NUM(ref.line_number), INT2NUM(ref.column_number));
460
+ }
461
+
462
+ parser_adjust_line_number(parser, ref.start, ref.length);
463
+
464
+ return;
465
+ }
466
+
467
+ static VALUE parser_initialize_method(VALUE self)
468
+ {
469
+ struct parser_t *parser = NULL;
470
+
471
+ Parser_Get_Struct(self, parser);
472
+ DBG_PRINT("parser=%p initialize", parser);
473
+
474
+ memset(parser, 0, sizeof(struct parser_t));
475
+
476
+ parser->context = PARSER_NONE;
477
+
478
+ tokenizer_init(&parser->tk);
479
+ parser->tk.callback_data = parser;
480
+ parser->tk.f_callback = parser_tokenize_callback;
481
+
482
+ parser->doc.length = 0;
483
+ parser->doc.data = NULL;
484
+ parser->doc.enc_index = 0;
485
+ parser->doc.mb_length = 0;
486
+
487
+ parser->doc.line_number = 1;
488
+ parser->doc.column_number = 0;
489
+
490
+ parser->errors_count = 0;
491
+ parser->errors = NULL;
492
+
493
+ return Qnil;
494
+ }
495
+
496
+ static int parser_document_append(struct parser_t *parser, const char *string, unsigned long int length)
497
+ {
498
+ void *old = parser->doc.data;
499
+ unsigned long int mb_length;
500
+ char *buf;
501
+ rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
502
+ REALLOC_N(parser->doc.data, char, parser->doc.length + length + 1);
503
+ DBG_PRINT("parser=%p realloc(parser->doc.data) %p -> %p length=%lu", parser, old,
504
+ parser->doc.data, parser->doc.length + length + 1);
505
+ buf = parser->doc.data + parser->doc.length;
506
+ strcpy(buf, string);
507
+ mb_length = rb_enc_strlen(buf, buf + length, enc);
508
+ parser->doc.length += length;
509
+ parser->doc.mb_length += mb_length;
510
+ return 1;
511
+ }
512
+
513
+ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
514
+ {
515
+ struct parser_t *parser = NULL;
516
+ char *string = NULL;
517
+ long unsigned int length = 0, cursor = 0, mb_cursor = 0;
518
+
519
+ if(NIL_P(source))
520
+ return Qnil;
521
+
522
+ Check_Type(source, T_STRING);
523
+ Parser_Get_Struct(self, parser);
524
+
525
+ string = StringValueCStr(source);
526
+ length = strlen(string);
527
+
528
+ cursor = parser->doc.length;
529
+ mb_cursor = parser->doc.mb_length;
530
+
531
+ if(parser->doc.data == NULL) {
532
+ parser->doc.enc_index = rb_enc_get_index(source);
533
+ }
534
+ else if(parser->doc.enc_index != rb_enc_get_index(source)) {
535
+ rb_raise(rb_eArgError, "cannot append %s string to %s document",
536
+ rb_enc_name(rb_enc_get(source)), rb_enc_name(rb_enc_from_index(parser->doc.enc_index)));
537
+ }
538
+
539
+ if(!parser_document_append(parser, string, length)) {
540
+ // error
541
+ return Qnil;
542
+ }
543
+
544
+ if(is_placeholder) {
545
+ parser_adjust_line_number(parser, cursor, length);
546
+ }
547
+ else {
548
+ parser->tk.scan.cursor = cursor;
549
+ tokenizer_set_scan_string(&parser->tk, parser->doc.data, parser->doc.length);
550
+ parser->tk.scan.enc_index = parser->doc.enc_index;
551
+ parser->tk.scan.mb_cursor = mb_cursor;
552
+
553
+ tokenizer_scan_all(&parser->tk);
554
+ tokenizer_free_scan_string(&parser->tk);
555
+ }
556
+
557
+ return Qtrue;
558
+ }
559
+
560
+ static VALUE parser_parse_method(VALUE self, VALUE source)
561
+ {
562
+ return parser_append_data(self, source, 0);
563
+ }
564
+
565
+ static VALUE parser_append_placeholder_method(VALUE self, VALUE source)
566
+ {
567
+ return parser_append_data(self, source, 1);
568
+ }
569
+
570
+ static VALUE parser_document_method(VALUE self)
571
+ {
572
+ struct parser_t *parser = NULL;
573
+ rb_encoding *enc;
574
+ Parser_Get_Struct(self, parser);
575
+ if(!parser->doc.data)
576
+ return Qnil;
577
+ enc = rb_enc_from_index(parser->doc.enc_index);
578
+ return rb_enc_str_new(parser->doc.data, parser->doc.length, enc);
579
+ }
580
+
581
+ static VALUE parser_document_length_method(VALUE self)
582
+ {
583
+ struct parser_t *parser = NULL;
584
+ rb_encoding *enc;
585
+ const char *buf;
586
+
587
+ Parser_Get_Struct(self, parser);
588
+
589
+ if(parser->doc.data == NULL) {
590
+ return ULONG2NUM(0);
591
+ }
592
+ else {
593
+ buf = parser->doc.data;
594
+ enc = rb_enc_from_index(parser->doc.enc_index);
595
+ return ULONG2NUM(rb_enc_strlen(buf, buf + parser->doc.length, enc));
596
+ }
597
+ }
598
+
599
+ static VALUE parser_context_method(VALUE self)
600
+ {
601
+ struct parser_t *parser = NULL;
602
+
603
+ Parser_Get_Struct(self, parser);
604
+
605
+ switch(parser->context) {
606
+ case PARSER_NONE:
607
+ return rawtext_context(parser) ? ID2SYM(rb_intern("rawtext")) : ID2SYM(rb_intern("none"));
608
+ case PARSER_SOLIDUS_OR_TAG_NAME:
609
+ return ID2SYM(rb_intern("solidus_or_tag_name"));
610
+ case PARSER_TAG_NAME:
611
+ return ID2SYM(rb_intern("tag_name"));
612
+ case PARSER_TAG:
613
+ return ID2SYM(rb_intern("tag"));
614
+ case PARSER_ATTRIBUTE_NAME:
615
+ return ID2SYM(rb_intern("attribute_name"));
616
+ case PARSER_ATTRIBUTE_WHITESPACE_OR_EQUAL:
617
+ return ID2SYM(rb_intern("after_attribute_name"));
618
+ case PARSER_ATTRIBUTE_WHITESPACE_OR_VALUE:
619
+ return ID2SYM(rb_intern("after_equal"));
620
+ case PARSER_ATTRIBUTE_QUOTED_VALUE:
621
+ return ID2SYM(rb_intern("quoted_value"));
622
+ case PARSER_SPACE_AFTER_ATTRIBUTE:
623
+ return ID2SYM(rb_intern("space_after_attribute"));
624
+ case PARSER_ATTRIBUTE_UNQUOTED_VALUE:
625
+ return ID2SYM(rb_intern("unquoted_value"));
626
+ case PARSER_TAG_END:
627
+ return ID2SYM(rb_intern("tag_end"));
628
+ case PARSER_COMMENT:
629
+ return ID2SYM(rb_intern("comment"));
630
+ case PARSER_CDATA:
631
+ return ID2SYM(rb_intern("cdata"));
632
+ }
633
+
634
+ return Qnil;
635
+ }
636
+
637
+ static inline VALUE ref_to_str(struct parser_t *parser, struct token_reference_t *ref)
638
+ {
639
+ rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
640
+ if(ref->type == TOKEN_NONE || parser->doc.data == NULL)
641
+ return Qnil;
642
+ return rb_enc_str_new(parser->doc.data+ref->start, ref->length, enc);
643
+ }
644
+
645
+ static VALUE parser_tag_name_method(VALUE self)
646
+ {
647
+ struct parser_t *parser = NULL;
648
+ Parser_Get_Struct(self, parser);
649
+ return ref_to_str(parser, &parser->tag.name);
650
+ }
651
+
652
+ static VALUE parser_closing_tag_method(VALUE self)
653
+ {
654
+ struct parser_t *parser = NULL;
655
+ Parser_Get_Struct(self, parser);
656
+ return parser->tk.is_closing_tag ? Qtrue : Qfalse;
657
+ }
658
+
659
+ static VALUE parser_self_closing_tag_method(VALUE self)
660
+ {
661
+ struct parser_t *parser = NULL;
662
+ Parser_Get_Struct(self, parser);
663
+ return parser->tag.self_closing ? Qtrue : Qfalse;
664
+ }
665
+
666
+ static VALUE parser_attribute_name_method(VALUE self)
667
+ {
668
+ struct parser_t *parser = NULL;
669
+ Parser_Get_Struct(self, parser);
670
+ return ref_to_str(parser, &parser->attribute.name);
671
+ }
672
+
673
+ static VALUE parser_attribute_value_method(VALUE self)
674
+ {
675
+ struct parser_t *parser = NULL;
676
+ Parser_Get_Struct(self, parser);
677
+ return ref_to_str(parser, &parser->attribute.value);
678
+ }
679
+
680
+ static VALUE parser_quote_character_method(VALUE self)
681
+ {
682
+ struct parser_t *parser = NULL;
683
+ Parser_Get_Struct(self, parser);
684
+ return parser->attribute.is_quoted ?
685
+ rb_str_new(&parser->tk.attribute_value_start, 1) :
686
+ Qnil;
687
+ }
688
+
689
+ static VALUE parser_attribute_is_quoted_method(VALUE self)
690
+ {
691
+ struct parser_t *parser = NULL;
692
+ Parser_Get_Struct(self, parser);
693
+ return parser->attribute.is_quoted ? Qtrue : Qfalse;
694
+ }
695
+
696
+ static VALUE parser_comment_text_method(VALUE self)
697
+ {
698
+ struct parser_t *parser = NULL;
699
+ Parser_Get_Struct(self, parser);
700
+ return ref_to_str(parser, &parser->comment.text);
701
+ }
702
+
703
+ static VALUE parser_cdata_text_method(VALUE self)
704
+ {
705
+ struct parser_t *parser = NULL;
706
+ Parser_Get_Struct(self, parser);
707
+ return ref_to_str(parser, &parser->cdata.text);
708
+ }
709
+
710
+ static VALUE parser_rawtext_text_method(VALUE self)
711
+ {
712
+ struct parser_t *parser = NULL;
713
+ Parser_Get_Struct(self, parser);
714
+ return ref_to_str(parser, &parser->rawtext.text);
715
+ }
716
+
717
+ static VALUE parser_errors_count_method(VALUE self)
718
+ {
719
+ struct parser_t *parser = NULL;
720
+ Parser_Get_Struct(self, parser);
721
+ return INT2NUM(parser->errors_count);
722
+ }
723
+
724
+ static VALUE create_parser_error(struct parser_document_error_t *error)
725
+ {
726
+ VALUE module = rb_const_get(rb_cObject, rb_intern("BetterHtml"));
727
+ VALUE klass = rb_const_get(module, rb_intern("ParserError"));
728
+ VALUE args[4] = {
729
+ rb_str_new2(error->message),
730
+ ULONG2NUM(error->mb_pos),
731
+ ULONG2NUM(error->line_number),
732
+ ULONG2NUM(error->column_number),
733
+ };
734
+ return rb_class_new_instance(4, args, klass);
735
+ }
736
+
737
+ static VALUE parser_errors_method(VALUE self)
738
+ {
739
+ struct parser_t *parser = NULL;
740
+ VALUE list;
741
+ size_t i;
742
+ Parser_Get_Struct(self, parser);
743
+
744
+ list = rb_ary_new();
745
+ for(i=0; i<parser->errors_count; i++) {
746
+ if(parser->errors[i].message) {
747
+ rb_ary_push(list, create_parser_error(&parser->errors[i]));
748
+ }
749
+ }
750
+
751
+ return list;
752
+ }
753
+
754
+ static VALUE parser_line_number_method(VALUE self)
755
+ {
756
+ struct parser_t *parser = NULL;
757
+ Parser_Get_Struct(self, parser);
758
+ return ULONG2NUM(parser->doc.line_number);
759
+ }
760
+
761
+ static VALUE parser_column_number_method(VALUE self)
762
+ {
763
+ struct parser_t *parser = NULL;
764
+ Parser_Get_Struct(self, parser);
765
+ return ULONG2NUM(parser->doc.column_number);
766
+ }
767
+
768
+ void Init_html_tokenizer_parser(VALUE mHtmlTokenizer)
769
+ {
770
+ cParser = rb_define_class_under(mHtmlTokenizer, "Parser", rb_cObject);
771
+ rb_define_alloc_func(cParser, parser_allocate);
772
+ rb_define_method(cParser, "initialize", parser_initialize_method, 0);
773
+ rb_define_method(cParser, "document", parser_document_method, 0);
774
+ rb_define_method(cParser, "document_length", parser_document_length_method, 0);
775
+ rb_define_method(cParser, "line_number", parser_line_number_method, 0);
776
+ rb_define_method(cParser, "column_number", parser_column_number_method, 0);
777
+ rb_define_method(cParser, "parse", parser_parse_method, 1);
778
+ rb_define_method(cParser, "append_placeholder", parser_append_placeholder_method, 1);
779
+ rb_define_method(cParser, "context", parser_context_method, 0);
780
+ rb_define_method(cParser, "tag_name", parser_tag_name_method, 0);
781
+ rb_define_method(cParser, "closing_tag?", parser_closing_tag_method, 0);
782
+ rb_define_method(cParser, "self_closing_tag?", parser_self_closing_tag_method, 0);
783
+ rb_define_method(cParser, "attribute_name", parser_attribute_name_method, 0);
784
+ rb_define_method(cParser, "attribute_value", parser_attribute_value_method, 0);
785
+ rb_define_method(cParser, "quote_character", parser_quote_character_method, 0);
786
+ rb_define_method(cParser, "attribute_quoted?", parser_attribute_is_quoted_method, 0);
787
+ rb_define_method(cParser, "comment_text", parser_comment_text_method, 0);
788
+ rb_define_method(cParser, "cdata_text", parser_cdata_text_method, 0);
789
+ rb_define_method(cParser, "rawtext_text", parser_rawtext_text_method, 0);
790
+
791
+ rb_define_method(cParser, "errors_count", parser_errors_count_method, 0);
792
+ rb_define_method(cParser, "errors", parser_errors_method, 0);
793
+ }