html_tokenizer 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a3d58539284af566692b81cc4633af1137baabea
4
+ data.tar.gz: 1877010598cbadadb27212eae39346769fa2afde
5
+ SHA512:
6
+ metadata.gz: 9d6e46dfd48e7bb4967cfa13e90bdd213331b6dd38f9a57739ac1317f4b8147a9f11c5a39001ebf1c36554350ec444eaccc19dfe04040dadf2fc71d92435d5d5
7
+ data.tar.gz: f8fad88c25ff9404d710d4609ffc89fe80b08a71ba864cdca91a50cb9c5c98844befb75c43f5d4c6e0c56641fba57b4fda35ddf1c485abdcaf6bcd121bb4b0be
data/.autotest ADDED
@@ -0,0 +1,3 @@
1
+ # -*- ruby -*-
2
+
3
+ require "autotest/restart"
data/.gitignore ADDED
@@ -0,0 +1,35 @@
1
+ *.bundle
2
+ tmp/
3
+
4
+ # Object files
5
+ *.o
6
+ *.ko
7
+ *.obj
8
+ *.elf
9
+
10
+ # Precompiled Headers
11
+ *.gch
12
+ *.pch
13
+
14
+ # Libraries
15
+ *.lib
16
+ *.a
17
+ *.la
18
+ *.lo
19
+
20
+ # Shared objects (inc. Windows DLLs)
21
+ *.dll
22
+ *.so
23
+ *.so.*
24
+ *.dylib
25
+
26
+ # Executables
27
+ *.exe
28
+ *.out
29
+ *.app
30
+ *.i*86
31
+ *.x86_64
32
+ *.hex
33
+
34
+ # Debug files
35
+ *.dSYM/
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ gem 'minitest'
6
+
7
+ gem 'rake'
8
+ gem 'rake-compiler'
data/Gemfile.lock ADDED
@@ -0,0 +1,24 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ html_tokenizer (0.0.1)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ minitest (5.9.0)
10
+ rake (11.1.2)
11
+ rake-compiler (0.9.9)
12
+ rake
13
+
14
+ PLATFORMS
15
+ ruby
16
+
17
+ DEPENDENCIES
18
+ html_tokenizer!
19
+ minitest
20
+ rake
21
+ rake-compiler
22
+
23
+ BUNDLED WITH
24
+ 1.12.3
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Francois Chagnon
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/Manifest.txt ADDED
@@ -0,0 +1,8 @@
1
+ .autotest
2
+ History.txt
3
+ Manifest.txt
4
+ README.txt
5
+ Rakefile
6
+ bin/html_tokenizer
7
+ lib/html_tokenizer.rb
8
+ test/test_html_tokenizer.rb
data/README.md ADDED
@@ -0,0 +1,2 @@
1
+ # html_tokenizer
2
+ An HTML tokenizer.
data/Rakefile ADDED
@@ -0,0 +1,20 @@
1
+ # -*- ruby -*-
2
+
3
+ require "rubygems"
4
+ require 'rake'
5
+ require 'rake/testtask'
6
+ require 'bundler/gem_tasks'
7
+ require 'rake/extensiontask'
8
+
9
+ Rake::ExtensionTask.new("html_tokenizer_ext")
10
+
11
+ task :default => :test
12
+
13
+ task :test => ['test:unit']
14
+
15
+ namespace :test do
16
+ Rake::TestTask.new(:unit => :compile) do |t|
17
+ t.libs << 'lib' << 'test'
18
+ t.test_files = FileList['test/unit/**/*_test.rb']
19
+ end
20
+ end
@@ -0,0 +1,3 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ abort "you need to write me"
@@ -0,0 +1,6 @@
1
+ require 'mkmf'
2
+
3
+ $CXXFLAGS += " -std=c++11 "
4
+ $CXXFLAGS += " -g -Og -ggdb "
5
+
6
+ create_makefile('html_tokenizer_ext')
@@ -0,0 +1,12 @@
1
+ #include <ruby.h>
2
+ #include "tokenizer.h"
3
+ #include "parser.h"
4
+
5
+ static VALUE mHtmlTokenizer = Qnil;
6
+
7
+ void Init_html_tokenizer_ext()
8
+ {
9
+ mHtmlTokenizer = rb_define_module("HtmlTokenizer");
10
+ Init_html_tokenizer_tokenizer(mHtmlTokenizer);
11
+ Init_html_tokenizer_parser(mHtmlTokenizer);
12
+ }
@@ -0,0 +1,7 @@
1
+ #pragma once
2
+
3
+ #ifdef DEBUG
4
+ #define DBG_PRINT(msg, arg...) printf("%s:%u: " msg "\n", __FUNCTION__, __LINE__, arg);
5
+ #else
6
+ #define DBG_PRINT(msg, arg...) ((void)0);
7
+ #endif
@@ -0,0 +1,767 @@
1
+ #include <ruby.h>
2
+ #include "html_tokenizer.h"
3
+ #include "parser.h"
4
+
5
+ static VALUE cParser = Qnil;
6
+
7
+ static void parser_mark(void *ptr)
8
+ {}
9
+
10
+ static void parser_free(void *ptr)
11
+ {
12
+ struct parser_t *parser = ptr;
13
+ size_t i;
14
+
15
+ if(parser) {
16
+ if(parser->doc.data) {
17
+ DBG_PRINT("parser=%p xfree(parser->doc.data) %p", parser, parser->doc.data);
18
+ xfree(parser->doc.data);
19
+ parser->doc.data = NULL;
20
+ }
21
+ if(parser->errors_count && parser->errors) {
22
+ for(i=0; i<parser->errors_count; i++) {
23
+ if(!parser->errors[i].message)
24
+ continue;
25
+ DBG_PRINT("parser=%p xfree(parser->errors.messages[%u]) %p", parser, i, parser->errors[i].message);
26
+ xfree(parser->errors[i].message);
27
+ parser->errors[i].message = NULL;
28
+ }
29
+ DBG_PRINT("parser=%p xfree(parser->errors.messages) %p", parser, parser->errors);
30
+ xfree(parser->errors);
31
+ parser->errors = NULL;
32
+ parser->errors_count = 0;
33
+ }
34
+ DBG_PRINT("parser=%p xfree(parser)", parser);
35
+ xfree(parser);
36
+ }
37
+ }
38
+
39
+ static size_t parser_memsize(const void *ptr)
40
+ {
41
+ return ptr ? sizeof(struct parser_t) : 0;
42
+ }
43
+
44
+ const rb_data_type_t ht_parser_data_type = {
45
+ "ht_parser_data_type",
46
+ { parser_mark, parser_free, parser_memsize, },
47
+ #if defined(RUBY_TYPED_FREE_IMMEDIATELY)
48
+ NULL, NULL, RUBY_TYPED_FREE_IMMEDIATELY
49
+ #endif
50
+ };
51
+
52
+ static VALUE parser_allocate(VALUE klass)
53
+ {
54
+ VALUE obj;
55
+ struct parser_t *parser = NULL;
56
+
57
+ obj = TypedData_Make_Struct(klass, struct parser_t, &ht_parser_data_type, parser);
58
+ DBG_PRINT("parser=%p allocate", parser);
59
+
60
+ return obj;
61
+ }
62
+
63
+ static inline void parser_append_ref(struct token_reference_t *dest, struct token_reference_t *src)
64
+ {
65
+ if(dest->type == TOKEN_NONE || dest->type != src->type || (dest->start + dest->length) != src->start) {
66
+ dest->type = src->type;
67
+ dest->start = src->start;
68
+ dest->length = src->length;
69
+ dest->line_number = src->line_number;
70
+ dest->column_number = src->column_number;
71
+ }
72
+ else {
73
+ dest->type = src->type;
74
+ dest->length += src->length;
75
+ }
76
+ }
77
+
78
+ static void parser_add_error(struct parser_t *parser, const char *message)
79
+ {
80
+ REALLOC_N(parser->errors, struct parser_document_error_t, parser->errors_count + 1);
81
+ parser->errors[parser->errors_count].message = strdup(message);
82
+ parser->errors[parser->errors_count].line_number = parser->doc.line_number;
83
+ parser->errors[parser->errors_count].column_number = parser->doc.column_number;
84
+ parser->errors_count += 1;
85
+ return;
86
+ }
87
+
88
+ static int parse_none(struct parser_t *parser, struct token_reference_t *ref)
89
+ {
90
+ if(ref->type == TOKEN_TAG_START) {
91
+ parser->tag.self_closing = 0;
92
+ parser->context = PARSER_SOLIDUS_OR_TAG_NAME;
93
+ parser->tag.name.type = TOKEN_NONE;
94
+ }
95
+ else if(ref->type == TOKEN_COMMENT_START) {
96
+ parser->context = PARSER_COMMENT;
97
+ parser->comment.text.type = TOKEN_NONE;
98
+ }
99
+ else if(ref->type == TOKEN_CDATA_START) {
100
+ parser->context = PARSER_CDATA;
101
+ parser->cdata.text.type = TOKEN_NONE;
102
+ }
103
+ PARSE_DONE;
104
+ }
105
+
106
+ static int parse_rawtext(struct parser_t *parser, struct token_reference_t *ref)
107
+ {
108
+ if(ref->type == TOKEN_TEXT) {
109
+ parser_append_ref(&parser->rawtext.text, ref);
110
+ }
111
+ else {
112
+ parser->context = PARSER_NONE;
113
+ parse_none(parser, ref);
114
+ }
115
+ PARSE_DONE;
116
+ }
117
+
118
+ static int parse_comment(struct parser_t *parser, struct token_reference_t *ref)
119
+ {
120
+ if(ref->type == TOKEN_COMMENT_END) {
121
+ parser->context = PARSER_NONE;
122
+ }
123
+ else if(ref->type == TOKEN_TEXT) {
124
+ parser_append_ref(&parser->comment.text, ref);
125
+ }
126
+ PARSE_DONE;
127
+ }
128
+
129
+ static int parse_cdata(struct parser_t *parser, struct token_reference_t *ref)
130
+ {
131
+ if(ref->type == TOKEN_CDATA_END) {
132
+ parser->context = PARSER_NONE;
133
+ }
134
+ else if(ref->type == TOKEN_TEXT) {
135
+ parser_append_ref(&parser->cdata.text, ref);
136
+ }
137
+ PARSE_DONE;
138
+ }
139
+
140
+ static int parse_solidus_or_tag_name(struct parser_t *parser, struct token_reference_t *ref)
141
+ {
142
+ if(ref->type == TOKEN_SOLIDUS) {
143
+ // ignore solidus before tag name
144
+ parser->context = PARSER_TAG_NAME;
145
+ }
146
+ else if(ref->type == TOKEN_TAG_NAME) {
147
+ parser->context = PARSER_TAG_NAME;
148
+ PARSE_AGAIN;
149
+ }
150
+ else {
151
+ parser_add_error(parser, "expected '/' or tag name");
152
+ parser->context = PARSER_TAG;
153
+ PARSE_AGAIN;
154
+ }
155
+ PARSE_DONE;
156
+ }
157
+
158
+ static int parse_tag_name(struct parser_t *parser, struct token_reference_t *ref)
159
+ {
160
+ if(ref->type == TOKEN_TAG_NAME) {
161
+ parser_append_ref(&parser->tag.name, ref);
162
+ }
163
+ else if(ref->type == TOKEN_WHITESPACE) {
164
+ parser->context = PARSER_TAG;
165
+ }
166
+ else if(ref->type == TOKEN_TAG_END) {
167
+ parser->context = PARSER_NONE;
168
+ }
169
+ else if(ref->type == TOKEN_SOLIDUS) {
170
+ parser->context = PARSER_TAG;
171
+ PARSE_AGAIN;
172
+ }
173
+ else {
174
+ // not reachable
175
+ rb_raise(rb_eArgError, "expected whitespace, '/' or '>' after tag name");
176
+ }
177
+ PARSE_DONE;
178
+ }
179
+
180
+ static int parse_tag(struct parser_t *parser, struct token_reference_t *ref)
181
+ {
182
+ if(ref->type == TOKEN_TAG_END) {
183
+ parser->context = PARSER_NONE;
184
+ }
185
+ else if(ref->type == TOKEN_WHITESPACE) {
186
+ // ignore whitespaces
187
+ }
188
+ else if(ref->type == TOKEN_SOLIDUS) {
189
+ parser->context = PARSER_TAG_END;
190
+ }
191
+ else if(ref->type == TOKEN_ATTRIBUTE_NAME) {
192
+ parser->context = PARSER_ATTRIBUTE_NAME;
193
+ parser->attribute.name.type = TOKEN_NONE;
194
+ parser->attribute.value.type = TOKEN_NONE;
195
+ parser->attribute.is_quoted = 0;
196
+ PARSE_AGAIN;
197
+ }
198
+ else if(ref->type == TOKEN_ATTRIBUTE_QUOTED_VALUE_START) {
199
+ parser->context = PARSER_ATTRIBUTE_QUOTED_VALUE;
200
+ parser->attribute.name.type = TOKEN_NONE;
201
+ parser->attribute.value.type = TOKEN_NONE;
202
+ parser->attribute.is_quoted = 1;
203
+ }
204
+ else {
205
+ // unexpected
206
+ parser_add_error(parser, "expected whitespace, '>', attribute name or value");
207
+ }
208
+ PARSE_DONE;
209
+ }
210
+
211
+ static int parse_tag_end(struct parser_t *parser, struct token_reference_t *ref)
212
+ {
213
+ if(ref->type == TOKEN_TAG_END) {
214
+ parser->tag.self_closing = 1;
215
+ parser->context = PARSER_NONE;
216
+ }
217
+ else {
218
+ parser_add_error(parser, "expected '>' after '/'");
219
+ parser->context = PARSER_TAG;
220
+ PARSE_AGAIN;
221
+ }
222
+ PARSE_DONE;
223
+ }
224
+
225
+ static int parse_attribute_name(struct parser_t *parser, struct token_reference_t *ref)
226
+ {
227
+ if(ref->type == TOKEN_ATTRIBUTE_NAME) {
228
+ parser_append_ref(&parser->attribute.name, ref);
229
+ }
230
+ else if(ref->type == TOKEN_TAG_END || ref->type == TOKEN_SOLIDUS) {
231
+ parser->context = PARSER_TAG;
232
+ PARSE_AGAIN;
233
+ }
234
+ else if(ref->type == TOKEN_WHITESPACE) {
235
+ parser->context = PARSER_ATTRIBUTE_WHITESPACE_OR_EQUAL;
236
+ PARSE_AGAIN;
237
+ }
238
+ else if(ref->type == TOKEN_EQUAL) {
239
+ parser->context = PARSER_ATTRIBUTE_WHITESPACE_OR_VALUE;
240
+ }
241
+ else {
242
+ parser_add_error(parser, "expected whitespace, '>' or '=' after attribute name");
243
+ parser->context = PARSER_TAG;
244
+ PARSE_AGAIN;
245
+ }
246
+ PARSE_DONE;
247
+ }
248
+
249
+ static int parse_attribute_whitespace_or_equal(struct parser_t *parser, struct token_reference_t *ref)
250
+ {
251
+ if(ref->type == TOKEN_WHITESPACE) {
252
+ // swallow whitespace after attribute name
253
+ }
254
+ else if(ref->type == TOKEN_TAG_END || ref->type == TOKEN_SOLIDUS) {
255
+ parser->context = PARSER_TAG;
256
+ PARSE_AGAIN;
257
+ }
258
+ else if(ref->type == TOKEN_EQUAL) {
259
+ parser->context = PARSER_ATTRIBUTE_WHITESPACE_OR_VALUE;
260
+ }
261
+ else if(ref->type == TOKEN_ATTRIBUTE_NAME) {
262
+ // start new attribute after whitespace
263
+ parser->context = PARSER_TAG;
264
+ PARSE_AGAIN;
265
+ }
266
+ else if(ref->type == TOKEN_ATTRIBUTE_QUOTED_VALUE_START) {
267
+ // start quoted value after whitespace
268
+ parser->context = PARSER_TAG;
269
+ PARSE_AGAIN;
270
+ }
271
+ else {
272
+ parser_add_error(parser, "expected '/', '>', \", ' or '=' after attribute name");
273
+ parser->context = PARSER_TAG;
274
+ PARSE_AGAIN;
275
+ }
276
+
277
+ PARSE_DONE;
278
+ }
279
+
280
+ static int parse_attribute_whitespace_or_value(struct parser_t *parser, struct token_reference_t *ref)
281
+ {
282
+ if(ref->type == TOKEN_WHITESPACE) {
283
+ // swallow whitespace after equal sign
284
+ }
285
+ else if(ref->type == TOKEN_ATTRIBUTE_QUOTED_VALUE_START) {
286
+ parser->context = PARSER_ATTRIBUTE_QUOTED_VALUE;
287
+ parser->attribute.is_quoted = 1;
288
+ }
289
+ else if(ref->type == TOKEN_ATTRIBUTE_UNQUOTED_VALUE) {
290
+ parser->context = PARSER_ATTRIBUTE_UNQUOTED_VALUE;
291
+ PARSE_AGAIN;
292
+ }
293
+ else {
294
+ parser_add_error(parser, "expected attribute value after '='");
295
+ parser->context = PARSER_TAG;
296
+ PARSE_AGAIN;
297
+ }
298
+
299
+ PARSE_DONE;
300
+ }
301
+
302
+ static int parse_attribute_quoted_value(struct parser_t *parser, struct token_reference_t *ref)
303
+ {
304
+ if(ref->type == TOKEN_ATTRIBUTE_QUOTED_VALUE) {
305
+ parser_append_ref(&parser->attribute.value, ref);
306
+ }
307
+ else if(ref->type == TOKEN_ATTRIBUTE_QUOTED_VALUE_END) {
308
+ parser->context = PARSER_SPACE_AFTER_ATTRIBUTE;
309
+ }
310
+ else {
311
+ // not reachable
312
+ rb_raise(rb_eArgError, "expected end-quote after quoted value");
313
+ }
314
+
315
+ PARSE_DONE;
316
+ }
317
+
318
+ static int parse_space_after_attribute(struct parser_t *parser, struct token_reference_t *ref)
319
+ {
320
+ if(ref->type == TOKEN_WHITESPACE) {
321
+ parser->context = PARSER_TAG;
322
+ }
323
+ else if(ref->type == TOKEN_TAG_END || ref->type == TOKEN_SOLIDUS) {
324
+ parser->context = PARSER_TAG;
325
+ PARSE_AGAIN;
326
+ }
327
+ else {
328
+ parser_add_error(parser, "expected space after attribute value");
329
+ parser->context = PARSER_TAG;
330
+ PARSE_AGAIN;
331
+ }
332
+
333
+ PARSE_DONE;
334
+ }
335
+
336
+ static int parse_attribute_unquoted_value(struct parser_t *parser, struct token_reference_t *ref)
337
+ {
338
+ if(ref->type == TOKEN_ATTRIBUTE_UNQUOTED_VALUE) {
339
+ parser_append_ref(&parser->attribute.value, ref);
340
+ }
341
+ else if(ref->type == TOKEN_WHITESPACE) {
342
+ parser->context = PARSER_TAG;
343
+ }
344
+ else if(ref->type == TOKEN_TAG_END || ref->type == TOKEN_SOLIDUS) {
345
+ parser->context = PARSER_TAG;
346
+ PARSE_AGAIN;
347
+ }
348
+ else {
349
+ // not reachable
350
+ rb_raise(rb_eArgError, "expected space or end-of-tag after unquoted value");
351
+ }
352
+
353
+ PARSE_DONE;
354
+ }
355
+
356
+ static inline int rawtext_context(struct parser_t *parser)
357
+ {
358
+ enum tokenizer_context ctx = parser->tk.context[parser->tk.current_context];
359
+ return (ctx == TOKENIZER_RCDATA || ctx == TOKENIZER_RAWTEXT ||
360
+ ctx == TOKENIZER_SCRIPT_DATA || ctx == TOKENIZER_PLAINTEXT);
361
+ }
362
+
363
+ static void parser_adjust_line_number(struct parser_t *parser, long unsigned int start, long unsigned int length)
364
+ {
365
+ long unsigned int i;
366
+
367
+ for(i = start;i < (start + length); i++) {
368
+ if(parser->doc.data[i] == '\n') {
369
+ parser->doc.column_number = 0;
370
+ parser->doc.line_number += 1;
371
+ }
372
+ else {
373
+ parser->doc.column_number += 1;
374
+ }
375
+ }
376
+
377
+ return;
378
+ }
379
+
380
+ static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type type, unsigned long int length, void *data)
381
+ {
382
+ struct parser_t *parser = (struct parser_t *)data;
383
+ struct token_reference_t ref = {
384
+ .type = type,
385
+ .start = tk->scan.cursor,
386
+ .length = length,
387
+ .line_number = parser->doc.line_number,
388
+ .column_number = parser->doc.column_number,
389
+ };
390
+ int parse_again = 1;
391
+
392
+ while(parse_again) {
393
+ switch(parser->context)
394
+ {
395
+ case PARSER_NONE:
396
+ if(rawtext_context(parser))
397
+ parse_again = parse_rawtext(parser, &ref);
398
+ else
399
+ parse_again = parse_none(parser, &ref);
400
+ break;
401
+ case PARSER_SOLIDUS_OR_TAG_NAME:
402
+ parse_again = parse_solidus_or_tag_name(parser, &ref);
403
+ break;
404
+ case PARSER_TAG_NAME:
405
+ parse_again = parse_tag_name(parser, &ref);
406
+ break;
407
+ case PARSER_TAG:
408
+ parse_again = parse_tag(parser, &ref);
409
+ break;
410
+ case PARSER_ATTRIBUTE_NAME:
411
+ parse_again = parse_attribute_name(parser, &ref);
412
+ break;
413
+ case PARSER_ATTRIBUTE_WHITESPACE_OR_EQUAL:
414
+ parse_again = parse_attribute_whitespace_or_equal(parser, &ref);
415
+ break;
416
+ case PARSER_ATTRIBUTE_WHITESPACE_OR_VALUE:
417
+ parse_again = parse_attribute_whitespace_or_value(parser, &ref);
418
+ break;
419
+ case PARSER_ATTRIBUTE_QUOTED_VALUE:
420
+ parse_again = parse_attribute_quoted_value(parser, &ref);
421
+ break;
422
+ case PARSER_SPACE_AFTER_ATTRIBUTE:
423
+ parse_again = parse_space_after_attribute(parser, &ref);
424
+ break;
425
+ case PARSER_ATTRIBUTE_UNQUOTED_VALUE:
426
+ parse_again = parse_attribute_unquoted_value(parser, &ref);
427
+ break;
428
+ case PARSER_TAG_END:
429
+ parse_again = parse_tag_end(parser, &ref);
430
+ break;
431
+ case PARSER_CDATA:
432
+ parse_again = parse_cdata(parser, &ref);
433
+ break;
434
+ case PARSER_COMMENT:
435
+ parse_again = parse_comment(parser, &ref);
436
+ break;
437
+ }
438
+ }
439
+
440
+ if(rb_block_given_p()) {
441
+ rb_yield_values(5, token_type_to_symbol(type),
442
+ INT2NUM(ref.start), INT2NUM(ref.start + ref.length),
443
+ INT2NUM(ref.line_number), INT2NUM(ref.column_number));
444
+ }
445
+
446
+ parser_adjust_line_number(parser, ref.start, ref.length);
447
+
448
+ return;
449
+ }
450
+
451
+ static VALUE parser_initialize_method(VALUE self)
452
+ {
453
+ struct parser_t *parser = NULL;
454
+
455
+ Parser_Get_Struct(self, parser);
456
+ DBG_PRINT("parser=%p initialize", parser);
457
+
458
+ memset(parser, 0, sizeof(struct parser_t));
459
+
460
+ parser->context = PARSER_NONE;
461
+
462
+ tokenizer_init(&parser->tk);
463
+ parser->tk.callback_data = parser;
464
+ parser->tk.f_callback = parser_tokenize_callback;
465
+
466
+ parser->doc.length = 0;
467
+ parser->doc.data = NULL;
468
+
469
+ parser->doc.line_number = 1;
470
+ parser->doc.column_number = 0;
471
+
472
+ parser->errors_count = 0;
473
+ parser->errors = NULL;
474
+
475
+ return Qnil;
476
+ }
477
+
478
+ static int parser_document_append(struct parser_t *parser, const char *string, unsigned long int length)
479
+ {
480
+ void *old = parser->doc.data;
481
+ REALLOC_N(parser->doc.data, char, parser->doc.length + length + 1);
482
+ DBG_PRINT("parser=%p realloc(parser->doc.data) %p -> %p length=%lu", parser, old,
483
+ parser->doc.data, parser->doc.length + length + 1);
484
+ strcpy(parser->doc.data+parser->doc.length, string);
485
+ parser->doc.length += length;
486
+ return 1;
487
+ }
488
+
489
+ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
490
+ {
491
+ struct parser_t *parser = NULL;
492
+ char *string = NULL;
493
+ long unsigned int length = 0, cursor = 0;
494
+
495
+ if(NIL_P(source))
496
+ return Qnil;
497
+
498
+ Check_Type(source, T_STRING);
499
+ Parser_Get_Struct(self, parser);
500
+
501
+ string = StringValueCStr(source);
502
+ length = strlen(string);
503
+
504
+ cursor = parser->doc.length;
505
+
506
+ if(!parser_document_append(parser, string, length)) {
507
+ // error
508
+ return Qnil;
509
+ }
510
+
511
+ if(is_placeholder) {
512
+ parser_adjust_line_number(parser, cursor, length);
513
+ }
514
+ else {
515
+ parser->tk.scan.cursor = cursor;
516
+ parser->tk.scan.string = parser->doc.data;
517
+ parser->tk.scan.length = parser->doc.length;
518
+
519
+ tokenizer_scan_all(&parser->tk);
520
+ }
521
+
522
+ return Qtrue;
523
+ }
524
+
525
+ static VALUE parser_parse_method(VALUE self, VALUE source)
526
+ {
527
+ return parser_append_data(self, source, 0);
528
+ }
529
+
530
+ static VALUE parser_append_placeholder_method(VALUE self, VALUE source)
531
+ {
532
+ return parser_append_data(self, source, 1);
533
+ }
534
+
535
+ static VALUE parser_document_method(VALUE self)
536
+ {
537
+ struct parser_t *parser = NULL;
538
+ Parser_Get_Struct(self, parser);
539
+ if(!parser->doc.data)
540
+ return Qnil;
541
+ return rb_str_new(parser->doc.data, parser->doc.length);
542
+ }
543
+
544
+ static VALUE parser_document_length_method(VALUE self)
545
+ {
546
+ struct parser_t *parser = NULL;
547
+ Parser_Get_Struct(self, parser);
548
+ return ULONG2NUM(parser->doc.length);
549
+ }
550
+
551
+ static VALUE parser_context_method(VALUE self)
552
+ {
553
+ struct parser_t *parser = NULL;
554
+
555
+ Parser_Get_Struct(self, parser);
556
+
557
+ switch(parser->context) {
558
+ case PARSER_NONE:
559
+ return rawtext_context(parser) ? ID2SYM(rb_intern("rawtext")) : ID2SYM(rb_intern("none"));
560
+ case PARSER_SOLIDUS_OR_TAG_NAME:
561
+ return ID2SYM(rb_intern("solidus_or_tag_name"));
562
+ case PARSER_TAG_NAME:
563
+ return ID2SYM(rb_intern("tag_name"));
564
+ case PARSER_TAG:
565
+ return ID2SYM(rb_intern("tag"));
566
+ case PARSER_ATTRIBUTE_NAME:
567
+ return ID2SYM(rb_intern("attribute_name"));
568
+ case PARSER_ATTRIBUTE_WHITESPACE_OR_EQUAL:
569
+ return ID2SYM(rb_intern("after_attribute_name"));
570
+ case PARSER_ATTRIBUTE_WHITESPACE_OR_VALUE:
571
+ return ID2SYM(rb_intern("after_equal"));
572
+ case PARSER_ATTRIBUTE_QUOTED_VALUE:
573
+ return ID2SYM(rb_intern("quoted_value"));
574
+ case PARSER_SPACE_AFTER_ATTRIBUTE:
575
+ return ID2SYM(rb_intern("space_after_attribute"));
576
+ case PARSER_ATTRIBUTE_UNQUOTED_VALUE:
577
+ return ID2SYM(rb_intern("unquoted_value"));
578
+ case PARSER_TAG_END:
579
+ return ID2SYM(rb_intern("tag_end"));
580
+ case PARSER_COMMENT:
581
+ return ID2SYM(rb_intern("comment"));
582
+ case PARSER_CDATA:
583
+ return ID2SYM(rb_intern("cdata"));
584
+ }
585
+
586
+ return Qnil;
587
+ }
588
+
589
+ static inline VALUE ref_to_str(struct parser_t *parser, struct token_reference_t *ref)
590
+ {
591
+ if(ref->type == TOKEN_NONE || parser->doc.data == NULL)
592
+ return Qnil;
593
+ return rb_str_new(parser->doc.data+ref->start, ref->length);
594
+ }
595
+
596
+ static VALUE parser_tag_name_method(VALUE self)
597
+ {
598
+ struct parser_t *parser = NULL;
599
+ Parser_Get_Struct(self, parser);
600
+ return ref_to_str(parser, &parser->tag.name);
601
+ }
602
+
603
+ static VALUE parser_closing_tag_method(VALUE self)
604
+ {
605
+ struct parser_t *parser = NULL;
606
+ Parser_Get_Struct(self, parser);
607
+ return parser->tk.is_closing_tag ? Qtrue : Qfalse;
608
+ }
609
+
610
+ static VALUE parser_self_closing_tag_method(VALUE self)
611
+ {
612
+ struct parser_t *parser = NULL;
613
+ Parser_Get_Struct(self, parser);
614
+ return parser->tag.self_closing ? Qtrue : Qfalse;
615
+ }
616
+
617
+ static VALUE parser_attribute_name_method(VALUE self)
618
+ {
619
+ struct parser_t *parser = NULL;
620
+ Parser_Get_Struct(self, parser);
621
+ return ref_to_str(parser, &parser->attribute.name);
622
+ }
623
+
624
+ static VALUE parser_attribute_value_method(VALUE self)
625
+ {
626
+ struct parser_t *parser = NULL;
627
+ Parser_Get_Struct(self, parser);
628
+ return ref_to_str(parser, &parser->attribute.value);
629
+ }
630
+
631
+ static VALUE parser_quote_character_method(VALUE self)
632
+ {
633
+ struct parser_t *parser = NULL;
634
+ Parser_Get_Struct(self, parser);
635
+ return parser->attribute.is_quoted ?
636
+ rb_str_new(&parser->tk.attribute_value_start, 1) :
637
+ Qnil;
638
+ }
639
+
640
+ static VALUE parser_attribute_is_quoted_method(VALUE self)
641
+ {
642
+ struct parser_t *parser = NULL;
643
+ Parser_Get_Struct(self, parser);
644
+ return parser->attribute.is_quoted ? Qtrue : Qfalse;
645
+ }
646
+
647
+ static VALUE parser_comment_text_method(VALUE self)
648
+ {
649
+ struct parser_t *parser = NULL;
650
+ Parser_Get_Struct(self, parser);
651
+ return ref_to_str(parser, &parser->comment.text);
652
+ }
653
+
654
+ static VALUE parser_cdata_text_method(VALUE self)
655
+ {
656
+ struct parser_t *parser = NULL;
657
+ Parser_Get_Struct(self, parser);
658
+ return ref_to_str(parser, &parser->cdata.text);
659
+ }
660
+
661
+ static VALUE parser_rawtext_text_method(VALUE self)
662
+ {
663
+ struct parser_t *parser = NULL;
664
+ Parser_Get_Struct(self, parser);
665
+ return ref_to_str(parser, &parser->rawtext.text);
666
+ }
667
+
668
+ static VALUE parser_extract_method(VALUE self, VALUE start_p, VALUE end_p)
669
+ {
670
+ struct parser_t *parser = NULL;
671
+ unsigned long int start, end;
672
+ struct token_reference_t ref;
673
+
674
+ Parser_Get_Struct(self, parser);
675
+
676
+ start = NUM2ULONG(start_p);
677
+ end = NUM2ULONG(end_p);
678
+ if(end < start) {
679
+ rb_raise(rb_eArgError, "'end' must be greater or equal than 'start'");
680
+ }
681
+ if(end > parser->doc.length) {
682
+ rb_raise(rb_eArgError, "'end' argument not in range of document");
683
+ }
684
+
685
+ ref.type = TOKEN_TEXT; // anything not NONE
686
+ ref.start = start;
687
+ ref.length = end - start;
688
+ return ref_to_str(parser, &ref);
689
+ }
690
+
691
+ static VALUE parser_errors_count_method(VALUE self)
692
+ {
693
+ struct parser_t *parser = NULL;
694
+ Parser_Get_Struct(self, parser);
695
+ return INT2NUM(parser->errors_count);
696
+ }
697
+
698
+ static VALUE create_parser_error(struct parser_document_error_t *error)
699
+ {
700
+ VALUE module = rb_const_get(rb_cObject, rb_intern("HtmlTokenizer"));
701
+ VALUE klass = rb_const_get(module, rb_intern("ParserError"));
702
+ VALUE args[3] = {
703
+ rb_str_new2(error->message),
704
+ ULONG2NUM(error->line_number),
705
+ ULONG2NUM(error->column_number),
706
+ };
707
+ return rb_class_new_instance(3, args, klass);
708
+ }
709
+
710
+ static VALUE parser_errors_method(VALUE self, VALUE error_p)
711
+ {
712
+ struct parser_t *parser = NULL;
713
+ VALUE list;
714
+ size_t i;
715
+ Parser_Get_Struct(self, parser);
716
+
717
+ list = rb_ary_new();
718
+ for(i=0; i<parser->errors_count; i++) {
719
+ if(parser->errors[i].message) {
720
+ rb_ary_push(list, create_parser_error(&parser->errors[i]));
721
+ }
722
+ }
723
+
724
+ return list;
725
+ }
726
+
727
+ static VALUE parser_line_number_method(VALUE self)
728
+ {
729
+ struct parser_t *parser = NULL;
730
+ Parser_Get_Struct(self, parser);
731
+ return ULONG2NUM(parser->doc.line_number);
732
+ }
733
+
734
+ static VALUE parser_column_number_method(VALUE self)
735
+ {
736
+ struct parser_t *parser = NULL;
737
+ Parser_Get_Struct(self, parser);
738
+ return ULONG2NUM(parser->doc.column_number);
739
+ }
740
+
741
+ void Init_html_tokenizer_parser(VALUE mHtmlTokenizer)
742
+ {
743
+ cParser = rb_define_class_under(mHtmlTokenizer, "Parser", rb_cObject);
744
+ rb_define_alloc_func(cParser, parser_allocate);
745
+ rb_define_method(cParser, "initialize", parser_initialize_method, 0);
746
+ rb_define_method(cParser, "document", parser_document_method, 0);
747
+ rb_define_method(cParser, "document_length", parser_document_length_method, 0);
748
+ rb_define_method(cParser, "line_number", parser_line_number_method, 0);
749
+ rb_define_method(cParser, "column_number", parser_column_number_method, 0);
750
+ rb_define_method(cParser, "parse", parser_parse_method, 1);
751
+ rb_define_method(cParser, "append_placeholder", parser_append_placeholder_method, 1);
752
+ rb_define_method(cParser, "extract", parser_extract_method, 2);
753
+ rb_define_method(cParser, "context", parser_context_method, 0);
754
+ rb_define_method(cParser, "tag_name", parser_tag_name_method, 0);
755
+ rb_define_method(cParser, "closing_tag?", parser_closing_tag_method, 0);
756
+ rb_define_method(cParser, "self_closing_tag?", parser_self_closing_tag_method, 0);
757
+ rb_define_method(cParser, "attribute_name", parser_attribute_name_method, 0);
758
+ rb_define_method(cParser, "attribute_value", parser_attribute_value_method, 0);
759
+ rb_define_method(cParser, "quote_character", parser_quote_character_method, 0);
760
+ rb_define_method(cParser, "attribute_quoted?", parser_attribute_is_quoted_method, 0);
761
+ rb_define_method(cParser, "comment_text", parser_comment_text_method, 0);
762
+ rb_define_method(cParser, "cdata_text", parser_cdata_text_method, 0);
763
+ rb_define_method(cParser, "rawtext_text", parser_rawtext_text_method, 0);
764
+
765
+ rb_define_method(cParser, "errors_count", parser_errors_count_method, 0);
766
+ rb_define_method(cParser, "errors", parser_errors_method, 0);
767
+ }