html_tokenizer 0.0.1 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/Gemfile.lock +3 -3
- data/ext/html_tokenizer_ext/extconf.rb +7 -1
- data/ext/html_tokenizer_ext/parser.c +64 -38
- data/ext/html_tokenizer_ext/parser.h +6 -0
- data/ext/html_tokenizer_ext/tokenizer.c +55 -22
- data/ext/html_tokenizer_ext/tokenizer.h +6 -0
- data/html_tokenizer.gemspec +1 -1
- data/lib/html_tokenizer.rb +3 -2
- data/test/unit/parser_test.rb +48 -29
- data/test/unit/tokenizer_test.rb +22 -1
- metadata +13 -13
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 7f63f5699a8e9dc129392fa0d554196d9c2322c43f19cd21b353194b90d1c0f1
|
4
|
+
data.tar.gz: db308d4bb26d5181da91f9353e29d12d7aa822f02d1f0214959663516891781b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7f70e313d9206393e094b38569584f2f5e95bf3cb8abbd840fd063652da7d66b4c4c3a29e55b900f8da4f3c672d312dedb3895cebc5727292287e60b41b3049d
|
7
|
+
data.tar.gz: 9999d3a9e5c51ac426cb33551f14dcba980b7ee45543c000422ea7c80fe6884b34297790f977f53b95d6d128a35d49bcfa052b8b005f52d33d1f1a675d599ec8
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
html_tokenizer (0.0.
|
4
|
+
html_tokenizer (0.0.7)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
9
|
minitest (5.9.0)
|
10
|
-
rake (
|
10
|
+
rake (12.3.0)
|
11
11
|
rake-compiler (0.9.9)
|
12
12
|
rake
|
13
13
|
|
@@ -21,4 +21,4 @@ DEPENDENCIES
|
|
21
21
|
rake-compiler
|
22
22
|
|
23
23
|
BUNDLED WITH
|
24
|
-
1.
|
24
|
+
1.16.1
|
@@ -1,6 +1,12 @@
|
|
1
1
|
require 'mkmf'
|
2
2
|
|
3
3
|
$CXXFLAGS += " -std=c++11 "
|
4
|
-
$CXXFLAGS += " -g -
|
4
|
+
$CXXFLAGS += " -g -O1 -ggdb "
|
5
|
+
$CFLAGS += " -g -O1 -ggdb "
|
6
|
+
|
7
|
+
if ENV['DEBUG']
|
8
|
+
$CXXFLAGS += " -DDEBUG "
|
9
|
+
$CFLAGS += " -DDEBUG "
|
10
|
+
end
|
5
11
|
|
6
12
|
create_makefile('html_tokenizer_ext')
|
@@ -1,4 +1,5 @@
|
|
1
1
|
#include <ruby.h>
|
2
|
+
#include <ruby/encoding.h>
|
2
3
|
#include "html_tokenizer.h"
|
3
4
|
#include "parser.h"
|
4
5
|
|
@@ -13,6 +14,7 @@ static void parser_free(void *ptr)
|
|
13
14
|
size_t i;
|
14
15
|
|
15
16
|
if(parser) {
|
17
|
+
tokenizer_free_members(&parser->tk);
|
16
18
|
if(parser->doc.data) {
|
17
19
|
DBG_PRINT("parser=%p xfree(parser->doc.data) %p", parser, parser->doc.data);
|
18
20
|
xfree(parser->doc.data);
|
@@ -65,6 +67,7 @@ static inline void parser_append_ref(struct token_reference_t *dest, struct toke
|
|
65
67
|
if(dest->type == TOKEN_NONE || dest->type != src->type || (dest->start + dest->length) != src->start) {
|
66
68
|
dest->type = src->type;
|
67
69
|
dest->start = src->start;
|
70
|
+
dest->mb_start = src->mb_start;
|
68
71
|
dest->length = src->length;
|
69
72
|
dest->line_number = src->line_number;
|
70
73
|
dest->column_number = src->column_number;
|
@@ -79,6 +82,8 @@ static void parser_add_error(struct parser_t *parser, const char *message)
|
|
79
82
|
{
|
80
83
|
REALLOC_N(parser->errors, struct parser_document_error_t, parser->errors_count + 1);
|
81
84
|
parser->errors[parser->errors_count].message = strdup(message);
|
85
|
+
parser->errors[parser->errors_count].pos = parser->tk.scan.cursor;
|
86
|
+
parser->errors[parser->errors_count].mb_pos = parser->tk.scan.mb_cursor;
|
82
87
|
parser->errors[parser->errors_count].line_number = parser->doc.line_number;
|
83
88
|
parser->errors[parser->errors_count].column_number = parser->doc.column_number;
|
84
89
|
parser->errors_count += 1;
|
@@ -362,15 +367,21 @@ static inline int rawtext_context(struct parser_t *parser)
|
|
362
367
|
|
363
368
|
static void parser_adjust_line_number(struct parser_t *parser, long unsigned int start, long unsigned int length)
|
364
369
|
{
|
370
|
+
rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
|
365
371
|
long unsigned int i;
|
372
|
+
const char *buf, *nextlf;
|
366
373
|
|
367
|
-
for(i =
|
368
|
-
|
374
|
+
for(i = 0; i < length;) {
|
375
|
+
buf = &parser->doc.data[start + i];
|
376
|
+
nextlf = memchr(buf, '\n', length - i);
|
377
|
+
if(nextlf) {
|
369
378
|
parser->doc.column_number = 0;
|
370
379
|
parser->doc.line_number += 1;
|
380
|
+
i += (nextlf - buf) + 1;
|
371
381
|
}
|
372
382
|
else {
|
373
|
-
parser->doc.column_number +=
|
383
|
+
parser->doc.column_number += rb_enc_strlen(buf, buf + length - i, enc);
|
384
|
+
break;
|
374
385
|
}
|
375
386
|
}
|
376
387
|
|
@@ -383,11 +394,14 @@ static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type typ
|
|
383
394
|
struct token_reference_t ref = {
|
384
395
|
.type = type,
|
385
396
|
.start = tk->scan.cursor,
|
397
|
+
.mb_start = tk->scan.mb_cursor,
|
386
398
|
.length = length,
|
387
399
|
.line_number = parser->doc.line_number,
|
388
400
|
.column_number = parser->doc.column_number,
|
389
401
|
};
|
390
402
|
int parse_again = 1;
|
403
|
+
long unsigned int mb_strlen;
|
404
|
+
rb_encoding *enc;
|
391
405
|
|
392
406
|
while(parse_again) {
|
393
407
|
switch(parser->context)
|
@@ -438,8 +452,10 @@ static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type typ
|
|
438
452
|
}
|
439
453
|
|
440
454
|
if(rb_block_given_p()) {
|
455
|
+
enc = rb_enc_from_index(parser->doc.enc_index);
|
456
|
+
mb_strlen = rb_enc_strlen(parser->doc.data + ref.start, parser->doc.data + ref.start + ref.length, enc);
|
441
457
|
rb_yield_values(5, token_type_to_symbol(type),
|
442
|
-
INT2NUM(ref.
|
458
|
+
INT2NUM(ref.mb_start), INT2NUM(ref.mb_start + mb_strlen),
|
443
459
|
INT2NUM(ref.line_number), INT2NUM(ref.column_number));
|
444
460
|
}
|
445
461
|
|
@@ -465,6 +481,8 @@ static VALUE parser_initialize_method(VALUE self)
|
|
465
481
|
|
466
482
|
parser->doc.length = 0;
|
467
483
|
parser->doc.data = NULL;
|
484
|
+
parser->doc.enc_index = 0;
|
485
|
+
parser->doc.mb_length = 0;
|
468
486
|
|
469
487
|
parser->doc.line_number = 1;
|
470
488
|
parser->doc.column_number = 0;
|
@@ -478,11 +496,17 @@ static VALUE parser_initialize_method(VALUE self)
|
|
478
496
|
static int parser_document_append(struct parser_t *parser, const char *string, unsigned long int length)
|
479
497
|
{
|
480
498
|
void *old = parser->doc.data;
|
499
|
+
unsigned long int mb_length;
|
500
|
+
char *buf;
|
501
|
+
rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
|
481
502
|
REALLOC_N(parser->doc.data, char, parser->doc.length + length + 1);
|
482
503
|
DBG_PRINT("parser=%p realloc(parser->doc.data) %p -> %p length=%lu", parser, old,
|
483
|
-
parser->doc.data,
|
484
|
-
|
504
|
+
parser->doc.data, parser->doc.length + length + 1);
|
505
|
+
buf = parser->doc.data + parser->doc.length;
|
506
|
+
strcpy(buf, string);
|
507
|
+
mb_length = rb_enc_strlen(buf, buf + length, enc);
|
485
508
|
parser->doc.length += length;
|
509
|
+
parser->doc.mb_length += mb_length;
|
486
510
|
return 1;
|
487
511
|
}
|
488
512
|
|
@@ -490,7 +514,7 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
|
|
490
514
|
{
|
491
515
|
struct parser_t *parser = NULL;
|
492
516
|
char *string = NULL;
|
493
|
-
long unsigned int length = 0, cursor = 0;
|
517
|
+
long unsigned int length = 0, cursor = 0, mb_cursor = 0;
|
494
518
|
|
495
519
|
if(NIL_P(source))
|
496
520
|
return Qnil;
|
@@ -502,6 +526,15 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
|
|
502
526
|
length = strlen(string);
|
503
527
|
|
504
528
|
cursor = parser->doc.length;
|
529
|
+
mb_cursor = parser->doc.mb_length;
|
530
|
+
|
531
|
+
if(parser->doc.data == NULL) {
|
532
|
+
parser->doc.enc_index = rb_enc_get_index(source);
|
533
|
+
}
|
534
|
+
else if(parser->doc.enc_index != rb_enc_get_index(source)) {
|
535
|
+
rb_raise(rb_eArgError, "cannot append %s string to %s document",
|
536
|
+
rb_enc_name(rb_enc_get(source)), rb_enc_name(rb_enc_from_index(parser->doc.enc_index)));
|
537
|
+
}
|
505
538
|
|
506
539
|
if(!parser_document_append(parser, string, length)) {
|
507
540
|
// error
|
@@ -513,10 +546,12 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
|
|
513
546
|
}
|
514
547
|
else {
|
515
548
|
parser->tk.scan.cursor = cursor;
|
516
|
-
parser->tk.
|
517
|
-
parser->tk.scan.
|
549
|
+
tokenizer_set_scan_string(&parser->tk, parser->doc.data, parser->doc.length);
|
550
|
+
parser->tk.scan.enc_index = parser->doc.enc_index;
|
551
|
+
parser->tk.scan.mb_cursor = mb_cursor;
|
518
552
|
|
519
553
|
tokenizer_scan_all(&parser->tk);
|
554
|
+
tokenizer_free_scan_string(&parser->tk);
|
520
555
|
}
|
521
556
|
|
522
557
|
return Qtrue;
|
@@ -535,17 +570,30 @@ static VALUE parser_append_placeholder_method(VALUE self, VALUE source)
|
|
535
570
|
static VALUE parser_document_method(VALUE self)
|
536
571
|
{
|
537
572
|
struct parser_t *parser = NULL;
|
573
|
+
rb_encoding *enc;
|
538
574
|
Parser_Get_Struct(self, parser);
|
539
575
|
if(!parser->doc.data)
|
540
576
|
return Qnil;
|
541
|
-
|
577
|
+
enc = rb_enc_from_index(parser->doc.enc_index);
|
578
|
+
return rb_enc_str_new(parser->doc.data, parser->doc.length, enc);
|
542
579
|
}
|
543
580
|
|
544
581
|
static VALUE parser_document_length_method(VALUE self)
|
545
582
|
{
|
546
583
|
struct parser_t *parser = NULL;
|
584
|
+
rb_encoding *enc;
|
585
|
+
const char *buf;
|
586
|
+
|
547
587
|
Parser_Get_Struct(self, parser);
|
548
|
-
|
588
|
+
|
589
|
+
if(parser->doc.data == NULL) {
|
590
|
+
return ULONG2NUM(0);
|
591
|
+
}
|
592
|
+
else {
|
593
|
+
buf = parser->doc.data;
|
594
|
+
enc = rb_enc_from_index(parser->doc.enc_index);
|
595
|
+
return ULONG2NUM(rb_enc_strlen(buf, buf + parser->doc.length, enc));
|
596
|
+
}
|
549
597
|
}
|
550
598
|
|
551
599
|
static VALUE parser_context_method(VALUE self)
|
@@ -588,9 +636,10 @@ static VALUE parser_context_method(VALUE self)
|
|
588
636
|
|
589
637
|
static inline VALUE ref_to_str(struct parser_t *parser, struct token_reference_t *ref)
|
590
638
|
{
|
639
|
+
rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
|
591
640
|
if(ref->type == TOKEN_NONE || parser->doc.data == NULL)
|
592
641
|
return Qnil;
|
593
|
-
return
|
642
|
+
return rb_enc_str_new(parser->doc.data+ref->start, ref->length, enc);
|
594
643
|
}
|
595
644
|
|
596
645
|
static VALUE parser_tag_name_method(VALUE self)
|
@@ -665,29 +714,6 @@ static VALUE parser_rawtext_text_method(VALUE self)
|
|
665
714
|
return ref_to_str(parser, &parser->rawtext.text);
|
666
715
|
}
|
667
716
|
|
668
|
-
static VALUE parser_extract_method(VALUE self, VALUE start_p, VALUE end_p)
|
669
|
-
{
|
670
|
-
struct parser_t *parser = NULL;
|
671
|
-
unsigned long int start, end;
|
672
|
-
struct token_reference_t ref;
|
673
|
-
|
674
|
-
Parser_Get_Struct(self, parser);
|
675
|
-
|
676
|
-
start = NUM2ULONG(start_p);
|
677
|
-
end = NUM2ULONG(end_p);
|
678
|
-
if(end < start) {
|
679
|
-
rb_raise(rb_eArgError, "'end' must be greater or equal than 'start'");
|
680
|
-
}
|
681
|
-
if(end > parser->doc.length) {
|
682
|
-
rb_raise(rb_eArgError, "'end' argument not in range of document");
|
683
|
-
}
|
684
|
-
|
685
|
-
ref.type = TOKEN_TEXT; // anything not NONE
|
686
|
-
ref.start = start;
|
687
|
-
ref.length = end - start;
|
688
|
-
return ref_to_str(parser, &ref);
|
689
|
-
}
|
690
|
-
|
691
717
|
static VALUE parser_errors_count_method(VALUE self)
|
692
718
|
{
|
693
719
|
struct parser_t *parser = NULL;
|
@@ -699,12 +725,13 @@ static VALUE create_parser_error(struct parser_document_error_t *error)
|
|
699
725
|
{
|
700
726
|
VALUE module = rb_const_get(rb_cObject, rb_intern("HtmlTokenizer"));
|
701
727
|
VALUE klass = rb_const_get(module, rb_intern("ParserError"));
|
702
|
-
VALUE args[
|
728
|
+
VALUE args[4] = {
|
703
729
|
rb_str_new2(error->message),
|
730
|
+
ULONG2NUM(error->mb_pos),
|
704
731
|
ULONG2NUM(error->line_number),
|
705
732
|
ULONG2NUM(error->column_number),
|
706
733
|
};
|
707
|
-
return rb_class_new_instance(
|
734
|
+
return rb_class_new_instance(4, args, klass);
|
708
735
|
}
|
709
736
|
|
710
737
|
static VALUE parser_errors_method(VALUE self, VALUE error_p)
|
@@ -749,7 +776,6 @@ void Init_html_tokenizer_parser(VALUE mHtmlTokenizer)
|
|
749
776
|
rb_define_method(cParser, "column_number", parser_column_number_method, 0);
|
750
777
|
rb_define_method(cParser, "parse", parser_parse_method, 1);
|
751
778
|
rb_define_method(cParser, "append_placeholder", parser_append_placeholder_method, 1);
|
752
|
-
rb_define_method(cParser, "extract", parser_extract_method, 2);
|
753
779
|
rb_define_method(cParser, "context", parser_context_method, 0);
|
754
780
|
rb_define_method(cParser, "tag_name", parser_tag_name_method, 0);
|
755
781
|
rb_define_method(cParser, "closing_tag?", parser_closing_tag_method, 0);
|
@@ -19,6 +19,8 @@ enum parser_context {
|
|
19
19
|
|
20
20
|
struct parser_document_error_t {
|
21
21
|
char *message;
|
22
|
+
long unsigned int pos;
|
23
|
+
long unsigned int mb_pos;
|
22
24
|
long unsigned int line_number;
|
23
25
|
long unsigned int column_number;
|
24
26
|
};
|
@@ -28,11 +30,15 @@ struct parser_document_t {
|
|
28
30
|
char *data;
|
29
31
|
long unsigned int line_number;
|
30
32
|
long unsigned int column_number;
|
33
|
+
|
34
|
+
int enc_index;
|
35
|
+
long unsigned int mb_length;
|
31
36
|
};
|
32
37
|
|
33
38
|
struct token_reference_t {
|
34
39
|
enum token_type type;
|
35
40
|
long unsigned int start;
|
41
|
+
long unsigned int mb_start;
|
36
42
|
long unsigned int length;
|
37
43
|
long unsigned int line_number;
|
38
44
|
long unsigned int column_number;
|
@@ -1,4 +1,5 @@
|
|
1
1
|
#include <ruby.h>
|
2
|
+
#include <ruby/encoding.h>
|
2
3
|
#include "html_tokenizer.h"
|
3
4
|
#include "tokenizer.h"
|
4
5
|
|
@@ -11,16 +12,7 @@ static void tokenizer_free(void *ptr)
|
|
11
12
|
{
|
12
13
|
struct tokenizer_t *tk = ptr;
|
13
14
|
if(tk) {
|
14
|
-
|
15
|
-
DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag);
|
16
|
-
xfree(tk->current_tag);
|
17
|
-
tk->current_tag = NULL;
|
18
|
-
}
|
19
|
-
if(tk->scan.string) {
|
20
|
-
DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string);
|
21
|
-
xfree(tk->scan.string);
|
22
|
-
tk->scan.string = NULL;
|
23
|
-
}
|
15
|
+
tokenizer_free_members(tk);
|
24
16
|
DBG_PRINT("tk=%p xfree(tk)", tk);
|
25
17
|
xfree(tk);
|
26
18
|
}
|
@@ -60,6 +52,8 @@ void tokenizer_init(struct tokenizer_t *tk)
|
|
60
52
|
tk->scan.string = NULL;
|
61
53
|
tk->scan.cursor = 0;
|
62
54
|
tk->scan.length = 0;
|
55
|
+
tk->scan.mb_cursor = 0;
|
56
|
+
tk->scan.enc_index = 0;
|
63
57
|
|
64
58
|
tk->attribute_value_start = 0;
|
65
59
|
tk->found_attribute = 0;
|
@@ -72,6 +66,21 @@ void tokenizer_init(struct tokenizer_t *tk)
|
|
72
66
|
return;
|
73
67
|
}
|
74
68
|
|
69
|
+
void tokenizer_free_members(struct tokenizer_t *tk)
|
70
|
+
{
|
71
|
+
if(tk->current_tag) {
|
72
|
+
DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag);
|
73
|
+
xfree(tk->current_tag);
|
74
|
+
tk->current_tag = NULL;
|
75
|
+
}
|
76
|
+
if(tk->scan.string) {
|
77
|
+
DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string);
|
78
|
+
xfree(tk->scan.string);
|
79
|
+
tk->scan.string = NULL;
|
80
|
+
}
|
81
|
+
return;
|
82
|
+
}
|
83
|
+
|
75
84
|
VALUE token_type_to_symbol(enum token_type type)
|
76
85
|
{
|
77
86
|
switch(type) {
|
@@ -115,17 +124,27 @@ VALUE token_type_to_symbol(enum token_type type)
|
|
115
124
|
return Qnil;
|
116
125
|
}
|
117
126
|
|
127
|
+
static long unsigned int tokenizer_mblength(struct tokenizer_t *tk, long unsigned int length)
|
128
|
+
{
|
129
|
+
rb_encoding *enc = rb_enc_from_index(tk->scan.enc_index);
|
130
|
+
const char *buf = tk->scan.string + tk->scan.cursor;
|
131
|
+
return rb_enc_strlen(buf, buf + length, enc);
|
132
|
+
}
|
133
|
+
|
118
134
|
static void tokenizer_yield_tag(struct tokenizer_t *tk, enum token_type type, long unsigned int length, void *data)
|
119
135
|
{
|
136
|
+
long unsigned int mb_length = tokenizer_mblength(tk, length);
|
120
137
|
tk->last_token = type;
|
121
|
-
rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan.
|
138
|
+
rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan.mb_cursor), INT2NUM(tk->scan.mb_cursor + mb_length));
|
122
139
|
}
|
123
140
|
|
124
141
|
static void tokenizer_callback(struct tokenizer_t *tk, enum token_type type, long unsigned int length)
|
125
142
|
{
|
143
|
+
long unsigned int mb_length = tokenizer_mblength(tk, length);
|
126
144
|
if(tk->f_callback)
|
127
145
|
tk->f_callback(tk, type, length, tk->callback_data);
|
128
146
|
tk->scan.cursor += length;
|
147
|
+
tk->scan.mb_cursor += mb_length;
|
129
148
|
}
|
130
149
|
|
131
150
|
static VALUE tokenizer_initialize_method(VALUE self)
|
@@ -356,6 +375,7 @@ static int scan_open_tag(struct tokenizer_t *tk)
|
|
356
375
|
else if(is_doctype(&tk->scan)) {
|
357
376
|
tokenizer_callback(tk, TOKEN_TAG_START, 1);
|
358
377
|
tokenizer_callback(tk, TOKEN_TAG_NAME, 8);
|
378
|
+
push_context(tk, TOKENIZER_TAG_NAME);
|
359
379
|
return 1;
|
360
380
|
}
|
361
381
|
else if(is_cdata_start(&tk->scan)) {
|
@@ -642,11 +662,30 @@ void tokenizer_scan_all(struct tokenizer_t *tk)
|
|
642
662
|
return;
|
643
663
|
}
|
644
664
|
|
665
|
+
void tokenizer_set_scan_string(struct tokenizer_t *tk, const char *string, long unsigned int length)
|
666
|
+
{
|
667
|
+
const char *old = tk->scan.string;
|
668
|
+
REALLOC_N(tk->scan.string, char, string ? length + 1 : 0);
|
669
|
+
DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
|
670
|
+
tk->scan.string, length + 1);
|
671
|
+
if(string && length > 0) {
|
672
|
+
strncpy(tk->scan.string, string, length);
|
673
|
+
tk->scan.string[length] = 0;
|
674
|
+
}
|
675
|
+
tk->scan.length = length;
|
676
|
+
return;
|
677
|
+
}
|
678
|
+
|
679
|
+
void tokenizer_free_scan_string(struct tokenizer_t *tk)
|
680
|
+
{
|
681
|
+
tokenizer_set_scan_string(tk, NULL, 0);
|
682
|
+
return;
|
683
|
+
}
|
684
|
+
|
645
685
|
static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
|
646
686
|
{
|
647
687
|
struct tokenizer_t *tk = NULL;
|
648
688
|
char *c_source;
|
649
|
-
char *old;
|
650
689
|
|
651
690
|
if(NIL_P(source))
|
652
691
|
return Qnil;
|
@@ -656,19 +695,13 @@ static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
|
|
656
695
|
|
657
696
|
c_source = StringValueCStr(source);
|
658
697
|
tk->scan.cursor = 0;
|
659
|
-
tk
|
660
|
-
|
661
|
-
|
662
|
-
REALLOC_N(tk->scan.string, char, tk->scan.length+1);
|
663
|
-
DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
|
664
|
-
tk->scan.string, tk->scan.length+1);
|
665
|
-
strncpy(tk->scan.string, c_source, tk->scan.length);
|
698
|
+
tokenizer_set_scan_string(tk, c_source, strlen(c_source));
|
699
|
+
tk->scan.enc_index = rb_enc_get_index(source);
|
700
|
+
tk->scan.mb_cursor = 0;
|
666
701
|
|
667
702
|
tokenizer_scan_all(tk);
|
668
703
|
|
669
|
-
|
670
|
-
xfree(tk->scan.string);
|
671
|
-
tk->scan.string = NULL;
|
704
|
+
tokenizer_free_scan_string(tk);
|
672
705
|
|
673
706
|
return Qtrue;
|
674
707
|
}
|
@@ -43,6 +43,9 @@ struct scan_t {
|
|
43
43
|
char *string;
|
44
44
|
long unsigned int cursor;
|
45
45
|
long unsigned int length;
|
46
|
+
|
47
|
+
int enc_index;
|
48
|
+
long unsigned int mb_cursor;
|
46
49
|
};
|
47
50
|
|
48
51
|
struct tokenizer_t
|
@@ -67,6 +70,9 @@ struct tokenizer_t
|
|
67
70
|
|
68
71
|
void Init_html_tokenizer_tokenizer(VALUE mHtmlTokenizer);
|
69
72
|
void tokenizer_init(struct tokenizer_t *tk);
|
73
|
+
void tokenizer_free_members(struct tokenizer_t *tk);
|
74
|
+
void tokenizer_set_scan_string(struct tokenizer_t *tk, const char *string, long unsigned int length);
|
75
|
+
void tokenizer_free_scan_string(struct tokenizer_t *tk);
|
70
76
|
void tokenizer_scan_all(struct tokenizer_t *tk);
|
71
77
|
VALUE token_type_to_symbol(enum token_type type);
|
72
78
|
|
data/html_tokenizer.gemspec
CHANGED
data/lib/html_tokenizer.rb
CHANGED
@@ -2,9 +2,10 @@ require 'html_tokenizer_ext'
|
|
2
2
|
|
3
3
|
module HtmlTokenizer
|
4
4
|
class ParserError < RuntimeError
|
5
|
-
attr_reader :line, :column
|
6
|
-
def initialize(message, line, column)
|
5
|
+
attr_reader :position, :line, :column
|
6
|
+
def initialize(message, position, line, column)
|
7
7
|
super(message)
|
8
|
+
@position = position
|
8
9
|
@line = line
|
9
10
|
@column = column
|
10
11
|
end
|
data/test/unit/parser_test.rb
CHANGED
@@ -431,35 +431,13 @@ class HtmlTokenizer::ParserTest < Minitest::Test
|
|
431
431
|
tokens << token
|
432
432
|
end
|
433
433
|
assert_equal [[:text, 0, 4, 1, 0], [:text, 34, 38, 5, 0]], tokens
|
434
|
-
assert_equal "bar\n", @parser.extract(34, 38)
|
435
|
-
end
|
436
|
-
|
437
|
-
def test_extract_method
|
438
|
-
parse("abcdefg")
|
439
|
-
assert_equal "a", @parser.extract(0, 1)
|
440
|
-
assert_equal "cd", @parser.extract(2, 4)
|
441
|
-
end
|
442
|
-
|
443
|
-
def test_extract_method_raises_argument_error_end_past_length
|
444
|
-
parse("abcdefg")
|
445
|
-
e = assert_raises(ArgumentError) do
|
446
|
-
@parser.extract(0, 32)
|
447
|
-
end
|
448
|
-
assert_equal "'end' argument not in range of document", e.message
|
449
|
-
end
|
450
|
-
|
451
|
-
def test_extract_method_raises_argument_error_end_less_than_start
|
452
|
-
parse("abcdefg")
|
453
|
-
e = assert_raises(ArgumentError) do
|
454
|
-
@parser.extract(1, 0)
|
455
|
-
end
|
456
|
-
assert_equal "'end' must be greater or equal than 'start'", e.message
|
457
434
|
end
|
458
435
|
|
459
436
|
def test_solidus_or_tag_name_error
|
460
437
|
parse('<>')
|
461
438
|
assert_equal 1, @parser.errors_count
|
462
439
|
assert_equal "expected '/' or tag name", @parser.errors.first.to_s
|
440
|
+
assert_equal 1, @parser.errors.first.position
|
463
441
|
assert_equal 1, @parser.errors.first.line
|
464
442
|
assert_equal 1, @parser.errors.first.column
|
465
443
|
end
|
@@ -468,6 +446,7 @@ class HtmlTokenizer::ParserTest < Minitest::Test
|
|
468
446
|
parse('< ')
|
469
447
|
assert_equal 1, @parser.errors_count
|
470
448
|
assert_equal "expected '/' or tag name", @parser.errors.first.to_s
|
449
|
+
assert_equal 1, @parser.errors.first.position
|
471
450
|
assert_equal 1, @parser.errors.first.line
|
472
451
|
assert_equal 1, @parser.errors.first.column
|
473
452
|
end
|
@@ -476,6 +455,7 @@ class HtmlTokenizer::ParserTest < Minitest::Test
|
|
476
455
|
parse('<foo =')
|
477
456
|
assert_equal 1, @parser.errors_count
|
478
457
|
assert_equal "expected whitespace, '>', attribute name or value", @parser.errors.first.to_s
|
458
|
+
assert_equal 5, @parser.errors.first.position
|
479
459
|
assert_equal 1, @parser.errors.first.line
|
480
460
|
assert_equal 5, @parser.errors.first.column
|
481
461
|
end
|
@@ -484,6 +464,7 @@ class HtmlTokenizer::ParserTest < Minitest::Test
|
|
484
464
|
parse('<foo /x')
|
485
465
|
assert_equal 1, @parser.errors_count
|
486
466
|
assert_equal "expected '>' after '/'", @parser.errors.first.to_s
|
467
|
+
assert_equal 6, @parser.errors.first.position
|
487
468
|
assert_equal 1, @parser.errors.first.line
|
488
469
|
assert_equal 6, @parser.errors.first.column
|
489
470
|
end
|
@@ -492,6 +473,7 @@ class HtmlTokenizer::ParserTest < Minitest::Test
|
|
492
473
|
parse('<foo / ')
|
493
474
|
assert_equal 1, @parser.errors_count
|
494
475
|
assert_equal "expected '>' after '/'", @parser.errors.first.to_s
|
476
|
+
assert_equal 6, @parser.errors.first.position
|
495
477
|
assert_equal 1, @parser.errors.first.line
|
496
478
|
assert_equal 6, @parser.errors.first.column
|
497
479
|
end
|
@@ -499,29 +481,33 @@ class HtmlTokenizer::ParserTest < Minitest::Test
|
|
499
481
|
def test_attribute_name_error
|
500
482
|
parse('<foo bar~')
|
501
483
|
assert_equal 2, @parser.errors_count
|
502
|
-
assert_equal "expected whitespace, '>' or '=' after attribute name", @parser.errors.first.to_s
|
503
|
-
assert_equal 1, @parser.errors.first.line
|
504
|
-
assert_equal 8, @parser.errors.first.column
|
505
484
|
assert_equal "expected whitespace, '>' or '=' after attribute name", @parser.errors[0].to_s
|
485
|
+
assert_equal 8, @parser.errors.first.position
|
506
486
|
assert_equal 1, @parser.errors[0].line
|
507
487
|
assert_equal 8, @parser.errors[0].column
|
488
|
+
assert_equal "expected whitespace, '>', attribute name or value", @parser.errors[1].to_s
|
489
|
+
assert_equal 8, @parser.errors.first.position
|
490
|
+
assert_equal 1, @parser.errors[1].line
|
491
|
+
assert_equal 8, @parser.errors[1].column
|
508
492
|
end
|
509
493
|
|
510
494
|
def test_attribute_whitespace_or_equal_error
|
511
495
|
parse('<foo bar ~')
|
512
496
|
assert_equal 2, @parser.errors_count
|
513
|
-
assert_equal "expected '/', '>', \", ' or '=' after attribute name", @parser.errors.first.to_s
|
514
|
-
assert_equal 1, @parser.errors.first.line
|
515
|
-
assert_equal 9, @parser.errors.first.column
|
516
497
|
assert_equal "expected '/', '>', \", ' or '=' after attribute name", @parser.errors[0].to_s
|
517
498
|
assert_equal 1, @parser.errors[0].line
|
518
499
|
assert_equal 9, @parser.errors[0].column
|
500
|
+
assert_equal "expected whitespace, '>', attribute name or value", @parser.errors[1].to_s
|
501
|
+
assert_equal 9, @parser.errors.first.position
|
502
|
+
assert_equal 1, @parser.errors[1].line
|
503
|
+
assert_equal 9, @parser.errors[1].column
|
519
504
|
end
|
520
505
|
|
521
506
|
def test_attribute_whitespace_or_equal_error_2
|
522
507
|
parse('<foo bar = >')
|
523
508
|
assert_equal 1, @parser.errors_count
|
524
509
|
assert_equal "expected attribute value after '='", @parser.errors.first.to_s
|
510
|
+
assert_equal 11, @parser.errors.first.position
|
525
511
|
assert_equal 1, @parser.errors.first.line
|
526
512
|
assert_equal 11, @parser.errors.first.column
|
527
513
|
end
|
@@ -530,10 +516,34 @@ class HtmlTokenizer::ParserTest < Minitest::Test
|
|
530
516
|
parse('<foo bar=""x')
|
531
517
|
assert_equal 1, @parser.errors_count
|
532
518
|
assert_equal "expected space after attribute value", @parser.errors.first.to_s
|
519
|
+
assert_equal 11, @parser.errors.first.position
|
533
520
|
assert_equal 1, @parser.errors.first.line
|
534
521
|
assert_equal 11, @parser.errors.first.column
|
535
522
|
end
|
536
523
|
|
524
|
+
def test_attribute_with_mutlibyte_characters
|
525
|
+
data = ["<div title", "='your store’s'>"]
|
526
|
+
tokens = []
|
527
|
+
parse(*data) { |name, start, stop| tokens << [name, start, stop, data.join[start...stop]] }
|
528
|
+
assert_equal "div", @parser.tag_name
|
529
|
+
assert_equal "title", @parser.attribute_name
|
530
|
+
assert_equal "your store’s", @parser.attribute_value
|
531
|
+
assert_equal data.join, @parser.document
|
532
|
+
assert_equal data.join.size, @parser.document_length
|
533
|
+
assert_equal data.join.size, @parser.column_number
|
534
|
+
assert_equal [
|
535
|
+
[:tag_start, 0, 1, "<"],
|
536
|
+
[:tag_name, 1, 4, "div"],
|
537
|
+
[:whitespace, 4, 5, " "],
|
538
|
+
[:attribute_name, 5, 10, "title"],
|
539
|
+
[:equal, 10, 11, "="],
|
540
|
+
[:attribute_quoted_value_start, 11, 12, "'"],
|
541
|
+
[:attribute_quoted_value, 12, 24, "your store’s"],
|
542
|
+
[:attribute_quoted_value_end, 24, 25, "'"],
|
543
|
+
[:tag_end, 25, 26, ">"],
|
544
|
+
], tokens
|
545
|
+
end
|
546
|
+
|
537
547
|
def test_valid_syntaxes
|
538
548
|
parse(
|
539
549
|
'<div>',
|
@@ -564,6 +574,15 @@ class HtmlTokenizer::ParserTest < Minitest::Test
|
|
564
574
|
assert_equal 0, @parser.errors_count, "Expected no errors: #{@parser.errors}"
|
565
575
|
end
|
566
576
|
|
577
|
+
def test_doctype_without_space
|
578
|
+
parse('<!DOCTYPE')
|
579
|
+
assert_equal "!DOCTYPE", @parser.tag_name
|
580
|
+
parse('foo')
|
581
|
+
assert_equal "!DOCTYPEfoo", @parser.tag_name
|
582
|
+
|
583
|
+
assert_equal 0, @parser.errors_count, "Expected no errors: #{@parser.errors}"
|
584
|
+
end
|
585
|
+
|
567
586
|
private
|
568
587
|
|
569
588
|
def parse(*parts, &block)
|
data/test/unit/tokenizer_test.rb
CHANGED
@@ -324,13 +324,34 @@ class HtmlTokenizer::TokenizerTest < Minitest::Test
|
|
324
324
|
], result
|
325
325
|
end
|
326
326
|
|
327
|
+
def test_html_with_mutlibyte_characters
|
328
|
+
data = "<div title='your store’s'>foo</div>"
|
329
|
+
result = tokenize(data)
|
330
|
+
assert_equal [
|
331
|
+
[:tag_start, "<"],
|
332
|
+
[:tag_name, "div"],
|
333
|
+
[:whitespace, " "],
|
334
|
+
[:attribute_name, "title"],
|
335
|
+
[:equal, "="],
|
336
|
+
[:attribute_quoted_value_start, "'"],
|
337
|
+
[:attribute_quoted_value, "your store’s"],
|
338
|
+
[:attribute_quoted_value_end, "'"],
|
339
|
+
[:tag_end, ">"],
|
340
|
+
[:text, "foo"],
|
341
|
+
[:tag_start, "<"],
|
342
|
+
[:solidus, "/"],
|
343
|
+
[:tag_name, "div"],
|
344
|
+
[:tag_end, ">"],
|
345
|
+
], result
|
346
|
+
end
|
347
|
+
|
327
348
|
private
|
328
349
|
|
329
350
|
def tokenize(*parts)
|
330
351
|
tokens = []
|
331
352
|
@tokenizer = HtmlTokenizer::Tokenizer.new
|
332
353
|
parts.each do |part|
|
333
|
-
@tokenizer.tokenize(part) { |name, start, stop| tokens << [name, part[start
|
354
|
+
@tokenizer.tokenize(part) { |name, start, stop| tokens << [name, part[start...stop]] }
|
334
355
|
end
|
335
356
|
tokens
|
336
357
|
end
|
metadata
CHANGED
@@ -1,55 +1,55 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francois Chagnon
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-05-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - ~>
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - ~>
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rake-compiler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - ~>
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: '0'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - ~>
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: minitest
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - ~>
|
45
|
+
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
47
|
version: '0'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - ~>
|
52
|
+
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
description:
|
@@ -60,8 +60,8 @@ extensions:
|
|
60
60
|
- ext/html_tokenizer_ext/extconf.rb
|
61
61
|
extra_rdoc_files: []
|
62
62
|
files:
|
63
|
-
- .autotest
|
64
|
-
- .gitignore
|
63
|
+
- ".autotest"
|
64
|
+
- ".gitignore"
|
65
65
|
- Gemfile
|
66
66
|
- Gemfile.lock
|
67
67
|
- LICENSE
|
@@ -90,17 +90,17 @@ require_paths:
|
|
90
90
|
- ext
|
91
91
|
required_ruby_version: !ruby/object:Gem::Requirement
|
92
92
|
requirements:
|
93
|
-
- -
|
93
|
+
- - ">="
|
94
94
|
- !ruby/object:Gem::Version
|
95
95
|
version: '0'
|
96
96
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
97
97
|
requirements:
|
98
|
-
- -
|
98
|
+
- - ">="
|
99
99
|
- !ruby/object:Gem::Version
|
100
100
|
version: '0'
|
101
101
|
requirements: []
|
102
102
|
rubyforge_project:
|
103
|
-
rubygems_version: 2.
|
103
|
+
rubygems_version: 2.7.6
|
104
104
|
signing_key:
|
105
105
|
specification_version: 4
|
106
106
|
summary: HTML Tokenizer
|