html_tokenizer 0.0.1 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/Gemfile.lock +3 -3
- data/ext/html_tokenizer_ext/extconf.rb +7 -1
- data/ext/html_tokenizer_ext/parser.c +64 -38
- data/ext/html_tokenizer_ext/parser.h +6 -0
- data/ext/html_tokenizer_ext/tokenizer.c +55 -22
- data/ext/html_tokenizer_ext/tokenizer.h +6 -0
- data/html_tokenizer.gemspec +1 -1
- data/lib/html_tokenizer.rb +3 -2
- data/test/unit/parser_test.rb +48 -29
- data/test/unit/tokenizer_test.rb +22 -1
- metadata +13 -13
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
|
-
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 7f63f5699a8e9dc129392fa0d554196d9c2322c43f19cd21b353194b90d1c0f1
|
|
4
|
+
data.tar.gz: db308d4bb26d5181da91f9353e29d12d7aa822f02d1f0214959663516891781b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7f70e313d9206393e094b38569584f2f5e95bf3cb8abbd840fd063652da7d66b4c4c3a29e55b900f8da4f3c672d312dedb3895cebc5727292287e60b41b3049d
|
|
7
|
+
data.tar.gz: 9999d3a9e5c51ac426cb33551f14dcba980b7ee45543c000422ea7c80fe6884b34297790f977f53b95d6d128a35d49bcfa052b8b005f52d33d1f1a675d599ec8
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
html_tokenizer (0.0.
|
|
4
|
+
html_tokenizer (0.0.7)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: https://rubygems.org/
|
|
8
8
|
specs:
|
|
9
9
|
minitest (5.9.0)
|
|
10
|
-
rake (
|
|
10
|
+
rake (12.3.0)
|
|
11
11
|
rake-compiler (0.9.9)
|
|
12
12
|
rake
|
|
13
13
|
|
|
@@ -21,4 +21,4 @@ DEPENDENCIES
|
|
|
21
21
|
rake-compiler
|
|
22
22
|
|
|
23
23
|
BUNDLED WITH
|
|
24
|
-
1.
|
|
24
|
+
1.16.1
|
|
@@ -1,6 +1,12 @@
|
|
|
1
1
|
require 'mkmf'
|
|
2
2
|
|
|
3
3
|
$CXXFLAGS += " -std=c++11 "
|
|
4
|
-
$CXXFLAGS += " -g -
|
|
4
|
+
$CXXFLAGS += " -g -O1 -ggdb "
|
|
5
|
+
$CFLAGS += " -g -O1 -ggdb "
|
|
6
|
+
|
|
7
|
+
if ENV['DEBUG']
|
|
8
|
+
$CXXFLAGS += " -DDEBUG "
|
|
9
|
+
$CFLAGS += " -DDEBUG "
|
|
10
|
+
end
|
|
5
11
|
|
|
6
12
|
create_makefile('html_tokenizer_ext')
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
#include <ruby.h>
|
|
2
|
+
#include <ruby/encoding.h>
|
|
2
3
|
#include "html_tokenizer.h"
|
|
3
4
|
#include "parser.h"
|
|
4
5
|
|
|
@@ -13,6 +14,7 @@ static void parser_free(void *ptr)
|
|
|
13
14
|
size_t i;
|
|
14
15
|
|
|
15
16
|
if(parser) {
|
|
17
|
+
tokenizer_free_members(&parser->tk);
|
|
16
18
|
if(parser->doc.data) {
|
|
17
19
|
DBG_PRINT("parser=%p xfree(parser->doc.data) %p", parser, parser->doc.data);
|
|
18
20
|
xfree(parser->doc.data);
|
|
@@ -65,6 +67,7 @@ static inline void parser_append_ref(struct token_reference_t *dest, struct toke
|
|
|
65
67
|
if(dest->type == TOKEN_NONE || dest->type != src->type || (dest->start + dest->length) != src->start) {
|
|
66
68
|
dest->type = src->type;
|
|
67
69
|
dest->start = src->start;
|
|
70
|
+
dest->mb_start = src->mb_start;
|
|
68
71
|
dest->length = src->length;
|
|
69
72
|
dest->line_number = src->line_number;
|
|
70
73
|
dest->column_number = src->column_number;
|
|
@@ -79,6 +82,8 @@ static void parser_add_error(struct parser_t *parser, const char *message)
|
|
|
79
82
|
{
|
|
80
83
|
REALLOC_N(parser->errors, struct parser_document_error_t, parser->errors_count + 1);
|
|
81
84
|
parser->errors[parser->errors_count].message = strdup(message);
|
|
85
|
+
parser->errors[parser->errors_count].pos = parser->tk.scan.cursor;
|
|
86
|
+
parser->errors[parser->errors_count].mb_pos = parser->tk.scan.mb_cursor;
|
|
82
87
|
parser->errors[parser->errors_count].line_number = parser->doc.line_number;
|
|
83
88
|
parser->errors[parser->errors_count].column_number = parser->doc.column_number;
|
|
84
89
|
parser->errors_count += 1;
|
|
@@ -362,15 +367,21 @@ static inline int rawtext_context(struct parser_t *parser)
|
|
|
362
367
|
|
|
363
368
|
static void parser_adjust_line_number(struct parser_t *parser, long unsigned int start, long unsigned int length)
|
|
364
369
|
{
|
|
370
|
+
rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
|
|
365
371
|
long unsigned int i;
|
|
372
|
+
const char *buf, *nextlf;
|
|
366
373
|
|
|
367
|
-
for(i =
|
|
368
|
-
|
|
374
|
+
for(i = 0; i < length;) {
|
|
375
|
+
buf = &parser->doc.data[start + i];
|
|
376
|
+
nextlf = memchr(buf, '\n', length - i);
|
|
377
|
+
if(nextlf) {
|
|
369
378
|
parser->doc.column_number = 0;
|
|
370
379
|
parser->doc.line_number += 1;
|
|
380
|
+
i += (nextlf - buf) + 1;
|
|
371
381
|
}
|
|
372
382
|
else {
|
|
373
|
-
parser->doc.column_number +=
|
|
383
|
+
parser->doc.column_number += rb_enc_strlen(buf, buf + length - i, enc);
|
|
384
|
+
break;
|
|
374
385
|
}
|
|
375
386
|
}
|
|
376
387
|
|
|
@@ -383,11 +394,14 @@ static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type typ
|
|
|
383
394
|
struct token_reference_t ref = {
|
|
384
395
|
.type = type,
|
|
385
396
|
.start = tk->scan.cursor,
|
|
397
|
+
.mb_start = tk->scan.mb_cursor,
|
|
386
398
|
.length = length,
|
|
387
399
|
.line_number = parser->doc.line_number,
|
|
388
400
|
.column_number = parser->doc.column_number,
|
|
389
401
|
};
|
|
390
402
|
int parse_again = 1;
|
|
403
|
+
long unsigned int mb_strlen;
|
|
404
|
+
rb_encoding *enc;
|
|
391
405
|
|
|
392
406
|
while(parse_again) {
|
|
393
407
|
switch(parser->context)
|
|
@@ -438,8 +452,10 @@ static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type typ
|
|
|
438
452
|
}
|
|
439
453
|
|
|
440
454
|
if(rb_block_given_p()) {
|
|
455
|
+
enc = rb_enc_from_index(parser->doc.enc_index);
|
|
456
|
+
mb_strlen = rb_enc_strlen(parser->doc.data + ref.start, parser->doc.data + ref.start + ref.length, enc);
|
|
441
457
|
rb_yield_values(5, token_type_to_symbol(type),
|
|
442
|
-
INT2NUM(ref.
|
|
458
|
+
INT2NUM(ref.mb_start), INT2NUM(ref.mb_start + mb_strlen),
|
|
443
459
|
INT2NUM(ref.line_number), INT2NUM(ref.column_number));
|
|
444
460
|
}
|
|
445
461
|
|
|
@@ -465,6 +481,8 @@ static VALUE parser_initialize_method(VALUE self)
|
|
|
465
481
|
|
|
466
482
|
parser->doc.length = 0;
|
|
467
483
|
parser->doc.data = NULL;
|
|
484
|
+
parser->doc.enc_index = 0;
|
|
485
|
+
parser->doc.mb_length = 0;
|
|
468
486
|
|
|
469
487
|
parser->doc.line_number = 1;
|
|
470
488
|
parser->doc.column_number = 0;
|
|
@@ -478,11 +496,17 @@ static VALUE parser_initialize_method(VALUE self)
|
|
|
478
496
|
static int parser_document_append(struct parser_t *parser, const char *string, unsigned long int length)
|
|
479
497
|
{
|
|
480
498
|
void *old = parser->doc.data;
|
|
499
|
+
unsigned long int mb_length;
|
|
500
|
+
char *buf;
|
|
501
|
+
rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
|
|
481
502
|
REALLOC_N(parser->doc.data, char, parser->doc.length + length + 1);
|
|
482
503
|
DBG_PRINT("parser=%p realloc(parser->doc.data) %p -> %p length=%lu", parser, old,
|
|
483
|
-
parser->doc.data,
|
|
484
|
-
|
|
504
|
+
parser->doc.data, parser->doc.length + length + 1);
|
|
505
|
+
buf = parser->doc.data + parser->doc.length;
|
|
506
|
+
strcpy(buf, string);
|
|
507
|
+
mb_length = rb_enc_strlen(buf, buf + length, enc);
|
|
485
508
|
parser->doc.length += length;
|
|
509
|
+
parser->doc.mb_length += mb_length;
|
|
486
510
|
return 1;
|
|
487
511
|
}
|
|
488
512
|
|
|
@@ -490,7 +514,7 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
|
|
|
490
514
|
{
|
|
491
515
|
struct parser_t *parser = NULL;
|
|
492
516
|
char *string = NULL;
|
|
493
|
-
long unsigned int length = 0, cursor = 0;
|
|
517
|
+
long unsigned int length = 0, cursor = 0, mb_cursor = 0;
|
|
494
518
|
|
|
495
519
|
if(NIL_P(source))
|
|
496
520
|
return Qnil;
|
|
@@ -502,6 +526,15 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
|
|
|
502
526
|
length = strlen(string);
|
|
503
527
|
|
|
504
528
|
cursor = parser->doc.length;
|
|
529
|
+
mb_cursor = parser->doc.mb_length;
|
|
530
|
+
|
|
531
|
+
if(parser->doc.data == NULL) {
|
|
532
|
+
parser->doc.enc_index = rb_enc_get_index(source);
|
|
533
|
+
}
|
|
534
|
+
else if(parser->doc.enc_index != rb_enc_get_index(source)) {
|
|
535
|
+
rb_raise(rb_eArgError, "cannot append %s string to %s document",
|
|
536
|
+
rb_enc_name(rb_enc_get(source)), rb_enc_name(rb_enc_from_index(parser->doc.enc_index)));
|
|
537
|
+
}
|
|
505
538
|
|
|
506
539
|
if(!parser_document_append(parser, string, length)) {
|
|
507
540
|
// error
|
|
@@ -513,10 +546,12 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
|
|
|
513
546
|
}
|
|
514
547
|
else {
|
|
515
548
|
parser->tk.scan.cursor = cursor;
|
|
516
|
-
parser->tk.
|
|
517
|
-
parser->tk.scan.
|
|
549
|
+
tokenizer_set_scan_string(&parser->tk, parser->doc.data, parser->doc.length);
|
|
550
|
+
parser->tk.scan.enc_index = parser->doc.enc_index;
|
|
551
|
+
parser->tk.scan.mb_cursor = mb_cursor;
|
|
518
552
|
|
|
519
553
|
tokenizer_scan_all(&parser->tk);
|
|
554
|
+
tokenizer_free_scan_string(&parser->tk);
|
|
520
555
|
}
|
|
521
556
|
|
|
522
557
|
return Qtrue;
|
|
@@ -535,17 +570,30 @@ static VALUE parser_append_placeholder_method(VALUE self, VALUE source)
|
|
|
535
570
|
static VALUE parser_document_method(VALUE self)
|
|
536
571
|
{
|
|
537
572
|
struct parser_t *parser = NULL;
|
|
573
|
+
rb_encoding *enc;
|
|
538
574
|
Parser_Get_Struct(self, parser);
|
|
539
575
|
if(!parser->doc.data)
|
|
540
576
|
return Qnil;
|
|
541
|
-
|
|
577
|
+
enc = rb_enc_from_index(parser->doc.enc_index);
|
|
578
|
+
return rb_enc_str_new(parser->doc.data, parser->doc.length, enc);
|
|
542
579
|
}
|
|
543
580
|
|
|
544
581
|
static VALUE parser_document_length_method(VALUE self)
|
|
545
582
|
{
|
|
546
583
|
struct parser_t *parser = NULL;
|
|
584
|
+
rb_encoding *enc;
|
|
585
|
+
const char *buf;
|
|
586
|
+
|
|
547
587
|
Parser_Get_Struct(self, parser);
|
|
548
|
-
|
|
588
|
+
|
|
589
|
+
if(parser->doc.data == NULL) {
|
|
590
|
+
return ULONG2NUM(0);
|
|
591
|
+
}
|
|
592
|
+
else {
|
|
593
|
+
buf = parser->doc.data;
|
|
594
|
+
enc = rb_enc_from_index(parser->doc.enc_index);
|
|
595
|
+
return ULONG2NUM(rb_enc_strlen(buf, buf + parser->doc.length, enc));
|
|
596
|
+
}
|
|
549
597
|
}
|
|
550
598
|
|
|
551
599
|
static VALUE parser_context_method(VALUE self)
|
|
@@ -588,9 +636,10 @@ static VALUE parser_context_method(VALUE self)
|
|
|
588
636
|
|
|
589
637
|
static inline VALUE ref_to_str(struct parser_t *parser, struct token_reference_t *ref)
|
|
590
638
|
{
|
|
639
|
+
rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
|
|
591
640
|
if(ref->type == TOKEN_NONE || parser->doc.data == NULL)
|
|
592
641
|
return Qnil;
|
|
593
|
-
return
|
|
642
|
+
return rb_enc_str_new(parser->doc.data+ref->start, ref->length, enc);
|
|
594
643
|
}
|
|
595
644
|
|
|
596
645
|
static VALUE parser_tag_name_method(VALUE self)
|
|
@@ -665,29 +714,6 @@ static VALUE parser_rawtext_text_method(VALUE self)
|
|
|
665
714
|
return ref_to_str(parser, &parser->rawtext.text);
|
|
666
715
|
}
|
|
667
716
|
|
|
668
|
-
static VALUE parser_extract_method(VALUE self, VALUE start_p, VALUE end_p)
|
|
669
|
-
{
|
|
670
|
-
struct parser_t *parser = NULL;
|
|
671
|
-
unsigned long int start, end;
|
|
672
|
-
struct token_reference_t ref;
|
|
673
|
-
|
|
674
|
-
Parser_Get_Struct(self, parser);
|
|
675
|
-
|
|
676
|
-
start = NUM2ULONG(start_p);
|
|
677
|
-
end = NUM2ULONG(end_p);
|
|
678
|
-
if(end < start) {
|
|
679
|
-
rb_raise(rb_eArgError, "'end' must be greater or equal than 'start'");
|
|
680
|
-
}
|
|
681
|
-
if(end > parser->doc.length) {
|
|
682
|
-
rb_raise(rb_eArgError, "'end' argument not in range of document");
|
|
683
|
-
}
|
|
684
|
-
|
|
685
|
-
ref.type = TOKEN_TEXT; // anything not NONE
|
|
686
|
-
ref.start = start;
|
|
687
|
-
ref.length = end - start;
|
|
688
|
-
return ref_to_str(parser, &ref);
|
|
689
|
-
}
|
|
690
|
-
|
|
691
717
|
static VALUE parser_errors_count_method(VALUE self)
|
|
692
718
|
{
|
|
693
719
|
struct parser_t *parser = NULL;
|
|
@@ -699,12 +725,13 @@ static VALUE create_parser_error(struct parser_document_error_t *error)
|
|
|
699
725
|
{
|
|
700
726
|
VALUE module = rb_const_get(rb_cObject, rb_intern("HtmlTokenizer"));
|
|
701
727
|
VALUE klass = rb_const_get(module, rb_intern("ParserError"));
|
|
702
|
-
VALUE args[
|
|
728
|
+
VALUE args[4] = {
|
|
703
729
|
rb_str_new2(error->message),
|
|
730
|
+
ULONG2NUM(error->mb_pos),
|
|
704
731
|
ULONG2NUM(error->line_number),
|
|
705
732
|
ULONG2NUM(error->column_number),
|
|
706
733
|
};
|
|
707
|
-
return rb_class_new_instance(
|
|
734
|
+
return rb_class_new_instance(4, args, klass);
|
|
708
735
|
}
|
|
709
736
|
|
|
710
737
|
static VALUE parser_errors_method(VALUE self, VALUE error_p)
|
|
@@ -749,7 +776,6 @@ void Init_html_tokenizer_parser(VALUE mHtmlTokenizer)
|
|
|
749
776
|
rb_define_method(cParser, "column_number", parser_column_number_method, 0);
|
|
750
777
|
rb_define_method(cParser, "parse", parser_parse_method, 1);
|
|
751
778
|
rb_define_method(cParser, "append_placeholder", parser_append_placeholder_method, 1);
|
|
752
|
-
rb_define_method(cParser, "extract", parser_extract_method, 2);
|
|
753
779
|
rb_define_method(cParser, "context", parser_context_method, 0);
|
|
754
780
|
rb_define_method(cParser, "tag_name", parser_tag_name_method, 0);
|
|
755
781
|
rb_define_method(cParser, "closing_tag?", parser_closing_tag_method, 0);
|
|
@@ -19,6 +19,8 @@ enum parser_context {
|
|
|
19
19
|
|
|
20
20
|
struct parser_document_error_t {
|
|
21
21
|
char *message;
|
|
22
|
+
long unsigned int pos;
|
|
23
|
+
long unsigned int mb_pos;
|
|
22
24
|
long unsigned int line_number;
|
|
23
25
|
long unsigned int column_number;
|
|
24
26
|
};
|
|
@@ -28,11 +30,15 @@ struct parser_document_t {
|
|
|
28
30
|
char *data;
|
|
29
31
|
long unsigned int line_number;
|
|
30
32
|
long unsigned int column_number;
|
|
33
|
+
|
|
34
|
+
int enc_index;
|
|
35
|
+
long unsigned int mb_length;
|
|
31
36
|
};
|
|
32
37
|
|
|
33
38
|
struct token_reference_t {
|
|
34
39
|
enum token_type type;
|
|
35
40
|
long unsigned int start;
|
|
41
|
+
long unsigned int mb_start;
|
|
36
42
|
long unsigned int length;
|
|
37
43
|
long unsigned int line_number;
|
|
38
44
|
long unsigned int column_number;
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
#include <ruby.h>
|
|
2
|
+
#include <ruby/encoding.h>
|
|
2
3
|
#include "html_tokenizer.h"
|
|
3
4
|
#include "tokenizer.h"
|
|
4
5
|
|
|
@@ -11,16 +12,7 @@ static void tokenizer_free(void *ptr)
|
|
|
11
12
|
{
|
|
12
13
|
struct tokenizer_t *tk = ptr;
|
|
13
14
|
if(tk) {
|
|
14
|
-
|
|
15
|
-
DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag);
|
|
16
|
-
xfree(tk->current_tag);
|
|
17
|
-
tk->current_tag = NULL;
|
|
18
|
-
}
|
|
19
|
-
if(tk->scan.string) {
|
|
20
|
-
DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string);
|
|
21
|
-
xfree(tk->scan.string);
|
|
22
|
-
tk->scan.string = NULL;
|
|
23
|
-
}
|
|
15
|
+
tokenizer_free_members(tk);
|
|
24
16
|
DBG_PRINT("tk=%p xfree(tk)", tk);
|
|
25
17
|
xfree(tk);
|
|
26
18
|
}
|
|
@@ -60,6 +52,8 @@ void tokenizer_init(struct tokenizer_t *tk)
|
|
|
60
52
|
tk->scan.string = NULL;
|
|
61
53
|
tk->scan.cursor = 0;
|
|
62
54
|
tk->scan.length = 0;
|
|
55
|
+
tk->scan.mb_cursor = 0;
|
|
56
|
+
tk->scan.enc_index = 0;
|
|
63
57
|
|
|
64
58
|
tk->attribute_value_start = 0;
|
|
65
59
|
tk->found_attribute = 0;
|
|
@@ -72,6 +66,21 @@ void tokenizer_init(struct tokenizer_t *tk)
|
|
|
72
66
|
return;
|
|
73
67
|
}
|
|
74
68
|
|
|
69
|
+
void tokenizer_free_members(struct tokenizer_t *tk)
|
|
70
|
+
{
|
|
71
|
+
if(tk->current_tag) {
|
|
72
|
+
DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag);
|
|
73
|
+
xfree(tk->current_tag);
|
|
74
|
+
tk->current_tag = NULL;
|
|
75
|
+
}
|
|
76
|
+
if(tk->scan.string) {
|
|
77
|
+
DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string);
|
|
78
|
+
xfree(tk->scan.string);
|
|
79
|
+
tk->scan.string = NULL;
|
|
80
|
+
}
|
|
81
|
+
return;
|
|
82
|
+
}
|
|
83
|
+
|
|
75
84
|
VALUE token_type_to_symbol(enum token_type type)
|
|
76
85
|
{
|
|
77
86
|
switch(type) {
|
|
@@ -115,17 +124,27 @@ VALUE token_type_to_symbol(enum token_type type)
|
|
|
115
124
|
return Qnil;
|
|
116
125
|
}
|
|
117
126
|
|
|
127
|
+
static long unsigned int tokenizer_mblength(struct tokenizer_t *tk, long unsigned int length)
|
|
128
|
+
{
|
|
129
|
+
rb_encoding *enc = rb_enc_from_index(tk->scan.enc_index);
|
|
130
|
+
const char *buf = tk->scan.string + tk->scan.cursor;
|
|
131
|
+
return rb_enc_strlen(buf, buf + length, enc);
|
|
132
|
+
}
|
|
133
|
+
|
|
118
134
|
static void tokenizer_yield_tag(struct tokenizer_t *tk, enum token_type type, long unsigned int length, void *data)
|
|
119
135
|
{
|
|
136
|
+
long unsigned int mb_length = tokenizer_mblength(tk, length);
|
|
120
137
|
tk->last_token = type;
|
|
121
|
-
rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan.
|
|
138
|
+
rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan.mb_cursor), INT2NUM(tk->scan.mb_cursor + mb_length));
|
|
122
139
|
}
|
|
123
140
|
|
|
124
141
|
static void tokenizer_callback(struct tokenizer_t *tk, enum token_type type, long unsigned int length)
|
|
125
142
|
{
|
|
143
|
+
long unsigned int mb_length = tokenizer_mblength(tk, length);
|
|
126
144
|
if(tk->f_callback)
|
|
127
145
|
tk->f_callback(tk, type, length, tk->callback_data);
|
|
128
146
|
tk->scan.cursor += length;
|
|
147
|
+
tk->scan.mb_cursor += mb_length;
|
|
129
148
|
}
|
|
130
149
|
|
|
131
150
|
static VALUE tokenizer_initialize_method(VALUE self)
|
|
@@ -356,6 +375,7 @@ static int scan_open_tag(struct tokenizer_t *tk)
|
|
|
356
375
|
else if(is_doctype(&tk->scan)) {
|
|
357
376
|
tokenizer_callback(tk, TOKEN_TAG_START, 1);
|
|
358
377
|
tokenizer_callback(tk, TOKEN_TAG_NAME, 8);
|
|
378
|
+
push_context(tk, TOKENIZER_TAG_NAME);
|
|
359
379
|
return 1;
|
|
360
380
|
}
|
|
361
381
|
else if(is_cdata_start(&tk->scan)) {
|
|
@@ -642,11 +662,30 @@ void tokenizer_scan_all(struct tokenizer_t *tk)
|
|
|
642
662
|
return;
|
|
643
663
|
}
|
|
644
664
|
|
|
665
|
+
void tokenizer_set_scan_string(struct tokenizer_t *tk, const char *string, long unsigned int length)
|
|
666
|
+
{
|
|
667
|
+
const char *old = tk->scan.string;
|
|
668
|
+
REALLOC_N(tk->scan.string, char, string ? length + 1 : 0);
|
|
669
|
+
DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
|
|
670
|
+
tk->scan.string, length + 1);
|
|
671
|
+
if(string && length > 0) {
|
|
672
|
+
strncpy(tk->scan.string, string, length);
|
|
673
|
+
tk->scan.string[length] = 0;
|
|
674
|
+
}
|
|
675
|
+
tk->scan.length = length;
|
|
676
|
+
return;
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
void tokenizer_free_scan_string(struct tokenizer_t *tk)
|
|
680
|
+
{
|
|
681
|
+
tokenizer_set_scan_string(tk, NULL, 0);
|
|
682
|
+
return;
|
|
683
|
+
}
|
|
684
|
+
|
|
645
685
|
static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
|
|
646
686
|
{
|
|
647
687
|
struct tokenizer_t *tk = NULL;
|
|
648
688
|
char *c_source;
|
|
649
|
-
char *old;
|
|
650
689
|
|
|
651
690
|
if(NIL_P(source))
|
|
652
691
|
return Qnil;
|
|
@@ -656,19 +695,13 @@ static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
|
|
|
656
695
|
|
|
657
696
|
c_source = StringValueCStr(source);
|
|
658
697
|
tk->scan.cursor = 0;
|
|
659
|
-
tk
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
REALLOC_N(tk->scan.string, char, tk->scan.length+1);
|
|
663
|
-
DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
|
|
664
|
-
tk->scan.string, tk->scan.length+1);
|
|
665
|
-
strncpy(tk->scan.string, c_source, tk->scan.length);
|
|
698
|
+
tokenizer_set_scan_string(tk, c_source, strlen(c_source));
|
|
699
|
+
tk->scan.enc_index = rb_enc_get_index(source);
|
|
700
|
+
tk->scan.mb_cursor = 0;
|
|
666
701
|
|
|
667
702
|
tokenizer_scan_all(tk);
|
|
668
703
|
|
|
669
|
-
|
|
670
|
-
xfree(tk->scan.string);
|
|
671
|
-
tk->scan.string = NULL;
|
|
704
|
+
tokenizer_free_scan_string(tk);
|
|
672
705
|
|
|
673
706
|
return Qtrue;
|
|
674
707
|
}
|
|
@@ -43,6 +43,9 @@ struct scan_t {
|
|
|
43
43
|
char *string;
|
|
44
44
|
long unsigned int cursor;
|
|
45
45
|
long unsigned int length;
|
|
46
|
+
|
|
47
|
+
int enc_index;
|
|
48
|
+
long unsigned int mb_cursor;
|
|
46
49
|
};
|
|
47
50
|
|
|
48
51
|
struct tokenizer_t
|
|
@@ -67,6 +70,9 @@ struct tokenizer_t
|
|
|
67
70
|
|
|
68
71
|
void Init_html_tokenizer_tokenizer(VALUE mHtmlTokenizer);
|
|
69
72
|
void tokenizer_init(struct tokenizer_t *tk);
|
|
73
|
+
void tokenizer_free_members(struct tokenizer_t *tk);
|
|
74
|
+
void tokenizer_set_scan_string(struct tokenizer_t *tk, const char *string, long unsigned int length);
|
|
75
|
+
void tokenizer_free_scan_string(struct tokenizer_t *tk);
|
|
70
76
|
void tokenizer_scan_all(struct tokenizer_t *tk);
|
|
71
77
|
VALUE token_type_to_symbol(enum token_type type);
|
|
72
78
|
|
data/html_tokenizer.gemspec
CHANGED
data/lib/html_tokenizer.rb
CHANGED
|
@@ -2,9 +2,10 @@ require 'html_tokenizer_ext'
|
|
|
2
2
|
|
|
3
3
|
module HtmlTokenizer
|
|
4
4
|
class ParserError < RuntimeError
|
|
5
|
-
attr_reader :line, :column
|
|
6
|
-
def initialize(message, line, column)
|
|
5
|
+
attr_reader :position, :line, :column
|
|
6
|
+
def initialize(message, position, line, column)
|
|
7
7
|
super(message)
|
|
8
|
+
@position = position
|
|
8
9
|
@line = line
|
|
9
10
|
@column = column
|
|
10
11
|
end
|
data/test/unit/parser_test.rb
CHANGED
|
@@ -431,35 +431,13 @@ class HtmlTokenizer::ParserTest < Minitest::Test
|
|
|
431
431
|
tokens << token
|
|
432
432
|
end
|
|
433
433
|
assert_equal [[:text, 0, 4, 1, 0], [:text, 34, 38, 5, 0]], tokens
|
|
434
|
-
assert_equal "bar\n", @parser.extract(34, 38)
|
|
435
|
-
end
|
|
436
|
-
|
|
437
|
-
def test_extract_method
|
|
438
|
-
parse("abcdefg")
|
|
439
|
-
assert_equal "a", @parser.extract(0, 1)
|
|
440
|
-
assert_equal "cd", @parser.extract(2, 4)
|
|
441
|
-
end
|
|
442
|
-
|
|
443
|
-
def test_extract_method_raises_argument_error_end_past_length
|
|
444
|
-
parse("abcdefg")
|
|
445
|
-
e = assert_raises(ArgumentError) do
|
|
446
|
-
@parser.extract(0, 32)
|
|
447
|
-
end
|
|
448
|
-
assert_equal "'end' argument not in range of document", e.message
|
|
449
|
-
end
|
|
450
|
-
|
|
451
|
-
def test_extract_method_raises_argument_error_end_less_than_start
|
|
452
|
-
parse("abcdefg")
|
|
453
|
-
e = assert_raises(ArgumentError) do
|
|
454
|
-
@parser.extract(1, 0)
|
|
455
|
-
end
|
|
456
|
-
assert_equal "'end' must be greater or equal than 'start'", e.message
|
|
457
434
|
end
|
|
458
435
|
|
|
459
436
|
def test_solidus_or_tag_name_error
|
|
460
437
|
parse('<>')
|
|
461
438
|
assert_equal 1, @parser.errors_count
|
|
462
439
|
assert_equal "expected '/' or tag name", @parser.errors.first.to_s
|
|
440
|
+
assert_equal 1, @parser.errors.first.position
|
|
463
441
|
assert_equal 1, @parser.errors.first.line
|
|
464
442
|
assert_equal 1, @parser.errors.first.column
|
|
465
443
|
end
|
|
@@ -468,6 +446,7 @@ class HtmlTokenizer::ParserTest < Minitest::Test
|
|
|
468
446
|
parse('< ')
|
|
469
447
|
assert_equal 1, @parser.errors_count
|
|
470
448
|
assert_equal "expected '/' or tag name", @parser.errors.first.to_s
|
|
449
|
+
assert_equal 1, @parser.errors.first.position
|
|
471
450
|
assert_equal 1, @parser.errors.first.line
|
|
472
451
|
assert_equal 1, @parser.errors.first.column
|
|
473
452
|
end
|
|
@@ -476,6 +455,7 @@ class HtmlTokenizer::ParserTest < Minitest::Test
|
|
|
476
455
|
parse('<foo =')
|
|
477
456
|
assert_equal 1, @parser.errors_count
|
|
478
457
|
assert_equal "expected whitespace, '>', attribute name or value", @parser.errors.first.to_s
|
|
458
|
+
assert_equal 5, @parser.errors.first.position
|
|
479
459
|
assert_equal 1, @parser.errors.first.line
|
|
480
460
|
assert_equal 5, @parser.errors.first.column
|
|
481
461
|
end
|
|
@@ -484,6 +464,7 @@ class HtmlTokenizer::ParserTest < Minitest::Test
|
|
|
484
464
|
parse('<foo /x')
|
|
485
465
|
assert_equal 1, @parser.errors_count
|
|
486
466
|
assert_equal "expected '>' after '/'", @parser.errors.first.to_s
|
|
467
|
+
assert_equal 6, @parser.errors.first.position
|
|
487
468
|
assert_equal 1, @parser.errors.first.line
|
|
488
469
|
assert_equal 6, @parser.errors.first.column
|
|
489
470
|
end
|
|
@@ -492,6 +473,7 @@ class HtmlTokenizer::ParserTest < Minitest::Test
|
|
|
492
473
|
parse('<foo / ')
|
|
493
474
|
assert_equal 1, @parser.errors_count
|
|
494
475
|
assert_equal "expected '>' after '/'", @parser.errors.first.to_s
|
|
476
|
+
assert_equal 6, @parser.errors.first.position
|
|
495
477
|
assert_equal 1, @parser.errors.first.line
|
|
496
478
|
assert_equal 6, @parser.errors.first.column
|
|
497
479
|
end
|
|
@@ -499,29 +481,33 @@ class HtmlTokenizer::ParserTest < Minitest::Test
|
|
|
499
481
|
def test_attribute_name_error
|
|
500
482
|
parse('<foo bar~')
|
|
501
483
|
assert_equal 2, @parser.errors_count
|
|
502
|
-
assert_equal "expected whitespace, '>' or '=' after attribute name", @parser.errors.first.to_s
|
|
503
|
-
assert_equal 1, @parser.errors.first.line
|
|
504
|
-
assert_equal 8, @parser.errors.first.column
|
|
505
484
|
assert_equal "expected whitespace, '>' or '=' after attribute name", @parser.errors[0].to_s
|
|
485
|
+
assert_equal 8, @parser.errors.first.position
|
|
506
486
|
assert_equal 1, @parser.errors[0].line
|
|
507
487
|
assert_equal 8, @parser.errors[0].column
|
|
488
|
+
assert_equal "expected whitespace, '>', attribute name or value", @parser.errors[1].to_s
|
|
489
|
+
assert_equal 8, @parser.errors.first.position
|
|
490
|
+
assert_equal 1, @parser.errors[1].line
|
|
491
|
+
assert_equal 8, @parser.errors[1].column
|
|
508
492
|
end
|
|
509
493
|
|
|
510
494
|
def test_attribute_whitespace_or_equal_error
|
|
511
495
|
parse('<foo bar ~')
|
|
512
496
|
assert_equal 2, @parser.errors_count
|
|
513
|
-
assert_equal "expected '/', '>', \", ' or '=' after attribute name", @parser.errors.first.to_s
|
|
514
|
-
assert_equal 1, @parser.errors.first.line
|
|
515
|
-
assert_equal 9, @parser.errors.first.column
|
|
516
497
|
assert_equal "expected '/', '>', \", ' or '=' after attribute name", @parser.errors[0].to_s
|
|
517
498
|
assert_equal 1, @parser.errors[0].line
|
|
518
499
|
assert_equal 9, @parser.errors[0].column
|
|
500
|
+
assert_equal "expected whitespace, '>', attribute name or value", @parser.errors[1].to_s
|
|
501
|
+
assert_equal 9, @parser.errors.first.position
|
|
502
|
+
assert_equal 1, @parser.errors[1].line
|
|
503
|
+
assert_equal 9, @parser.errors[1].column
|
|
519
504
|
end
|
|
520
505
|
|
|
521
506
|
def test_attribute_whitespace_or_equal_error_2
|
|
522
507
|
parse('<foo bar = >')
|
|
523
508
|
assert_equal 1, @parser.errors_count
|
|
524
509
|
assert_equal "expected attribute value after '='", @parser.errors.first.to_s
|
|
510
|
+
assert_equal 11, @parser.errors.first.position
|
|
525
511
|
assert_equal 1, @parser.errors.first.line
|
|
526
512
|
assert_equal 11, @parser.errors.first.column
|
|
527
513
|
end
|
|
@@ -530,10 +516,34 @@ class HtmlTokenizer::ParserTest < Minitest::Test
|
|
|
530
516
|
parse('<foo bar=""x')
|
|
531
517
|
assert_equal 1, @parser.errors_count
|
|
532
518
|
assert_equal "expected space after attribute value", @parser.errors.first.to_s
|
|
519
|
+
assert_equal 11, @parser.errors.first.position
|
|
533
520
|
assert_equal 1, @parser.errors.first.line
|
|
534
521
|
assert_equal 11, @parser.errors.first.column
|
|
535
522
|
end
|
|
536
523
|
|
|
524
|
+
def test_attribute_with_mutlibyte_characters
|
|
525
|
+
data = ["<div title", "='your store’s'>"]
|
|
526
|
+
tokens = []
|
|
527
|
+
parse(*data) { |name, start, stop| tokens << [name, start, stop, data.join[start...stop]] }
|
|
528
|
+
assert_equal "div", @parser.tag_name
|
|
529
|
+
assert_equal "title", @parser.attribute_name
|
|
530
|
+
assert_equal "your store’s", @parser.attribute_value
|
|
531
|
+
assert_equal data.join, @parser.document
|
|
532
|
+
assert_equal data.join.size, @parser.document_length
|
|
533
|
+
assert_equal data.join.size, @parser.column_number
|
|
534
|
+
assert_equal [
|
|
535
|
+
[:tag_start, 0, 1, "<"],
|
|
536
|
+
[:tag_name, 1, 4, "div"],
|
|
537
|
+
[:whitespace, 4, 5, " "],
|
|
538
|
+
[:attribute_name, 5, 10, "title"],
|
|
539
|
+
[:equal, 10, 11, "="],
|
|
540
|
+
[:attribute_quoted_value_start, 11, 12, "'"],
|
|
541
|
+
[:attribute_quoted_value, 12, 24, "your store’s"],
|
|
542
|
+
[:attribute_quoted_value_end, 24, 25, "'"],
|
|
543
|
+
[:tag_end, 25, 26, ">"],
|
|
544
|
+
], tokens
|
|
545
|
+
end
|
|
546
|
+
|
|
537
547
|
def test_valid_syntaxes
|
|
538
548
|
parse(
|
|
539
549
|
'<div>',
|
|
@@ -564,6 +574,15 @@ class HtmlTokenizer::ParserTest < Minitest::Test
|
|
|
564
574
|
assert_equal 0, @parser.errors_count, "Expected no errors: #{@parser.errors}"
|
|
565
575
|
end
|
|
566
576
|
|
|
577
|
+
def test_doctype_without_space
|
|
578
|
+
parse('<!DOCTYPE')
|
|
579
|
+
assert_equal "!DOCTYPE", @parser.tag_name
|
|
580
|
+
parse('foo')
|
|
581
|
+
assert_equal "!DOCTYPEfoo", @parser.tag_name
|
|
582
|
+
|
|
583
|
+
assert_equal 0, @parser.errors_count, "Expected no errors: #{@parser.errors}"
|
|
584
|
+
end
|
|
585
|
+
|
|
567
586
|
private
|
|
568
587
|
|
|
569
588
|
def parse(*parts, &block)
|
data/test/unit/tokenizer_test.rb
CHANGED
|
@@ -324,13 +324,34 @@ class HtmlTokenizer::TokenizerTest < Minitest::Test
|
|
|
324
324
|
], result
|
|
325
325
|
end
|
|
326
326
|
|
|
327
|
+
def test_html_with_mutlibyte_characters
|
|
328
|
+
data = "<div title='your store’s'>foo</div>"
|
|
329
|
+
result = tokenize(data)
|
|
330
|
+
assert_equal [
|
|
331
|
+
[:tag_start, "<"],
|
|
332
|
+
[:tag_name, "div"],
|
|
333
|
+
[:whitespace, " "],
|
|
334
|
+
[:attribute_name, "title"],
|
|
335
|
+
[:equal, "="],
|
|
336
|
+
[:attribute_quoted_value_start, "'"],
|
|
337
|
+
[:attribute_quoted_value, "your store’s"],
|
|
338
|
+
[:attribute_quoted_value_end, "'"],
|
|
339
|
+
[:tag_end, ">"],
|
|
340
|
+
[:text, "foo"],
|
|
341
|
+
[:tag_start, "<"],
|
|
342
|
+
[:solidus, "/"],
|
|
343
|
+
[:tag_name, "div"],
|
|
344
|
+
[:tag_end, ">"],
|
|
345
|
+
], result
|
|
346
|
+
end
|
|
347
|
+
|
|
327
348
|
private
|
|
328
349
|
|
|
329
350
|
def tokenize(*parts)
|
|
330
351
|
tokens = []
|
|
331
352
|
@tokenizer = HtmlTokenizer::Tokenizer.new
|
|
332
353
|
parts.each do |part|
|
|
333
|
-
@tokenizer.tokenize(part) { |name, start, stop| tokens << [name, part[start
|
|
354
|
+
@tokenizer.tokenize(part) { |name, start, stop| tokens << [name, part[start...stop]] }
|
|
334
355
|
end
|
|
335
356
|
tokens
|
|
336
357
|
end
|
metadata
CHANGED
|
@@ -1,55 +1,55 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: html_tokenizer
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.7
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Francois Chagnon
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2018-05-25 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rake
|
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
|
16
16
|
requirements:
|
|
17
|
-
- - ~>
|
|
17
|
+
- - "~>"
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
19
|
version: '0'
|
|
20
20
|
type: :development
|
|
21
21
|
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
|
-
- - ~>
|
|
24
|
+
- - "~>"
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
26
|
version: '0'
|
|
27
27
|
- !ruby/object:Gem::Dependency
|
|
28
28
|
name: rake-compiler
|
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
|
30
30
|
requirements:
|
|
31
|
-
- - ~>
|
|
31
|
+
- - "~>"
|
|
32
32
|
- !ruby/object:Gem::Version
|
|
33
33
|
version: '0'
|
|
34
34
|
type: :development
|
|
35
35
|
prerelease: false
|
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
|
37
37
|
requirements:
|
|
38
|
-
- - ~>
|
|
38
|
+
- - "~>"
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
40
|
version: '0'
|
|
41
41
|
- !ruby/object:Gem::Dependency
|
|
42
42
|
name: minitest
|
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
|
44
44
|
requirements:
|
|
45
|
-
- - ~>
|
|
45
|
+
- - "~>"
|
|
46
46
|
- !ruby/object:Gem::Version
|
|
47
47
|
version: '0'
|
|
48
48
|
type: :development
|
|
49
49
|
prerelease: false
|
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
|
51
51
|
requirements:
|
|
52
|
-
- - ~>
|
|
52
|
+
- - "~>"
|
|
53
53
|
- !ruby/object:Gem::Version
|
|
54
54
|
version: '0'
|
|
55
55
|
description:
|
|
@@ -60,8 +60,8 @@ extensions:
|
|
|
60
60
|
- ext/html_tokenizer_ext/extconf.rb
|
|
61
61
|
extra_rdoc_files: []
|
|
62
62
|
files:
|
|
63
|
-
- .autotest
|
|
64
|
-
- .gitignore
|
|
63
|
+
- ".autotest"
|
|
64
|
+
- ".gitignore"
|
|
65
65
|
- Gemfile
|
|
66
66
|
- Gemfile.lock
|
|
67
67
|
- LICENSE
|
|
@@ -90,17 +90,17 @@ require_paths:
|
|
|
90
90
|
- ext
|
|
91
91
|
required_ruby_version: !ruby/object:Gem::Requirement
|
|
92
92
|
requirements:
|
|
93
|
-
- -
|
|
93
|
+
- - ">="
|
|
94
94
|
- !ruby/object:Gem::Version
|
|
95
95
|
version: '0'
|
|
96
96
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
97
97
|
requirements:
|
|
98
|
-
- -
|
|
98
|
+
- - ">="
|
|
99
99
|
- !ruby/object:Gem::Version
|
|
100
100
|
version: '0'
|
|
101
101
|
requirements: []
|
|
102
102
|
rubyforge_project:
|
|
103
|
-
rubygems_version: 2.
|
|
103
|
+
rubygems_version: 2.7.6
|
|
104
104
|
signing_key:
|
|
105
105
|
specification_version: 4
|
|
106
106
|
summary: HTML Tokenizer
|