html_tokenizer 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/ext/html_tokenizer_ext/parser.c +56 -34
- data/ext/html_tokenizer_ext/parser.h +4 -0
- data/ext/html_tokenizer_ext/tokenizer.c +16 -1
- data/ext/html_tokenizer_ext/tokenizer.h +3 -0
- data/html_tokenizer.gemspec +1 -1
- data/test/unit/parser_test.rb +23 -23
- data/test/unit/tokenizer_test.rb +22 -1
- metadata +13 -13
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA1:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 2e620e43f97a82c4cb3aae2067a2666325b453a5
         | 
| 4 | 
            +
              data.tar.gz: 57784c1b53c4faefe2ab3b6e222836bfad852d8c
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 3b7a26cc219ea9f5885999146015e8137264ee51b43e5e6929d377cc2447ca6d4c1d5804e32954c24b2b807a19edcaf1f9cc708e7b5a0f83b086e16a21c8fa3e
         | 
| 7 | 
            +
              data.tar.gz: 46beb0ed1994fe7468ab89451de1af830251da73d15c5d0943d30b70080806a458a7531a9fb7df2d7640e5ba1a195706eb84bf8896ef9184a207a1477747c200
         | 
    
        data/Gemfile.lock
    CHANGED
    
    
| @@ -1,4 +1,5 @@ | |
| 1 1 | 
             
            #include <ruby.h>
         | 
| 2 | 
            +
            #include <ruby/encoding.h>
         | 
| 2 3 | 
             
            #include "html_tokenizer.h"
         | 
| 3 4 | 
             
            #include "parser.h"
         | 
| 4 5 |  | 
| @@ -65,6 +66,7 @@ static inline void parser_append_ref(struct token_reference_t *dest, struct toke | |
| 65 66 | 
             
              if(dest->type == TOKEN_NONE || dest->type != src->type || (dest->start + dest->length) != src->start) {
         | 
| 66 67 | 
             
                dest->type = src->type;
         | 
| 67 68 | 
             
                dest->start = src->start;
         | 
| 69 | 
            +
                dest->mb_start = src->mb_start;
         | 
| 68 70 | 
             
                dest->length = src->length;
         | 
| 69 71 | 
             
                dest->line_number = src->line_number;
         | 
| 70 72 | 
             
                dest->column_number = src->column_number;
         | 
| @@ -362,15 +364,21 @@ static inline int rawtext_context(struct parser_t *parser) | |
| 362 364 |  | 
| 363 365 | 
             
            static void parser_adjust_line_number(struct parser_t *parser, long unsigned int start, long unsigned int length)
         | 
| 364 366 | 
             
            {
         | 
| 367 | 
            +
              rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
         | 
| 365 368 | 
             
              long unsigned int i;
         | 
| 369 | 
            +
              const char *buf, *nextlf;
         | 
| 366 370 |  | 
| 367 | 
            -
              for(i =  | 
| 368 | 
            -
                 | 
| 371 | 
            +
              for(i = 0; i < length;) {
         | 
| 372 | 
            +
                buf = &parser->doc.data[start + i];
         | 
| 373 | 
            +
                nextlf = memchr(buf, '\n', length - i);
         | 
| 374 | 
            +
                if(nextlf) {
         | 
| 369 375 | 
             
                  parser->doc.column_number = 0;
         | 
| 370 376 | 
             
                  parser->doc.line_number += 1;
         | 
| 377 | 
            +
                  i += (nextlf - buf) + 1;
         | 
| 371 378 | 
             
                }
         | 
| 372 379 | 
             
                else {
         | 
| 373 | 
            -
                  parser->doc.column_number +=  | 
| 380 | 
            +
                  parser->doc.column_number += rb_enc_strlen(buf, buf + length - i, enc);
         | 
| 381 | 
            +
                  break;
         | 
| 374 382 | 
             
                }
         | 
| 375 383 | 
             
              }
         | 
| 376 384 |  | 
| @@ -383,11 +391,14 @@ static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type typ | |
| 383 391 | 
             
              struct token_reference_t ref = {
         | 
| 384 392 | 
             
                .type = type,
         | 
| 385 393 | 
             
                .start = tk->scan.cursor,
         | 
| 394 | 
            +
                .mb_start = tk->scan.mb_cursor,
         | 
| 386 395 | 
             
                .length = length,
         | 
| 387 396 | 
             
                .line_number = parser->doc.line_number,
         | 
| 388 397 | 
             
                .column_number = parser->doc.column_number,
         | 
| 389 398 | 
             
              };
         | 
| 390 399 | 
             
              int parse_again = 1;
         | 
| 400 | 
            +
              long unsigned int mb_strlen;
         | 
| 401 | 
            +
              rb_encoding *enc;
         | 
| 391 402 |  | 
| 392 403 | 
             
              while(parse_again) {
         | 
| 393 404 | 
             
                switch(parser->context)
         | 
| @@ -438,8 +449,10 @@ static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type typ | |
| 438 449 | 
             
              }
         | 
| 439 450 |  | 
| 440 451 | 
             
              if(rb_block_given_p()) {
         | 
| 452 | 
            +
                enc = rb_enc_from_index(parser->doc.enc_index);
         | 
| 453 | 
            +
                mb_strlen = rb_enc_strlen(parser->doc.data + ref.start, parser->doc.data + ref.start + ref.length, enc);
         | 
| 441 454 | 
             
                rb_yield_values(5, token_type_to_symbol(type),
         | 
| 442 | 
            -
                  INT2NUM(ref. | 
| 455 | 
            +
                  INT2NUM(ref.mb_start), INT2NUM(ref.mb_start + mb_strlen),
         | 
| 443 456 | 
             
                  INT2NUM(ref.line_number), INT2NUM(ref.column_number));
         | 
| 444 457 | 
             
              }
         | 
| 445 458 |  | 
| @@ -465,6 +478,8 @@ static VALUE parser_initialize_method(VALUE self) | |
| 465 478 |  | 
| 466 479 | 
             
              parser->doc.length = 0;
         | 
| 467 480 | 
             
              parser->doc.data = NULL;
         | 
| 481 | 
            +
              parser->doc.enc_index = 0;
         | 
| 482 | 
            +
              parser->doc.mb_length = 0;
         | 
| 468 483 |  | 
| 469 484 | 
             
              parser->doc.line_number = 1;
         | 
| 470 485 | 
             
              parser->doc.column_number = 0;
         | 
| @@ -478,11 +493,17 @@ static VALUE parser_initialize_method(VALUE self) | |
| 478 493 | 
             
            static int parser_document_append(struct parser_t *parser, const char *string, unsigned long int length)
         | 
| 479 494 | 
             
            {
         | 
| 480 495 | 
             
              void *old = parser->doc.data;
         | 
| 496 | 
            +
              unsigned long int mb_length;
         | 
| 497 | 
            +
              char *buf;
         | 
| 498 | 
            +
              rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
         | 
| 481 499 | 
             
              REALLOC_N(parser->doc.data, char, parser->doc.length + length + 1);
         | 
| 482 500 | 
             
              DBG_PRINT("parser=%p realloc(parser->doc.data) %p -> %p length=%lu", parser, old,
         | 
| 483 | 
            -
                parser->doc.data, | 
| 484 | 
            -
               | 
| 501 | 
            +
                parser->doc.data, parser->doc.length + length + 1);
         | 
| 502 | 
            +
              buf = parser->doc.data + parser->doc.length;
         | 
| 503 | 
            +
              strcpy(buf, string);
         | 
| 504 | 
            +
              mb_length = rb_enc_strlen(buf, buf + length, enc);
         | 
| 485 505 | 
             
              parser->doc.length += length;
         | 
| 506 | 
            +
              parser->doc.mb_length += mb_length;
         | 
| 486 507 | 
             
              return 1;
         | 
| 487 508 | 
             
            }
         | 
| 488 509 |  | 
| @@ -490,7 +511,7 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder) | |
| 490 511 | 
             
            {
         | 
| 491 512 | 
             
              struct parser_t *parser = NULL;
         | 
| 492 513 | 
             
              char *string = NULL;
         | 
| 493 | 
            -
              long unsigned int length = 0, cursor = 0;
         | 
| 514 | 
            +
              long unsigned int length = 0, cursor = 0, mb_cursor = 0;
         | 
| 494 515 |  | 
| 495 516 | 
             
              if(NIL_P(source))
         | 
| 496 517 | 
             
                return Qnil;
         | 
| @@ -502,6 +523,15 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder) | |
| 502 523 | 
             
              length = strlen(string);
         | 
| 503 524 |  | 
| 504 525 | 
             
              cursor = parser->doc.length;
         | 
| 526 | 
            +
              mb_cursor = parser->doc.mb_length;
         | 
| 527 | 
            +
             | 
| 528 | 
            +
              if(parser->doc.data == NULL) {
         | 
| 529 | 
            +
                parser->doc.enc_index = rb_enc_get_index(source);
         | 
| 530 | 
            +
              }
         | 
| 531 | 
            +
              else if(parser->doc.enc_index != rb_enc_get_index(source)) {
         | 
| 532 | 
            +
                rb_raise(rb_eArgError, "cannot append %s string to %s document",
         | 
| 533 | 
            +
                  rb_enc_name(rb_enc_get(source)), rb_enc_name(rb_enc_from_index(parser->doc.enc_index)));
         | 
| 534 | 
            +
              }
         | 
| 505 535 |  | 
| 506 536 | 
             
              if(!parser_document_append(parser, string, length)) {
         | 
| 507 537 | 
             
                // error
         | 
| @@ -515,6 +545,8 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder) | |
| 515 545 | 
             
                parser->tk.scan.cursor = cursor;
         | 
| 516 546 | 
             
                parser->tk.scan.string = parser->doc.data;
         | 
| 517 547 | 
             
                parser->tk.scan.length = parser->doc.length;
         | 
| 548 | 
            +
                parser->tk.scan.enc_index = parser->doc.enc_index;
         | 
| 549 | 
            +
                parser->tk.scan.mb_cursor = mb_cursor;
         | 
| 518 550 |  | 
| 519 551 | 
             
                tokenizer_scan_all(&parser->tk);
         | 
| 520 552 | 
             
              }
         | 
| @@ -535,17 +567,30 @@ static VALUE parser_append_placeholder_method(VALUE self, VALUE source) | |
| 535 567 | 
             
            static VALUE parser_document_method(VALUE self)
         | 
| 536 568 | 
             
            {
         | 
| 537 569 | 
             
              struct parser_t *parser = NULL;
         | 
| 570 | 
            +
              rb_encoding *enc;
         | 
| 538 571 | 
             
              Parser_Get_Struct(self, parser);
         | 
| 539 572 | 
             
              if(!parser->doc.data)
         | 
| 540 573 | 
             
                return Qnil;
         | 
| 541 | 
            -
               | 
| 574 | 
            +
              enc = rb_enc_from_index(parser->doc.enc_index);
         | 
| 575 | 
            +
              return rb_enc_str_new(parser->doc.data, parser->doc.length, enc);
         | 
| 542 576 | 
             
            }
         | 
| 543 577 |  | 
| 544 578 | 
             
            static VALUE parser_document_length_method(VALUE self)
         | 
| 545 579 | 
             
            {
         | 
| 546 580 | 
             
              struct parser_t *parser = NULL;
         | 
| 581 | 
            +
              rb_encoding *enc;
         | 
| 582 | 
            +
              const char *buf;
         | 
| 583 | 
            +
             | 
| 547 584 | 
             
              Parser_Get_Struct(self, parser);
         | 
| 548 | 
            -
             | 
| 585 | 
            +
             | 
| 586 | 
            +
              if(parser->doc.data == NULL) {
         | 
| 587 | 
            +
                return ULONG2NUM(0);
         | 
| 588 | 
            +
              }
         | 
| 589 | 
            +
              else {
         | 
| 590 | 
            +
                buf = parser->doc.data;
         | 
| 591 | 
            +
                enc = rb_enc_from_index(parser->doc.enc_index);
         | 
| 592 | 
            +
                return ULONG2NUM(rb_enc_strlen(buf, buf + parser->doc.length, enc));
         | 
| 593 | 
            +
              }
         | 
| 549 594 | 
             
            }
         | 
| 550 595 |  | 
| 551 596 | 
             
            static VALUE parser_context_method(VALUE self)
         | 
| @@ -588,9 +633,10 @@ static VALUE parser_context_method(VALUE self) | |
| 588 633 |  | 
| 589 634 | 
             
            static inline VALUE ref_to_str(struct parser_t *parser, struct token_reference_t *ref)
         | 
| 590 635 | 
             
            {
         | 
| 636 | 
            +
              rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
         | 
| 591 637 | 
             
              if(ref->type == TOKEN_NONE || parser->doc.data == NULL)
         | 
| 592 638 | 
             
                return Qnil;
         | 
| 593 | 
            -
              return  | 
| 639 | 
            +
              return rb_enc_str_new(parser->doc.data+ref->start, ref->length, enc);
         | 
| 594 640 | 
             
            }
         | 
| 595 641 |  | 
| 596 642 | 
             
            static VALUE parser_tag_name_method(VALUE self)
         | 
| @@ -665,29 +711,6 @@ static VALUE parser_rawtext_text_method(VALUE self) | |
| 665 711 | 
             
              return ref_to_str(parser, &parser->rawtext.text);
         | 
| 666 712 | 
             
            }
         | 
| 667 713 |  | 
| 668 | 
            -
            static VALUE parser_extract_method(VALUE self, VALUE start_p, VALUE end_p)
         | 
| 669 | 
            -
            {
         | 
| 670 | 
            -
              struct parser_t *parser = NULL;
         | 
| 671 | 
            -
              unsigned long int start, end;
         | 
| 672 | 
            -
              struct token_reference_t ref;
         | 
| 673 | 
            -
             | 
| 674 | 
            -
              Parser_Get_Struct(self, parser);
         | 
| 675 | 
            -
             | 
| 676 | 
            -
              start = NUM2ULONG(start_p);
         | 
| 677 | 
            -
              end = NUM2ULONG(end_p);
         | 
| 678 | 
            -
              if(end < start) {
         | 
| 679 | 
            -
                rb_raise(rb_eArgError, "'end' must be greater or equal than 'start'");
         | 
| 680 | 
            -
              }
         | 
| 681 | 
            -
              if(end > parser->doc.length) {
         | 
| 682 | 
            -
                rb_raise(rb_eArgError, "'end' argument not in range of document");
         | 
| 683 | 
            -
              }
         | 
| 684 | 
            -
             | 
| 685 | 
            -
              ref.type = TOKEN_TEXT; // anything not NONE
         | 
| 686 | 
            -
              ref.start = start;
         | 
| 687 | 
            -
              ref.length = end - start;
         | 
| 688 | 
            -
              return ref_to_str(parser, &ref);
         | 
| 689 | 
            -
            }
         | 
| 690 | 
            -
             | 
| 691 714 | 
             
            static VALUE parser_errors_count_method(VALUE self)
         | 
| 692 715 | 
             
            {
         | 
| 693 716 | 
             
              struct parser_t *parser = NULL;
         | 
| @@ -749,7 +772,6 @@ void Init_html_tokenizer_parser(VALUE mHtmlTokenizer) | |
| 749 772 | 
             
              rb_define_method(cParser, "column_number", parser_column_number_method, 0);
         | 
| 750 773 | 
             
              rb_define_method(cParser, "parse", parser_parse_method, 1);
         | 
| 751 774 | 
             
              rb_define_method(cParser, "append_placeholder", parser_append_placeholder_method, 1);
         | 
| 752 | 
            -
              rb_define_method(cParser, "extract", parser_extract_method, 2);
         | 
| 753 775 | 
             
              rb_define_method(cParser, "context", parser_context_method, 0);
         | 
| 754 776 | 
             
              rb_define_method(cParser, "tag_name", parser_tag_name_method, 0);
         | 
| 755 777 | 
             
              rb_define_method(cParser, "closing_tag?", parser_closing_tag_method, 0);
         | 
| @@ -28,11 +28,15 @@ struct parser_document_t { | |
| 28 28 | 
             
              char *data;
         | 
| 29 29 | 
             
              long unsigned int line_number;
         | 
| 30 30 | 
             
              long unsigned int column_number;
         | 
| 31 | 
            +
             | 
| 32 | 
            +
              int enc_index;
         | 
| 33 | 
            +
              long unsigned int mb_length;
         | 
| 31 34 | 
             
            };
         | 
| 32 35 |  | 
| 33 36 | 
             
            struct token_reference_t {
         | 
| 34 37 | 
             
              enum token_type type;
         | 
| 35 38 | 
             
              long unsigned int start;
         | 
| 39 | 
            +
              long unsigned int mb_start;
         | 
| 36 40 | 
             
              long unsigned int length;
         | 
| 37 41 | 
             
              long unsigned int line_number;
         | 
| 38 42 | 
             
              long unsigned int column_number;
         | 
| @@ -1,4 +1,5 @@ | |
| 1 1 | 
             
            #include <ruby.h>
         | 
| 2 | 
            +
            #include <ruby/encoding.h>
         | 
| 2 3 | 
             
            #include "html_tokenizer.h"
         | 
| 3 4 | 
             
            #include "tokenizer.h"
         | 
| 4 5 |  | 
| @@ -60,6 +61,8 @@ void tokenizer_init(struct tokenizer_t *tk) | |
| 60 61 | 
             
              tk->scan.string = NULL;
         | 
| 61 62 | 
             
              tk->scan.cursor = 0;
         | 
| 62 63 | 
             
              tk->scan.length = 0;
         | 
| 64 | 
            +
              tk->scan.mb_cursor = 0;
         | 
| 65 | 
            +
              tk->scan.enc_index = 0;
         | 
| 63 66 |  | 
| 64 67 | 
             
              tk->attribute_value_start = 0;
         | 
| 65 68 | 
             
              tk->found_attribute = 0;
         | 
| @@ -115,17 +118,27 @@ VALUE token_type_to_symbol(enum token_type type) | |
| 115 118 | 
             
              return Qnil;
         | 
| 116 119 | 
             
            }
         | 
| 117 120 |  | 
| 121 | 
            +
            static long unsigned int tokenizer_mblength(struct tokenizer_t *tk, long unsigned int length)
         | 
| 122 | 
            +
            {
         | 
| 123 | 
            +
              rb_encoding *enc = rb_enc_from_index(tk->scan.enc_index);
         | 
| 124 | 
            +
              const char *buf = tk->scan.string + tk->scan.cursor;
         | 
| 125 | 
            +
              return rb_enc_strlen(buf, buf + length, enc);
         | 
| 126 | 
            +
            }
         | 
| 127 | 
            +
             | 
| 118 128 | 
             
            static void tokenizer_yield_tag(struct tokenizer_t *tk, enum token_type type, long unsigned int length, void *data)
         | 
| 119 129 | 
             
            {
         | 
| 130 | 
            +
              long unsigned int mb_length = tokenizer_mblength(tk, length);
         | 
| 120 131 | 
             
              tk->last_token = type;
         | 
| 121 | 
            -
              rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan. | 
| 132 | 
            +
              rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan.mb_cursor), INT2NUM(tk->scan.mb_cursor + mb_length));
         | 
| 122 133 | 
             
            }
         | 
| 123 134 |  | 
| 124 135 | 
             
            static void tokenizer_callback(struct tokenizer_t *tk, enum token_type type, long unsigned int length)
         | 
| 125 136 | 
             
            {
         | 
| 137 | 
            +
              long unsigned int mb_length = tokenizer_mblength(tk, length);
         | 
| 126 138 | 
             
              if(tk->f_callback)
         | 
| 127 139 | 
             
                tk->f_callback(tk, type, length, tk->callback_data);
         | 
| 128 140 | 
             
              tk->scan.cursor += length;
         | 
| 141 | 
            +
              tk->scan.mb_cursor += mb_length;
         | 
| 129 142 | 
             
            }
         | 
| 130 143 |  | 
| 131 144 | 
             
            static VALUE tokenizer_initialize_method(VALUE self)
         | 
| @@ -657,6 +670,8 @@ static VALUE tokenizer_tokenize_method(VALUE self, VALUE source) | |
| 657 670 | 
             
              c_source = StringValueCStr(source);
         | 
| 658 671 | 
             
              tk->scan.cursor = 0;
         | 
| 659 672 | 
             
              tk->scan.length = strlen(c_source);
         | 
| 673 | 
            +
              tk->scan.enc_index = rb_enc_get_index(source);
         | 
| 674 | 
            +
              tk->scan.mb_cursor = 0;
         | 
| 660 675 |  | 
| 661 676 | 
             
              old = tk->scan.string;
         | 
| 662 677 | 
             
              REALLOC_N(tk->scan.string, char, tk->scan.length+1);
         | 
    
        data/html_tokenizer.gemspec
    CHANGED
    
    
    
        data/test/unit/parser_test.rb
    CHANGED
    
    | @@ -431,29 +431,6 @@ class HtmlTokenizer::ParserTest < Minitest::Test | |
| 431 431 | 
             
                  tokens << token
         | 
| 432 432 | 
             
                end
         | 
| 433 433 | 
             
                assert_equal [[:text, 0, 4, 1, 0], [:text, 34, 38, 5, 0]], tokens
         | 
| 434 | 
            -
                assert_equal "bar\n", @parser.extract(34, 38)
         | 
| 435 | 
            -
              end
         | 
| 436 | 
            -
             | 
| 437 | 
            -
              def test_extract_method
         | 
| 438 | 
            -
                parse("abcdefg")
         | 
| 439 | 
            -
                assert_equal "a", @parser.extract(0, 1)
         | 
| 440 | 
            -
                assert_equal "cd", @parser.extract(2, 4)
         | 
| 441 | 
            -
              end
         | 
| 442 | 
            -
             | 
| 443 | 
            -
              def test_extract_method_raises_argument_error_end_past_length
         | 
| 444 | 
            -
                parse("abcdefg")
         | 
| 445 | 
            -
                e = assert_raises(ArgumentError) do
         | 
| 446 | 
            -
                  @parser.extract(0, 32)
         | 
| 447 | 
            -
                end
         | 
| 448 | 
            -
                assert_equal "'end' argument not in range of document", e.message
         | 
| 449 | 
            -
              end
         | 
| 450 | 
            -
             | 
| 451 | 
            -
              def test_extract_method_raises_argument_error_end_less_than_start
         | 
| 452 | 
            -
                parse("abcdefg")
         | 
| 453 | 
            -
                e = assert_raises(ArgumentError) do
         | 
| 454 | 
            -
                  @parser.extract(1, 0)
         | 
| 455 | 
            -
                end
         | 
| 456 | 
            -
                assert_equal "'end' must be greater or equal than 'start'", e.message
         | 
| 457 434 | 
             
              end
         | 
| 458 435 |  | 
| 459 436 | 
             
              def test_solidus_or_tag_name_error
         | 
| @@ -534,6 +511,29 @@ class HtmlTokenizer::ParserTest < Minitest::Test | |
| 534 511 | 
             
                assert_equal 11, @parser.errors.first.column
         | 
| 535 512 | 
             
              end
         | 
| 536 513 |  | 
| 514 | 
            +
              def test_attribute_with_mutlibyte_characters
         | 
| 515 | 
            +
                data = ["<div title", "='your store’s'>"]
         | 
| 516 | 
            +
                tokens = []
         | 
| 517 | 
            +
                parse(*data) { |name, start, stop| tokens << [name, start, stop, data.join[start...stop]] }
         | 
| 518 | 
            +
                assert_equal "div", @parser.tag_name
         | 
| 519 | 
            +
                assert_equal "title", @parser.attribute_name
         | 
| 520 | 
            +
                assert_equal "your store’s", @parser.attribute_value
         | 
| 521 | 
            +
                assert_equal data.join, @parser.document
         | 
| 522 | 
            +
                assert_equal data.join.size, @parser.document_length
         | 
| 523 | 
            +
                assert_equal data.join.size, @parser.column_number
         | 
| 524 | 
            +
                assert_equal [
         | 
| 525 | 
            +
                  [:tag_start, 0, 1, "<"],
         | 
| 526 | 
            +
                  [:tag_name, 1, 4, "div"],
         | 
| 527 | 
            +
                  [:whitespace, 4, 5, " "],
         | 
| 528 | 
            +
                  [:attribute_name, 5, 10, "title"],
         | 
| 529 | 
            +
                  [:equal, 10, 11, "="],
         | 
| 530 | 
            +
                  [:attribute_quoted_value_start, 11, 12, "'"],
         | 
| 531 | 
            +
                  [:attribute_quoted_value, 12, 24, "your store’s"],
         | 
| 532 | 
            +
                  [:attribute_quoted_value_end, 24, 25, "'"],
         | 
| 533 | 
            +
                  [:tag_end, 25, 26, ">"],
         | 
| 534 | 
            +
                ], tokens
         | 
| 535 | 
            +
              end
         | 
| 536 | 
            +
             | 
| 537 537 | 
             
              def test_valid_syntaxes
         | 
| 538 538 | 
             
                parse(
         | 
| 539 539 | 
             
                  '<div>',
         | 
    
        data/test/unit/tokenizer_test.rb
    CHANGED
    
    | @@ -324,13 +324,34 @@ class HtmlTokenizer::TokenizerTest < Minitest::Test | |
| 324 324 | 
             
                ], result
         | 
| 325 325 | 
             
              end
         | 
| 326 326 |  | 
| 327 | 
            +
              def test_html_with_mutlibyte_characters
         | 
| 328 | 
            +
                data = "<div title='your store’s'>foo</div>"
         | 
| 329 | 
            +
                result = tokenize(data)
         | 
| 330 | 
            +
                assert_equal [
         | 
| 331 | 
            +
                  [:tag_start, "<"],
         | 
| 332 | 
            +
                  [:tag_name, "div"],
         | 
| 333 | 
            +
                  [:whitespace, " "],
         | 
| 334 | 
            +
                  [:attribute_name, "title"],
         | 
| 335 | 
            +
                  [:equal, "="],
         | 
| 336 | 
            +
                  [:attribute_quoted_value_start, "'"],
         | 
| 337 | 
            +
                  [:attribute_quoted_value, "your store’s"],
         | 
| 338 | 
            +
                  [:attribute_quoted_value_end, "'"],
         | 
| 339 | 
            +
                  [:tag_end, ">"],
         | 
| 340 | 
            +
                  [:text, "foo"],
         | 
| 341 | 
            +
                  [:tag_start, "<"],
         | 
| 342 | 
            +
                  [:solidus, "/"],
         | 
| 343 | 
            +
                  [:tag_name, "div"],
         | 
| 344 | 
            +
                  [:tag_end, ">"],
         | 
| 345 | 
            +
                ], result
         | 
| 346 | 
            +
              end
         | 
| 347 | 
            +
             | 
| 327 348 | 
             
              private
         | 
| 328 349 |  | 
| 329 350 | 
             
              def tokenize(*parts)
         | 
| 330 351 | 
             
                tokens = []
         | 
| 331 352 | 
             
                @tokenizer = HtmlTokenizer::Tokenizer.new
         | 
| 332 353 | 
             
                parts.each do |part|
         | 
| 333 | 
            -
                  @tokenizer.tokenize(part) { |name, start, stop| tokens << [name, part[start | 
| 354 | 
            +
                  @tokenizer.tokenize(part) { |name, start, stop| tokens << [name, part[start...stop]] }
         | 
| 334 355 | 
             
                end
         | 
| 335 356 | 
             
                tokens
         | 
| 336 357 | 
             
              end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,55 +1,55 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: html_tokenizer
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.0. | 
| 4 | 
            +
              version: 0.0.2
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Francois Chagnon
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2017- | 
| 11 | 
            +
            date: 2017-11-21 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: rake
         | 
| 15 15 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| 16 16 | 
             
                requirements:
         | 
| 17 | 
            -
                - - ~>
         | 
| 17 | 
            +
                - - "~>"
         | 
| 18 18 | 
             
                  - !ruby/object:Gem::Version
         | 
| 19 19 | 
             
                    version: '0'
         | 
| 20 20 | 
             
              type: :development
         | 
| 21 21 | 
             
              prerelease: false
         | 
| 22 22 | 
             
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 23 23 | 
             
                requirements:
         | 
| 24 | 
            -
                - - ~>
         | 
| 24 | 
            +
                - - "~>"
         | 
| 25 25 | 
             
                  - !ruby/object:Gem::Version
         | 
| 26 26 | 
             
                    version: '0'
         | 
| 27 27 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 28 28 | 
             
              name: rake-compiler
         | 
| 29 29 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| 30 30 | 
             
                requirements:
         | 
| 31 | 
            -
                - - ~>
         | 
| 31 | 
            +
                - - "~>"
         | 
| 32 32 | 
             
                  - !ruby/object:Gem::Version
         | 
| 33 33 | 
             
                    version: '0'
         | 
| 34 34 | 
             
              type: :development
         | 
| 35 35 | 
             
              prerelease: false
         | 
| 36 36 | 
             
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 37 37 | 
             
                requirements:
         | 
| 38 | 
            -
                - - ~>
         | 
| 38 | 
            +
                - - "~>"
         | 
| 39 39 | 
             
                  - !ruby/object:Gem::Version
         | 
| 40 40 | 
             
                    version: '0'
         | 
| 41 41 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 42 42 | 
             
              name: minitest
         | 
| 43 43 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| 44 44 | 
             
                requirements:
         | 
| 45 | 
            -
                - - ~>
         | 
| 45 | 
            +
                - - "~>"
         | 
| 46 46 | 
             
                  - !ruby/object:Gem::Version
         | 
| 47 47 | 
             
                    version: '0'
         | 
| 48 48 | 
             
              type: :development
         | 
| 49 49 | 
             
              prerelease: false
         | 
| 50 50 | 
             
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 51 51 | 
             
                requirements:
         | 
| 52 | 
            -
                - - ~>
         | 
| 52 | 
            +
                - - "~>"
         | 
| 53 53 | 
             
                  - !ruby/object:Gem::Version
         | 
| 54 54 | 
             
                    version: '0'
         | 
| 55 55 | 
             
            description: 
         | 
| @@ -60,8 +60,8 @@ extensions: | |
| 60 60 | 
             
            - ext/html_tokenizer_ext/extconf.rb
         | 
| 61 61 | 
             
            extra_rdoc_files: []
         | 
| 62 62 | 
             
            files:
         | 
| 63 | 
            -
            - .autotest
         | 
| 64 | 
            -
            - .gitignore
         | 
| 63 | 
            +
            - ".autotest"
         | 
| 64 | 
            +
            - ".gitignore"
         | 
| 65 65 | 
             
            - Gemfile
         | 
| 66 66 | 
             
            - Gemfile.lock
         | 
| 67 67 | 
             
            - LICENSE
         | 
| @@ -90,17 +90,17 @@ require_paths: | |
| 90 90 | 
             
            - ext
         | 
| 91 91 | 
             
            required_ruby_version: !ruby/object:Gem::Requirement
         | 
| 92 92 | 
             
              requirements:
         | 
| 93 | 
            -
              - -  | 
| 93 | 
            +
              - - ">="
         | 
| 94 94 | 
             
                - !ruby/object:Gem::Version
         | 
| 95 95 | 
             
                  version: '0'
         | 
| 96 96 | 
             
            required_rubygems_version: !ruby/object:Gem::Requirement
         | 
| 97 97 | 
             
              requirements:
         | 
| 98 | 
            -
              - -  | 
| 98 | 
            +
              - - ">="
         | 
| 99 99 | 
             
                - !ruby/object:Gem::Version
         | 
| 100 100 | 
             
                  version: '0'
         | 
| 101 101 | 
             
            requirements: []
         | 
| 102 102 | 
             
            rubyforge_project: 
         | 
| 103 | 
            -
            rubygems_version: 2. | 
| 103 | 
            +
            rubygems_version: 2.6.8
         | 
| 104 104 | 
             
            signing_key: 
         | 
| 105 105 | 
             
            specification_version: 4
         | 
| 106 106 | 
             
            summary: HTML Tokenizer
         |