html_tokenizer 0.0.1 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: a3d58539284af566692b81cc4633af1137baabea
4
- data.tar.gz: 1877010598cbadadb27212eae39346769fa2afde
2
+ SHA256:
3
+ metadata.gz: 7f63f5699a8e9dc129392fa0d554196d9c2322c43f19cd21b353194b90d1c0f1
4
+ data.tar.gz: db308d4bb26d5181da91f9353e29d12d7aa822f02d1f0214959663516891781b
5
5
  SHA512:
6
- metadata.gz: 9d6e46dfd48e7bb4967cfa13e90bdd213331b6dd38f9a57739ac1317f4b8147a9f11c5a39001ebf1c36554350ec444eaccc19dfe04040dadf2fc71d92435d5d5
7
- data.tar.gz: f8fad88c25ff9404d710d4609ffc89fe80b08a71ba864cdca91a50cb9c5c98844befb75c43f5d4c6e0c56641fba57b4fda35ddf1c485abdcaf6bcd121bb4b0be
6
+ metadata.gz: 7f70e313d9206393e094b38569584f2f5e95bf3cb8abbd840fd063652da7d66b4c4c3a29e55b900f8da4f3c672d312dedb3895cebc5727292287e60b41b3049d
7
+ data.tar.gz: 9999d3a9e5c51ac426cb33551f14dcba980b7ee45543c000422ea7c80fe6884b34297790f977f53b95d6d128a35d49bcfa052b8b005f52d33d1f1a675d599ec8
data/.gitignore CHANGED
@@ -30,6 +30,7 @@ tmp/
30
30
  *.i*86
31
31
  *.x86_64
32
32
  *.hex
33
+ *.gem
33
34
 
34
35
  # Debug files
35
36
  *.dSYM/
data/Gemfile.lock CHANGED
@@ -1,13 +1,13 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html_tokenizer (0.0.1)
4
+ html_tokenizer (0.0.7)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
9
  minitest (5.9.0)
10
- rake (11.1.2)
10
+ rake (12.3.0)
11
11
  rake-compiler (0.9.9)
12
12
  rake
13
13
 
@@ -21,4 +21,4 @@ DEPENDENCIES
21
21
  rake-compiler
22
22
 
23
23
  BUNDLED WITH
24
- 1.12.3
24
+ 1.16.1
@@ -1,6 +1,12 @@
1
1
  require 'mkmf'
2
2
 
3
3
  $CXXFLAGS += " -std=c++11 "
4
- $CXXFLAGS += " -g -Og -ggdb "
4
+ $CXXFLAGS += " -g -O1 -ggdb "
5
+ $CFLAGS += " -g -O1 -ggdb "
6
+
7
+ if ENV['DEBUG']
8
+ $CXXFLAGS += " -DDEBUG "
9
+ $CFLAGS += " -DDEBUG "
10
+ end
5
11
 
6
12
  create_makefile('html_tokenizer_ext')
@@ -1,4 +1,5 @@
1
1
  #include <ruby.h>
2
+ #include <ruby/encoding.h>
2
3
  #include "html_tokenizer.h"
3
4
  #include "parser.h"
4
5
 
@@ -13,6 +14,7 @@ static void parser_free(void *ptr)
13
14
  size_t i;
14
15
 
15
16
  if(parser) {
17
+ tokenizer_free_members(&parser->tk);
16
18
  if(parser->doc.data) {
17
19
  DBG_PRINT("parser=%p xfree(parser->doc.data) %p", parser, parser->doc.data);
18
20
  xfree(parser->doc.data);
@@ -65,6 +67,7 @@ static inline void parser_append_ref(struct token_reference_t *dest, struct toke
65
67
  if(dest->type == TOKEN_NONE || dest->type != src->type || (dest->start + dest->length) != src->start) {
66
68
  dest->type = src->type;
67
69
  dest->start = src->start;
70
+ dest->mb_start = src->mb_start;
68
71
  dest->length = src->length;
69
72
  dest->line_number = src->line_number;
70
73
  dest->column_number = src->column_number;
@@ -79,6 +82,8 @@ static void parser_add_error(struct parser_t *parser, const char *message)
79
82
  {
80
83
  REALLOC_N(parser->errors, struct parser_document_error_t, parser->errors_count + 1);
81
84
  parser->errors[parser->errors_count].message = strdup(message);
85
+ parser->errors[parser->errors_count].pos = parser->tk.scan.cursor;
86
+ parser->errors[parser->errors_count].mb_pos = parser->tk.scan.mb_cursor;
82
87
  parser->errors[parser->errors_count].line_number = parser->doc.line_number;
83
88
  parser->errors[parser->errors_count].column_number = parser->doc.column_number;
84
89
  parser->errors_count += 1;
@@ -362,15 +367,21 @@ static inline int rawtext_context(struct parser_t *parser)
362
367
 
363
368
  static void parser_adjust_line_number(struct parser_t *parser, long unsigned int start, long unsigned int length)
364
369
  {
370
+ rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
365
371
  long unsigned int i;
372
+ const char *buf, *nextlf;
366
373
 
367
- for(i = start;i < (start + length); i++) {
368
- if(parser->doc.data[i] == '\n') {
374
+ for(i = 0; i < length;) {
375
+ buf = &parser->doc.data[start + i];
376
+ nextlf = memchr(buf, '\n', length - i);
377
+ if(nextlf) {
369
378
  parser->doc.column_number = 0;
370
379
  parser->doc.line_number += 1;
380
+ i += (nextlf - buf) + 1;
371
381
  }
372
382
  else {
373
- parser->doc.column_number += 1;
383
+ parser->doc.column_number += rb_enc_strlen(buf, buf + length - i, enc);
384
+ break;
374
385
  }
375
386
  }
376
387
 
@@ -383,11 +394,14 @@ static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type typ
383
394
  struct token_reference_t ref = {
384
395
  .type = type,
385
396
  .start = tk->scan.cursor,
397
+ .mb_start = tk->scan.mb_cursor,
386
398
  .length = length,
387
399
  .line_number = parser->doc.line_number,
388
400
  .column_number = parser->doc.column_number,
389
401
  };
390
402
  int parse_again = 1;
403
+ long unsigned int mb_strlen;
404
+ rb_encoding *enc;
391
405
 
392
406
  while(parse_again) {
393
407
  switch(parser->context)
@@ -438,8 +452,10 @@ static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type typ
438
452
  }
439
453
 
440
454
  if(rb_block_given_p()) {
455
+ enc = rb_enc_from_index(parser->doc.enc_index);
456
+ mb_strlen = rb_enc_strlen(parser->doc.data + ref.start, parser->doc.data + ref.start + ref.length, enc);
441
457
  rb_yield_values(5, token_type_to_symbol(type),
442
- INT2NUM(ref.start), INT2NUM(ref.start + ref.length),
458
+ INT2NUM(ref.mb_start), INT2NUM(ref.mb_start + mb_strlen),
443
459
  INT2NUM(ref.line_number), INT2NUM(ref.column_number));
444
460
  }
445
461
 
@@ -465,6 +481,8 @@ static VALUE parser_initialize_method(VALUE self)
465
481
 
466
482
  parser->doc.length = 0;
467
483
  parser->doc.data = NULL;
484
+ parser->doc.enc_index = 0;
485
+ parser->doc.mb_length = 0;
468
486
 
469
487
  parser->doc.line_number = 1;
470
488
  parser->doc.column_number = 0;
@@ -478,11 +496,17 @@ static VALUE parser_initialize_method(VALUE self)
478
496
  static int parser_document_append(struct parser_t *parser, const char *string, unsigned long int length)
479
497
  {
480
498
  void *old = parser->doc.data;
499
+ unsigned long int mb_length;
500
+ char *buf;
501
+ rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
481
502
  REALLOC_N(parser->doc.data, char, parser->doc.length + length + 1);
482
503
  DBG_PRINT("parser=%p realloc(parser->doc.data) %p -> %p length=%lu", parser, old,
483
- parser->doc.data, parser->doc.length + length + 1);
484
- strcpy(parser->doc.data+parser->doc.length, string);
504
+ parser->doc.data, parser->doc.length + length + 1);
505
+ buf = parser->doc.data + parser->doc.length;
506
+ strcpy(buf, string);
507
+ mb_length = rb_enc_strlen(buf, buf + length, enc);
485
508
  parser->doc.length += length;
509
+ parser->doc.mb_length += mb_length;
486
510
  return 1;
487
511
  }
488
512
 
@@ -490,7 +514,7 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
490
514
  {
491
515
  struct parser_t *parser = NULL;
492
516
  char *string = NULL;
493
- long unsigned int length = 0, cursor = 0;
517
+ long unsigned int length = 0, cursor = 0, mb_cursor = 0;
494
518
 
495
519
  if(NIL_P(source))
496
520
  return Qnil;
@@ -502,6 +526,15 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
502
526
  length = strlen(string);
503
527
 
504
528
  cursor = parser->doc.length;
529
+ mb_cursor = parser->doc.mb_length;
530
+
531
+ if(parser->doc.data == NULL) {
532
+ parser->doc.enc_index = rb_enc_get_index(source);
533
+ }
534
+ else if(parser->doc.enc_index != rb_enc_get_index(source)) {
535
+ rb_raise(rb_eArgError, "cannot append %s string to %s document",
536
+ rb_enc_name(rb_enc_get(source)), rb_enc_name(rb_enc_from_index(parser->doc.enc_index)));
537
+ }
505
538
 
506
539
  if(!parser_document_append(parser, string, length)) {
507
540
  // error
@@ -513,10 +546,12 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
513
546
  }
514
547
  else {
515
548
  parser->tk.scan.cursor = cursor;
516
- parser->tk.scan.string = parser->doc.data;
517
- parser->tk.scan.length = parser->doc.length;
549
+ tokenizer_set_scan_string(&parser->tk, parser->doc.data, parser->doc.length);
550
+ parser->tk.scan.enc_index = parser->doc.enc_index;
551
+ parser->tk.scan.mb_cursor = mb_cursor;
518
552
 
519
553
  tokenizer_scan_all(&parser->tk);
554
+ tokenizer_free_scan_string(&parser->tk);
520
555
  }
521
556
 
522
557
  return Qtrue;
@@ -535,17 +570,30 @@ static VALUE parser_append_placeholder_method(VALUE self, VALUE source)
535
570
  static VALUE parser_document_method(VALUE self)
536
571
  {
537
572
  struct parser_t *parser = NULL;
573
+ rb_encoding *enc;
538
574
  Parser_Get_Struct(self, parser);
539
575
  if(!parser->doc.data)
540
576
  return Qnil;
541
- return rb_str_new(parser->doc.data, parser->doc.length);
577
+ enc = rb_enc_from_index(parser->doc.enc_index);
578
+ return rb_enc_str_new(parser->doc.data, parser->doc.length, enc);
542
579
  }
543
580
 
544
581
  static VALUE parser_document_length_method(VALUE self)
545
582
  {
546
583
  struct parser_t *parser = NULL;
584
+ rb_encoding *enc;
585
+ const char *buf;
586
+
547
587
  Parser_Get_Struct(self, parser);
548
- return ULONG2NUM(parser->doc.length);
588
+
589
+ if(parser->doc.data == NULL) {
590
+ return ULONG2NUM(0);
591
+ }
592
+ else {
593
+ buf = parser->doc.data;
594
+ enc = rb_enc_from_index(parser->doc.enc_index);
595
+ return ULONG2NUM(rb_enc_strlen(buf, buf + parser->doc.length, enc));
596
+ }
549
597
  }
550
598
 
551
599
  static VALUE parser_context_method(VALUE self)
@@ -588,9 +636,10 @@ static VALUE parser_context_method(VALUE self)
588
636
 
589
637
  static inline VALUE ref_to_str(struct parser_t *parser, struct token_reference_t *ref)
590
638
  {
639
+ rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
591
640
  if(ref->type == TOKEN_NONE || parser->doc.data == NULL)
592
641
  return Qnil;
593
- return rb_str_new(parser->doc.data+ref->start, ref->length);
642
+ return rb_enc_str_new(parser->doc.data+ref->start, ref->length, enc);
594
643
  }
595
644
 
596
645
  static VALUE parser_tag_name_method(VALUE self)
@@ -665,29 +714,6 @@ static VALUE parser_rawtext_text_method(VALUE self)
665
714
  return ref_to_str(parser, &parser->rawtext.text);
666
715
  }
667
716
 
668
- static VALUE parser_extract_method(VALUE self, VALUE start_p, VALUE end_p)
669
- {
670
- struct parser_t *parser = NULL;
671
- unsigned long int start, end;
672
- struct token_reference_t ref;
673
-
674
- Parser_Get_Struct(self, parser);
675
-
676
- start = NUM2ULONG(start_p);
677
- end = NUM2ULONG(end_p);
678
- if(end < start) {
679
- rb_raise(rb_eArgError, "'end' must be greater or equal than 'start'");
680
- }
681
- if(end > parser->doc.length) {
682
- rb_raise(rb_eArgError, "'end' argument not in range of document");
683
- }
684
-
685
- ref.type = TOKEN_TEXT; // anything not NONE
686
- ref.start = start;
687
- ref.length = end - start;
688
- return ref_to_str(parser, &ref);
689
- }
690
-
691
717
  static VALUE parser_errors_count_method(VALUE self)
692
718
  {
693
719
  struct parser_t *parser = NULL;
@@ -699,12 +725,13 @@ static VALUE create_parser_error(struct parser_document_error_t *error)
699
725
  {
700
726
  VALUE module = rb_const_get(rb_cObject, rb_intern("HtmlTokenizer"));
701
727
  VALUE klass = rb_const_get(module, rb_intern("ParserError"));
702
- VALUE args[3] = {
728
+ VALUE args[4] = {
703
729
  rb_str_new2(error->message),
730
+ ULONG2NUM(error->mb_pos),
704
731
  ULONG2NUM(error->line_number),
705
732
  ULONG2NUM(error->column_number),
706
733
  };
707
- return rb_class_new_instance(3, args, klass);
734
+ return rb_class_new_instance(4, args, klass);
708
735
  }
709
736
 
710
737
  static VALUE parser_errors_method(VALUE self, VALUE error_p)
@@ -749,7 +776,6 @@ void Init_html_tokenizer_parser(VALUE mHtmlTokenizer)
749
776
  rb_define_method(cParser, "column_number", parser_column_number_method, 0);
750
777
  rb_define_method(cParser, "parse", parser_parse_method, 1);
751
778
  rb_define_method(cParser, "append_placeholder", parser_append_placeholder_method, 1);
752
- rb_define_method(cParser, "extract", parser_extract_method, 2);
753
779
  rb_define_method(cParser, "context", parser_context_method, 0);
754
780
  rb_define_method(cParser, "tag_name", parser_tag_name_method, 0);
755
781
  rb_define_method(cParser, "closing_tag?", parser_closing_tag_method, 0);
@@ -19,6 +19,8 @@ enum parser_context {
19
19
 
20
20
  struct parser_document_error_t {
21
21
  char *message;
22
+ long unsigned int pos;
23
+ long unsigned int mb_pos;
22
24
  long unsigned int line_number;
23
25
  long unsigned int column_number;
24
26
  };
@@ -28,11 +30,15 @@ struct parser_document_t {
28
30
  char *data;
29
31
  long unsigned int line_number;
30
32
  long unsigned int column_number;
33
+
34
+ int enc_index;
35
+ long unsigned int mb_length;
31
36
  };
32
37
 
33
38
  struct token_reference_t {
34
39
  enum token_type type;
35
40
  long unsigned int start;
41
+ long unsigned int mb_start;
36
42
  long unsigned int length;
37
43
  long unsigned int line_number;
38
44
  long unsigned int column_number;
@@ -1,4 +1,5 @@
1
1
  #include <ruby.h>
2
+ #include <ruby/encoding.h>
2
3
  #include "html_tokenizer.h"
3
4
  #include "tokenizer.h"
4
5
 
@@ -11,16 +12,7 @@ static void tokenizer_free(void *ptr)
11
12
  {
12
13
  struct tokenizer_t *tk = ptr;
13
14
  if(tk) {
14
- if(tk->current_tag) {
15
- DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag);
16
- xfree(tk->current_tag);
17
- tk->current_tag = NULL;
18
- }
19
- if(tk->scan.string) {
20
- DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string);
21
- xfree(tk->scan.string);
22
- tk->scan.string = NULL;
23
- }
15
+ tokenizer_free_members(tk);
24
16
  DBG_PRINT("tk=%p xfree(tk)", tk);
25
17
  xfree(tk);
26
18
  }
@@ -60,6 +52,8 @@ void tokenizer_init(struct tokenizer_t *tk)
60
52
  tk->scan.string = NULL;
61
53
  tk->scan.cursor = 0;
62
54
  tk->scan.length = 0;
55
+ tk->scan.mb_cursor = 0;
56
+ tk->scan.enc_index = 0;
63
57
 
64
58
  tk->attribute_value_start = 0;
65
59
  tk->found_attribute = 0;
@@ -72,6 +66,21 @@ void tokenizer_init(struct tokenizer_t *tk)
72
66
  return;
73
67
  }
74
68
 
69
+ void tokenizer_free_members(struct tokenizer_t *tk)
70
+ {
71
+ if(tk->current_tag) {
72
+ DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag);
73
+ xfree(tk->current_tag);
74
+ tk->current_tag = NULL;
75
+ }
76
+ if(tk->scan.string) {
77
+ DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string);
78
+ xfree(tk->scan.string);
79
+ tk->scan.string = NULL;
80
+ }
81
+ return;
82
+ }
83
+
75
84
  VALUE token_type_to_symbol(enum token_type type)
76
85
  {
77
86
  switch(type) {
@@ -115,17 +124,27 @@ VALUE token_type_to_symbol(enum token_type type)
115
124
  return Qnil;
116
125
  }
117
126
 
127
+ static long unsigned int tokenizer_mblength(struct tokenizer_t *tk, long unsigned int length)
128
+ {
129
+ rb_encoding *enc = rb_enc_from_index(tk->scan.enc_index);
130
+ const char *buf = tk->scan.string + tk->scan.cursor;
131
+ return rb_enc_strlen(buf, buf + length, enc);
132
+ }
133
+
118
134
  static void tokenizer_yield_tag(struct tokenizer_t *tk, enum token_type type, long unsigned int length, void *data)
119
135
  {
136
+ long unsigned int mb_length = tokenizer_mblength(tk, length);
120
137
  tk->last_token = type;
121
- rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan.cursor), INT2NUM(tk->scan.cursor + length));
138
+ rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan.mb_cursor), INT2NUM(tk->scan.mb_cursor + mb_length));
122
139
  }
123
140
 
124
141
  static void tokenizer_callback(struct tokenizer_t *tk, enum token_type type, long unsigned int length)
125
142
  {
143
+ long unsigned int mb_length = tokenizer_mblength(tk, length);
126
144
  if(tk->f_callback)
127
145
  tk->f_callback(tk, type, length, tk->callback_data);
128
146
  tk->scan.cursor += length;
147
+ tk->scan.mb_cursor += mb_length;
129
148
  }
130
149
 
131
150
  static VALUE tokenizer_initialize_method(VALUE self)
@@ -356,6 +375,7 @@ static int scan_open_tag(struct tokenizer_t *tk)
356
375
  else if(is_doctype(&tk->scan)) {
357
376
  tokenizer_callback(tk, TOKEN_TAG_START, 1);
358
377
  tokenizer_callback(tk, TOKEN_TAG_NAME, 8);
378
+ push_context(tk, TOKENIZER_TAG_NAME);
359
379
  return 1;
360
380
  }
361
381
  else if(is_cdata_start(&tk->scan)) {
@@ -642,11 +662,30 @@ void tokenizer_scan_all(struct tokenizer_t *tk)
642
662
  return;
643
663
  }
644
664
 
665
+ void tokenizer_set_scan_string(struct tokenizer_t *tk, const char *string, long unsigned int length)
666
+ {
667
+ const char *old = tk->scan.string;
668
+ REALLOC_N(tk->scan.string, char, string ? length + 1 : 0);
669
+ DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
670
+ tk->scan.string, length + 1);
671
+ if(string && length > 0) {
672
+ strncpy(tk->scan.string, string, length);
673
+ tk->scan.string[length] = 0;
674
+ }
675
+ tk->scan.length = length;
676
+ return;
677
+ }
678
+
679
+ void tokenizer_free_scan_string(struct tokenizer_t *tk)
680
+ {
681
+ tokenizer_set_scan_string(tk, NULL, 0);
682
+ return;
683
+ }
684
+
645
685
  static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
646
686
  {
647
687
  struct tokenizer_t *tk = NULL;
648
688
  char *c_source;
649
- char *old;
650
689
 
651
690
  if(NIL_P(source))
652
691
  return Qnil;
@@ -656,19 +695,13 @@ static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
656
695
 
657
696
  c_source = StringValueCStr(source);
658
697
  tk->scan.cursor = 0;
659
- tk->scan.length = strlen(c_source);
660
-
661
- old = tk->scan.string;
662
- REALLOC_N(tk->scan.string, char, tk->scan.length+1);
663
- DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
664
- tk->scan.string, tk->scan.length+1);
665
- strncpy(tk->scan.string, c_source, tk->scan.length);
698
+ tokenizer_set_scan_string(tk, c_source, strlen(c_source));
699
+ tk->scan.enc_index = rb_enc_get_index(source);
700
+ tk->scan.mb_cursor = 0;
666
701
 
667
702
  tokenizer_scan_all(tk);
668
703
 
669
- DBG_PRINT("tk=%p xfree(tk->scan.string) 0x%p", tk, tk->scan.string);
670
- xfree(tk->scan.string);
671
- tk->scan.string = NULL;
704
+ tokenizer_free_scan_string(tk);
672
705
 
673
706
  return Qtrue;
674
707
  }
@@ -43,6 +43,9 @@ struct scan_t {
43
43
  char *string;
44
44
  long unsigned int cursor;
45
45
  long unsigned int length;
46
+
47
+ int enc_index;
48
+ long unsigned int mb_cursor;
46
49
  };
47
50
 
48
51
  struct tokenizer_t
@@ -67,6 +70,9 @@ struct tokenizer_t
67
70
 
68
71
  void Init_html_tokenizer_tokenizer(VALUE mHtmlTokenizer);
69
72
  void tokenizer_init(struct tokenizer_t *tk);
73
+ void tokenizer_free_members(struct tokenizer_t *tk);
74
+ void tokenizer_set_scan_string(struct tokenizer_t *tk, const char *string, long unsigned int length);
75
+ void tokenizer_free_scan_string(struct tokenizer_t *tk);
70
76
  void tokenizer_scan_all(struct tokenizer_t *tk);
71
77
  VALUE token_type_to_symbol(enum token_type type);
72
78
 
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "html_tokenizer"
3
- spec.version = "0.0.1"
3
+ spec.version = "0.0.7"
4
4
  spec.summary = "HTML Tokenizer"
5
5
  spec.author = "Francois Chagnon"
6
6
 
@@ -2,9 +2,10 @@ require 'html_tokenizer_ext'
2
2
 
3
3
  module HtmlTokenizer
4
4
  class ParserError < RuntimeError
5
- attr_reader :line, :column
6
- def initialize(message, line, column)
5
+ attr_reader :position, :line, :column
6
+ def initialize(message, position, line, column)
7
7
  super(message)
8
+ @position = position
8
9
  @line = line
9
10
  @column = column
10
11
  end
@@ -431,35 +431,13 @@ class HtmlTokenizer::ParserTest < Minitest::Test
431
431
  tokens << token
432
432
  end
433
433
  assert_equal [[:text, 0, 4, 1, 0], [:text, 34, 38, 5, 0]], tokens
434
- assert_equal "bar\n", @parser.extract(34, 38)
435
- end
436
-
437
- def test_extract_method
438
- parse("abcdefg")
439
- assert_equal "a", @parser.extract(0, 1)
440
- assert_equal "cd", @parser.extract(2, 4)
441
- end
442
-
443
- def test_extract_method_raises_argument_error_end_past_length
444
- parse("abcdefg")
445
- e = assert_raises(ArgumentError) do
446
- @parser.extract(0, 32)
447
- end
448
- assert_equal "'end' argument not in range of document", e.message
449
- end
450
-
451
- def test_extract_method_raises_argument_error_end_less_than_start
452
- parse("abcdefg")
453
- e = assert_raises(ArgumentError) do
454
- @parser.extract(1, 0)
455
- end
456
- assert_equal "'end' must be greater or equal than 'start'", e.message
457
434
  end
458
435
 
459
436
  def test_solidus_or_tag_name_error
460
437
  parse('<>')
461
438
  assert_equal 1, @parser.errors_count
462
439
  assert_equal "expected '/' or tag name", @parser.errors.first.to_s
440
+ assert_equal 1, @parser.errors.first.position
463
441
  assert_equal 1, @parser.errors.first.line
464
442
  assert_equal 1, @parser.errors.first.column
465
443
  end
@@ -468,6 +446,7 @@ class HtmlTokenizer::ParserTest < Minitest::Test
468
446
  parse('< ')
469
447
  assert_equal 1, @parser.errors_count
470
448
  assert_equal "expected '/' or tag name", @parser.errors.first.to_s
449
+ assert_equal 1, @parser.errors.first.position
471
450
  assert_equal 1, @parser.errors.first.line
472
451
  assert_equal 1, @parser.errors.first.column
473
452
  end
@@ -476,6 +455,7 @@ class HtmlTokenizer::ParserTest < Minitest::Test
476
455
  parse('<foo =')
477
456
  assert_equal 1, @parser.errors_count
478
457
  assert_equal "expected whitespace, '>', attribute name or value", @parser.errors.first.to_s
458
+ assert_equal 5, @parser.errors.first.position
479
459
  assert_equal 1, @parser.errors.first.line
480
460
  assert_equal 5, @parser.errors.first.column
481
461
  end
@@ -484,6 +464,7 @@ class HtmlTokenizer::ParserTest < Minitest::Test
484
464
  parse('<foo /x')
485
465
  assert_equal 1, @parser.errors_count
486
466
  assert_equal "expected '>' after '/'", @parser.errors.first.to_s
467
+ assert_equal 6, @parser.errors.first.position
487
468
  assert_equal 1, @parser.errors.first.line
488
469
  assert_equal 6, @parser.errors.first.column
489
470
  end
@@ -492,6 +473,7 @@ class HtmlTokenizer::ParserTest < Minitest::Test
492
473
  parse('<foo / ')
493
474
  assert_equal 1, @parser.errors_count
494
475
  assert_equal "expected '>' after '/'", @parser.errors.first.to_s
476
+ assert_equal 6, @parser.errors.first.position
495
477
  assert_equal 1, @parser.errors.first.line
496
478
  assert_equal 6, @parser.errors.first.column
497
479
  end
@@ -499,29 +481,33 @@ class HtmlTokenizer::ParserTest < Minitest::Test
499
481
  def test_attribute_name_error
500
482
  parse('<foo bar~')
501
483
  assert_equal 2, @parser.errors_count
502
- assert_equal "expected whitespace, '>' or '=' after attribute name", @parser.errors.first.to_s
503
- assert_equal 1, @parser.errors.first.line
504
- assert_equal 8, @parser.errors.first.column
505
484
  assert_equal "expected whitespace, '>' or '=' after attribute name", @parser.errors[0].to_s
485
+ assert_equal 8, @parser.errors.first.position
506
486
  assert_equal 1, @parser.errors[0].line
507
487
  assert_equal 8, @parser.errors[0].column
488
+ assert_equal "expected whitespace, '>', attribute name or value", @parser.errors[1].to_s
489
+ assert_equal 8, @parser.errors.first.position
490
+ assert_equal 1, @parser.errors[1].line
491
+ assert_equal 8, @parser.errors[1].column
508
492
  end
509
493
 
510
494
  def test_attribute_whitespace_or_equal_error
511
495
  parse('<foo bar ~')
512
496
  assert_equal 2, @parser.errors_count
513
- assert_equal "expected '/', '>', \", ' or '=' after attribute name", @parser.errors.first.to_s
514
- assert_equal 1, @parser.errors.first.line
515
- assert_equal 9, @parser.errors.first.column
516
497
  assert_equal "expected '/', '>', \", ' or '=' after attribute name", @parser.errors[0].to_s
517
498
  assert_equal 1, @parser.errors[0].line
518
499
  assert_equal 9, @parser.errors[0].column
500
+ assert_equal "expected whitespace, '>', attribute name or value", @parser.errors[1].to_s
501
+ assert_equal 9, @parser.errors.first.position
502
+ assert_equal 1, @parser.errors[1].line
503
+ assert_equal 9, @parser.errors[1].column
519
504
  end
520
505
 
521
506
  def test_attribute_whitespace_or_equal_error_2
522
507
  parse('<foo bar = >')
523
508
  assert_equal 1, @parser.errors_count
524
509
  assert_equal "expected attribute value after '='", @parser.errors.first.to_s
510
+ assert_equal 11, @parser.errors.first.position
525
511
  assert_equal 1, @parser.errors.first.line
526
512
  assert_equal 11, @parser.errors.first.column
527
513
  end
@@ -530,10 +516,34 @@ class HtmlTokenizer::ParserTest < Minitest::Test
530
516
  parse('<foo bar=""x')
531
517
  assert_equal 1, @parser.errors_count
532
518
  assert_equal "expected space after attribute value", @parser.errors.first.to_s
519
+ assert_equal 11, @parser.errors.first.position
533
520
  assert_equal 1, @parser.errors.first.line
534
521
  assert_equal 11, @parser.errors.first.column
535
522
  end
536
523
 
524
+ def test_attribute_with_mutlibyte_characters
525
+ data = ["<div title", "='your store’s'>"]
526
+ tokens = []
527
+ parse(*data) { |name, start, stop| tokens << [name, start, stop, data.join[start...stop]] }
528
+ assert_equal "div", @parser.tag_name
529
+ assert_equal "title", @parser.attribute_name
530
+ assert_equal "your store’s", @parser.attribute_value
531
+ assert_equal data.join, @parser.document
532
+ assert_equal data.join.size, @parser.document_length
533
+ assert_equal data.join.size, @parser.column_number
534
+ assert_equal [
535
+ [:tag_start, 0, 1, "<"],
536
+ [:tag_name, 1, 4, "div"],
537
+ [:whitespace, 4, 5, " "],
538
+ [:attribute_name, 5, 10, "title"],
539
+ [:equal, 10, 11, "="],
540
+ [:attribute_quoted_value_start, 11, 12, "'"],
541
+ [:attribute_quoted_value, 12, 24, "your store’s"],
542
+ [:attribute_quoted_value_end, 24, 25, "'"],
543
+ [:tag_end, 25, 26, ">"],
544
+ ], tokens
545
+ end
546
+
537
547
  def test_valid_syntaxes
538
548
  parse(
539
549
  '<div>',
@@ -564,6 +574,15 @@ class HtmlTokenizer::ParserTest < Minitest::Test
564
574
  assert_equal 0, @parser.errors_count, "Expected no errors: #{@parser.errors}"
565
575
  end
566
576
 
577
+ def test_doctype_without_space
578
+ parse('<!DOCTYPE')
579
+ assert_equal "!DOCTYPE", @parser.tag_name
580
+ parse('foo')
581
+ assert_equal "!DOCTYPEfoo", @parser.tag_name
582
+
583
+ assert_equal 0, @parser.errors_count, "Expected no errors: #{@parser.errors}"
584
+ end
585
+
567
586
  private
568
587
 
569
588
  def parse(*parts, &block)
@@ -324,13 +324,34 @@ class HtmlTokenizer::TokenizerTest < Minitest::Test
324
324
  ], result
325
325
  end
326
326
 
327
+ def test_html_with_mutlibyte_characters
328
+ data = "<div title='your store’s'>foo</div>"
329
+ result = tokenize(data)
330
+ assert_equal [
331
+ [:tag_start, "<"],
332
+ [:tag_name, "div"],
333
+ [:whitespace, " "],
334
+ [:attribute_name, "title"],
335
+ [:equal, "="],
336
+ [:attribute_quoted_value_start, "'"],
337
+ [:attribute_quoted_value, "your store’s"],
338
+ [:attribute_quoted_value_end, "'"],
339
+ [:tag_end, ">"],
340
+ [:text, "foo"],
341
+ [:tag_start, "<"],
342
+ [:solidus, "/"],
343
+ [:tag_name, "div"],
344
+ [:tag_end, ">"],
345
+ ], result
346
+ end
347
+
327
348
  private
328
349
 
329
350
  def tokenize(*parts)
330
351
  tokens = []
331
352
  @tokenizer = HtmlTokenizer::Tokenizer.new
332
353
  parts.each do |part|
333
- @tokenizer.tokenize(part) { |name, start, stop| tokens << [name, part[start..(stop-1)]] }
354
+ @tokenizer.tokenize(part) { |name, start, stop| tokens << [name, part[start...stop]] }
334
355
  end
335
356
  tokens
336
357
  end
metadata CHANGED
@@ -1,55 +1,55 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francois Chagnon
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-10-26 00:00:00.000000000 Z
11
+ date: 2018-05-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake-compiler
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ~>
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ~>
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: minitest
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ~>
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
47
  version: '0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ~>
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  description:
@@ -60,8 +60,8 @@ extensions:
60
60
  - ext/html_tokenizer_ext/extconf.rb
61
61
  extra_rdoc_files: []
62
62
  files:
63
- - .autotest
64
- - .gitignore
63
+ - ".autotest"
64
+ - ".gitignore"
65
65
  - Gemfile
66
66
  - Gemfile.lock
67
67
  - LICENSE
@@ -90,17 +90,17 @@ require_paths:
90
90
  - ext
91
91
  required_ruby_version: !ruby/object:Gem::Requirement
92
92
  requirements:
93
- - - '>='
93
+ - - ">="
94
94
  - !ruby/object:Gem::Version
95
95
  version: '0'
96
96
  required_rubygems_version: !ruby/object:Gem::Requirement
97
97
  requirements:
98
- - - '>='
98
+ - - ">="
99
99
  - !ruby/object:Gem::Version
100
100
  version: '0'
101
101
  requirements: []
102
102
  rubyforge_project:
103
- rubygems_version: 2.0.14.1
103
+ rubygems_version: 2.7.6
104
104
  signing_key:
105
105
  specification_version: 4
106
106
  summary: HTML Tokenizer