html_tokenizer 0.0.1 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: a3d58539284af566692b81cc4633af1137baabea
4
- data.tar.gz: 1877010598cbadadb27212eae39346769fa2afde
2
+ SHA256:
3
+ metadata.gz: 7f63f5699a8e9dc129392fa0d554196d9c2322c43f19cd21b353194b90d1c0f1
4
+ data.tar.gz: db308d4bb26d5181da91f9353e29d12d7aa822f02d1f0214959663516891781b
5
5
  SHA512:
6
- metadata.gz: 9d6e46dfd48e7bb4967cfa13e90bdd213331b6dd38f9a57739ac1317f4b8147a9f11c5a39001ebf1c36554350ec444eaccc19dfe04040dadf2fc71d92435d5d5
7
- data.tar.gz: f8fad88c25ff9404d710d4609ffc89fe80b08a71ba864cdca91a50cb9c5c98844befb75c43f5d4c6e0c56641fba57b4fda35ddf1c485abdcaf6bcd121bb4b0be
6
+ metadata.gz: 7f70e313d9206393e094b38569584f2f5e95bf3cb8abbd840fd063652da7d66b4c4c3a29e55b900f8da4f3c672d312dedb3895cebc5727292287e60b41b3049d
7
+ data.tar.gz: 9999d3a9e5c51ac426cb33551f14dcba980b7ee45543c000422ea7c80fe6884b34297790f977f53b95d6d128a35d49bcfa052b8b005f52d33d1f1a675d599ec8
data/.gitignore CHANGED
@@ -30,6 +30,7 @@ tmp/
30
30
  *.i*86
31
31
  *.x86_64
32
32
  *.hex
33
+ *.gem
33
34
 
34
35
  # Debug files
35
36
  *.dSYM/
data/Gemfile.lock CHANGED
@@ -1,13 +1,13 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html_tokenizer (0.0.1)
4
+ html_tokenizer (0.0.7)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
9
  minitest (5.9.0)
10
- rake (11.1.2)
10
+ rake (12.3.0)
11
11
  rake-compiler (0.9.9)
12
12
  rake
13
13
 
@@ -21,4 +21,4 @@ DEPENDENCIES
21
21
  rake-compiler
22
22
 
23
23
  BUNDLED WITH
24
- 1.12.3
24
+ 1.16.1
@@ -1,6 +1,12 @@
1
1
  require 'mkmf'
2
2
 
3
3
  $CXXFLAGS += " -std=c++11 "
4
- $CXXFLAGS += " -g -Og -ggdb "
4
+ $CXXFLAGS += " -g -O1 -ggdb "
5
+ $CFLAGS += " -g -O1 -ggdb "
6
+
7
+ if ENV['DEBUG']
8
+ $CXXFLAGS += " -DDEBUG "
9
+ $CFLAGS += " -DDEBUG "
10
+ end
5
11
 
6
12
  create_makefile('html_tokenizer_ext')
@@ -1,4 +1,5 @@
1
1
  #include <ruby.h>
2
+ #include <ruby/encoding.h>
2
3
  #include "html_tokenizer.h"
3
4
  #include "parser.h"
4
5
 
@@ -13,6 +14,7 @@ static void parser_free(void *ptr)
13
14
  size_t i;
14
15
 
15
16
  if(parser) {
17
+ tokenizer_free_members(&parser->tk);
16
18
  if(parser->doc.data) {
17
19
  DBG_PRINT("parser=%p xfree(parser->doc.data) %p", parser, parser->doc.data);
18
20
  xfree(parser->doc.data);
@@ -65,6 +67,7 @@ static inline void parser_append_ref(struct token_reference_t *dest, struct toke
65
67
  if(dest->type == TOKEN_NONE || dest->type != src->type || (dest->start + dest->length) != src->start) {
66
68
  dest->type = src->type;
67
69
  dest->start = src->start;
70
+ dest->mb_start = src->mb_start;
68
71
  dest->length = src->length;
69
72
  dest->line_number = src->line_number;
70
73
  dest->column_number = src->column_number;
@@ -79,6 +82,8 @@ static void parser_add_error(struct parser_t *parser, const char *message)
79
82
  {
80
83
  REALLOC_N(parser->errors, struct parser_document_error_t, parser->errors_count + 1);
81
84
  parser->errors[parser->errors_count].message = strdup(message);
85
+ parser->errors[parser->errors_count].pos = parser->tk.scan.cursor;
86
+ parser->errors[parser->errors_count].mb_pos = parser->tk.scan.mb_cursor;
82
87
  parser->errors[parser->errors_count].line_number = parser->doc.line_number;
83
88
  parser->errors[parser->errors_count].column_number = parser->doc.column_number;
84
89
  parser->errors_count += 1;
@@ -362,15 +367,21 @@ static inline int rawtext_context(struct parser_t *parser)
362
367
 
363
368
  static void parser_adjust_line_number(struct parser_t *parser, long unsigned int start, long unsigned int length)
364
369
  {
370
+ rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
365
371
  long unsigned int i;
372
+ const char *buf, *nextlf;
366
373
 
367
- for(i = start;i < (start + length); i++) {
368
- if(parser->doc.data[i] == '\n') {
374
+ for(i = 0; i < length;) {
375
+ buf = &parser->doc.data[start + i];
376
+ nextlf = memchr(buf, '\n', length - i);
377
+ if(nextlf) {
369
378
  parser->doc.column_number = 0;
370
379
  parser->doc.line_number += 1;
380
+ i += (nextlf - buf) + 1;
371
381
  }
372
382
  else {
373
- parser->doc.column_number += 1;
383
+ parser->doc.column_number += rb_enc_strlen(buf, buf + length - i, enc);
384
+ break;
374
385
  }
375
386
  }
376
387
 
@@ -383,11 +394,14 @@ static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type typ
383
394
  struct token_reference_t ref = {
384
395
  .type = type,
385
396
  .start = tk->scan.cursor,
397
+ .mb_start = tk->scan.mb_cursor,
386
398
  .length = length,
387
399
  .line_number = parser->doc.line_number,
388
400
  .column_number = parser->doc.column_number,
389
401
  };
390
402
  int parse_again = 1;
403
+ long unsigned int mb_strlen;
404
+ rb_encoding *enc;
391
405
 
392
406
  while(parse_again) {
393
407
  switch(parser->context)
@@ -438,8 +452,10 @@ static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type typ
438
452
  }
439
453
 
440
454
  if(rb_block_given_p()) {
455
+ enc = rb_enc_from_index(parser->doc.enc_index);
456
+ mb_strlen = rb_enc_strlen(parser->doc.data + ref.start, parser->doc.data + ref.start + ref.length, enc);
441
457
  rb_yield_values(5, token_type_to_symbol(type),
442
- INT2NUM(ref.start), INT2NUM(ref.start + ref.length),
458
+ INT2NUM(ref.mb_start), INT2NUM(ref.mb_start + mb_strlen),
443
459
  INT2NUM(ref.line_number), INT2NUM(ref.column_number));
444
460
  }
445
461
 
@@ -465,6 +481,8 @@ static VALUE parser_initialize_method(VALUE self)
465
481
 
466
482
  parser->doc.length = 0;
467
483
  parser->doc.data = NULL;
484
+ parser->doc.enc_index = 0;
485
+ parser->doc.mb_length = 0;
468
486
 
469
487
  parser->doc.line_number = 1;
470
488
  parser->doc.column_number = 0;
@@ -478,11 +496,17 @@ static VALUE parser_initialize_method(VALUE self)
478
496
  static int parser_document_append(struct parser_t *parser, const char *string, unsigned long int length)
479
497
  {
480
498
  void *old = parser->doc.data;
499
+ unsigned long int mb_length;
500
+ char *buf;
501
+ rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
481
502
  REALLOC_N(parser->doc.data, char, parser->doc.length + length + 1);
482
503
  DBG_PRINT("parser=%p realloc(parser->doc.data) %p -> %p length=%lu", parser, old,
483
- parser->doc.data, parser->doc.length + length + 1);
484
- strcpy(parser->doc.data+parser->doc.length, string);
504
+ parser->doc.data, parser->doc.length + length + 1);
505
+ buf = parser->doc.data + parser->doc.length;
506
+ strcpy(buf, string);
507
+ mb_length = rb_enc_strlen(buf, buf + length, enc);
485
508
  parser->doc.length += length;
509
+ parser->doc.mb_length += mb_length;
486
510
  return 1;
487
511
  }
488
512
 
@@ -490,7 +514,7 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
490
514
  {
491
515
  struct parser_t *parser = NULL;
492
516
  char *string = NULL;
493
- long unsigned int length = 0, cursor = 0;
517
+ long unsigned int length = 0, cursor = 0, mb_cursor = 0;
494
518
 
495
519
  if(NIL_P(source))
496
520
  return Qnil;
@@ -502,6 +526,15 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
502
526
  length = strlen(string);
503
527
 
504
528
  cursor = parser->doc.length;
529
+ mb_cursor = parser->doc.mb_length;
530
+
531
+ if(parser->doc.data == NULL) {
532
+ parser->doc.enc_index = rb_enc_get_index(source);
533
+ }
534
+ else if(parser->doc.enc_index != rb_enc_get_index(source)) {
535
+ rb_raise(rb_eArgError, "cannot append %s string to %s document",
536
+ rb_enc_name(rb_enc_get(source)), rb_enc_name(rb_enc_from_index(parser->doc.enc_index)));
537
+ }
505
538
 
506
539
  if(!parser_document_append(parser, string, length)) {
507
540
  // error
@@ -513,10 +546,12 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
513
546
  }
514
547
  else {
515
548
  parser->tk.scan.cursor = cursor;
516
- parser->tk.scan.string = parser->doc.data;
517
- parser->tk.scan.length = parser->doc.length;
549
+ tokenizer_set_scan_string(&parser->tk, parser->doc.data, parser->doc.length);
550
+ parser->tk.scan.enc_index = parser->doc.enc_index;
551
+ parser->tk.scan.mb_cursor = mb_cursor;
518
552
 
519
553
  tokenizer_scan_all(&parser->tk);
554
+ tokenizer_free_scan_string(&parser->tk);
520
555
  }
521
556
 
522
557
  return Qtrue;
@@ -535,17 +570,30 @@ static VALUE parser_append_placeholder_method(VALUE self, VALUE source)
535
570
  static VALUE parser_document_method(VALUE self)
536
571
  {
537
572
  struct parser_t *parser = NULL;
573
+ rb_encoding *enc;
538
574
  Parser_Get_Struct(self, parser);
539
575
  if(!parser->doc.data)
540
576
  return Qnil;
541
- return rb_str_new(parser->doc.data, parser->doc.length);
577
+ enc = rb_enc_from_index(parser->doc.enc_index);
578
+ return rb_enc_str_new(parser->doc.data, parser->doc.length, enc);
542
579
  }
543
580
 
544
581
  static VALUE parser_document_length_method(VALUE self)
545
582
  {
546
583
  struct parser_t *parser = NULL;
584
+ rb_encoding *enc;
585
+ const char *buf;
586
+
547
587
  Parser_Get_Struct(self, parser);
548
- return ULONG2NUM(parser->doc.length);
588
+
589
+ if(parser->doc.data == NULL) {
590
+ return ULONG2NUM(0);
591
+ }
592
+ else {
593
+ buf = parser->doc.data;
594
+ enc = rb_enc_from_index(parser->doc.enc_index);
595
+ return ULONG2NUM(rb_enc_strlen(buf, buf + parser->doc.length, enc));
596
+ }
549
597
  }
550
598
 
551
599
  static VALUE parser_context_method(VALUE self)
@@ -588,9 +636,10 @@ static VALUE parser_context_method(VALUE self)
588
636
 
589
637
  static inline VALUE ref_to_str(struct parser_t *parser, struct token_reference_t *ref)
590
638
  {
639
+ rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
591
640
  if(ref->type == TOKEN_NONE || parser->doc.data == NULL)
592
641
  return Qnil;
593
- return rb_str_new(parser->doc.data+ref->start, ref->length);
642
+ return rb_enc_str_new(parser->doc.data+ref->start, ref->length, enc);
594
643
  }
595
644
 
596
645
  static VALUE parser_tag_name_method(VALUE self)
@@ -665,29 +714,6 @@ static VALUE parser_rawtext_text_method(VALUE self)
665
714
  return ref_to_str(parser, &parser->rawtext.text);
666
715
  }
667
716
 
668
- static VALUE parser_extract_method(VALUE self, VALUE start_p, VALUE end_p)
669
- {
670
- struct parser_t *parser = NULL;
671
- unsigned long int start, end;
672
- struct token_reference_t ref;
673
-
674
- Parser_Get_Struct(self, parser);
675
-
676
- start = NUM2ULONG(start_p);
677
- end = NUM2ULONG(end_p);
678
- if(end < start) {
679
- rb_raise(rb_eArgError, "'end' must be greater or equal than 'start'");
680
- }
681
- if(end > parser->doc.length) {
682
- rb_raise(rb_eArgError, "'end' argument not in range of document");
683
- }
684
-
685
- ref.type = TOKEN_TEXT; // anything not NONE
686
- ref.start = start;
687
- ref.length = end - start;
688
- return ref_to_str(parser, &ref);
689
- }
690
-
691
717
  static VALUE parser_errors_count_method(VALUE self)
692
718
  {
693
719
  struct parser_t *parser = NULL;
@@ -699,12 +725,13 @@ static VALUE create_parser_error(struct parser_document_error_t *error)
699
725
  {
700
726
  VALUE module = rb_const_get(rb_cObject, rb_intern("HtmlTokenizer"));
701
727
  VALUE klass = rb_const_get(module, rb_intern("ParserError"));
702
- VALUE args[3] = {
728
+ VALUE args[4] = {
703
729
  rb_str_new2(error->message),
730
+ ULONG2NUM(error->mb_pos),
704
731
  ULONG2NUM(error->line_number),
705
732
  ULONG2NUM(error->column_number),
706
733
  };
707
- return rb_class_new_instance(3, args, klass);
734
+ return rb_class_new_instance(4, args, klass);
708
735
  }
709
736
 
710
737
  static VALUE parser_errors_method(VALUE self, VALUE error_p)
@@ -749,7 +776,6 @@ void Init_html_tokenizer_parser(VALUE mHtmlTokenizer)
749
776
  rb_define_method(cParser, "column_number", parser_column_number_method, 0);
750
777
  rb_define_method(cParser, "parse", parser_parse_method, 1);
751
778
  rb_define_method(cParser, "append_placeholder", parser_append_placeholder_method, 1);
752
- rb_define_method(cParser, "extract", parser_extract_method, 2);
753
779
  rb_define_method(cParser, "context", parser_context_method, 0);
754
780
  rb_define_method(cParser, "tag_name", parser_tag_name_method, 0);
755
781
  rb_define_method(cParser, "closing_tag?", parser_closing_tag_method, 0);
@@ -19,6 +19,8 @@ enum parser_context {
19
19
 
20
20
  struct parser_document_error_t {
21
21
  char *message;
22
+ long unsigned int pos;
23
+ long unsigned int mb_pos;
22
24
  long unsigned int line_number;
23
25
  long unsigned int column_number;
24
26
  };
@@ -28,11 +30,15 @@ struct parser_document_t {
28
30
  char *data;
29
31
  long unsigned int line_number;
30
32
  long unsigned int column_number;
33
+
34
+ int enc_index;
35
+ long unsigned int mb_length;
31
36
  };
32
37
 
33
38
  struct token_reference_t {
34
39
  enum token_type type;
35
40
  long unsigned int start;
41
+ long unsigned int mb_start;
36
42
  long unsigned int length;
37
43
  long unsigned int line_number;
38
44
  long unsigned int column_number;
@@ -1,4 +1,5 @@
1
1
  #include <ruby.h>
2
+ #include <ruby/encoding.h>
2
3
  #include "html_tokenizer.h"
3
4
  #include "tokenizer.h"
4
5
 
@@ -11,16 +12,7 @@ static void tokenizer_free(void *ptr)
11
12
  {
12
13
  struct tokenizer_t *tk = ptr;
13
14
  if(tk) {
14
- if(tk->current_tag) {
15
- DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag);
16
- xfree(tk->current_tag);
17
- tk->current_tag = NULL;
18
- }
19
- if(tk->scan.string) {
20
- DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string);
21
- xfree(tk->scan.string);
22
- tk->scan.string = NULL;
23
- }
15
+ tokenizer_free_members(tk);
24
16
  DBG_PRINT("tk=%p xfree(tk)", tk);
25
17
  xfree(tk);
26
18
  }
@@ -60,6 +52,8 @@ void tokenizer_init(struct tokenizer_t *tk)
60
52
  tk->scan.string = NULL;
61
53
  tk->scan.cursor = 0;
62
54
  tk->scan.length = 0;
55
+ tk->scan.mb_cursor = 0;
56
+ tk->scan.enc_index = 0;
63
57
 
64
58
  tk->attribute_value_start = 0;
65
59
  tk->found_attribute = 0;
@@ -72,6 +66,21 @@ void tokenizer_init(struct tokenizer_t *tk)
72
66
  return;
73
67
  }
74
68
 
69
+ void tokenizer_free_members(struct tokenizer_t *tk)
70
+ {
71
+ if(tk->current_tag) {
72
+ DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag);
73
+ xfree(tk->current_tag);
74
+ tk->current_tag = NULL;
75
+ }
76
+ if(tk->scan.string) {
77
+ DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string);
78
+ xfree(tk->scan.string);
79
+ tk->scan.string = NULL;
80
+ }
81
+ return;
82
+ }
83
+
75
84
  VALUE token_type_to_symbol(enum token_type type)
76
85
  {
77
86
  switch(type) {
@@ -115,17 +124,27 @@ VALUE token_type_to_symbol(enum token_type type)
115
124
  return Qnil;
116
125
  }
117
126
 
127
+ static long unsigned int tokenizer_mblength(struct tokenizer_t *tk, long unsigned int length)
128
+ {
129
+ rb_encoding *enc = rb_enc_from_index(tk->scan.enc_index);
130
+ const char *buf = tk->scan.string + tk->scan.cursor;
131
+ return rb_enc_strlen(buf, buf + length, enc);
132
+ }
133
+
118
134
  static void tokenizer_yield_tag(struct tokenizer_t *tk, enum token_type type, long unsigned int length, void *data)
119
135
  {
136
+ long unsigned int mb_length = tokenizer_mblength(tk, length);
120
137
  tk->last_token = type;
121
- rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan.cursor), INT2NUM(tk->scan.cursor + length));
138
+ rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan.mb_cursor), INT2NUM(tk->scan.mb_cursor + mb_length));
122
139
  }
123
140
 
124
141
  static void tokenizer_callback(struct tokenizer_t *tk, enum token_type type, long unsigned int length)
125
142
  {
143
+ long unsigned int mb_length = tokenizer_mblength(tk, length);
126
144
  if(tk->f_callback)
127
145
  tk->f_callback(tk, type, length, tk->callback_data);
128
146
  tk->scan.cursor += length;
147
+ tk->scan.mb_cursor += mb_length;
129
148
  }
130
149
 
131
150
  static VALUE tokenizer_initialize_method(VALUE self)
@@ -356,6 +375,7 @@ static int scan_open_tag(struct tokenizer_t *tk)
356
375
  else if(is_doctype(&tk->scan)) {
357
376
  tokenizer_callback(tk, TOKEN_TAG_START, 1);
358
377
  tokenizer_callback(tk, TOKEN_TAG_NAME, 8);
378
+ push_context(tk, TOKENIZER_TAG_NAME);
359
379
  return 1;
360
380
  }
361
381
  else if(is_cdata_start(&tk->scan)) {
@@ -642,11 +662,30 @@ void tokenizer_scan_all(struct tokenizer_t *tk)
642
662
  return;
643
663
  }
644
664
 
665
+ void tokenizer_set_scan_string(struct tokenizer_t *tk, const char *string, long unsigned int length)
666
+ {
667
+ const char *old = tk->scan.string;
668
+ REALLOC_N(tk->scan.string, char, string ? length + 1 : 0);
669
+ DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
670
+ tk->scan.string, length + 1);
671
+ if(string && length > 0) {
672
+ strncpy(tk->scan.string, string, length);
673
+ tk->scan.string[length] = 0;
674
+ }
675
+ tk->scan.length = length;
676
+ return;
677
+ }
678
+
679
+ void tokenizer_free_scan_string(struct tokenizer_t *tk)
680
+ {
681
+ tokenizer_set_scan_string(tk, NULL, 0);
682
+ return;
683
+ }
684
+
645
685
  static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
646
686
  {
647
687
  struct tokenizer_t *tk = NULL;
648
688
  char *c_source;
649
- char *old;
650
689
 
651
690
  if(NIL_P(source))
652
691
  return Qnil;
@@ -656,19 +695,13 @@ static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
656
695
 
657
696
  c_source = StringValueCStr(source);
658
697
  tk->scan.cursor = 0;
659
- tk->scan.length = strlen(c_source);
660
-
661
- old = tk->scan.string;
662
- REALLOC_N(tk->scan.string, char, tk->scan.length+1);
663
- DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
664
- tk->scan.string, tk->scan.length+1);
665
- strncpy(tk->scan.string, c_source, tk->scan.length);
698
+ tokenizer_set_scan_string(tk, c_source, strlen(c_source));
699
+ tk->scan.enc_index = rb_enc_get_index(source);
700
+ tk->scan.mb_cursor = 0;
666
701
 
667
702
  tokenizer_scan_all(tk);
668
703
 
669
- DBG_PRINT("tk=%p xfree(tk->scan.string) 0x%p", tk, tk->scan.string);
670
- xfree(tk->scan.string);
671
- tk->scan.string = NULL;
704
+ tokenizer_free_scan_string(tk);
672
705
 
673
706
  return Qtrue;
674
707
  }
@@ -43,6 +43,9 @@ struct scan_t {
43
43
  char *string;
44
44
  long unsigned int cursor;
45
45
  long unsigned int length;
46
+
47
+ int enc_index;
48
+ long unsigned int mb_cursor;
46
49
  };
47
50
 
48
51
  struct tokenizer_t
@@ -67,6 +70,9 @@ struct tokenizer_t
67
70
 
68
71
  void Init_html_tokenizer_tokenizer(VALUE mHtmlTokenizer);
69
72
  void tokenizer_init(struct tokenizer_t *tk);
73
+ void tokenizer_free_members(struct tokenizer_t *tk);
74
+ void tokenizer_set_scan_string(struct tokenizer_t *tk, const char *string, long unsigned int length);
75
+ void tokenizer_free_scan_string(struct tokenizer_t *tk);
70
76
  void tokenizer_scan_all(struct tokenizer_t *tk);
71
77
  VALUE token_type_to_symbol(enum token_type type);
72
78
 
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "html_tokenizer"
3
- spec.version = "0.0.1"
3
+ spec.version = "0.0.7"
4
4
  spec.summary = "HTML Tokenizer"
5
5
  spec.author = "Francois Chagnon"
6
6
 
@@ -2,9 +2,10 @@ require 'html_tokenizer_ext'
2
2
 
3
3
  module HtmlTokenizer
4
4
  class ParserError < RuntimeError
5
- attr_reader :line, :column
6
- def initialize(message, line, column)
5
+ attr_reader :position, :line, :column
6
+ def initialize(message, position, line, column)
7
7
  super(message)
8
+ @position = position
8
9
  @line = line
9
10
  @column = column
10
11
  end
@@ -431,35 +431,13 @@ class HtmlTokenizer::ParserTest < Minitest::Test
431
431
  tokens << token
432
432
  end
433
433
  assert_equal [[:text, 0, 4, 1, 0], [:text, 34, 38, 5, 0]], tokens
434
- assert_equal "bar\n", @parser.extract(34, 38)
435
- end
436
-
437
- def test_extract_method
438
- parse("abcdefg")
439
- assert_equal "a", @parser.extract(0, 1)
440
- assert_equal "cd", @parser.extract(2, 4)
441
- end
442
-
443
- def test_extract_method_raises_argument_error_end_past_length
444
- parse("abcdefg")
445
- e = assert_raises(ArgumentError) do
446
- @parser.extract(0, 32)
447
- end
448
- assert_equal "'end' argument not in range of document", e.message
449
- end
450
-
451
- def test_extract_method_raises_argument_error_end_less_than_start
452
- parse("abcdefg")
453
- e = assert_raises(ArgumentError) do
454
- @parser.extract(1, 0)
455
- end
456
- assert_equal "'end' must be greater or equal than 'start'", e.message
457
434
  end
458
435
 
459
436
  def test_solidus_or_tag_name_error
460
437
  parse('<>')
461
438
  assert_equal 1, @parser.errors_count
462
439
  assert_equal "expected '/' or tag name", @parser.errors.first.to_s
440
+ assert_equal 1, @parser.errors.first.position
463
441
  assert_equal 1, @parser.errors.first.line
464
442
  assert_equal 1, @parser.errors.first.column
465
443
  end
@@ -468,6 +446,7 @@ class HtmlTokenizer::ParserTest < Minitest::Test
468
446
  parse('< ')
469
447
  assert_equal 1, @parser.errors_count
470
448
  assert_equal "expected '/' or tag name", @parser.errors.first.to_s
449
+ assert_equal 1, @parser.errors.first.position
471
450
  assert_equal 1, @parser.errors.first.line
472
451
  assert_equal 1, @parser.errors.first.column
473
452
  end
@@ -476,6 +455,7 @@ class HtmlTokenizer::ParserTest < Minitest::Test
476
455
  parse('<foo =')
477
456
  assert_equal 1, @parser.errors_count
478
457
  assert_equal "expected whitespace, '>', attribute name or value", @parser.errors.first.to_s
458
+ assert_equal 5, @parser.errors.first.position
479
459
  assert_equal 1, @parser.errors.first.line
480
460
  assert_equal 5, @parser.errors.first.column
481
461
  end
@@ -484,6 +464,7 @@ class HtmlTokenizer::ParserTest < Minitest::Test
484
464
  parse('<foo /x')
485
465
  assert_equal 1, @parser.errors_count
486
466
  assert_equal "expected '>' after '/'", @parser.errors.first.to_s
467
+ assert_equal 6, @parser.errors.first.position
487
468
  assert_equal 1, @parser.errors.first.line
488
469
  assert_equal 6, @parser.errors.first.column
489
470
  end
@@ -492,6 +473,7 @@ class HtmlTokenizer::ParserTest < Minitest::Test
492
473
  parse('<foo / ')
493
474
  assert_equal 1, @parser.errors_count
494
475
  assert_equal "expected '>' after '/'", @parser.errors.first.to_s
476
+ assert_equal 6, @parser.errors.first.position
495
477
  assert_equal 1, @parser.errors.first.line
496
478
  assert_equal 6, @parser.errors.first.column
497
479
  end
@@ -499,29 +481,33 @@ class HtmlTokenizer::ParserTest < Minitest::Test
499
481
  def test_attribute_name_error
500
482
  parse('<foo bar~')
501
483
  assert_equal 2, @parser.errors_count
502
- assert_equal "expected whitespace, '>' or '=' after attribute name", @parser.errors.first.to_s
503
- assert_equal 1, @parser.errors.first.line
504
- assert_equal 8, @parser.errors.first.column
505
484
  assert_equal "expected whitespace, '>' or '=' after attribute name", @parser.errors[0].to_s
485
+ assert_equal 8, @parser.errors.first.position
506
486
  assert_equal 1, @parser.errors[0].line
507
487
  assert_equal 8, @parser.errors[0].column
488
+ assert_equal "expected whitespace, '>', attribute name or value", @parser.errors[1].to_s
489
+ assert_equal 8, @parser.errors.first.position
490
+ assert_equal 1, @parser.errors[1].line
491
+ assert_equal 8, @parser.errors[1].column
508
492
  end
509
493
 
510
494
  def test_attribute_whitespace_or_equal_error
511
495
  parse('<foo bar ~')
512
496
  assert_equal 2, @parser.errors_count
513
- assert_equal "expected '/', '>', \", ' or '=' after attribute name", @parser.errors.first.to_s
514
- assert_equal 1, @parser.errors.first.line
515
- assert_equal 9, @parser.errors.first.column
516
497
  assert_equal "expected '/', '>', \", ' or '=' after attribute name", @parser.errors[0].to_s
517
498
  assert_equal 1, @parser.errors[0].line
518
499
  assert_equal 9, @parser.errors[0].column
500
+ assert_equal "expected whitespace, '>', attribute name or value", @parser.errors[1].to_s
501
+ assert_equal 9, @parser.errors.first.position
502
+ assert_equal 1, @parser.errors[1].line
503
+ assert_equal 9, @parser.errors[1].column
519
504
  end
520
505
 
521
506
  def test_attribute_whitespace_or_equal_error_2
522
507
  parse('<foo bar = >')
523
508
  assert_equal 1, @parser.errors_count
524
509
  assert_equal "expected attribute value after '='", @parser.errors.first.to_s
510
+ assert_equal 11, @parser.errors.first.position
525
511
  assert_equal 1, @parser.errors.first.line
526
512
  assert_equal 11, @parser.errors.first.column
527
513
  end
@@ -530,10 +516,34 @@ class HtmlTokenizer::ParserTest < Minitest::Test
530
516
  parse('<foo bar=""x')
531
517
  assert_equal 1, @parser.errors_count
532
518
  assert_equal "expected space after attribute value", @parser.errors.first.to_s
519
+ assert_equal 11, @parser.errors.first.position
533
520
  assert_equal 1, @parser.errors.first.line
534
521
  assert_equal 11, @parser.errors.first.column
535
522
  end
536
523
 
524
+ def test_attribute_with_mutlibyte_characters
525
+ data = ["<div title", "='your store’s'>"]
526
+ tokens = []
527
+ parse(*data) { |name, start, stop| tokens << [name, start, stop, data.join[start...stop]] }
528
+ assert_equal "div", @parser.tag_name
529
+ assert_equal "title", @parser.attribute_name
530
+ assert_equal "your store’s", @parser.attribute_value
531
+ assert_equal data.join, @parser.document
532
+ assert_equal data.join.size, @parser.document_length
533
+ assert_equal data.join.size, @parser.column_number
534
+ assert_equal [
535
+ [:tag_start, 0, 1, "<"],
536
+ [:tag_name, 1, 4, "div"],
537
+ [:whitespace, 4, 5, " "],
538
+ [:attribute_name, 5, 10, "title"],
539
+ [:equal, 10, 11, "="],
540
+ [:attribute_quoted_value_start, 11, 12, "'"],
541
+ [:attribute_quoted_value, 12, 24, "your store’s"],
542
+ [:attribute_quoted_value_end, 24, 25, "'"],
543
+ [:tag_end, 25, 26, ">"],
544
+ ], tokens
545
+ end
546
+
537
547
  def test_valid_syntaxes
538
548
  parse(
539
549
  '<div>',
@@ -564,6 +574,15 @@ class HtmlTokenizer::ParserTest < Minitest::Test
564
574
  assert_equal 0, @parser.errors_count, "Expected no errors: #{@parser.errors}"
565
575
  end
566
576
 
577
+ def test_doctype_without_space
578
+ parse('<!DOCTYPE')
579
+ assert_equal "!DOCTYPE", @parser.tag_name
580
+ parse('foo')
581
+ assert_equal "!DOCTYPEfoo", @parser.tag_name
582
+
583
+ assert_equal 0, @parser.errors_count, "Expected no errors: #{@parser.errors}"
584
+ end
585
+
567
586
  private
568
587
 
569
588
  def parse(*parts, &block)
@@ -324,13 +324,34 @@ class HtmlTokenizer::TokenizerTest < Minitest::Test
324
324
  ], result
325
325
  end
326
326
 
327
+ def test_html_with_mutlibyte_characters
328
+ data = "<div title='your store’s'>foo</div>"
329
+ result = tokenize(data)
330
+ assert_equal [
331
+ [:tag_start, "<"],
332
+ [:tag_name, "div"],
333
+ [:whitespace, " "],
334
+ [:attribute_name, "title"],
335
+ [:equal, "="],
336
+ [:attribute_quoted_value_start, "'"],
337
+ [:attribute_quoted_value, "your store’s"],
338
+ [:attribute_quoted_value_end, "'"],
339
+ [:tag_end, ">"],
340
+ [:text, "foo"],
341
+ [:tag_start, "<"],
342
+ [:solidus, "/"],
343
+ [:tag_name, "div"],
344
+ [:tag_end, ">"],
345
+ ], result
346
+ end
347
+
327
348
  private
328
349
 
329
350
  def tokenize(*parts)
330
351
  tokens = []
331
352
  @tokenizer = HtmlTokenizer::Tokenizer.new
332
353
  parts.each do |part|
333
- @tokenizer.tokenize(part) { |name, start, stop| tokens << [name, part[start..(stop-1)]] }
354
+ @tokenizer.tokenize(part) { |name, start, stop| tokens << [name, part[start...stop]] }
334
355
  end
335
356
  tokens
336
357
  end
metadata CHANGED
@@ -1,55 +1,55 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francois Chagnon
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-10-26 00:00:00.000000000 Z
11
+ date: 2018-05-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake-compiler
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ~>
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ~>
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: minitest
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ~>
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
47
  version: '0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ~>
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  description:
@@ -60,8 +60,8 @@ extensions:
60
60
  - ext/html_tokenizer_ext/extconf.rb
61
61
  extra_rdoc_files: []
62
62
  files:
63
- - .autotest
64
- - .gitignore
63
+ - ".autotest"
64
+ - ".gitignore"
65
65
  - Gemfile
66
66
  - Gemfile.lock
67
67
  - LICENSE
@@ -90,17 +90,17 @@ require_paths:
90
90
  - ext
91
91
  required_ruby_version: !ruby/object:Gem::Requirement
92
92
  requirements:
93
- - - '>='
93
+ - - ">="
94
94
  - !ruby/object:Gem::Version
95
95
  version: '0'
96
96
  required_rubygems_version: !ruby/object:Gem::Requirement
97
97
  requirements:
98
- - - '>='
98
+ - - ">="
99
99
  - !ruby/object:Gem::Version
100
100
  version: '0'
101
101
  requirements: []
102
102
  rubyforge_project:
103
- rubygems_version: 2.0.14.1
103
+ rubygems_version: 2.7.6
104
104
  signing_key:
105
105
  specification_version: 4
106
106
  summary: HTML Tokenizer