html_tokenizer 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a3d58539284af566692b81cc4633af1137baabea
4
- data.tar.gz: 1877010598cbadadb27212eae39346769fa2afde
3
+ metadata.gz: 2e620e43f97a82c4cb3aae2067a2666325b453a5
4
+ data.tar.gz: 57784c1b53c4faefe2ab3b6e222836bfad852d8c
5
5
  SHA512:
6
- metadata.gz: 9d6e46dfd48e7bb4967cfa13e90bdd213331b6dd38f9a57739ac1317f4b8147a9f11c5a39001ebf1c36554350ec444eaccc19dfe04040dadf2fc71d92435d5d5
7
- data.tar.gz: f8fad88c25ff9404d710d4609ffc89fe80b08a71ba864cdca91a50cb9c5c98844befb75c43f5d4c6e0c56641fba57b4fda35ddf1c485abdcaf6bcd121bb4b0be
6
+ metadata.gz: 3b7a26cc219ea9f5885999146015e8137264ee51b43e5e6929d377cc2447ca6d4c1d5804e32954c24b2b807a19edcaf1f9cc708e7b5a0f83b086e16a21c8fa3e
7
+ data.tar.gz: 46beb0ed1994fe7468ab89451de1af830251da73d15c5d0943d30b70080806a458a7531a9fb7df2d7640e5ba1a195706eb84bf8896ef9184a207a1477747c200
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html_tokenizer (0.0.1)
4
+ html_tokenizer (0.0.2)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -21,4 +21,4 @@ DEPENDENCIES
21
21
  rake-compiler
22
22
 
23
23
  BUNDLED WITH
24
- 1.12.3
24
+ 1.16.0
@@ -1,4 +1,5 @@
1
1
  #include <ruby.h>
2
+ #include <ruby/encoding.h>
2
3
  #include "html_tokenizer.h"
3
4
  #include "parser.h"
4
5
 
@@ -65,6 +66,7 @@ static inline void parser_append_ref(struct token_reference_t *dest, struct toke
65
66
  if(dest->type == TOKEN_NONE || dest->type != src->type || (dest->start + dest->length) != src->start) {
66
67
  dest->type = src->type;
67
68
  dest->start = src->start;
69
+ dest->mb_start = src->mb_start;
68
70
  dest->length = src->length;
69
71
  dest->line_number = src->line_number;
70
72
  dest->column_number = src->column_number;
@@ -362,15 +364,21 @@ static inline int rawtext_context(struct parser_t *parser)
362
364
 
363
365
  static void parser_adjust_line_number(struct parser_t *parser, long unsigned int start, long unsigned int length)
364
366
  {
367
+ rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
365
368
  long unsigned int i;
369
+ const char *buf, *nextlf;
366
370
 
367
- for(i = start;i < (start + length); i++) {
368
- if(parser->doc.data[i] == '\n') {
371
+ for(i = 0; i < length;) {
372
+ buf = &parser->doc.data[start + i];
373
+ nextlf = memchr(buf, '\n', length - i);
374
+ if(nextlf) {
369
375
  parser->doc.column_number = 0;
370
376
  parser->doc.line_number += 1;
377
+ i += (nextlf - buf) + 1;
371
378
  }
372
379
  else {
373
- parser->doc.column_number += 1;
380
+ parser->doc.column_number += rb_enc_strlen(buf, buf + length - i, enc);
381
+ break;
374
382
  }
375
383
  }
376
384
 
@@ -383,11 +391,14 @@ static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type typ
383
391
  struct token_reference_t ref = {
384
392
  .type = type,
385
393
  .start = tk->scan.cursor,
394
+ .mb_start = tk->scan.mb_cursor,
386
395
  .length = length,
387
396
  .line_number = parser->doc.line_number,
388
397
  .column_number = parser->doc.column_number,
389
398
  };
390
399
  int parse_again = 1;
400
+ long unsigned int mb_strlen;
401
+ rb_encoding *enc;
391
402
 
392
403
  while(parse_again) {
393
404
  switch(parser->context)
@@ -438,8 +449,10 @@ static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type typ
438
449
  }
439
450
 
440
451
  if(rb_block_given_p()) {
452
+ enc = rb_enc_from_index(parser->doc.enc_index);
453
+ mb_strlen = rb_enc_strlen(parser->doc.data + ref.start, parser->doc.data + ref.start + ref.length, enc);
441
454
  rb_yield_values(5, token_type_to_symbol(type),
442
- INT2NUM(ref.start), INT2NUM(ref.start + ref.length),
455
+ INT2NUM(ref.mb_start), INT2NUM(ref.mb_start + mb_strlen),
443
456
  INT2NUM(ref.line_number), INT2NUM(ref.column_number));
444
457
  }
445
458
 
@@ -465,6 +478,8 @@ static VALUE parser_initialize_method(VALUE self)
465
478
 
466
479
  parser->doc.length = 0;
467
480
  parser->doc.data = NULL;
481
+ parser->doc.enc_index = 0;
482
+ parser->doc.mb_length = 0;
468
483
 
469
484
  parser->doc.line_number = 1;
470
485
  parser->doc.column_number = 0;
@@ -478,11 +493,17 @@ static VALUE parser_initialize_method(VALUE self)
478
493
  static int parser_document_append(struct parser_t *parser, const char *string, unsigned long int length)
479
494
  {
480
495
  void *old = parser->doc.data;
496
+ unsigned long int mb_length;
497
+ char *buf;
498
+ rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
481
499
  REALLOC_N(parser->doc.data, char, parser->doc.length + length + 1);
482
500
  DBG_PRINT("parser=%p realloc(parser->doc.data) %p -> %p length=%lu", parser, old,
483
- parser->doc.data, parser->doc.length + length + 1);
484
- strcpy(parser->doc.data+parser->doc.length, string);
501
+ parser->doc.data, parser->doc.length + length + 1);
502
+ buf = parser->doc.data + parser->doc.length;
503
+ strcpy(buf, string);
504
+ mb_length = rb_enc_strlen(buf, buf + length, enc);
485
505
  parser->doc.length += length;
506
+ parser->doc.mb_length += mb_length;
486
507
  return 1;
487
508
  }
488
509
 
@@ -490,7 +511,7 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
490
511
  {
491
512
  struct parser_t *parser = NULL;
492
513
  char *string = NULL;
493
- long unsigned int length = 0, cursor = 0;
514
+ long unsigned int length = 0, cursor = 0, mb_cursor = 0;
494
515
 
495
516
  if(NIL_P(source))
496
517
  return Qnil;
@@ -502,6 +523,15 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
502
523
  length = strlen(string);
503
524
 
504
525
  cursor = parser->doc.length;
526
+ mb_cursor = parser->doc.mb_length;
527
+
528
+ if(parser->doc.data == NULL) {
529
+ parser->doc.enc_index = rb_enc_get_index(source);
530
+ }
531
+ else if(parser->doc.enc_index != rb_enc_get_index(source)) {
532
+ rb_raise(rb_eArgError, "cannot append %s string to %s document",
533
+ rb_enc_name(rb_enc_get(source)), rb_enc_name(rb_enc_from_index(parser->doc.enc_index)));
534
+ }
505
535
 
506
536
  if(!parser_document_append(parser, string, length)) {
507
537
  // error
@@ -515,6 +545,8 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
515
545
  parser->tk.scan.cursor = cursor;
516
546
  parser->tk.scan.string = parser->doc.data;
517
547
  parser->tk.scan.length = parser->doc.length;
548
+ parser->tk.scan.enc_index = parser->doc.enc_index;
549
+ parser->tk.scan.mb_cursor = mb_cursor;
518
550
 
519
551
  tokenizer_scan_all(&parser->tk);
520
552
  }
@@ -535,17 +567,30 @@ static VALUE parser_append_placeholder_method(VALUE self, VALUE source)
535
567
  static VALUE parser_document_method(VALUE self)
536
568
  {
537
569
  struct parser_t *parser = NULL;
570
+ rb_encoding *enc;
538
571
  Parser_Get_Struct(self, parser);
539
572
  if(!parser->doc.data)
540
573
  return Qnil;
541
- return rb_str_new(parser->doc.data, parser->doc.length);
574
+ enc = rb_enc_from_index(parser->doc.enc_index);
575
+ return rb_enc_str_new(parser->doc.data, parser->doc.length, enc);
542
576
  }
543
577
 
544
578
  static VALUE parser_document_length_method(VALUE self)
545
579
  {
546
580
  struct parser_t *parser = NULL;
581
+ rb_encoding *enc;
582
+ const char *buf;
583
+
547
584
  Parser_Get_Struct(self, parser);
548
- return ULONG2NUM(parser->doc.length);
585
+
586
+ if(parser->doc.data == NULL) {
587
+ return ULONG2NUM(0);
588
+ }
589
+ else {
590
+ buf = parser->doc.data;
591
+ enc = rb_enc_from_index(parser->doc.enc_index);
592
+ return ULONG2NUM(rb_enc_strlen(buf, buf + parser->doc.length, enc));
593
+ }
549
594
  }
550
595
 
551
596
  static VALUE parser_context_method(VALUE self)
@@ -588,9 +633,10 @@ static VALUE parser_context_method(VALUE self)
588
633
 
589
634
  static inline VALUE ref_to_str(struct parser_t *parser, struct token_reference_t *ref)
590
635
  {
636
+ rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
591
637
  if(ref->type == TOKEN_NONE || parser->doc.data == NULL)
592
638
  return Qnil;
593
- return rb_str_new(parser->doc.data+ref->start, ref->length);
639
+ return rb_enc_str_new(parser->doc.data+ref->start, ref->length, enc);
594
640
  }
595
641
 
596
642
  static VALUE parser_tag_name_method(VALUE self)
@@ -665,29 +711,6 @@ static VALUE parser_rawtext_text_method(VALUE self)
665
711
  return ref_to_str(parser, &parser->rawtext.text);
666
712
  }
667
713
 
668
- static VALUE parser_extract_method(VALUE self, VALUE start_p, VALUE end_p)
669
- {
670
- struct parser_t *parser = NULL;
671
- unsigned long int start, end;
672
- struct token_reference_t ref;
673
-
674
- Parser_Get_Struct(self, parser);
675
-
676
- start = NUM2ULONG(start_p);
677
- end = NUM2ULONG(end_p);
678
- if(end < start) {
679
- rb_raise(rb_eArgError, "'end' must be greater or equal than 'start'");
680
- }
681
- if(end > parser->doc.length) {
682
- rb_raise(rb_eArgError, "'end' argument not in range of document");
683
- }
684
-
685
- ref.type = TOKEN_TEXT; // anything not NONE
686
- ref.start = start;
687
- ref.length = end - start;
688
- return ref_to_str(parser, &ref);
689
- }
690
-
691
714
  static VALUE parser_errors_count_method(VALUE self)
692
715
  {
693
716
  struct parser_t *parser = NULL;
@@ -749,7 +772,6 @@ void Init_html_tokenizer_parser(VALUE mHtmlTokenizer)
749
772
  rb_define_method(cParser, "column_number", parser_column_number_method, 0);
750
773
  rb_define_method(cParser, "parse", parser_parse_method, 1);
751
774
  rb_define_method(cParser, "append_placeholder", parser_append_placeholder_method, 1);
752
- rb_define_method(cParser, "extract", parser_extract_method, 2);
753
775
  rb_define_method(cParser, "context", parser_context_method, 0);
754
776
  rb_define_method(cParser, "tag_name", parser_tag_name_method, 0);
755
777
  rb_define_method(cParser, "closing_tag?", parser_closing_tag_method, 0);
@@ -28,11 +28,15 @@ struct parser_document_t {
28
28
  char *data;
29
29
  long unsigned int line_number;
30
30
  long unsigned int column_number;
31
+
32
+ int enc_index;
33
+ long unsigned int mb_length;
31
34
  };
32
35
 
33
36
  struct token_reference_t {
34
37
  enum token_type type;
35
38
  long unsigned int start;
39
+ long unsigned int mb_start;
36
40
  long unsigned int length;
37
41
  long unsigned int line_number;
38
42
  long unsigned int column_number;
@@ -1,4 +1,5 @@
1
1
  #include <ruby.h>
2
+ #include <ruby/encoding.h>
2
3
  #include "html_tokenizer.h"
3
4
  #include "tokenizer.h"
4
5
 
@@ -60,6 +61,8 @@ void tokenizer_init(struct tokenizer_t *tk)
60
61
  tk->scan.string = NULL;
61
62
  tk->scan.cursor = 0;
62
63
  tk->scan.length = 0;
64
+ tk->scan.mb_cursor = 0;
65
+ tk->scan.enc_index = 0;
63
66
 
64
67
  tk->attribute_value_start = 0;
65
68
  tk->found_attribute = 0;
@@ -115,17 +118,27 @@ VALUE token_type_to_symbol(enum token_type type)
115
118
  return Qnil;
116
119
  }
117
120
 
121
+ static long unsigned int tokenizer_mblength(struct tokenizer_t *tk, long unsigned int length)
122
+ {
123
+ rb_encoding *enc = rb_enc_from_index(tk->scan.enc_index);
124
+ const char *buf = tk->scan.string + tk->scan.cursor;
125
+ return rb_enc_strlen(buf, buf + length, enc);
126
+ }
127
+
118
128
  static void tokenizer_yield_tag(struct tokenizer_t *tk, enum token_type type, long unsigned int length, void *data)
119
129
  {
130
+ long unsigned int mb_length = tokenizer_mblength(tk, length);
120
131
  tk->last_token = type;
121
- rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan.cursor), INT2NUM(tk->scan.cursor + length));
132
+ rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan.mb_cursor), INT2NUM(tk->scan.mb_cursor + mb_length));
122
133
  }
123
134
 
124
135
  static void tokenizer_callback(struct tokenizer_t *tk, enum token_type type, long unsigned int length)
125
136
  {
137
+ long unsigned int mb_length = tokenizer_mblength(tk, length);
126
138
  if(tk->f_callback)
127
139
  tk->f_callback(tk, type, length, tk->callback_data);
128
140
  tk->scan.cursor += length;
141
+ tk->scan.mb_cursor += mb_length;
129
142
  }
130
143
 
131
144
  static VALUE tokenizer_initialize_method(VALUE self)
@@ -657,6 +670,8 @@ static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
657
670
  c_source = StringValueCStr(source);
658
671
  tk->scan.cursor = 0;
659
672
  tk->scan.length = strlen(c_source);
673
+ tk->scan.enc_index = rb_enc_get_index(source);
674
+ tk->scan.mb_cursor = 0;
660
675
 
661
676
  old = tk->scan.string;
662
677
  REALLOC_N(tk->scan.string, char, tk->scan.length+1);
@@ -43,6 +43,9 @@ struct scan_t {
43
43
  char *string;
44
44
  long unsigned int cursor;
45
45
  long unsigned int length;
46
+
47
+ int enc_index;
48
+ long unsigned int mb_cursor;
46
49
  };
47
50
 
48
51
  struct tokenizer_t
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "html_tokenizer"
3
- spec.version = "0.0.1"
3
+ spec.version = "0.0.2"
4
4
  spec.summary = "HTML Tokenizer"
5
5
  spec.author = "Francois Chagnon"
6
6
 
@@ -431,29 +431,6 @@ class HtmlTokenizer::ParserTest < Minitest::Test
431
431
  tokens << token
432
432
  end
433
433
  assert_equal [[:text, 0, 4, 1, 0], [:text, 34, 38, 5, 0]], tokens
434
- assert_equal "bar\n", @parser.extract(34, 38)
435
- end
436
-
437
- def test_extract_method
438
- parse("abcdefg")
439
- assert_equal "a", @parser.extract(0, 1)
440
- assert_equal "cd", @parser.extract(2, 4)
441
- end
442
-
443
- def test_extract_method_raises_argument_error_end_past_length
444
- parse("abcdefg")
445
- e = assert_raises(ArgumentError) do
446
- @parser.extract(0, 32)
447
- end
448
- assert_equal "'end' argument not in range of document", e.message
449
- end
450
-
451
- def test_extract_method_raises_argument_error_end_less_than_start
452
- parse("abcdefg")
453
- e = assert_raises(ArgumentError) do
454
- @parser.extract(1, 0)
455
- end
456
- assert_equal "'end' must be greater or equal than 'start'", e.message
457
434
  end
458
435
 
459
436
  def test_solidus_or_tag_name_error
@@ -534,6 +511,29 @@ class HtmlTokenizer::ParserTest < Minitest::Test
534
511
  assert_equal 11, @parser.errors.first.column
535
512
  end
536
513
 
514
+ def test_attribute_with_mutlibyte_characters
515
+ data = ["<div title", "='your store’s'>"]
516
+ tokens = []
517
+ parse(*data) { |name, start, stop| tokens << [name, start, stop, data.join[start...stop]] }
518
+ assert_equal "div", @parser.tag_name
519
+ assert_equal "title", @parser.attribute_name
520
+ assert_equal "your store’s", @parser.attribute_value
521
+ assert_equal data.join, @parser.document
522
+ assert_equal data.join.size, @parser.document_length
523
+ assert_equal data.join.size, @parser.column_number
524
+ assert_equal [
525
+ [:tag_start, 0, 1, "<"],
526
+ [:tag_name, 1, 4, "div"],
527
+ [:whitespace, 4, 5, " "],
528
+ [:attribute_name, 5, 10, "title"],
529
+ [:equal, 10, 11, "="],
530
+ [:attribute_quoted_value_start, 11, 12, "'"],
531
+ [:attribute_quoted_value, 12, 24, "your store’s"],
532
+ [:attribute_quoted_value_end, 24, 25, "'"],
533
+ [:tag_end, 25, 26, ">"],
534
+ ], tokens
535
+ end
536
+
537
537
  def test_valid_syntaxes
538
538
  parse(
539
539
  '<div>',
@@ -324,13 +324,34 @@ class HtmlTokenizer::TokenizerTest < Minitest::Test
324
324
  ], result
325
325
  end
326
326
 
327
+ def test_html_with_mutlibyte_characters
328
+ data = "<div title='your store’s'>foo</div>"
329
+ result = tokenize(data)
330
+ assert_equal [
331
+ [:tag_start, "<"],
332
+ [:tag_name, "div"],
333
+ [:whitespace, " "],
334
+ [:attribute_name, "title"],
335
+ [:equal, "="],
336
+ [:attribute_quoted_value_start, "'"],
337
+ [:attribute_quoted_value, "your store’s"],
338
+ [:attribute_quoted_value_end, "'"],
339
+ [:tag_end, ">"],
340
+ [:text, "foo"],
341
+ [:tag_start, "<"],
342
+ [:solidus, "/"],
343
+ [:tag_name, "div"],
344
+ [:tag_end, ">"],
345
+ ], result
346
+ end
347
+
327
348
  private
328
349
 
329
350
  def tokenize(*parts)
330
351
  tokens = []
331
352
  @tokenizer = HtmlTokenizer::Tokenizer.new
332
353
  parts.each do |part|
333
- @tokenizer.tokenize(part) { |name, start, stop| tokens << [name, part[start..(stop-1)]] }
354
+ @tokenizer.tokenize(part) { |name, start, stop| tokens << [name, part[start...stop]] }
334
355
  end
335
356
  tokens
336
357
  end
metadata CHANGED
@@ -1,55 +1,55 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francois Chagnon
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-10-26 00:00:00.000000000 Z
11
+ date: 2017-11-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake-compiler
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ~>
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ~>
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: minitest
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ~>
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
47
  version: '0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ~>
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  description:
@@ -60,8 +60,8 @@ extensions:
60
60
  - ext/html_tokenizer_ext/extconf.rb
61
61
  extra_rdoc_files: []
62
62
  files:
63
- - .autotest
64
- - .gitignore
63
+ - ".autotest"
64
+ - ".gitignore"
65
65
  - Gemfile
66
66
  - Gemfile.lock
67
67
  - LICENSE
@@ -90,17 +90,17 @@ require_paths:
90
90
  - ext
91
91
  required_ruby_version: !ruby/object:Gem::Requirement
92
92
  requirements:
93
- - - '>='
93
+ - - ">="
94
94
  - !ruby/object:Gem::Version
95
95
  version: '0'
96
96
  required_rubygems_version: !ruby/object:Gem::Requirement
97
97
  requirements:
98
- - - '>='
98
+ - - ">="
99
99
  - !ruby/object:Gem::Version
100
100
  version: '0'
101
101
  requirements: []
102
102
  rubyforge_project:
103
- rubygems_version: 2.0.14.1
103
+ rubygems_version: 2.6.8
104
104
  signing_key:
105
105
  specification_version: 4
106
106
  summary: HTML Tokenizer