html_tokenizer 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a3d58539284af566692b81cc4633af1137baabea
4
- data.tar.gz: 1877010598cbadadb27212eae39346769fa2afde
3
+ metadata.gz: 2e620e43f97a82c4cb3aae2067a2666325b453a5
4
+ data.tar.gz: 57784c1b53c4faefe2ab3b6e222836bfad852d8c
5
5
  SHA512:
6
- metadata.gz: 9d6e46dfd48e7bb4967cfa13e90bdd213331b6dd38f9a57739ac1317f4b8147a9f11c5a39001ebf1c36554350ec444eaccc19dfe04040dadf2fc71d92435d5d5
7
- data.tar.gz: f8fad88c25ff9404d710d4609ffc89fe80b08a71ba864cdca91a50cb9c5c98844befb75c43f5d4c6e0c56641fba57b4fda35ddf1c485abdcaf6bcd121bb4b0be
6
+ metadata.gz: 3b7a26cc219ea9f5885999146015e8137264ee51b43e5e6929d377cc2447ca6d4c1d5804e32954c24b2b807a19edcaf1f9cc708e7b5a0f83b086e16a21c8fa3e
7
+ data.tar.gz: 46beb0ed1994fe7468ab89451de1af830251da73d15c5d0943d30b70080806a458a7531a9fb7df2d7640e5ba1a195706eb84bf8896ef9184a207a1477747c200
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html_tokenizer (0.0.1)
4
+ html_tokenizer (0.0.2)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -21,4 +21,4 @@ DEPENDENCIES
21
21
  rake-compiler
22
22
 
23
23
  BUNDLED WITH
24
- 1.12.3
24
+ 1.16.0
@@ -1,4 +1,5 @@
1
1
  #include <ruby.h>
2
+ #include <ruby/encoding.h>
2
3
  #include "html_tokenizer.h"
3
4
  #include "parser.h"
4
5
 
@@ -65,6 +66,7 @@ static inline void parser_append_ref(struct token_reference_t *dest, struct toke
65
66
  if(dest->type == TOKEN_NONE || dest->type != src->type || (dest->start + dest->length) != src->start) {
66
67
  dest->type = src->type;
67
68
  dest->start = src->start;
69
+ dest->mb_start = src->mb_start;
68
70
  dest->length = src->length;
69
71
  dest->line_number = src->line_number;
70
72
  dest->column_number = src->column_number;
@@ -362,15 +364,21 @@ static inline int rawtext_context(struct parser_t *parser)
362
364
 
363
365
  static void parser_adjust_line_number(struct parser_t *parser, long unsigned int start, long unsigned int length)
364
366
  {
367
+ rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
365
368
  long unsigned int i;
369
+ const char *buf, *nextlf;
366
370
 
367
- for(i = start;i < (start + length); i++) {
368
- if(parser->doc.data[i] == '\n') {
371
+ for(i = 0; i < length;) {
372
+ buf = &parser->doc.data[start + i];
373
+ nextlf = memchr(buf, '\n', length - i);
374
+ if(nextlf) {
369
375
  parser->doc.column_number = 0;
370
376
  parser->doc.line_number += 1;
377
+ i += (nextlf - buf) + 1;
371
378
  }
372
379
  else {
373
- parser->doc.column_number += 1;
380
+ parser->doc.column_number += rb_enc_strlen(buf, buf + length - i, enc);
381
+ break;
374
382
  }
375
383
  }
376
384
 
@@ -383,11 +391,14 @@ static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type typ
383
391
  struct token_reference_t ref = {
384
392
  .type = type,
385
393
  .start = tk->scan.cursor,
394
+ .mb_start = tk->scan.mb_cursor,
386
395
  .length = length,
387
396
  .line_number = parser->doc.line_number,
388
397
  .column_number = parser->doc.column_number,
389
398
  };
390
399
  int parse_again = 1;
400
+ long unsigned int mb_strlen;
401
+ rb_encoding *enc;
391
402
 
392
403
  while(parse_again) {
393
404
  switch(parser->context)
@@ -438,8 +449,10 @@ static void parser_tokenize_callback(struct tokenizer_t *tk, enum token_type typ
438
449
  }
439
450
 
440
451
  if(rb_block_given_p()) {
452
+ enc = rb_enc_from_index(parser->doc.enc_index);
453
+ mb_strlen = rb_enc_strlen(parser->doc.data + ref.start, parser->doc.data + ref.start + ref.length, enc);
441
454
  rb_yield_values(5, token_type_to_symbol(type),
442
- INT2NUM(ref.start), INT2NUM(ref.start + ref.length),
455
+ INT2NUM(ref.mb_start), INT2NUM(ref.mb_start + mb_strlen),
443
456
  INT2NUM(ref.line_number), INT2NUM(ref.column_number));
444
457
  }
445
458
 
@@ -465,6 +478,8 @@ static VALUE parser_initialize_method(VALUE self)
465
478
 
466
479
  parser->doc.length = 0;
467
480
  parser->doc.data = NULL;
481
+ parser->doc.enc_index = 0;
482
+ parser->doc.mb_length = 0;
468
483
 
469
484
  parser->doc.line_number = 1;
470
485
  parser->doc.column_number = 0;
@@ -478,11 +493,17 @@ static VALUE parser_initialize_method(VALUE self)
478
493
  static int parser_document_append(struct parser_t *parser, const char *string, unsigned long int length)
479
494
  {
480
495
  void *old = parser->doc.data;
496
+ unsigned long int mb_length;
497
+ char *buf;
498
+ rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
481
499
  REALLOC_N(parser->doc.data, char, parser->doc.length + length + 1);
482
500
  DBG_PRINT("parser=%p realloc(parser->doc.data) %p -> %p length=%lu", parser, old,
483
- parser->doc.data, parser->doc.length + length + 1);
484
- strcpy(parser->doc.data+parser->doc.length, string);
501
+ parser->doc.data, parser->doc.length + length + 1);
502
+ buf = parser->doc.data + parser->doc.length;
503
+ strcpy(buf, string);
504
+ mb_length = rb_enc_strlen(buf, buf + length, enc);
485
505
  parser->doc.length += length;
506
+ parser->doc.mb_length += mb_length;
486
507
  return 1;
487
508
  }
488
509
 
@@ -490,7 +511,7 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
490
511
  {
491
512
  struct parser_t *parser = NULL;
492
513
  char *string = NULL;
493
- long unsigned int length = 0, cursor = 0;
514
+ long unsigned int length = 0, cursor = 0, mb_cursor = 0;
494
515
 
495
516
  if(NIL_P(source))
496
517
  return Qnil;
@@ -502,6 +523,15 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
502
523
  length = strlen(string);
503
524
 
504
525
  cursor = parser->doc.length;
526
+ mb_cursor = parser->doc.mb_length;
527
+
528
+ if(parser->doc.data == NULL) {
529
+ parser->doc.enc_index = rb_enc_get_index(source);
530
+ }
531
+ else if(parser->doc.enc_index != rb_enc_get_index(source)) {
532
+ rb_raise(rb_eArgError, "cannot append %s string to %s document",
533
+ rb_enc_name(rb_enc_get(source)), rb_enc_name(rb_enc_from_index(parser->doc.enc_index)));
534
+ }
505
535
 
506
536
  if(!parser_document_append(parser, string, length)) {
507
537
  // error
@@ -515,6 +545,8 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
515
545
  parser->tk.scan.cursor = cursor;
516
546
  parser->tk.scan.string = parser->doc.data;
517
547
  parser->tk.scan.length = parser->doc.length;
548
+ parser->tk.scan.enc_index = parser->doc.enc_index;
549
+ parser->tk.scan.mb_cursor = mb_cursor;
518
550
 
519
551
  tokenizer_scan_all(&parser->tk);
520
552
  }
@@ -535,17 +567,30 @@ static VALUE parser_append_placeholder_method(VALUE self, VALUE source)
535
567
  static VALUE parser_document_method(VALUE self)
536
568
  {
537
569
  struct parser_t *parser = NULL;
570
+ rb_encoding *enc;
538
571
  Parser_Get_Struct(self, parser);
539
572
  if(!parser->doc.data)
540
573
  return Qnil;
541
- return rb_str_new(parser->doc.data, parser->doc.length);
574
+ enc = rb_enc_from_index(parser->doc.enc_index);
575
+ return rb_enc_str_new(parser->doc.data, parser->doc.length, enc);
542
576
  }
543
577
 
544
578
  static VALUE parser_document_length_method(VALUE self)
545
579
  {
546
580
  struct parser_t *parser = NULL;
581
+ rb_encoding *enc;
582
+ const char *buf;
583
+
547
584
  Parser_Get_Struct(self, parser);
548
- return ULONG2NUM(parser->doc.length);
585
+
586
+ if(parser->doc.data == NULL) {
587
+ return ULONG2NUM(0);
588
+ }
589
+ else {
590
+ buf = parser->doc.data;
591
+ enc = rb_enc_from_index(parser->doc.enc_index);
592
+ return ULONG2NUM(rb_enc_strlen(buf, buf + parser->doc.length, enc));
593
+ }
549
594
  }
550
595
 
551
596
  static VALUE parser_context_method(VALUE self)
@@ -588,9 +633,10 @@ static VALUE parser_context_method(VALUE self)
588
633
 
589
634
  static inline VALUE ref_to_str(struct parser_t *parser, struct token_reference_t *ref)
590
635
  {
636
+ rb_encoding *enc = rb_enc_from_index(parser->doc.enc_index);
591
637
  if(ref->type == TOKEN_NONE || parser->doc.data == NULL)
592
638
  return Qnil;
593
- return rb_str_new(parser->doc.data+ref->start, ref->length);
639
+ return rb_enc_str_new(parser->doc.data+ref->start, ref->length, enc);
594
640
  }
595
641
 
596
642
  static VALUE parser_tag_name_method(VALUE self)
@@ -665,29 +711,6 @@ static VALUE parser_rawtext_text_method(VALUE self)
665
711
  return ref_to_str(parser, &parser->rawtext.text);
666
712
  }
667
713
 
668
- static VALUE parser_extract_method(VALUE self, VALUE start_p, VALUE end_p)
669
- {
670
- struct parser_t *parser = NULL;
671
- unsigned long int start, end;
672
- struct token_reference_t ref;
673
-
674
- Parser_Get_Struct(self, parser);
675
-
676
- start = NUM2ULONG(start_p);
677
- end = NUM2ULONG(end_p);
678
- if(end < start) {
679
- rb_raise(rb_eArgError, "'end' must be greater or equal than 'start'");
680
- }
681
- if(end > parser->doc.length) {
682
- rb_raise(rb_eArgError, "'end' argument not in range of document");
683
- }
684
-
685
- ref.type = TOKEN_TEXT; // anything not NONE
686
- ref.start = start;
687
- ref.length = end - start;
688
- return ref_to_str(parser, &ref);
689
- }
690
-
691
714
  static VALUE parser_errors_count_method(VALUE self)
692
715
  {
693
716
  struct parser_t *parser = NULL;
@@ -749,7 +772,6 @@ void Init_html_tokenizer_parser(VALUE mHtmlTokenizer)
749
772
  rb_define_method(cParser, "column_number", parser_column_number_method, 0);
750
773
  rb_define_method(cParser, "parse", parser_parse_method, 1);
751
774
  rb_define_method(cParser, "append_placeholder", parser_append_placeholder_method, 1);
752
- rb_define_method(cParser, "extract", parser_extract_method, 2);
753
775
  rb_define_method(cParser, "context", parser_context_method, 0);
754
776
  rb_define_method(cParser, "tag_name", parser_tag_name_method, 0);
755
777
  rb_define_method(cParser, "closing_tag?", parser_closing_tag_method, 0);
@@ -28,11 +28,15 @@ struct parser_document_t {
28
28
  char *data;
29
29
  long unsigned int line_number;
30
30
  long unsigned int column_number;
31
+
32
+ int enc_index;
33
+ long unsigned int mb_length;
31
34
  };
32
35
 
33
36
  struct token_reference_t {
34
37
  enum token_type type;
35
38
  long unsigned int start;
39
+ long unsigned int mb_start;
36
40
  long unsigned int length;
37
41
  long unsigned int line_number;
38
42
  long unsigned int column_number;
@@ -1,4 +1,5 @@
1
1
  #include <ruby.h>
2
+ #include <ruby/encoding.h>
2
3
  #include "html_tokenizer.h"
3
4
  #include "tokenizer.h"
4
5
 
@@ -60,6 +61,8 @@ void tokenizer_init(struct tokenizer_t *tk)
60
61
  tk->scan.string = NULL;
61
62
  tk->scan.cursor = 0;
62
63
  tk->scan.length = 0;
64
+ tk->scan.mb_cursor = 0;
65
+ tk->scan.enc_index = 0;
63
66
 
64
67
  tk->attribute_value_start = 0;
65
68
  tk->found_attribute = 0;
@@ -115,17 +118,27 @@ VALUE token_type_to_symbol(enum token_type type)
115
118
  return Qnil;
116
119
  }
117
120
 
121
+ static long unsigned int tokenizer_mblength(struct tokenizer_t *tk, long unsigned int length)
122
+ {
123
+ rb_encoding *enc = rb_enc_from_index(tk->scan.enc_index);
124
+ const char *buf = tk->scan.string + tk->scan.cursor;
125
+ return rb_enc_strlen(buf, buf + length, enc);
126
+ }
127
+
118
128
  static void tokenizer_yield_tag(struct tokenizer_t *tk, enum token_type type, long unsigned int length, void *data)
119
129
  {
130
+ long unsigned int mb_length = tokenizer_mblength(tk, length);
120
131
  tk->last_token = type;
121
- rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan.cursor), INT2NUM(tk->scan.cursor + length));
132
+ rb_yield_values(3, token_type_to_symbol(type), INT2NUM(tk->scan.mb_cursor), INT2NUM(tk->scan.mb_cursor + mb_length));
122
133
  }
123
134
 
124
135
  static void tokenizer_callback(struct tokenizer_t *tk, enum token_type type, long unsigned int length)
125
136
  {
137
+ long unsigned int mb_length = tokenizer_mblength(tk, length);
126
138
  if(tk->f_callback)
127
139
  tk->f_callback(tk, type, length, tk->callback_data);
128
140
  tk->scan.cursor += length;
141
+ tk->scan.mb_cursor += mb_length;
129
142
  }
130
143
 
131
144
  static VALUE tokenizer_initialize_method(VALUE self)
@@ -657,6 +670,8 @@ static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
657
670
  c_source = StringValueCStr(source);
658
671
  tk->scan.cursor = 0;
659
672
  tk->scan.length = strlen(c_source);
673
+ tk->scan.enc_index = rb_enc_get_index(source);
674
+ tk->scan.mb_cursor = 0;
660
675
 
661
676
  old = tk->scan.string;
662
677
  REALLOC_N(tk->scan.string, char, tk->scan.length+1);
@@ -43,6 +43,9 @@ struct scan_t {
43
43
  char *string;
44
44
  long unsigned int cursor;
45
45
  long unsigned int length;
46
+
47
+ int enc_index;
48
+ long unsigned int mb_cursor;
46
49
  };
47
50
 
48
51
  struct tokenizer_t
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "html_tokenizer"
3
- spec.version = "0.0.1"
3
+ spec.version = "0.0.2"
4
4
  spec.summary = "HTML Tokenizer"
5
5
  spec.author = "Francois Chagnon"
6
6
 
@@ -431,29 +431,6 @@ class HtmlTokenizer::ParserTest < Minitest::Test
431
431
  tokens << token
432
432
  end
433
433
  assert_equal [[:text, 0, 4, 1, 0], [:text, 34, 38, 5, 0]], tokens
434
- assert_equal "bar\n", @parser.extract(34, 38)
435
- end
436
-
437
- def test_extract_method
438
- parse("abcdefg")
439
- assert_equal "a", @parser.extract(0, 1)
440
- assert_equal "cd", @parser.extract(2, 4)
441
- end
442
-
443
- def test_extract_method_raises_argument_error_end_past_length
444
- parse("abcdefg")
445
- e = assert_raises(ArgumentError) do
446
- @parser.extract(0, 32)
447
- end
448
- assert_equal "'end' argument not in range of document", e.message
449
- end
450
-
451
- def test_extract_method_raises_argument_error_end_less_than_start
452
- parse("abcdefg")
453
- e = assert_raises(ArgumentError) do
454
- @parser.extract(1, 0)
455
- end
456
- assert_equal "'end' must be greater or equal than 'start'", e.message
457
434
  end
458
435
 
459
436
  def test_solidus_or_tag_name_error
@@ -534,6 +511,29 @@ class HtmlTokenizer::ParserTest < Minitest::Test
534
511
  assert_equal 11, @parser.errors.first.column
535
512
  end
536
513
 
514
+ def test_attribute_with_mutlibyte_characters
515
+ data = ["<div title", "='your store’s'>"]
516
+ tokens = []
517
+ parse(*data) { |name, start, stop| tokens << [name, start, stop, data.join[start...stop]] }
518
+ assert_equal "div", @parser.tag_name
519
+ assert_equal "title", @parser.attribute_name
520
+ assert_equal "your store’s", @parser.attribute_value
521
+ assert_equal data.join, @parser.document
522
+ assert_equal data.join.size, @parser.document_length
523
+ assert_equal data.join.size, @parser.column_number
524
+ assert_equal [
525
+ [:tag_start, 0, 1, "<"],
526
+ [:tag_name, 1, 4, "div"],
527
+ [:whitespace, 4, 5, " "],
528
+ [:attribute_name, 5, 10, "title"],
529
+ [:equal, 10, 11, "="],
530
+ [:attribute_quoted_value_start, 11, 12, "'"],
531
+ [:attribute_quoted_value, 12, 24, "your store’s"],
532
+ [:attribute_quoted_value_end, 24, 25, "'"],
533
+ [:tag_end, 25, 26, ">"],
534
+ ], tokens
535
+ end
536
+
537
537
  def test_valid_syntaxes
538
538
  parse(
539
539
  '<div>',
@@ -324,13 +324,34 @@ class HtmlTokenizer::TokenizerTest < Minitest::Test
324
324
  ], result
325
325
  end
326
326
 
327
+ def test_html_with_mutlibyte_characters
328
+ data = "<div title='your store’s'>foo</div>"
329
+ result = tokenize(data)
330
+ assert_equal [
331
+ [:tag_start, "<"],
332
+ [:tag_name, "div"],
333
+ [:whitespace, " "],
334
+ [:attribute_name, "title"],
335
+ [:equal, "="],
336
+ [:attribute_quoted_value_start, "'"],
337
+ [:attribute_quoted_value, "your store’s"],
338
+ [:attribute_quoted_value_end, "'"],
339
+ [:tag_end, ">"],
340
+ [:text, "foo"],
341
+ [:tag_start, "<"],
342
+ [:solidus, "/"],
343
+ [:tag_name, "div"],
344
+ [:tag_end, ">"],
345
+ ], result
346
+ end
347
+
327
348
  private
328
349
 
329
350
  def tokenize(*parts)
330
351
  tokens = []
331
352
  @tokenizer = HtmlTokenizer::Tokenizer.new
332
353
  parts.each do |part|
333
- @tokenizer.tokenize(part) { |name, start, stop| tokens << [name, part[start..(stop-1)]] }
354
+ @tokenizer.tokenize(part) { |name, start, stop| tokens << [name, part[start...stop]] }
334
355
  end
335
356
  tokens
336
357
  end
metadata CHANGED
@@ -1,55 +1,55 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francois Chagnon
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-10-26 00:00:00.000000000 Z
11
+ date: 2017-11-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ~>
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
19
  version: '0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ~>
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
26
  version: '0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake-compiler
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ~>
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
33
  version: '0'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ~>
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: minitest
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ~>
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
47
  version: '0'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ~>
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  description:
@@ -60,8 +60,8 @@ extensions:
60
60
  - ext/html_tokenizer_ext/extconf.rb
61
61
  extra_rdoc_files: []
62
62
  files:
63
- - .autotest
64
- - .gitignore
63
+ - ".autotest"
64
+ - ".gitignore"
65
65
  - Gemfile
66
66
  - Gemfile.lock
67
67
  - LICENSE
@@ -90,17 +90,17 @@ require_paths:
90
90
  - ext
91
91
  required_ruby_version: !ruby/object:Gem::Requirement
92
92
  requirements:
93
- - - '>='
93
+ - - ">="
94
94
  - !ruby/object:Gem::Version
95
95
  version: '0'
96
96
  required_rubygems_version: !ruby/object:Gem::Requirement
97
97
  requirements:
98
- - - '>='
98
+ - - ">="
99
99
  - !ruby/object:Gem::Version
100
100
  version: '0'
101
101
  requirements: []
102
102
  rubyforge_project:
103
- rubygems_version: 2.0.14.1
103
+ rubygems_version: 2.6.8
104
104
  signing_key:
105
105
  specification_version: 4
106
106
  summary: HTML Tokenizer