html_tokenizer 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 18ea0aa757ca8640a78cb619149b306aba21b8cc
4
- data.tar.gz: 182b3fab9d73373ae04c45beb3d73eb0798bfb3b
3
+ metadata.gz: a45a481e7310c22092c48de49f62315d7ae19700
4
+ data.tar.gz: fda14319ce3a8b6770c074b68384e3c7e16d3fa7
5
5
  SHA512:
6
- metadata.gz: d92c656e1c755d7b6e338732444327481af753145ca981e5e35510c44f77eecc5b06cb9eae16b02874f502037ecbf286f817886cb4fe1a5ea196fe77c760feaf
7
- data.tar.gz: a57323a2228866046de1176f28f70c0309f24cb8a5fec6e0b486a4c64beaab83dcc32ab059b044c85776edcc1833bf59bb068606fcbd32c7c9ead6ecd032edaa
6
+ metadata.gz: b55f2ae076aa4cbf3b55fca42435bcfa63ec4e9e032a91ad55260cf0b6cc74ae75b484b0c7ad70cb08a6bf3fe75d9499feec9df768f85fdac2fdfe29cc8262ca
7
+ data.tar.gz: 8e3726c0471524c66270d8ed187a807b5c6bf906312e754c69e0e4da915db46ee28729f0400347413ddaf147c48a21fd4d628454e01f09eec2a104ace6452d17
data/.gitignore CHANGED
@@ -30,6 +30,7 @@ tmp/
30
30
  *.i*86
31
31
  *.x86_64
32
32
  *.hex
33
+ *.gem
33
34
 
34
35
  # Debug files
35
36
  *.dSYM/
@@ -1,13 +1,13 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- html_tokenizer (0.0.3)
4
+ html_tokenizer (0.0.5)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
9
  minitest (5.9.0)
10
- rake (11.1.2)
10
+ rake (12.3.0)
11
11
  rake-compiler (0.9.9)
12
12
  rake
13
13
 
@@ -2,5 +2,11 @@ require 'mkmf'
2
2
 
3
3
  $CXXFLAGS += " -std=c++11 "
4
4
  $CXXFLAGS += " -g -Og -ggdb "
5
+ $CFLAGS += " -g -Og -ggdb "
6
+
7
+ if ENV['DEBUG']
8
+ $CXXFLAGS += " -DDEBUG "
9
+ $CFLAGS += " -DDEBUG "
10
+ end
5
11
 
6
12
  create_makefile('html_tokenizer_ext')
@@ -14,6 +14,7 @@ static void parser_free(void *ptr)
14
14
  size_t i;
15
15
 
16
16
  if(parser) {
17
+ tokenizer_free_members(&parser->tk);
17
18
  if(parser->doc.data) {
18
19
  DBG_PRINT("parser=%p xfree(parser->doc.data) %p", parser, parser->doc.data);
19
20
  xfree(parser->doc.data);
@@ -543,12 +544,12 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
543
544
  }
544
545
  else {
545
546
  parser->tk.scan.cursor = cursor;
546
- parser->tk.scan.string = parser->doc.data;
547
- parser->tk.scan.length = parser->doc.length;
547
+ tokenizer_set_scan_string(&parser->tk, parser->doc.data, parser->doc.length);
548
548
  parser->tk.scan.enc_index = parser->doc.enc_index;
549
549
  parser->tk.scan.mb_cursor = mb_cursor;
550
550
 
551
551
  tokenizer_scan_all(&parser->tk);
552
+ tokenizer_free_scan_string(&parser->tk);
552
553
  }
553
554
 
554
555
  return Qtrue;
@@ -12,16 +12,7 @@ static void tokenizer_free(void *ptr)
12
12
  {
13
13
  struct tokenizer_t *tk = ptr;
14
14
  if(tk) {
15
- if(tk->current_tag) {
16
- DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag);
17
- xfree(tk->current_tag);
18
- tk->current_tag = NULL;
19
- }
20
- if(tk->scan.string) {
21
- DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string);
22
- xfree(tk->scan.string);
23
- tk->scan.string = NULL;
24
- }
15
+ tokenizer_free_members(tk);
25
16
  DBG_PRINT("tk=%p xfree(tk)", tk);
26
17
  xfree(tk);
27
18
  }
@@ -75,6 +66,21 @@ void tokenizer_init(struct tokenizer_t *tk)
75
66
  return;
76
67
  }
77
68
 
69
+ void tokenizer_free_members(struct tokenizer_t *tk)
70
+ {
71
+ if(tk->current_tag) {
72
+ DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag);
73
+ xfree(tk->current_tag);
74
+ tk->current_tag = NULL;
75
+ }
76
+ if(tk->scan.string) {
77
+ DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string);
78
+ xfree(tk->scan.string);
79
+ tk->scan.string = NULL;
80
+ }
81
+ return;
82
+ }
83
+
78
84
  VALUE token_type_to_symbol(enum token_type type)
79
85
  {
80
86
  switch(type) {
@@ -656,11 +662,30 @@ void tokenizer_scan_all(struct tokenizer_t *tk)
656
662
  return;
657
663
  }
658
664
 
665
+ void tokenizer_set_scan_string(struct tokenizer_t *tk, const char *string, long unsigned int length)
666
+ {
667
+ const char *old = tk->scan.string;
668
+ REALLOC_N(tk->scan.string, char, string ? length + 1 : 0);
669
+ DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
670
+ tk->scan.string, length + 1);
671
+ if(string && length > 0) {
672
+ strncpy(tk->scan.string, string, length);
673
+ tk->scan.string[length] = 0;
674
+ }
675
+ tk->scan.length = length;
676
+ return;
677
+ }
678
+
679
+ void tokenizer_free_scan_string(struct tokenizer_t *tk)
680
+ {
681
+ tokenizer_set_scan_string(tk, NULL, 0);
682
+ return;
683
+ }
684
+
659
685
  static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
660
686
  {
661
687
  struct tokenizer_t *tk = NULL;
662
688
  char *c_source;
663
- char *old;
664
689
 
665
690
  if(NIL_P(source))
666
691
  return Qnil;
@@ -670,21 +695,13 @@ static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
670
695
 
671
696
  c_source = StringValueCStr(source);
672
697
  tk->scan.cursor = 0;
673
- tk->scan.length = strlen(c_source);
698
+ tokenizer_set_scan_string(tk, c_source, strlen(c_source));
674
699
  tk->scan.enc_index = rb_enc_get_index(source);
675
700
  tk->scan.mb_cursor = 0;
676
701
 
677
- old = tk->scan.string;
678
- REALLOC_N(tk->scan.string, char, tk->scan.length+1);
679
- DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
680
- tk->scan.string, tk->scan.length+1);
681
- strncpy(tk->scan.string, c_source, tk->scan.length);
682
-
683
702
  tokenizer_scan_all(tk);
684
703
 
685
- DBG_PRINT("tk=%p xfree(tk->scan.string) 0x%p", tk, tk->scan.string);
686
- xfree(tk->scan.string);
687
- tk->scan.string = NULL;
704
+ tokenizer_free_scan_string(tk);
688
705
 
689
706
  return Qtrue;
690
707
  }
@@ -70,6 +70,9 @@ struct tokenizer_t
70
70
 
71
71
  void Init_html_tokenizer_tokenizer(VALUE mHtmlTokenizer);
72
72
  void tokenizer_init(struct tokenizer_t *tk);
73
+ void tokenizer_free_members(struct tokenizer_t *tk);
74
+ void tokenizer_set_scan_string(struct tokenizer_t *tk, const char *string, long unsigned int length);
75
+ void tokenizer_free_scan_string(struct tokenizer_t *tk);
73
76
  void tokenizer_scan_all(struct tokenizer_t *tk);
74
77
  VALUE token_type_to_symbol(enum token_type type);
75
78
 
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "html_tokenizer"
3
- spec.version = "0.0.3"
3
+ spec.version = "0.0.5"
4
4
  spec.summary = "HTML Tokenizer"
5
5
  spec.author = "Francois Chagnon"
6
6
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francois Chagnon
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-11-21 00:00:00.000000000 Z
11
+ date: 2017-11-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake