html_tokenizer 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile.lock +2 -2
- data/ext/html_tokenizer_ext/extconf.rb +6 -0
- data/ext/html_tokenizer_ext/parser.c +3 -2
- data/ext/html_tokenizer_ext/tokenizer.c +38 -21
- data/ext/html_tokenizer_ext/tokenizer.h +3 -0
- data/html_tokenizer.gemspec +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a45a481e7310c22092c48de49f62315d7ae19700
|
4
|
+
data.tar.gz: fda14319ce3a8b6770c074b68384e3c7e16d3fa7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b55f2ae076aa4cbf3b55fca42435bcfa63ec4e9e032a91ad55260cf0b6cc74ae75b484b0c7ad70cb08a6bf3fe75d9499feec9df768f85fdac2fdfe29cc8262ca
|
7
|
+
data.tar.gz: 8e3726c0471524c66270d8ed187a807b5c6bf906312e754c69e0e4da915db46ee28729f0400347413ddaf147c48a21fd4d628454e01f09eec2a104ace6452d17
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
@@ -14,6 +14,7 @@ static void parser_free(void *ptr)
|
|
14
14
|
size_t i;
|
15
15
|
|
16
16
|
if(parser) {
|
17
|
+
tokenizer_free_members(&parser->tk);
|
17
18
|
if(parser->doc.data) {
|
18
19
|
DBG_PRINT("parser=%p xfree(parser->doc.data) %p", parser, parser->doc.data);
|
19
20
|
xfree(parser->doc.data);
|
@@ -543,12 +544,12 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
|
|
543
544
|
}
|
544
545
|
else {
|
545
546
|
parser->tk.scan.cursor = cursor;
|
546
|
-
parser->tk.
|
547
|
-
parser->tk.scan.length = parser->doc.length;
|
547
|
+
tokenizer_set_scan_string(&parser->tk, parser->doc.data, parser->doc.length);
|
548
548
|
parser->tk.scan.enc_index = parser->doc.enc_index;
|
549
549
|
parser->tk.scan.mb_cursor = mb_cursor;
|
550
550
|
|
551
551
|
tokenizer_scan_all(&parser->tk);
|
552
|
+
tokenizer_free_scan_string(&parser->tk);
|
552
553
|
}
|
553
554
|
|
554
555
|
return Qtrue;
|
@@ -12,16 +12,7 @@ static void tokenizer_free(void *ptr)
|
|
12
12
|
{
|
13
13
|
struct tokenizer_t *tk = ptr;
|
14
14
|
if(tk) {
|
15
|
-
|
16
|
-
DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag);
|
17
|
-
xfree(tk->current_tag);
|
18
|
-
tk->current_tag = NULL;
|
19
|
-
}
|
20
|
-
if(tk->scan.string) {
|
21
|
-
DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string);
|
22
|
-
xfree(tk->scan.string);
|
23
|
-
tk->scan.string = NULL;
|
24
|
-
}
|
15
|
+
tokenizer_free_members(tk);
|
25
16
|
DBG_PRINT("tk=%p xfree(tk)", tk);
|
26
17
|
xfree(tk);
|
27
18
|
}
|
@@ -75,6 +66,21 @@ void tokenizer_init(struct tokenizer_t *tk)
|
|
75
66
|
return;
|
76
67
|
}
|
77
68
|
|
69
|
+
void tokenizer_free_members(struct tokenizer_t *tk)
|
70
|
+
{
|
71
|
+
if(tk->current_tag) {
|
72
|
+
DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag);
|
73
|
+
xfree(tk->current_tag);
|
74
|
+
tk->current_tag = NULL;
|
75
|
+
}
|
76
|
+
if(tk->scan.string) {
|
77
|
+
DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string);
|
78
|
+
xfree(tk->scan.string);
|
79
|
+
tk->scan.string = NULL;
|
80
|
+
}
|
81
|
+
return;
|
82
|
+
}
|
83
|
+
|
78
84
|
VALUE token_type_to_symbol(enum token_type type)
|
79
85
|
{
|
80
86
|
switch(type) {
|
@@ -656,11 +662,30 @@ void tokenizer_scan_all(struct tokenizer_t *tk)
|
|
656
662
|
return;
|
657
663
|
}
|
658
664
|
|
665
|
+
void tokenizer_set_scan_string(struct tokenizer_t *tk, const char *string, long unsigned int length)
|
666
|
+
{
|
667
|
+
const char *old = tk->scan.string;
|
668
|
+
REALLOC_N(tk->scan.string, char, string ? length + 1 : 0);
|
669
|
+
DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
|
670
|
+
tk->scan.string, length + 1);
|
671
|
+
if(string && length > 0) {
|
672
|
+
strncpy(tk->scan.string, string, length);
|
673
|
+
tk->scan.string[length] = 0;
|
674
|
+
}
|
675
|
+
tk->scan.length = length;
|
676
|
+
return;
|
677
|
+
}
|
678
|
+
|
679
|
+
void tokenizer_free_scan_string(struct tokenizer_t *tk)
|
680
|
+
{
|
681
|
+
tokenizer_set_scan_string(tk, NULL, 0);
|
682
|
+
return;
|
683
|
+
}
|
684
|
+
|
659
685
|
static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
|
660
686
|
{
|
661
687
|
struct tokenizer_t *tk = NULL;
|
662
688
|
char *c_source;
|
663
|
-
char *old;
|
664
689
|
|
665
690
|
if(NIL_P(source))
|
666
691
|
return Qnil;
|
@@ -670,21 +695,13 @@ static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
|
|
670
695
|
|
671
696
|
c_source = StringValueCStr(source);
|
672
697
|
tk->scan.cursor = 0;
|
673
|
-
tk
|
698
|
+
tokenizer_set_scan_string(tk, c_source, strlen(c_source));
|
674
699
|
tk->scan.enc_index = rb_enc_get_index(source);
|
675
700
|
tk->scan.mb_cursor = 0;
|
676
701
|
|
677
|
-
old = tk->scan.string;
|
678
|
-
REALLOC_N(tk->scan.string, char, tk->scan.length+1);
|
679
|
-
DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
|
680
|
-
tk->scan.string, tk->scan.length+1);
|
681
|
-
strncpy(tk->scan.string, c_source, tk->scan.length);
|
682
|
-
|
683
702
|
tokenizer_scan_all(tk);
|
684
703
|
|
685
|
-
|
686
|
-
xfree(tk->scan.string);
|
687
|
-
tk->scan.string = NULL;
|
704
|
+
tokenizer_free_scan_string(tk);
|
688
705
|
|
689
706
|
return Qtrue;
|
690
707
|
}
|
@@ -70,6 +70,9 @@ struct tokenizer_t
|
|
70
70
|
|
71
71
|
void Init_html_tokenizer_tokenizer(VALUE mHtmlTokenizer);
|
72
72
|
void tokenizer_init(struct tokenizer_t *tk);
|
73
|
+
void tokenizer_free_members(struct tokenizer_t *tk);
|
74
|
+
void tokenizer_set_scan_string(struct tokenizer_t *tk, const char *string, long unsigned int length);
|
75
|
+
void tokenizer_free_scan_string(struct tokenizer_t *tk);
|
73
76
|
void tokenizer_scan_all(struct tokenizer_t *tk);
|
74
77
|
VALUE token_type_to_symbol(enum token_type type);
|
75
78
|
|
data/html_tokenizer.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francois Chagnon
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-11-
|
11
|
+
date: 2017-11-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|