html_tokenizer 0.0.3 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/Gemfile.lock +2 -2
- data/ext/html_tokenizer_ext/extconf.rb +6 -0
- data/ext/html_tokenizer_ext/parser.c +3 -2
- data/ext/html_tokenizer_ext/tokenizer.c +38 -21
- data/ext/html_tokenizer_ext/tokenizer.h +3 -0
- data/html_tokenizer.gemspec +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a45a481e7310c22092c48de49f62315d7ae19700
|
4
|
+
data.tar.gz: fda14319ce3a8b6770c074b68384e3c7e16d3fa7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b55f2ae076aa4cbf3b55fca42435bcfa63ec4e9e032a91ad55260cf0b6cc74ae75b484b0c7ad70cb08a6bf3fe75d9499feec9df768f85fdac2fdfe29cc8262ca
|
7
|
+
data.tar.gz: 8e3726c0471524c66270d8ed187a807b5c6bf906312e754c69e0e4da915db46ee28729f0400347413ddaf147c48a21fd4d628454e01f09eec2a104ace6452d17
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
@@ -14,6 +14,7 @@ static void parser_free(void *ptr)
|
|
14
14
|
size_t i;
|
15
15
|
|
16
16
|
if(parser) {
|
17
|
+
tokenizer_free_members(&parser->tk);
|
17
18
|
if(parser->doc.data) {
|
18
19
|
DBG_PRINT("parser=%p xfree(parser->doc.data) %p", parser, parser->doc.data);
|
19
20
|
xfree(parser->doc.data);
|
@@ -543,12 +544,12 @@ static VALUE parser_append_data(VALUE self, VALUE source, int is_placeholder)
|
|
543
544
|
}
|
544
545
|
else {
|
545
546
|
parser->tk.scan.cursor = cursor;
|
546
|
-
parser->tk.
|
547
|
-
parser->tk.scan.length = parser->doc.length;
|
547
|
+
tokenizer_set_scan_string(&parser->tk, parser->doc.data, parser->doc.length);
|
548
548
|
parser->tk.scan.enc_index = parser->doc.enc_index;
|
549
549
|
parser->tk.scan.mb_cursor = mb_cursor;
|
550
550
|
|
551
551
|
tokenizer_scan_all(&parser->tk);
|
552
|
+
tokenizer_free_scan_string(&parser->tk);
|
552
553
|
}
|
553
554
|
|
554
555
|
return Qtrue;
|
@@ -12,16 +12,7 @@ static void tokenizer_free(void *ptr)
|
|
12
12
|
{
|
13
13
|
struct tokenizer_t *tk = ptr;
|
14
14
|
if(tk) {
|
15
|
-
|
16
|
-
DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag);
|
17
|
-
xfree(tk->current_tag);
|
18
|
-
tk->current_tag = NULL;
|
19
|
-
}
|
20
|
-
if(tk->scan.string) {
|
21
|
-
DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string);
|
22
|
-
xfree(tk->scan.string);
|
23
|
-
tk->scan.string = NULL;
|
24
|
-
}
|
15
|
+
tokenizer_free_members(tk);
|
25
16
|
DBG_PRINT("tk=%p xfree(tk)", tk);
|
26
17
|
xfree(tk);
|
27
18
|
}
|
@@ -75,6 +66,21 @@ void tokenizer_init(struct tokenizer_t *tk)
|
|
75
66
|
return;
|
76
67
|
}
|
77
68
|
|
69
|
+
void tokenizer_free_members(struct tokenizer_t *tk)
|
70
|
+
{
|
71
|
+
if(tk->current_tag) {
|
72
|
+
DBG_PRINT("tk=%p xfree(tk->current_tag) %p", tk, tk->current_tag);
|
73
|
+
xfree(tk->current_tag);
|
74
|
+
tk->current_tag = NULL;
|
75
|
+
}
|
76
|
+
if(tk->scan.string) {
|
77
|
+
DBG_PRINT("tk=%p xfree(tk->scan.string) %p", tk, tk->scan.string);
|
78
|
+
xfree(tk->scan.string);
|
79
|
+
tk->scan.string = NULL;
|
80
|
+
}
|
81
|
+
return;
|
82
|
+
}
|
83
|
+
|
78
84
|
VALUE token_type_to_symbol(enum token_type type)
|
79
85
|
{
|
80
86
|
switch(type) {
|
@@ -656,11 +662,30 @@ void tokenizer_scan_all(struct tokenizer_t *tk)
|
|
656
662
|
return;
|
657
663
|
}
|
658
664
|
|
665
|
+
void tokenizer_set_scan_string(struct tokenizer_t *tk, const char *string, long unsigned int length)
|
666
|
+
{
|
667
|
+
const char *old = tk->scan.string;
|
668
|
+
REALLOC_N(tk->scan.string, char, string ? length + 1 : 0);
|
669
|
+
DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
|
670
|
+
tk->scan.string, length + 1);
|
671
|
+
if(string && length > 0) {
|
672
|
+
strncpy(tk->scan.string, string, length);
|
673
|
+
tk->scan.string[length] = 0;
|
674
|
+
}
|
675
|
+
tk->scan.length = length;
|
676
|
+
return;
|
677
|
+
}
|
678
|
+
|
679
|
+
void tokenizer_free_scan_string(struct tokenizer_t *tk)
|
680
|
+
{
|
681
|
+
tokenizer_set_scan_string(tk, NULL, 0);
|
682
|
+
return;
|
683
|
+
}
|
684
|
+
|
659
685
|
static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
|
660
686
|
{
|
661
687
|
struct tokenizer_t *tk = NULL;
|
662
688
|
char *c_source;
|
663
|
-
char *old;
|
664
689
|
|
665
690
|
if(NIL_P(source))
|
666
691
|
return Qnil;
|
@@ -670,21 +695,13 @@ static VALUE tokenizer_tokenize_method(VALUE self, VALUE source)
|
|
670
695
|
|
671
696
|
c_source = StringValueCStr(source);
|
672
697
|
tk->scan.cursor = 0;
|
673
|
-
tk
|
698
|
+
tokenizer_set_scan_string(tk, c_source, strlen(c_source));
|
674
699
|
tk->scan.enc_index = rb_enc_get_index(source);
|
675
700
|
tk->scan.mb_cursor = 0;
|
676
701
|
|
677
|
-
old = tk->scan.string;
|
678
|
-
REALLOC_N(tk->scan.string, char, tk->scan.length+1);
|
679
|
-
DBG_PRINT("tk=%p realloc(tk->scan.string) %p -> %p length=%lu", tk, old,
|
680
|
-
tk->scan.string, tk->scan.length+1);
|
681
|
-
strncpy(tk->scan.string, c_source, tk->scan.length);
|
682
|
-
|
683
702
|
tokenizer_scan_all(tk);
|
684
703
|
|
685
|
-
|
686
|
-
xfree(tk->scan.string);
|
687
|
-
tk->scan.string = NULL;
|
704
|
+
tokenizer_free_scan_string(tk);
|
688
705
|
|
689
706
|
return Qtrue;
|
690
707
|
}
|
@@ -70,6 +70,9 @@ struct tokenizer_t
|
|
70
70
|
|
71
71
|
void Init_html_tokenizer_tokenizer(VALUE mHtmlTokenizer);
|
72
72
|
void tokenizer_init(struct tokenizer_t *tk);
|
73
|
+
void tokenizer_free_members(struct tokenizer_t *tk);
|
74
|
+
void tokenizer_set_scan_string(struct tokenizer_t *tk, const char *string, long unsigned int length);
|
75
|
+
void tokenizer_free_scan_string(struct tokenizer_t *tk);
|
73
76
|
void tokenizer_scan_all(struct tokenizer_t *tk);
|
74
77
|
VALUE token_type_to_symbol(enum token_type type);
|
75
78
|
|
data/html_tokenizer.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francois Chagnon
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-11-
|
11
|
+
date: 2017-11-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|