wordtriez 0.1.3 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/hat-trie/text.c +6 -7
- data/ext/hat-trie/text.h +2 -2
- data/ext/wordtriez.cc +3 -3
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1166b687e599b75c7c6423e177e661e6e01557fc
|
4
|
+
data.tar.gz: 7157ecc077b1be637ceb067bae8e170acef367bf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a674980638b76c5ad7893897d945986b56634b49055b807cded3bc367a05917a207f7ea92a731692a74a4335580b758ac31398466654b01a3a3e291fe13fea1c
|
7
|
+
data.tar.gz: 4c113c405f97f8033a11e0f501c4e3043466ade43717dc762347c5283d5064a00096b1b27f578940ee4a97978a72977a79651755a81c3342c2c5b6059b31cb2c
|
data/ext/hat-trie/text.c
CHANGED
@@ -15,16 +15,16 @@
|
|
15
15
|
*
|
16
16
|
* Spaces indicate word boundaries, while periods indicate sentence boundaries.
|
17
17
|
*/
|
18
|
-
size_t text_clean(char* text)
|
18
|
+
size_t text_clean(char* text, size_t len)
|
19
19
|
{
|
20
|
-
if (
|
20
|
+
if (len == 0) return 0;
|
21
21
|
|
22
22
|
char* read;
|
23
23
|
char* write = text;
|
24
24
|
uint8_t join_lines = false,
|
25
25
|
just_added_space = true, // prevent prefix spaces
|
26
26
|
just_added_period = false;
|
27
|
-
for (read=text;
|
27
|
+
for (read=text; read<text+len; read++) {
|
28
28
|
char c = *read;
|
29
29
|
if (c >= 'A' && c <= 'Z') {
|
30
30
|
// Change upper case to lowercase
|
@@ -70,7 +70,7 @@ size_t text_clean(char* text)
|
|
70
70
|
void add_ngrams(hattrie_t* trie, int upto_n, char* text, size_t text_len, uint8_t incr_existing_keys_only)
|
71
71
|
{
|
72
72
|
char blank_suffix[] = "\0";
|
73
|
-
add_ngrams_with_suffix(trie, upto_n, text, text_len, blank_suffix, incr_existing_keys_only);
|
73
|
+
add_ngrams_with_suffix(trie, upto_n, text, text_len, blank_suffix, 0, incr_existing_keys_only);
|
74
74
|
}
|
75
75
|
|
76
76
|
static inline void incr_value(
|
@@ -101,7 +101,7 @@ static inline void incr_value(
|
|
101
101
|
|
102
102
|
}
|
103
103
|
|
104
|
-
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, size_t text_len, char* suffix, uint8_t incr_existing_keys_only)
|
104
|
+
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, size_t text_len, char* suffix, size_t suffix_len, uint8_t incr_existing_keys_only)
|
105
105
|
{
|
106
106
|
char* head = text;
|
107
107
|
char* tail = text;
|
@@ -112,10 +112,9 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, size_t text
|
|
112
112
|
if (text_len == 0) return;
|
113
113
|
|
114
114
|
char buffer[NGRAM_BUFFER_SIZE];
|
115
|
-
size_t suffix_len = strlen(suffix);
|
116
115
|
size_t buffer_offset = NGRAM_BUFFER_SIZE - suffix_len - 1;
|
117
116
|
char* buffer_pre = buffer + buffer_offset;
|
118
|
-
|
117
|
+
memcpy(buffer_pre, suffix, suffix_len);
|
119
118
|
|
120
119
|
do {
|
121
120
|
if (*tail == ' ' || *tail == '.' || tail >= head+text_len) {
|
data/ext/hat-trie/text.h
CHANGED
@@ -11,9 +11,9 @@ extern "C" {
|
|
11
11
|
|
12
12
|
#define NGRAM_BUFFER_SIZE 4096
|
13
13
|
|
14
|
-
size_t text_clean(char* text);
|
14
|
+
size_t text_clean(char* text, size_t len);
|
15
15
|
void add_ngrams(hattrie_t* trie, int upto_n, char* text, size_t text_len, uint8_t incr_existing_keys_only);
|
16
|
-
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, size_t text_len, char* suffix, uint8_t incr_existing_keys_only);
|
16
|
+
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, size_t text_len, char* suffix, size_t suffix_len, uint8_t incr_existing_keys_only);
|
17
17
|
|
18
18
|
#ifdef __cplusplus
|
19
19
|
}
|
data/ext/wordtriez.cc
CHANGED
@@ -276,8 +276,7 @@ static VALUE hat_walk(VALUE self, VALUE key) {
|
|
276
276
|
static VALUE hat_text_clean(VALUE self, VALUE text) {
|
277
277
|
rb_str_modify(text);
|
278
278
|
|
279
|
-
|
280
|
-
size_t new_length = text_clean(ctext);
|
279
|
+
size_t new_length = text_clean(RSTRING_PTR(text), RSTRING_LEN(text));
|
281
280
|
|
282
281
|
rb_str_set_len(text, (long)new_length);
|
283
282
|
|
@@ -296,7 +295,8 @@ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix, VA
|
|
296
295
|
FIX2INT(ngrams),
|
297
296
|
RSTRING_PTR(text),
|
298
297
|
RSTRING_LEN(text),
|
299
|
-
|
298
|
+
RSTRING_PTR(suffix),
|
299
|
+
RSTRING_LEN(suffix),
|
300
300
|
RTEST(incr_existing_keys_only));
|
301
301
|
|
302
302
|
return self;
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wordtriez
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Zete Lui
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-09-
|
12
|
+
date: 2014-09-28 00:00:00.000000000 Z
|
13
13
|
dependencies: []
|
14
14
|
description: fast, efficient, unicode aware HAT trie with prefix / suffix support.
|
15
15
|
email:
|