wordtriez 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/hat-trie/text.c +7 -12
- data/ext/hat-trie/text.h +2 -2
- data/ext/wordtriez.cc +15 -7
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2cf93aec498957ae648674251b2051ab4a08381c
|
4
|
+
data.tar.gz: 8fe39195a015704d6afee86762d304f84d485456
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 23f5c58246dec87b35d3c11476c4e8d82c6fdc013b1f534eaa8dc47881608c74a33b002dae5a2d744a223b38de59233eddf50fef8ac239a07f9fff64d03559f3
|
7
|
+
data.tar.gz: 1033906d59479f13f251af341905315ae637e34663cee293a785614299baf0e883715d9d61d0bab3b3e2f68e945e298c50b2861f6191c2918fd6cf308605218d
|
data/ext/hat-trie/text.c
CHANGED
@@ -20,7 +20,7 @@ size_t text_clean(char* text)
|
|
20
20
|
char* read;
|
21
21
|
char* write = text;
|
22
22
|
uint8_t join_lines = false,
|
23
|
-
just_added_space =
|
23
|
+
just_added_space = true, // prevent prefix spaces
|
24
24
|
just_added_period = false;
|
25
25
|
for (read=text; *read; read++) {
|
26
26
|
char c = *read;
|
@@ -60,17 +60,15 @@ size_t text_clean(char* text)
|
|
60
60
|
}
|
61
61
|
// erase space at end of text
|
62
62
|
if (just_added_space) write--;
|
63
|
-
// terminate the string at its new length
|
64
|
-
*write = '\0';
|
65
63
|
|
66
64
|
// Return the new length of the string
|
67
65
|
return (size_t)(write - text);
|
68
66
|
}
|
69
67
|
|
70
|
-
void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only)
|
68
|
+
void add_ngrams(hattrie_t* trie, int upto_n, char* text, size_t text_len, uint8_t incr_existing_keys_only)
|
71
69
|
{
|
72
70
|
char blank_suffix[] = "\0";
|
73
|
-
add_ngrams_with_suffix(trie, upto_n, text, blank_suffix, incr_existing_keys_only);
|
71
|
+
add_ngrams_with_suffix(trie, upto_n, text, text_len, blank_suffix, incr_existing_keys_only);
|
74
72
|
}
|
75
73
|
|
76
74
|
static inline void incr_value(
|
@@ -101,7 +99,7 @@ static inline void incr_value(
|
|
101
99
|
|
102
100
|
}
|
103
101
|
|
104
|
-
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only)
|
102
|
+
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, size_t text_len, char* suffix, uint8_t incr_existing_keys_only)
|
105
103
|
{
|
106
104
|
char* head = text;
|
107
105
|
char* tail = text;
|
@@ -109,7 +107,7 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
|
|
109
107
|
char* next_tail = text;
|
110
108
|
int word_count = 0;
|
111
109
|
|
112
|
-
if (
|
110
|
+
if (text_len == 0) return;
|
113
111
|
|
114
112
|
char buffer[NGRAM_BUFFER_SIZE];
|
115
113
|
size_t suffix_len = strlen(suffix);
|
@@ -117,11 +115,8 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
|
|
117
115
|
char* buffer_pre = buffer + buffer_offset;
|
118
116
|
strcpy(buffer_pre, suffix);
|
119
117
|
|
120
|
-
// skip any spaces at beginning
|
121
|
-
while(*head == ' ') head++;
|
122
|
-
|
123
118
|
do {
|
124
|
-
if (*tail == ' ' || *tail == '.' ||
|
119
|
+
if (*tail == ' ' || *tail == '.' || tail >= head+text_len) {
|
125
120
|
word_count++;
|
126
121
|
if (word_count == 1 || upto_n == 1) {
|
127
122
|
next_head = next_tail = tail + 1;
|
@@ -152,7 +147,7 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
|
|
152
147
|
|
153
148
|
// add the 1..(upto_n-1) sized ngrams at the tail
|
154
149
|
if (upto_n > 1) {
|
155
|
-
while(
|
150
|
+
while(head < text+text_len) {
|
156
151
|
if(*head == ' ' || *head == '.') {
|
157
152
|
incr_value(trie, buffer, buffer_pre,
|
158
153
|
head + 1, tail - head - 1, suffix_len,
|
data/ext/hat-trie/text.h
CHANGED
@@ -12,8 +12,8 @@ extern "C" {
|
|
12
12
|
#define NGRAM_BUFFER_SIZE 4096
|
13
13
|
|
14
14
|
size_t text_clean(char* text);
|
15
|
-
void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only);
|
16
|
-
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only);
|
15
|
+
void add_ngrams(hattrie_t* trie, int upto_n, char* text, size_t text_len, uint8_t incr_existing_keys_only);
|
16
|
+
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, size_t text_len, char* suffix, uint8_t incr_existing_keys_only);
|
17
17
|
|
18
18
|
#ifdef __cplusplus
|
19
19
|
}
|
data/ext/wordtriez.cc
CHANGED
@@ -273,26 +273,33 @@ static VALUE hat_walk(VALUE self, VALUE key) {
|
|
273
273
|
return data.arr;
|
274
274
|
}
|
275
275
|
|
276
|
+
static VALUE hat_text_clean(VALUE self, VALUE text) {
|
277
|
+
rb_str_modify(text);
|
278
|
+
|
279
|
+
char* ctext = StringValueCStr(text);
|
280
|
+
size_t new_length = text_clean(ctext);
|
281
|
+
|
282
|
+
rb_str_set_len(text, (long)new_length);
|
283
|
+
|
284
|
+
return text;
|
285
|
+
}
|
286
|
+
|
276
287
|
static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix, VALUE incr_existing_keys_only) {
|
277
|
-
// rb_str_dup
|
278
288
|
hattrie_t* p;
|
279
289
|
HatTrie* ht;
|
280
290
|
Data_Get_Struct(self, HatTrie, ht);
|
281
291
|
p = ht->p;
|
282
292
|
|
283
|
-
|
284
|
-
size_t new_length = text_clean(ctext);
|
293
|
+
hat_text_clean(self, text);
|
285
294
|
|
286
295
|
add_ngrams_with_suffix(p,
|
287
296
|
FIX2INT(ngrams),
|
288
|
-
|
297
|
+
RSTRING_PTR(text),
|
298
|
+
RSTRING_LEN(text),
|
289
299
|
StringValueCStr(suffix),
|
290
300
|
RTEST(incr_existing_keys_only));
|
291
301
|
|
292
|
-
rb_str_resize(text, (long)new_length);
|
293
|
-
|
294
302
|
return self;
|
295
|
-
// rb_str_substr
|
296
303
|
}
|
297
304
|
|
298
305
|
#define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
|
@@ -316,4 +323,5 @@ void Init_wordtriez() {
|
|
316
323
|
DEF(hat_class, "_internal_search", hat_search, 4);
|
317
324
|
DEF(hat_class, "_internal_walk", hat_walk, 1);
|
318
325
|
DEF(hat_class, "_internal_add_text", hat_add_text, 4);
|
326
|
+
DEF(hat_class, "text_clean", hat_text_clean, 1);
|
319
327
|
}
|