wordtriez 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/hat-trie/text.c +7 -12
- data/ext/hat-trie/text.h +2 -2
- data/ext/wordtriez.cc +15 -7
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2cf93aec498957ae648674251b2051ab4a08381c
|
4
|
+
data.tar.gz: 8fe39195a015704d6afee86762d304f84d485456
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 23f5c58246dec87b35d3c11476c4e8d82c6fdc013b1f534eaa8dc47881608c74a33b002dae5a2d744a223b38de59233eddf50fef8ac239a07f9fff64d03559f3
|
7
|
+
data.tar.gz: 1033906d59479f13f251af341905315ae637e34663cee293a785614299baf0e883715d9d61d0bab3b3e2f68e945e298c50b2861f6191c2918fd6cf308605218d
|
data/ext/hat-trie/text.c
CHANGED
@@ -20,7 +20,7 @@ size_t text_clean(char* text)
|
|
20
20
|
char* read;
|
21
21
|
char* write = text;
|
22
22
|
uint8_t join_lines = false,
|
23
|
-
just_added_space =
|
23
|
+
just_added_space = true, // prevent prefix spaces
|
24
24
|
just_added_period = false;
|
25
25
|
for (read=text; *read; read++) {
|
26
26
|
char c = *read;
|
@@ -60,17 +60,15 @@ size_t text_clean(char* text)
|
|
60
60
|
}
|
61
61
|
// erase space at end of text
|
62
62
|
if (just_added_space) write--;
|
63
|
-
// terminate the string at its new length
|
64
|
-
*write = '\0';
|
65
63
|
|
66
64
|
// Return the new length of the string
|
67
65
|
return (size_t)(write - text);
|
68
66
|
}
|
69
67
|
|
70
|
-
void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only)
|
68
|
+
void add_ngrams(hattrie_t* trie, int upto_n, char* text, size_t text_len, uint8_t incr_existing_keys_only)
|
71
69
|
{
|
72
70
|
char blank_suffix[] = "\0";
|
73
|
-
add_ngrams_with_suffix(trie, upto_n, text, blank_suffix, incr_existing_keys_only);
|
71
|
+
add_ngrams_with_suffix(trie, upto_n, text, text_len, blank_suffix, incr_existing_keys_only);
|
74
72
|
}
|
75
73
|
|
76
74
|
static inline void incr_value(
|
@@ -101,7 +99,7 @@ static inline void incr_value(
|
|
101
99
|
|
102
100
|
}
|
103
101
|
|
104
|
-
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only)
|
102
|
+
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, size_t text_len, char* suffix, uint8_t incr_existing_keys_only)
|
105
103
|
{
|
106
104
|
char* head = text;
|
107
105
|
char* tail = text;
|
@@ -109,7 +107,7 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
|
|
109
107
|
char* next_tail = text;
|
110
108
|
int word_count = 0;
|
111
109
|
|
112
|
-
if (
|
110
|
+
if (text_len == 0) return;
|
113
111
|
|
114
112
|
char buffer[NGRAM_BUFFER_SIZE];
|
115
113
|
size_t suffix_len = strlen(suffix);
|
@@ -117,11 +115,8 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
|
|
117
115
|
char* buffer_pre = buffer + buffer_offset;
|
118
116
|
strcpy(buffer_pre, suffix);
|
119
117
|
|
120
|
-
// skip any spaces at beginning
|
121
|
-
while(*head == ' ') head++;
|
122
|
-
|
123
118
|
do {
|
124
|
-
if (*tail == ' ' || *tail == '.' ||
|
119
|
+
if (*tail == ' ' || *tail == '.' || tail >= head+text_len) {
|
125
120
|
word_count++;
|
126
121
|
if (word_count == 1 || upto_n == 1) {
|
127
122
|
next_head = next_tail = tail + 1;
|
@@ -152,7 +147,7 @@ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffi
|
|
152
147
|
|
153
148
|
// add the 1..(upto_n-1) sized ngrams at the tail
|
154
149
|
if (upto_n > 1) {
|
155
|
-
while(
|
150
|
+
while(head < text+text_len) {
|
156
151
|
if(*head == ' ' || *head == '.') {
|
157
152
|
incr_value(trie, buffer, buffer_pre,
|
158
153
|
head + 1, tail - head - 1, suffix_len,
|
data/ext/hat-trie/text.h
CHANGED
@@ -12,8 +12,8 @@ extern "C" {
|
|
12
12
|
#define NGRAM_BUFFER_SIZE 4096
|
13
13
|
|
14
14
|
size_t text_clean(char* text);
|
15
|
-
void add_ngrams(hattrie_t* trie, int upto_n, char* text, uint8_t incr_existing_keys_only);
|
16
|
-
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix, uint8_t incr_existing_keys_only);
|
15
|
+
void add_ngrams(hattrie_t* trie, int upto_n, char* text, size_t text_len, uint8_t incr_existing_keys_only);
|
16
|
+
void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, size_t text_len, char* suffix, uint8_t incr_existing_keys_only);
|
17
17
|
|
18
18
|
#ifdef __cplusplus
|
19
19
|
}
|
data/ext/wordtriez.cc
CHANGED
@@ -273,26 +273,33 @@ static VALUE hat_walk(VALUE self, VALUE key) {
|
|
273
273
|
return data.arr;
|
274
274
|
}
|
275
275
|
|
276
|
+
static VALUE hat_text_clean(VALUE self, VALUE text) {
|
277
|
+
rb_str_modify(text);
|
278
|
+
|
279
|
+
char* ctext = StringValueCStr(text);
|
280
|
+
size_t new_length = text_clean(ctext);
|
281
|
+
|
282
|
+
rb_str_set_len(text, (long)new_length);
|
283
|
+
|
284
|
+
return text;
|
285
|
+
}
|
286
|
+
|
276
287
|
static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix, VALUE incr_existing_keys_only) {
|
277
|
-
// rb_str_dup
|
278
288
|
hattrie_t* p;
|
279
289
|
HatTrie* ht;
|
280
290
|
Data_Get_Struct(self, HatTrie, ht);
|
281
291
|
p = ht->p;
|
282
292
|
|
283
|
-
|
284
|
-
size_t new_length = text_clean(ctext);
|
293
|
+
hat_text_clean(self, text);
|
285
294
|
|
286
295
|
add_ngrams_with_suffix(p,
|
287
296
|
FIX2INT(ngrams),
|
288
|
-
|
297
|
+
RSTRING_PTR(text),
|
298
|
+
RSTRING_LEN(text),
|
289
299
|
StringValueCStr(suffix),
|
290
300
|
RTEST(incr_existing_keys_only));
|
291
301
|
|
292
|
-
rb_str_resize(text, (long)new_length);
|
293
|
-
|
294
302
|
return self;
|
295
|
-
// rb_str_substr
|
296
303
|
}
|
297
304
|
|
298
305
|
#define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
|
@@ -316,4 +323,5 @@ void Init_wordtriez() {
|
|
316
323
|
DEF(hat_class, "_internal_search", hat_search, 4);
|
317
324
|
DEF(hat_class, "_internal_walk", hat_walk, 1);
|
318
325
|
DEF(hat_class, "_internal_add_text", hat_add_text, 4);
|
326
|
+
DEF(hat_class, "text_clean", hat_text_clean, 1);
|
319
327
|
}
|