wordtriez 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,174 @@
1
+ #include "text.h"
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+ #include <assert.h>
5
+
6
+ /* Chris' C Code Version of the above (self.clean_text)**
7
+
8
+ * Credit: "most efficient way to remove special characters from string" By Guffa
9
+ * http://stackoverflow.com/questions/1120198/most-efficient-way-to-remove-special-characters-from-string
10
+ *
11
+ * How fast is this code?
12
+ *
13
+ * Regular expression: 294.4 ms.
14
+ * Original function: 54.5 ms.
15
+ * My suggested change: 47.1 ms.
16
+ * Mine with setting StringBuilder capacity: 43.3 ms.
17
+ * I tested the lookup+char[] solution, and it runs in about 13 ms.
18
+ */
19
+
20
+ /*
21
+ private static bool[] _lookup;
22
+ static Program() {
23
+ _lookup = new bool[65535];
24
+ for (char c = '0'; c <= '9'; c++) _lookup[c] = true;
25
+ for (char c = 'A'; c <= 'Z'; c++) _lookup[c] = true;
26
+ for (char c = 'a'; c <= 'z'; c++) _lookup[c] = true;
27
+ _lookup['.'] = true;
28
+ _lookup['_'] = true;
29
+ }
30
+ public static string RemoveSpecialCharacters(string str) {
31
+ char[] buffer = new char[str.Length];
32
+ int index = 0;
33
+ foreach (char c in str) {
34
+ if (_lookup[c]) {
35
+ buffer[index] = c;
36
+ index++;
37
+ }
38
+ }
39
+ return new string(buffer, 0, index);
40
+ }
41
+ */
42
+
43
+ /** Transforms text such as the following:
44
+ *
45
+ * And behold, I said, "This is no good!"
46
+ * What shall ye say unto these people, there-
47
+ * fore?
48
+ *
49
+ * Into a cleaned up single line of text, like the following:
50
+ *
51
+ * and behold i said this is no good.what shall ye say unto these people therefore.
52
+ *
53
+ * Spaces indicate word boundaries, while periods indicate sentence boundaries.
54
+ */
55
+ void text_clean(char* text)
56
+ {
57
+ char* read;
58
+ char* write = text;
59
+ uint8_t join_lines = false,
60
+ just_added_space = false,
61
+ just_added_period = false;
62
+ for (read=text; *read; read++) {
63
+ char c = *read;
64
+ if (c >= 'A' && c <= 'Z') {
65
+ // Change upper case to lowercase
66
+ c += 32;
67
+ } else if (c == '\n') {
68
+ // Change newlines to spaces (i.e. both count as whitespace)
69
+ c = ' ';
70
+ } else if (c == '?' || c == '!') {
71
+ // Change exclamation, question marks to periods (i.e. sentence boundaries)
72
+ c = '.';
73
+ }
74
+
75
+ if (c == '-') {
76
+ join_lines = true;
77
+ } else if (join_lines && c == ' ') {
78
+ // ignore whitespace after a dash (i.e. including newlines, which is the
79
+ // most common case because words that are broken by syllables are dashed)
80
+ } else if (c == '.' && !just_added_period) {
81
+ // erase space before period
82
+ if (just_added_space) write--;
83
+ *write++ = '.';
84
+ just_added_period = true;
85
+ just_added_space = false;
86
+ join_lines = false;
87
+ } else if (c == ' ' && !just_added_space && !just_added_period) {
88
+ *write++ = ' ';
89
+ just_added_space = true;
90
+ just_added_period = false;
91
+ } else if (c >= 'a' && c <= 'z') {
92
+ *write++ = c;
93
+ just_added_space = false;
94
+ just_added_period = false;
95
+ join_lines = false;
96
+ }
97
+ }
98
+ // erase space at end of text
99
+ if (just_added_space) write--;
100
+ // terminate the string at its new length
101
+ *write = '\0';
102
+ }
103
+
104
+ void add_ngrams(hattrie_t* trie, int upto_n, char* text)
105
+ {
106
+ char blank_suffix[] = "\0";
107
+ add_ngrams_with_suffix(trie, upto_n, text, blank_suffix);
108
+ }
109
+
110
+ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix)
111
+ {
112
+ char* head = text;
113
+ char* tail = text;
114
+ char* next_head = text;
115
+ char* next_tail = text;
116
+ int word_count = 0;
117
+ value_t* value = NULL;
118
+ size_t len = 0;
119
+
120
+ if (*text == '\0') return;
121
+
122
+ char buffer[NGRAM_BUFFER_SIZE];
123
+ size_t suffix_len = strlen(suffix);
124
+ size_t buffer_offset = NGRAM_BUFFER_SIZE - suffix_len - 1;
125
+ char* buffer_pre = buffer + buffer_offset;
126
+ strcpy(buffer_pre, suffix);
127
+
128
+ do {
129
+ if (*tail == ' ' || *tail == '.' || *tail == '\0') {
130
+ word_count++;
131
+ if (word_count == 1 || upto_n == 1) {
132
+ next_head = next_tail = tail + 1;
133
+ } else if (word_count == 2) {
134
+ next_tail = tail;
135
+ }
136
+ if (word_count <= upto_n) {
137
+ len = tail - head;
138
+ assert(buffer_pre - len >= buffer);
139
+ memcpy(buffer_pre - len, head, len);
140
+ value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
141
+ (*value)++;
142
+ }
143
+ if (word_count == upto_n) {
144
+ head = next_head;
145
+ tail = next_tail;
146
+ word_count = 0;
147
+ } else {
148
+ tail++;
149
+ }
150
+ } else {
151
+ tail++;
152
+ }
153
+ } while(*tail);
154
+
155
+ // add the last ngram of size upto_n
156
+ len = tail - head;
157
+ assert(buffer_pre - len >= buffer);
158
+ memcpy(buffer_pre - len, head, len);
159
+ value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
160
+ (*value)++;
161
+
162
+ if (upto_n > 1) {
163
+ while(*head) {
164
+ if(*head == ' ' || *head == '.') {
165
+ len = tail - head - 1;
166
+ assert(buffer_pre - len >= buffer);
167
+ memcpy(buffer_pre - len, head + 1, len);
168
+ value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
169
+ (*value)++;
170
+ }
171
+ head++;
172
+ }
173
+ }
174
+ }
@@ -0,0 +1,22 @@
1
+ #ifndef TEXT_H
2
+ #define TEXT_H
3
+
4
+ #include <stdbool.h>
5
+ #include "pstdint.h"
6
+ #include "hat-trie.h"
7
+
8
+ #ifdef __cplusplus
9
+ extern "C" {
10
+ #endif
11
+
12
+ #define NGRAM_BUFFER_SIZE 4096
13
+
14
+ void text_clean(char* text);
15
+ void add_ngrams(hattrie_t* trie, int upto_n, char* text);
16
+ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix);
17
+
18
+ #ifdef __cplusplus
19
+ }
20
+ #endif
21
+
22
+ #endif
data/ext/triez.cc ADDED
@@ -0,0 +1,313 @@
1
+ #include <hat-trie.h>
2
+ #include <text.h>
3
+ #include <ruby.h>
4
+ #include <ruby/encoding.h>
5
+
6
+ // for rubinius
7
+ #ifndef rb_enc_fast_mbclen
8
+ # define rb_enc_fast_mbclen rb_enc_mbclen
9
+ #endif
10
+
11
+ static VALUE hat_class;
12
+ static rb_encoding* u8_enc;
13
+ static rb_encoding* bin_enc;
14
+
15
+ static inline VALUE unify_key(VALUE key) {
16
+ rb_encoding* enc = rb_enc_get(key);
17
+ if (enc != u8_enc && enc != bin_enc) {
18
+ return rb_funcall(key, rb_intern("encode"), 1, rb_enc_from_encoding(u8_enc));
19
+ } else {
20
+ return key;
21
+ }
22
+ }
23
+
24
+ static inline long long V2LL(VALUE v) {
25
+ union {VALUE v; long long l;} u;
26
+ u.v = v;
27
+ return u.l;
28
+ }
29
+
30
+ static inline VALUE LL2V(long long l) {
31
+ union {VALUE v; long long l;} u;
32
+ u.l = l;
33
+ return u.v;
34
+ }
35
+
36
+ struct HatTrie {
37
+ hattrie_t* p;
38
+ VALUE default_value;
39
+ bool obj_value;
40
+ bool initialized;
41
+
42
+ HatTrie() : default_value(Qnil), obj_value(false), initialized(false) {
43
+ p = hattrie_create();
44
+ }
45
+
46
+ ~HatTrie() {
47
+ hattrie_free(p);
48
+ }
49
+ };
50
+
51
+ static void hat_mark(void* p_ht) {
52
+ HatTrie* ht = (HatTrie*)p_ht;
53
+ if (!IMMEDIATE_P(ht->default_value)) {
54
+ rb_gc_mark(ht->default_value);
55
+ }
56
+ if (!ht->obj_value) {
57
+ return;
58
+ }
59
+ hattrie_t* p = ht->p;
60
+ hattrie_iter_t* it = hattrie_iter_begin(p, false);
61
+ while (!hattrie_iter_finished(it)) {
62
+ value_t* v = hattrie_iter_val(it);
63
+ if (!IMMEDIATE_P(*v)) {
64
+ rb_gc_mark(*v);
65
+ }
66
+ hattrie_iter_next(it);
67
+ }
68
+ hattrie_iter_free(it);
69
+ }
70
+
71
+ static void hat_free(void* p) {
72
+ delete (HatTrie*)p;
73
+ }
74
+
75
+ static VALUE hat_alloc(VALUE self) {
76
+ HatTrie* ht = new HatTrie();
77
+ return Data_Wrap_Struct(hat_class, hat_mark, hat_free, ht);
78
+ }
79
+
80
+ #define PRE_HAT\
81
+ hattrie_t* p;\
82
+ HatTrie* ht;\
83
+ Data_Get_Struct(self, HatTrie, ht);\
84
+ p = ht->p;\
85
+ Check_Type(key, T_STRING);\
86
+ key = unify_key(key);
87
+
88
+ static VALUE hat_set_type(VALUE self, VALUE obj_value, VALUE default_value) {
89
+ HatTrie* ht;
90
+ Data_Get_Struct(self, HatTrie, ht);
91
+ if (ht->initialized) {
92
+ rb_raise(rb_eRuntimeError, "Already initialized");
93
+ return self;
94
+ }
95
+ ht->default_value = default_value;
96
+ ht->obj_value = RTEST(obj_value);
97
+ ht->initialized = true;
98
+ return self;
99
+ }
100
+
101
+ static VALUE hat_value_type(VALUE self) {
102
+ HatTrie* ht;
103
+ Data_Get_Struct(self, HatTrie, ht);
104
+ return ht->obj_value ? ID2SYM(rb_intern("object")) : ID2SYM(rb_intern("int64"));
105
+ }
106
+
107
+ static VALUE hat_size(VALUE self) {
108
+ HatTrie* ht;
109
+ Data_Get_Struct(self, HatTrie, ht);
110
+ return ULL2NUM(hattrie_size(ht->p));
111
+ }
112
+
113
+ static VALUE hat_set(VALUE self, VALUE key, VALUE value) {
114
+ PRE_HAT;
115
+ long long v = ht->obj_value ? value : NUM2LL(value);
116
+ char* s = RSTRING_PTR(key);
117
+ size_t len = RSTRING_LEN(key);
118
+ hattrie_get(p, s, len)[0] = v;
119
+ return self;
120
+ }
121
+
122
+ static inline void hat_change(HatTrie* ht, hattrie_t* p, char* s, size_t len) {
123
+ // NOTE must use 2-step change, because the block may change the trie
124
+ value_t* vp = hattrie_tryget(p, s, len);
125
+ long long v;
126
+ if (ht->obj_value) {
127
+ VALUE value = vp ? LL2V(vp[0]) : ht->default_value;
128
+ v = V2LL(rb_yield(value));
129
+ } else {
130
+ VALUE value = vp ? LL2NUM(vp[0]) : ht->default_value;
131
+ v = NUM2LL(rb_yield(value));
132
+ }
133
+ hattrie_get(p, s, len)[0] = v;
134
+ }
135
+
136
+ static inline void hat_change_prefix(HatTrie* ht, hattrie_t* p, char* s, size_t len, char* rs) {
137
+ char* rs_end = rs + len;
138
+ long n;
139
+ for (; rs < rs_end; rs += n, len -= n) {
140
+ hat_change(ht, p, s, len);
141
+ // no need check encoding because reverse succeeded
142
+ n = rb_enc_fast_mbclen(rs, rs_end, u8_enc);
143
+ }
144
+ }
145
+
146
+ static VALUE hat_change_all(VALUE self, VALUE type, VALUE key) {
147
+ PRE_HAT;
148
+ char* s = RSTRING_PTR(key);
149
+ size_t len = RSTRING_LEN(key);
150
+ ID ty = SYM2ID(type);
151
+ if (ty == rb_intern("suffix")) {
152
+ char* s_end = s + len;
153
+ long n;
154
+ for (; s < s_end; s += n, len -= n) {
155
+ hat_change(ht, p, s, len);
156
+ n = rb_enc_mbclen(s, s_end, u8_enc);
157
+ }
158
+ } else if (ty == rb_intern("prefix")) {
159
+ volatile VALUE reversed = rb_funcall(key, rb_intern("reverse"), 0);
160
+ hat_change_prefix(ht, p, s, len, RSTRING_PTR(reversed));
161
+ } else if (ty == rb_intern("substring")) {
162
+ volatile VALUE reversed = rb_funcall(key, rb_intern("reverse"), 0);
163
+ char* rs = RSTRING_PTR(reversed);
164
+ char* s_end = s + len;
165
+ long n;
166
+ for (; s < s_end; s += n, len -= n) {
167
+ hat_change_prefix(ht, p, s, len, rs);
168
+ n = rb_enc_fast_mbclen(s, s_end, u8_enc);
169
+ }
170
+ }
171
+ return self;
172
+ }
173
+
174
+ static VALUE hat_append(VALUE self, VALUE key) {
175
+ HatTrie* ht;
176
+ Data_Get_Struct(self, HatTrie, ht);
177
+ return hat_set(self, key, ht->default_value);
178
+ }
179
+
180
+ static VALUE hat_get(VALUE self, VALUE key) {
181
+ PRE_HAT;
182
+ value_t* vt = hattrie_tryget(p, RSTRING_PTR(key), RSTRING_LEN(key));
183
+ if (vt) {
184
+ return ht->obj_value ? (*vt) : LL2NUM(*vt);
185
+ } else {
186
+ return ht->default_value;
187
+ }
188
+ }
189
+
190
+ static VALUE hat_del(VALUE self, VALUE key) {
191
+ PRE_HAT;
192
+ const char* s = RSTRING_PTR(key);
193
+ size_t len = RSTRING_LEN(key);
194
+ value_t* vt = hattrie_tryget(p, s, len);
195
+ if (vt) {
196
+ hattrie_del(p, RSTRING_PTR(key), RSTRING_LEN(key));
197
+ return ht->obj_value ? (*vt) : LL2NUM(*vt);
198
+ } else {
199
+ return ht->default_value;
200
+ }
201
+ }
202
+
203
+ static VALUE hat_check(VALUE self, VALUE key) {
204
+ PRE_HAT;
205
+ value_t* vt = hattrie_tryget(p, RSTRING_PTR(key), RSTRING_LEN(key));
206
+ return vt ? Qtrue : Qfalse;
207
+ }
208
+
209
+ struct SearchCbData {
210
+ VALUE callback;
211
+ VALUE suffix;
212
+ VALUE value;
213
+ };
214
+
215
+ static VALUE hat_search_callback(VALUE data) {
216
+ SearchCbData* p = (SearchCbData*)data;
217
+ return rb_funcall(p->callback, rb_intern("call"), 2, p->suffix, p->value);
218
+ }
219
+
220
+ static VALUE hat_search(VALUE self, VALUE key, VALUE vlimit, VALUE vsort, VALUE callback) {
221
+ PRE_HAT;
222
+ long limit = 0;
223
+ if (vlimit != Qnil) {
224
+ limit = NUM2LONG(vlimit);
225
+ }
226
+
227
+ hattrie_iter_t* it = hattrie_iter_with_prefix(p, RTEST(vsort), RSTRING_PTR(key), RSTRING_LEN(key));
228
+ int error = 0;
229
+ SearchCbData data = {callback};
230
+ while (!hattrie_iter_finished(it)) {
231
+ if (vlimit != Qnil && limit-- <= 0) {
232
+ break;
233
+ }
234
+ size_t suffix_len;
235
+ const char* suffix_s = hattrie_iter_key(it, &suffix_len);
236
+ value_t* v = hattrie_iter_val(it);
237
+ data.suffix = rb_enc_str_new(suffix_s, suffix_len, u8_enc);
238
+ data.value = ht->obj_value ? (*v) : LL2NUM(*v);
239
+ rb_protect(hat_search_callback, (VALUE)&data, &error);
240
+ if (error) {
241
+ break;
242
+ }
243
+ hattrie_iter_next(it);
244
+ }
245
+ hattrie_iter_free(it);
246
+ if (error) {
247
+ rb_funcall(rb_mKernel, rb_intern("raise"), 0);
248
+ }
249
+ return self;
250
+ }
251
+
252
+ typedef struct {
253
+ bool obj_value;
254
+ VALUE arr;
255
+ } HatWalkData;
256
+
257
+ static int hat_walk_cb(const char* key, size_t len, value_t* v, void* data_p) {
258
+ HatWalkData* data = (HatWalkData*)data_p;
259
+ volatile VALUE r = rb_ary_new();
260
+ rb_ary_push(r, rb_str_new(key, len));
261
+ rb_ary_push(r, data->obj_value ? (*v) : LL2NUM(*v));
262
+ rb_ary_push(data->arr, r);
263
+ return hattrie_walk_continue;
264
+ }
265
+
266
+ static VALUE hat_walk(VALUE self, VALUE key) {
267
+ PRE_HAT;
268
+ size_t len = (size_t)RSTRING_LEN(key);
269
+ volatile HatWalkData data = {ht->obj_value, rb_ary_new()};
270
+
271
+ // to prevent leak by break/next, we have to collect the array first
272
+ hattrie_walk(p, RSTRING_PTR(key), len, (void*)&data, hat_walk_cb);
273
+ return data.arr;
274
+ }
275
+
276
+ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix) {
277
+ // rb_str_dup
278
+ hattrie_t* p;
279
+ HatTrie* ht;
280
+ Data_Get_Struct(self, HatTrie, ht);
281
+ p = ht->p;
282
+
283
+ char* ctext = StringValueCStr(text);
284
+ text_clean(ctext);
285
+
286
+ add_ngrams_with_suffix(p, FIX2INT(ngrams), ctext, StringValueCStr(suffix));
287
+
288
+ return self;
289
+ // rb_str_substr
290
+ }
291
+
292
+ #define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
293
+
294
+ extern "C"
295
+ void Init_triez() {
296
+ hat_class = rb_define_class("Triez", rb_cObject);
297
+ u8_enc = rb_utf8_encoding();
298
+ bin_enc = rb_ascii8bit_encoding();
299
+
300
+ rb_define_alloc_func(hat_class, hat_alloc);
301
+ DEF(hat_class, "_internal_set_type", hat_set_type, 2);
302
+ DEF(hat_class, "value_type", hat_value_type, 0);
303
+ DEF(hat_class, "size", hat_size, 0);
304
+ DEF(hat_class, "[]=", hat_set, 2);
305
+ DEF(hat_class, "change_all", hat_change_all, 2);
306
+ DEF(hat_class, "<<", hat_append, 1);
307
+ DEF(hat_class, "[]", hat_get, 1);
308
+ DEF(hat_class, "has_key?", hat_check, 1);
309
+ DEF(hat_class, "delete", hat_del, 1);
310
+ DEF(hat_class, "_internal_search", hat_search, 4);
311
+ DEF(hat_class, "_internal_walk", hat_walk, 1);
312
+ DEF(hat_class, "add_text!", hat_add_text, 3);
313
+ }
data/lib/wordtriez.rb ADDED
@@ -0,0 +1,65 @@
1
+ require_relative "../ext/triez"
2
+
3
+ class Triez
4
+ VERSION = '1.0.4'
5
+
6
+ private :_internal_set_type
7
+ private :_internal_search
8
+ private :_internal_walk
9
+
10
+ def initialize opts={}
11
+ opts = opts.dup
12
+
13
+ value_type = opts.delete :value_type
14
+ if value_type.nil?
15
+ value_type = :int64
16
+ elsif value_type != :int64 and value_type != :object
17
+ raise ArgumentError, "value_type should be :int64 or :object, but got #{value_type.inspect}"
18
+ end
19
+
20
+ default = opts.delete :default
21
+ if default.nil?
22
+ default = (value_type == :int64 ? 0 : nil)
23
+ elsif value_type == :int64
24
+ default = default.to_i
25
+ end
26
+
27
+ unless opts.empty?
28
+ raise ArgumentError, "Unknown options: #{opts.keys.inspect}, only [:value_type, :default] are allowed"
29
+ end
30
+
31
+ _internal_set_type value_type == :object, default
32
+ end
33
+
34
+ def each &p
35
+ raise ArgumentError, 'Need a block' unless p
36
+
37
+ _internal_search '', nil, true, p
38
+ end
39
+
40
+ def walk s, &p
41
+ _internal_walk(s).each &p
42
+ end
43
+
44
+ def search_with_prefix prefix, opts={}, &p
45
+ opts = opts.dup
46
+
47
+ limit = opts.delete :limit
48
+ if !limit.nil? and limit < 0
49
+ raise ArgumentError, "Limit should be > 0"
50
+ end
51
+
52
+ sort = opts.delete :sort
53
+ unless opts.empty?
54
+ raise ArgumentError, "Unknown options: #{opts.keys.inspect}, only [:limit, :sort] are allowed"
55
+ end
56
+
57
+ if p
58
+ _internal_search prefix, limit, sort, p
59
+ else
60
+ a = []
61
+ _internal_search prefix, limit, sort, -> k, v {a << [k, v]}
62
+ a
63
+ end
64
+ end
65
+ end