wordtriez 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,174 @@
1
+ #include "text.h"
2
+ #include <stdio.h>
3
+ #include <string.h>
4
+ #include <assert.h>
5
+
6
+ /* Chris' C Code Version of the above (self.clean_text)**
7
+
8
+ * Credit: "most efficient way to remove special characters from string" By Guffa
9
+ * http://stackoverflow.com/questions/1120198/most-efficient-way-to-remove-special-characters-from-string
10
+ *
11
+ * How fast is this code?
12
+ *
13
+ * Regular expression: 294.4 ms.
14
+ * Original function: 54.5 ms.
15
+ * My suggested change: 47.1 ms.
16
+ * Mine with setting StringBuilder capacity: 43.3 ms.
17
+ * I tested the lookup+char[] solution, and it runs in about 13 ms.
18
+ */
19
+
20
+ /*
21
+ private static bool[] _lookup;
22
+ static Program() {
23
+ _lookup = new bool[65535];
24
+ for (char c = '0'; c <= '9'; c++) _lookup[c] = true;
25
+ for (char c = 'A'; c <= 'Z'; c++) _lookup[c] = true;
26
+ for (char c = 'a'; c <= 'z'; c++) _lookup[c] = true;
27
+ _lookup['.'] = true;
28
+ _lookup['_'] = true;
29
+ }
30
+ public static string RemoveSpecialCharacters(string str) {
31
+ char[] buffer = new char[str.Length];
32
+ int index = 0;
33
+ foreach (char c in str) {
34
+ if (_lookup[c]) {
35
+ buffer[index] = c;
36
+ index++;
37
+ }
38
+ }
39
+ return new string(buffer, 0, index);
40
+ }
41
+ */
42
+
43
+ /** Transforms text such as the following:
44
+ *
45
+ * And behold, I said, "This is no good!"
46
+ * What shall ye say unto these people, there-
47
+ * fore?
48
+ *
49
+ * Into a cleaned up single line of text, like the following:
50
+ *
51
+ * and behold i said this is no good.what shall ye say unto these people therefore.
52
+ *
53
+ * Spaces indicate word boundaries, while periods indicate sentence boundaries.
54
+ */
55
+ void text_clean(char* text)
56
+ {
57
+ char* read;
58
+ char* write = text;
59
+ uint8_t join_lines = false,
60
+ just_added_space = false,
61
+ just_added_period = false;
62
+ for (read=text; *read; read++) {
63
+ char c = *read;
64
+ if (c >= 'A' && c <= 'Z') {
65
+ // Change upper case to lowercase
66
+ c += 32;
67
+ } else if (c == '\n') {
68
+ // Change newlines to spaces (i.e. both count as whitespace)
69
+ c = ' ';
70
+ } else if (c == '?' || c == '!') {
71
+ // Change exclamation, question marks to periods (i.e. sentence boundaries)
72
+ c = '.';
73
+ }
74
+
75
+ if (c == '-') {
76
+ join_lines = true;
77
+ } else if (join_lines && c == ' ') {
78
+ // ignore whitespace after a dash (i.e. including newlines, which is the
79
+ // most common case because words that are broken by syllables are dashed)
80
+ } else if (c == '.' && !just_added_period) {
81
+ // erase space before period
82
+ if (just_added_space) write--;
83
+ *write++ = '.';
84
+ just_added_period = true;
85
+ just_added_space = false;
86
+ join_lines = false;
87
+ } else if (c == ' ' && !just_added_space && !just_added_period) {
88
+ *write++ = ' ';
89
+ just_added_space = true;
90
+ just_added_period = false;
91
+ } else if (c >= 'a' && c <= 'z') {
92
+ *write++ = c;
93
+ just_added_space = false;
94
+ just_added_period = false;
95
+ join_lines = false;
96
+ }
97
+ }
98
+ // erase space at end of text
99
+ if (just_added_space) write--;
100
+ // terminate the string at its new length
101
+ *write = '\0';
102
+ }
103
+
104
+ void add_ngrams(hattrie_t* trie, int upto_n, char* text)
105
+ {
106
+ char blank_suffix[] = "\0";
107
+ add_ngrams_with_suffix(trie, upto_n, text, blank_suffix);
108
+ }
109
+
110
+ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix)
111
+ {
112
+ char* head = text;
113
+ char* tail = text;
114
+ char* next_head = text;
115
+ char* next_tail = text;
116
+ int word_count = 0;
117
+ value_t* value = NULL;
118
+ size_t len = 0;
119
+
120
+ if (*text == '\0') return;
121
+
122
+ char buffer[NGRAM_BUFFER_SIZE];
123
+ size_t suffix_len = strlen(suffix);
124
+ size_t buffer_offset = NGRAM_BUFFER_SIZE - suffix_len - 1;
125
+ char* buffer_pre = buffer + buffer_offset;
126
+ strcpy(buffer_pre, suffix);
127
+
128
+ do {
129
+ if (*tail == ' ' || *tail == '.' || *tail == '\0') {
130
+ word_count++;
131
+ if (word_count == 1 || upto_n == 1) {
132
+ next_head = next_tail = tail + 1;
133
+ } else if (word_count == 2) {
134
+ next_tail = tail;
135
+ }
136
+ if (word_count <= upto_n) {
137
+ len = tail - head;
138
+ assert(buffer_pre - len >= buffer);
139
+ memcpy(buffer_pre - len, head, len);
140
+ value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
141
+ (*value)++;
142
+ }
143
+ if (word_count == upto_n) {
144
+ head = next_head;
145
+ tail = next_tail;
146
+ word_count = 0;
147
+ } else {
148
+ tail++;
149
+ }
150
+ } else {
151
+ tail++;
152
+ }
153
+ } while(*tail);
154
+
155
+ // add the last ngram of size upto_n
156
+ len = tail - head;
157
+ assert(buffer_pre - len >= buffer);
158
+ memcpy(buffer_pre - len, head, len);
159
+ value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
160
+ (*value)++;
161
+
162
+ if (upto_n > 1) {
163
+ while(*head) {
164
+ if(*head == ' ' || *head == '.') {
165
+ len = tail - head - 1;
166
+ assert(buffer_pre - len >= buffer);
167
+ memcpy(buffer_pre - len, head + 1, len);
168
+ value = hattrie_get(trie, buffer_pre - len, len + suffix_len);
169
+ (*value)++;
170
+ }
171
+ head++;
172
+ }
173
+ }
174
+ }
@@ -0,0 +1,22 @@
1
+ #ifndef TEXT_H
2
+ #define TEXT_H
3
+
4
+ #include <stdbool.h>
5
+ #include "pstdint.h"
6
+ #include "hat-trie.h"
7
+
8
+ #ifdef __cplusplus
9
+ extern "C" {
10
+ #endif
11
+
12
+ #define NGRAM_BUFFER_SIZE 4096
13
+
14
+ void text_clean(char* text);
15
+ void add_ngrams(hattrie_t* trie, int upto_n, char* text);
16
+ void add_ngrams_with_suffix(hattrie_t* trie, int upto_n, char* text, char* suffix);
17
+
18
+ #ifdef __cplusplus
19
+ }
20
+ #endif
21
+
22
+ #endif
data/ext/triez.cc ADDED
@@ -0,0 +1,313 @@
1
+ #include <hat-trie.h>
2
+ #include <text.h>
3
+ #include <ruby.h>
4
+ #include <ruby/encoding.h>
5
+
6
+ // for rubinius
7
+ #ifndef rb_enc_fast_mbclen
8
+ # define rb_enc_fast_mbclen rb_enc_mbclen
9
+ #endif
10
+
11
+ static VALUE hat_class;
12
+ static rb_encoding* u8_enc;
13
+ static rb_encoding* bin_enc;
14
+
15
+ static inline VALUE unify_key(VALUE key) {
16
+ rb_encoding* enc = rb_enc_get(key);
17
+ if (enc != u8_enc && enc != bin_enc) {
18
+ return rb_funcall(key, rb_intern("encode"), 1, rb_enc_from_encoding(u8_enc));
19
+ } else {
20
+ return key;
21
+ }
22
+ }
23
+
24
+ static inline long long V2LL(VALUE v) {
25
+ union {VALUE v; long long l;} u;
26
+ u.v = v;
27
+ return u.l;
28
+ }
29
+
30
+ static inline VALUE LL2V(long long l) {
31
+ union {VALUE v; long long l;} u;
32
+ u.l = l;
33
+ return u.v;
34
+ }
35
+
36
+ struct HatTrie {
37
+ hattrie_t* p;
38
+ VALUE default_value;
39
+ bool obj_value;
40
+ bool initialized;
41
+
42
+ HatTrie() : default_value(Qnil), obj_value(false), initialized(false) {
43
+ p = hattrie_create();
44
+ }
45
+
46
+ ~HatTrie() {
47
+ hattrie_free(p);
48
+ }
49
+ };
50
+
51
+ static void hat_mark(void* p_ht) {
52
+ HatTrie* ht = (HatTrie*)p_ht;
53
+ if (!IMMEDIATE_P(ht->default_value)) {
54
+ rb_gc_mark(ht->default_value);
55
+ }
56
+ if (!ht->obj_value) {
57
+ return;
58
+ }
59
+ hattrie_t* p = ht->p;
60
+ hattrie_iter_t* it = hattrie_iter_begin(p, false);
61
+ while (!hattrie_iter_finished(it)) {
62
+ value_t* v = hattrie_iter_val(it);
63
+ if (!IMMEDIATE_P(*v)) {
64
+ rb_gc_mark(*v);
65
+ }
66
+ hattrie_iter_next(it);
67
+ }
68
+ hattrie_iter_free(it);
69
+ }
70
+
71
+ static void hat_free(void* p) {
72
+ delete (HatTrie*)p;
73
+ }
74
+
75
+ static VALUE hat_alloc(VALUE self) {
76
+ HatTrie* ht = new HatTrie();
77
+ return Data_Wrap_Struct(hat_class, hat_mark, hat_free, ht);
78
+ }
79
+
80
+ #define PRE_HAT\
81
+ hattrie_t* p;\
82
+ HatTrie* ht;\
83
+ Data_Get_Struct(self, HatTrie, ht);\
84
+ p = ht->p;\
85
+ Check_Type(key, T_STRING);\
86
+ key = unify_key(key);
87
+
88
+ static VALUE hat_set_type(VALUE self, VALUE obj_value, VALUE default_value) {
89
+ HatTrie* ht;
90
+ Data_Get_Struct(self, HatTrie, ht);
91
+ if (ht->initialized) {
92
+ rb_raise(rb_eRuntimeError, "Already initialized");
93
+ return self;
94
+ }
95
+ ht->default_value = default_value;
96
+ ht->obj_value = RTEST(obj_value);
97
+ ht->initialized = true;
98
+ return self;
99
+ }
100
+
101
+ static VALUE hat_value_type(VALUE self) {
102
+ HatTrie* ht;
103
+ Data_Get_Struct(self, HatTrie, ht);
104
+ return ht->obj_value ? ID2SYM(rb_intern("object")) : ID2SYM(rb_intern("int64"));
105
+ }
106
+
107
+ static VALUE hat_size(VALUE self) {
108
+ HatTrie* ht;
109
+ Data_Get_Struct(self, HatTrie, ht);
110
+ return ULL2NUM(hattrie_size(ht->p));
111
+ }
112
+
113
+ static VALUE hat_set(VALUE self, VALUE key, VALUE value) {
114
+ PRE_HAT;
115
+ long long v = ht->obj_value ? value : NUM2LL(value);
116
+ char* s = RSTRING_PTR(key);
117
+ size_t len = RSTRING_LEN(key);
118
+ hattrie_get(p, s, len)[0] = v;
119
+ return self;
120
+ }
121
+
122
+ static inline void hat_change(HatTrie* ht, hattrie_t* p, char* s, size_t len) {
123
+ // NOTE must use 2-step change, because the block may change the trie
124
+ value_t* vp = hattrie_tryget(p, s, len);
125
+ long long v;
126
+ if (ht->obj_value) {
127
+ VALUE value = vp ? LL2V(vp[0]) : ht->default_value;
128
+ v = V2LL(rb_yield(value));
129
+ } else {
130
+ VALUE value = vp ? LL2NUM(vp[0]) : ht->default_value;
131
+ v = NUM2LL(rb_yield(value));
132
+ }
133
+ hattrie_get(p, s, len)[0] = v;
134
+ }
135
+
136
+ static inline void hat_change_prefix(HatTrie* ht, hattrie_t* p, char* s, size_t len, char* rs) {
137
+ char* rs_end = rs + len;
138
+ long n;
139
+ for (; rs < rs_end; rs += n, len -= n) {
140
+ hat_change(ht, p, s, len);
141
+ // no need check encoding because reverse succeeded
142
+ n = rb_enc_fast_mbclen(rs, rs_end, u8_enc);
143
+ }
144
+ }
145
+
146
+ static VALUE hat_change_all(VALUE self, VALUE type, VALUE key) {
147
+ PRE_HAT;
148
+ char* s = RSTRING_PTR(key);
149
+ size_t len = RSTRING_LEN(key);
150
+ ID ty = SYM2ID(type);
151
+ if (ty == rb_intern("suffix")) {
152
+ char* s_end = s + len;
153
+ long n;
154
+ for (; s < s_end; s += n, len -= n) {
155
+ hat_change(ht, p, s, len);
156
+ n = rb_enc_mbclen(s, s_end, u8_enc);
157
+ }
158
+ } else if (ty == rb_intern("prefix")) {
159
+ volatile VALUE reversed = rb_funcall(key, rb_intern("reverse"), 0);
160
+ hat_change_prefix(ht, p, s, len, RSTRING_PTR(reversed));
161
+ } else if (ty == rb_intern("substring")) {
162
+ volatile VALUE reversed = rb_funcall(key, rb_intern("reverse"), 0);
163
+ char* rs = RSTRING_PTR(reversed);
164
+ char* s_end = s + len;
165
+ long n;
166
+ for (; s < s_end; s += n, len -= n) {
167
+ hat_change_prefix(ht, p, s, len, rs);
168
+ n = rb_enc_fast_mbclen(s, s_end, u8_enc);
169
+ }
170
+ }
171
+ return self;
172
+ }
173
+
174
+ static VALUE hat_append(VALUE self, VALUE key) {
175
+ HatTrie* ht;
176
+ Data_Get_Struct(self, HatTrie, ht);
177
+ return hat_set(self, key, ht->default_value);
178
+ }
179
+
180
+ static VALUE hat_get(VALUE self, VALUE key) {
181
+ PRE_HAT;
182
+ value_t* vt = hattrie_tryget(p, RSTRING_PTR(key), RSTRING_LEN(key));
183
+ if (vt) {
184
+ return ht->obj_value ? (*vt) : LL2NUM(*vt);
185
+ } else {
186
+ return ht->default_value;
187
+ }
188
+ }
189
+
190
+ static VALUE hat_del(VALUE self, VALUE key) {
191
+ PRE_HAT;
192
+ const char* s = RSTRING_PTR(key);
193
+ size_t len = RSTRING_LEN(key);
194
+ value_t* vt = hattrie_tryget(p, s, len);
195
+ if (vt) {
196
+ hattrie_del(p, RSTRING_PTR(key), RSTRING_LEN(key));
197
+ return ht->obj_value ? (*vt) : LL2NUM(*vt);
198
+ } else {
199
+ return ht->default_value;
200
+ }
201
+ }
202
+
203
+ static VALUE hat_check(VALUE self, VALUE key) {
204
+ PRE_HAT;
205
+ value_t* vt = hattrie_tryget(p, RSTRING_PTR(key), RSTRING_LEN(key));
206
+ return vt ? Qtrue : Qfalse;
207
+ }
208
+
209
+ struct SearchCbData {
210
+ VALUE callback;
211
+ VALUE suffix;
212
+ VALUE value;
213
+ };
214
+
215
+ static VALUE hat_search_callback(VALUE data) {
216
+ SearchCbData* p = (SearchCbData*)data;
217
+ return rb_funcall(p->callback, rb_intern("call"), 2, p->suffix, p->value);
218
+ }
219
+
220
+ static VALUE hat_search(VALUE self, VALUE key, VALUE vlimit, VALUE vsort, VALUE callback) {
221
+ PRE_HAT;
222
+ long limit = 0;
223
+ if (vlimit != Qnil) {
224
+ limit = NUM2LONG(vlimit);
225
+ }
226
+
227
+ hattrie_iter_t* it = hattrie_iter_with_prefix(p, RTEST(vsort), RSTRING_PTR(key), RSTRING_LEN(key));
228
+ int error = 0;
229
+ SearchCbData data = {callback};
230
+ while (!hattrie_iter_finished(it)) {
231
+ if (vlimit != Qnil && limit-- <= 0) {
232
+ break;
233
+ }
234
+ size_t suffix_len;
235
+ const char* suffix_s = hattrie_iter_key(it, &suffix_len);
236
+ value_t* v = hattrie_iter_val(it);
237
+ data.suffix = rb_enc_str_new(suffix_s, suffix_len, u8_enc);
238
+ data.value = ht->obj_value ? (*v) : LL2NUM(*v);
239
+ rb_protect(hat_search_callback, (VALUE)&data, &error);
240
+ if (error) {
241
+ break;
242
+ }
243
+ hattrie_iter_next(it);
244
+ }
245
+ hattrie_iter_free(it);
246
+ if (error) {
247
+ rb_funcall(rb_mKernel, rb_intern("raise"), 0);
248
+ }
249
+ return self;
250
+ }
251
+
252
+ typedef struct {
253
+ bool obj_value;
254
+ VALUE arr;
255
+ } HatWalkData;
256
+
257
+ static int hat_walk_cb(const char* key, size_t len, value_t* v, void* data_p) {
258
+ HatWalkData* data = (HatWalkData*)data_p;
259
+ volatile VALUE r = rb_ary_new();
260
+ rb_ary_push(r, rb_str_new(key, len));
261
+ rb_ary_push(r, data->obj_value ? (*v) : LL2NUM(*v));
262
+ rb_ary_push(data->arr, r);
263
+ return hattrie_walk_continue;
264
+ }
265
+
266
+ static VALUE hat_walk(VALUE self, VALUE key) {
267
+ PRE_HAT;
268
+ size_t len = (size_t)RSTRING_LEN(key);
269
+ volatile HatWalkData data = {ht->obj_value, rb_ary_new()};
270
+
271
+ // to prevent leak by break/next, we have to collect the array first
272
+ hattrie_walk(p, RSTRING_PTR(key), len, (void*)&data, hat_walk_cb);
273
+ return data.arr;
274
+ }
275
+
276
+ static VALUE hat_add_text(VALUE self, VALUE text, VALUE ngrams, VALUE suffix) {
277
+ // rb_str_dup
278
+ hattrie_t* p;
279
+ HatTrie* ht;
280
+ Data_Get_Struct(self, HatTrie, ht);
281
+ p = ht->p;
282
+
283
+ char* ctext = StringValueCStr(text);
284
+ text_clean(ctext);
285
+
286
+ add_ngrams_with_suffix(p, FIX2INT(ngrams), ctext, StringValueCStr(suffix));
287
+
288
+ return self;
289
+ // rb_str_substr
290
+ }
291
+
292
+ #define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
293
+
294
+ extern "C"
295
+ void Init_triez() {
296
+ hat_class = rb_define_class("Triez", rb_cObject);
297
+ u8_enc = rb_utf8_encoding();
298
+ bin_enc = rb_ascii8bit_encoding();
299
+
300
+ rb_define_alloc_func(hat_class, hat_alloc);
301
+ DEF(hat_class, "_internal_set_type", hat_set_type, 2);
302
+ DEF(hat_class, "value_type", hat_value_type, 0);
303
+ DEF(hat_class, "size", hat_size, 0);
304
+ DEF(hat_class, "[]=", hat_set, 2);
305
+ DEF(hat_class, "change_all", hat_change_all, 2);
306
+ DEF(hat_class, "<<", hat_append, 1);
307
+ DEF(hat_class, "[]", hat_get, 1);
308
+ DEF(hat_class, "has_key?", hat_check, 1);
309
+ DEF(hat_class, "delete", hat_del, 1);
310
+ DEF(hat_class, "_internal_search", hat_search, 4);
311
+ DEF(hat_class, "_internal_walk", hat_walk, 1);
312
+ DEF(hat_class, "add_text!", hat_add_text, 3);
313
+ }
data/lib/wordtriez.rb ADDED
@@ -0,0 +1,65 @@
1
+ require_relative "../ext/triez"
2
+
3
+ class Triez
4
+ VERSION = '1.0.4'
5
+
6
+ private :_internal_set_type
7
+ private :_internal_search
8
+ private :_internal_walk
9
+
10
+ def initialize opts={}
11
+ opts = opts.dup
12
+
13
+ value_type = opts.delete :value_type
14
+ if value_type.nil?
15
+ value_type = :int64
16
+ elsif value_type != :int64 and value_type != :object
17
+ raise ArgumentError, "value_type should be :int64 or :object, but got #{value_type.inspect}"
18
+ end
19
+
20
+ default = opts.delete :default
21
+ if default.nil?
22
+ default = (value_type == :int64 ? 0 : nil)
23
+ elsif value_type == :int64
24
+ default = default.to_i
25
+ end
26
+
27
+ unless opts.empty?
28
+ raise ArgumentError, "Unknown options: #{opts.keys.inspect}, only [:value_type, :default] are allowed"
29
+ end
30
+
31
+ _internal_set_type value_type == :object, default
32
+ end
33
+
34
+ def each &p
35
+ raise ArgumentError, 'Need a block' unless p
36
+
37
+ _internal_search '', nil, true, p
38
+ end
39
+
40
+ def walk s, &p
41
+ _internal_walk(s).each &p
42
+ end
43
+
44
+ def search_with_prefix prefix, opts={}, &p
45
+ opts = opts.dup
46
+
47
+ limit = opts.delete :limit
48
+ if !limit.nil? and limit < 0
49
+ raise ArgumentError, "Limit should be > 0"
50
+ end
51
+
52
+ sort = opts.delete :sort
53
+ unless opts.empty?
54
+ raise ArgumentError, "Unknown options: #{opts.keys.inspect}, only [:limit, :sort] are allowed"
55
+ end
56
+
57
+ if p
58
+ _internal_search prefix, limit, sort, p
59
+ else
60
+ a = []
61
+ _internal_search prefix, limit, sort, -> k, v {a << [k, v]}
62
+ a
63
+ end
64
+ end
65
+ end