triez 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/triez.cc ADDED
@@ -0,0 +1,261 @@
1
+ #include <hat-trie.h>
2
+ #include <ruby.h>
3
+ #include <ruby/encoding.h>
4
+
5
+ static VALUE hat_class;
6
+ static rb_encoding* u8_enc;
7
+ static rb_encoding* bin_enc;
8
+
9
+ static inline VALUE unify_key(VALUE key) {
10
+ rb_encoding* enc = rb_enc_get(key);
11
+ if (enc != u8_enc && enc != bin_enc) {
12
+ return rb_funcall(key, rb_intern("encode"), 1, rb_enc_from_encoding(u8_enc));
13
+ } else {
14
+ return key;
15
+ }
16
+ }
17
+
18
+ static inline long long V2LL(VALUE v) {
19
+ union {VALUE v; long long l;} u;
20
+ u.v = v;
21
+ return u.l;
22
+ }
23
+
24
+ static inline VALUE LL2V(long long l) {
25
+ union {VALUE v; long long l;} u;
26
+ u.l = l;
27
+ return u.v;
28
+ }
29
+
30
+ struct HatTrie {
31
+ hattrie_t* p;
32
+ bool obj_value;
33
+ bool suffix;
34
+ bool initialized;
35
+
36
+ HatTrie() : obj_value(false), suffix(false), initialized(false) {
37
+ p = hattrie_create();
38
+ }
39
+
40
+ ~HatTrie() {
41
+ hattrie_free(p);
42
+ }
43
+ };
44
+
45
+ static void hat_mark(void* p_ht) {
46
+ HatTrie* ht = (HatTrie*)p_ht;
47
+ if (!ht->obj_value) {
48
+ return;
49
+ }
50
+ hattrie_t* p = ht->p;
51
+ hattrie_iter_t* it = hattrie_iter_begin(p, false);
52
+ while (!hattrie_iter_finished(it)) {
53
+ value_t* v = hattrie_iter_val(it);
54
+ if (!IMMEDIATE_P(*v)) {
55
+ rb_gc_mark(*v);
56
+ }
57
+ }
58
+ hattrie_iter_free(it);
59
+ }
60
+
61
+ static void hat_free(void* p) {
62
+ delete (HatTrie*)p;
63
+ }
64
+
65
+ static VALUE hat_alloc(VALUE self) {
66
+ HatTrie* ht = new HatTrie();
67
+ return Data_Wrap_Struct(hat_class, hat_mark, hat_free, ht);
68
+ }
69
+
70
+ #define PRE_HAT\
71
+ hattrie_t* p;\
72
+ HatTrie* ht;\
73
+ Data_Get_Struct(self, HatTrie, ht);\
74
+ p = ht->p;\
75
+ Check_Type(key, T_STRING);\
76
+ key = unify_key(key);
77
+
78
+ static VALUE hat_set_type(VALUE self, VALUE obj_value, VALUE suffix) {
79
+ HatTrie* ht;
80
+ Data_Get_Struct(self, HatTrie, ht);
81
+ if (ht->initialized) {
82
+ rb_raise(rb_eRuntimeError, "Already initialized");
83
+ return self;
84
+ }
85
+ ht->obj_value = RTEST(obj_value);
86
+ ht->suffix = RTEST(suffix);
87
+ ht->initialized = true;
88
+ return self;
89
+ }
90
+
91
+ static VALUE hat_obj_value_p(VALUE self) {
92
+ HatTrie* ht;
93
+ Data_Get_Struct(self, HatTrie, ht);
94
+ return ht->obj_value ? Qtrue : Qfalse;
95
+ }
96
+
97
+ static VALUE hat_suffix_p(VALUE self) {
98
+ HatTrie* ht;
99
+ Data_Get_Struct(self, HatTrie, ht);
100
+ return ht->suffix ? Qtrue : Qfalse;
101
+ }
102
+
103
+ static VALUE hat_size(VALUE self) {
104
+ HatTrie* ht;
105
+ Data_Get_Struct(self, HatTrie, ht);
106
+ return ULL2NUM(hattrie_size(ht->p));
107
+ }
108
+
109
+ static VALUE hat_set(VALUE self, VALUE key, VALUE value) {
110
+ PRE_HAT;
111
+ long long v = ht->obj_value ? value : NUM2LL(value);
112
+ char* s = RSTRING_PTR(key);
113
+ size_t len = RSTRING_LEN(key);
114
+ if (ht->suffix) {
115
+ char* s_end = s + len;
116
+ long n;
117
+ for (; s < s_end; s += n, len -= n) {
118
+ n = rb_enc_mbclen(s, s_end, u8_enc);
119
+ hattrie_get(p, s, len)[0] = v;
120
+ }
121
+ } else {
122
+ hattrie_get(p, s, len)[0] = v;
123
+ }
124
+ return self;
125
+ }
126
+
127
+ static VALUE hat_alt(VALUE self, VALUE key) {
128
+ PRE_HAT;
129
+ char* s = RSTRING_PTR(key);
130
+ size_t len = RSTRING_LEN(key);
131
+ if (ht->suffix) {
132
+ char* s_end = s + len;
133
+ long n;
134
+ for (; s < s_end; s += n, len -= n) {
135
+ n = rb_enc_mbclen(s, s_end, u8_enc);
136
+ value_t* vp = hattrie_tryget(p, s, len);
137
+ long long v;
138
+ if (ht->obj_value) {
139
+ VALUE value = vp ? LL2V(vp[0]) : Qnil;
140
+ v = V2LL(rb_yield(value));
141
+ } else {
142
+ VALUE value = vp ? LL2NUM(vp[0]) : LL2NUM(0);
143
+ v = NUM2LL(rb_yield(value));
144
+ }
145
+ hattrie_get(p, s, len)[0] = v;
146
+ }
147
+ } else {
148
+ value_t* vp = hattrie_tryget(p, s, len);
149
+ long long v;
150
+ if (ht->obj_value) {
151
+ VALUE value = vp ? LL2V(vp[0]) : Qnil;
152
+ v = V2LL(rb_yield(value));
153
+ } else {
154
+ VALUE value = vp ? LL2NUM(vp[0]) : LL2NUM(0);
155
+ v = NUM2LL(rb_yield(value));
156
+ }
157
+ hattrie_get(p, s, len)[0] = v;
158
+ }
159
+ return self;
160
+ }
161
+
162
+ static VALUE hat_append(VALUE self, VALUE key) {
163
+ HatTrie* ht;
164
+ Data_Get_Struct(self, HatTrie, ht);
165
+ return hat_set(self, key, ht->obj_value ? Qnil : LL2NUM(0));
166
+ }
167
+
168
+ static VALUE hat_get(VALUE self, VALUE key) {
169
+ PRE_HAT;
170
+ value_t* vt = hattrie_tryget(p, RSTRING_PTR(key), RSTRING_LEN(key));
171
+ if (vt) {
172
+ return ht->obj_value ? (*vt) : LL2NUM(*vt);
173
+ } else {
174
+ return Qnil;
175
+ }
176
+ }
177
+
178
+ static VALUE hat_del(VALUE self, VALUE key) {
179
+ PRE_HAT;
180
+ const char* s = RSTRING_PTR(key);
181
+ size_t len = RSTRING_LEN(key);
182
+ value_t* vt = hattrie_tryget(p, s, len);
183
+ if (vt) {
184
+ hattrie_del(p, RSTRING_PTR(key), RSTRING_LEN(key));
185
+ return ht->obj_value ? (*vt) : LL2NUM(*vt);
186
+ } else {
187
+ return Qnil;
188
+ }
189
+ }
190
+
191
+ static VALUE hat_check(VALUE self, VALUE key) {
192
+ PRE_HAT;
193
+ value_t* vt = hattrie_tryget(p, RSTRING_PTR(key), RSTRING_LEN(key));
194
+ return vt ? Qtrue : Qfalse;
195
+ }
196
+
197
+ struct SearchCbData {
198
+ VALUE callback;
199
+ VALUE suffix;
200
+ VALUE value;
201
+ };
202
+
203
+ static VALUE hat_search_callback(VALUE data) {
204
+ SearchCbData* p = (SearchCbData*)data;
205
+ return rb_funcall(p->callback, rb_intern("call"), 2, p->suffix, p->value);
206
+ }
207
+
208
+ // returns: true if an error occured
209
+ static VALUE hat_search(VALUE self, VALUE key, VALUE vlimit, VALUE vsort, VALUE callback) {
210
+ PRE_HAT;
211
+ long limit = 0;
212
+ if (vlimit != Qnil) {
213
+ limit = NUM2LONG(vlimit);
214
+ }
215
+
216
+ hattrie_iter_t* it = hattrie_iter_with_prefix(p, RTEST(vsort), RSTRING_PTR(key), RSTRING_LEN(key));
217
+ int error = 0;
218
+ SearchCbData data = {callback};
219
+ while (!hattrie_iter_finished(it)) {
220
+ if (vlimit != Qnil && limit-- <= 0) {
221
+ break;
222
+ }
223
+ size_t suffix_len;
224
+ const char* suffix_s = hattrie_iter_key(it, &suffix_len);
225
+ value_t* v = hattrie_iter_val(it);
226
+ data.suffix = rb_enc_str_new(suffix_s, suffix_len, u8_enc);
227
+ data.value = ht->obj_value ? (*v) : LL2NUM(*v);
228
+ rb_protect(hat_search_callback, (VALUE)&data, &error);
229
+ if (error) {
230
+ break;
231
+ }
232
+ hattrie_iter_next(it);
233
+ }
234
+ hattrie_iter_free(it);
235
+ if (error) {
236
+ rb_funcall(rb_mKernel, rb_intern("raise"), 0);
237
+ }
238
+ return self;
239
+ }
240
+
241
+ #define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
242
+
243
+ extern "C"
244
+ void Init_triez() {
245
+ hat_class = rb_define_class("Triez", rb_cObject);
246
+ u8_enc = rb_utf8_encoding();
247
+ bin_enc = rb_ascii8bit_encoding();
248
+
249
+ rb_define_alloc_func(hat_class, hat_alloc);
250
+ DEF(hat_class, "_internal_set_type", hat_set_type, 2);
251
+ DEF(hat_class, "obj_value?", hat_obj_value_p, 0);
252
+ DEF(hat_class, "suffix?", hat_suffix_p, 0);
253
+ DEF(hat_class, "size", hat_size, 0);
254
+ DEF(hat_class, "has_key?", hat_check, 1);
255
+ DEF(hat_class, "[]=", hat_set, 2);
256
+ DEF(hat_class, "alt", hat_alt, 1);
257
+ DEF(hat_class, "<<", hat_append, 1);
258
+ DEF(hat_class, "[]", hat_get, 1);
259
+ DEF(hat_class, "delete", hat_del, 1);
260
+ DEF(hat_class, "_internal_search", hat_search, 4);
261
+ }
data/lib/triez.rb ADDED
@@ -0,0 +1,45 @@
1
+ require_relative "../ext/triez"
2
+
3
+ class Triez
4
+ VERSION = '0.2'
5
+
6
+ private :_internal_set_type
7
+ private :_internal_search
8
+
9
+ def initialize opts={}
10
+ opts = opts.dup
11
+ obj_value = opts.delete :obj_value
12
+ obj_value = false if obj_value.nil?
13
+ suffix = opts.delete :suffix
14
+ suffix = false if suffix.nil?
15
+ unless opts.empty?
16
+ raise ArgumentError, "Unknown options: #{opts.keys.inspect}, only [:suffix, :obj_value] are allowed"
17
+ end
18
+ _internal_set_type obj_value, suffix
19
+ end
20
+
21
+ def each &p
22
+ raise ArgumentError, 'Need a block' unless p
23
+ _internal_search '', nil, true, p
24
+ end
25
+
26
+ def search_with_prefix prefix, opts={}, &p
27
+ opts = opts.dup
28
+ limit = opts.delete :limit
29
+ if !limit.nil? and limit < 0
30
+ raise ArgumentError, "Limit should be > 0"
31
+ end
32
+ sort = opts.delete :sort
33
+ unless opts.empty?
34
+ raise ArgumentError, "Unknown options: #{opts.keys.inspect}, only [:limit, :sort] are allowed"
35
+ end
36
+
37
+ if p
38
+ _internal_search prefix, limit, sort, p
39
+ else
40
+ a = []
41
+ _internal_search prefix, limit, sort, -> k, v {a << [k, v]}
42
+ a
43
+ end
44
+ end
45
+ end
data/readme.md ADDED
@@ -0,0 +1,174 @@
1
+ ## What
2
+
3
+ Pragmatic [trie](http://en.wikipedia.org/wiki/Trie) for Ruby.
4
+
5
+ It is fast, memory efficient, unicode aware.
6
+
7
+ The backend of *triez* is a cache oblivious data structure: the [HAT trie](https://github.com/dcjones/hat-trie). It is generally faster and more memory efficient than double arrays or burst tries.
8
+
9
+ ## Requirement
10
+
11
+ - Ruby 1.9
12
+ - `g++` or `clang`
13
+
14
+ ## Install
15
+
16
+ ``` bash
17
+ gem ins triez
18
+ ```
19
+
20
+ ## Synopsis
21
+
22
+ ``` ruby
23
+ require 'triez'
24
+
25
+ t = Triez.new
26
+
27
+ # insertion
28
+ t['key'] = 100
29
+
30
+ # insert a key with default value (0 for normal triez, nil for obj_valued triez)
31
+ t << 'key'
32
+
33
+ # search
34
+ t.has_key? 'key'
35
+ t['key']
36
+
37
+ # iterate over values under a prefix.
38
+ t.search_with_prefix(prefix, limit: 10, sort: true) do |suffix, value|
39
+ ...
40
+ end
41
+
42
+ # enumerate (NOTE it is unordered)
43
+ t.each do |key, value|
44
+ ...
45
+ end
46
+ ```
47
+
48
+ ---
49
+
50
+ By default, a *triez* stores signed integers within 64bits, you can use it as weights, counts or database IDs, and doesn't cost any time in GC marking phase. In case you need to store arbitrary object in a node, use `obj_value: true`:
51
+
52
+ ``` ruby
53
+ t = Triez.new obj_value: true
54
+ t['Tom'] = {name: 'Tom', sex: 'Female'}
55
+ t['Tree'] = [:leaf, :trunk, :root]
56
+ ```
57
+
58
+ ---
59
+
60
+ When a *triez* is initialized with `suffix: true`, it inserts all suffices of a key
61
+
62
+ ``` ruby
63
+ t = Triez.new suffix: true
64
+ t['万塘路一锅鸡'] = 2
65
+ t['万塘路一锅鸡'] #=> 2
66
+ t['塘路一锅鸡'] #=> 2
67
+ t['路一锅鸡'] #=> 2
68
+ t['一锅鸡'] #=> 2
69
+ t['锅鸡'] #=> 2
70
+ t['鸡'] #=> 2
71
+ ```
72
+
73
+ You can batch change values with a block
74
+
75
+ ``` ruby
76
+ # v *= 5 for 'abcd', 'bcd', 'cd', 'd'
77
+ t.alt 'abcd' do |v|
78
+ v * 5
79
+ end
80
+ t['abcd'] #=> 10
81
+ t['cd'] #=> 10
82
+ ```
83
+
84
+ ---
85
+
86
+ Misc methods
87
+
88
+ ``` ruby
89
+ # if it is a suffix trie
90
+ t.suffix?
91
+ # if the value type is object
92
+ t.obj_value?
93
+ ```
94
+
95
+ ## Examples
96
+
97
+ Prefix-based autocompletion:
98
+
99
+ ``` ruby
100
+ require 'triez'
101
+ words = %w[readme, rot, red, rah, rasterization]
102
+ t = Triez.new
103
+ words.each do |word|
104
+ t[word] = 1
105
+ end
106
+ t.search_with_prefix 're' do |word|
107
+ puts "candidate: #{word}"
108
+ end
109
+ ```
110
+
111
+ The output:
112
+
113
+ ```bash
114
+ candidate: readme
115
+ candidate: red
116
+ ```
117
+
118
+ ---
119
+
120
+ Efficiently search for strings containing a substring:
121
+
122
+ ``` ruby
123
+ require 'triez'
124
+ sequences = {
125
+ 'ACTGAAAAAAACTG' => 1,
126
+ 'ATACGGTCCA' => 2,
127
+ 'GCTTGTACGT' => 3
128
+ }
129
+ t = Triez.new suffix: true
130
+ sequences.each do |seq, id|
131
+ t[seq] = id
132
+ end
133
+ t.search_with_prefix 'CGGT' do |_, id|
134
+ puts id #=> 2
135
+ end
136
+ ```
137
+
138
+ The search time is linear to the length of the substring.
139
+
140
+ ## Benchmarks
141
+
142
+ Here's a benchmark on
143
+
144
+ ```ruby
145
+ ruby 1.9.3p374 (2013-01-15 revision 38858) [x86_64-darwin12.2.1]
146
+ 2.3 GHz Intel Core i7
147
+ ```
148
+
149
+ The test data is 3 milion titles of wikipedia articles (from http://dumps.wikimedia.org/enwiki/20121101/)
150
+
151
+ ```
152
+ thing/backend | memory | insertion time | 3 M query
153
+ -------------------|---------|----------------|----------
154
+ hash/linked hash | 340.2 M | 4.369 s | 0.2800 s
155
+ trie/double array* | 155.6 M | 130.7 s | 0.4359 s
156
+ triez/HAT trie | 121.7 M | 3.872 s | 0.3472 s
157
+ ```
158
+
159
+ NOTE: `trie/double array` -> https://github.com/tyler/trie
160
+
161
+ ## Caveats
162
+
163
+ - `sort` orders keys with binary collation, not unicode codepoint collation in string comparison.
164
+ - For some rare case of many threads modifying the same trie, you may need a mutex.
165
+ - If you still feel memory not enough, you may consider [MARISA-trie](https://code.google.com/p/marisa-trie/) (NOTE that MARISA is immutable) or a database.
166
+
167
+ ## Development
168
+
169
+ ``` bash
170
+ git clone git://github.com/luikore/triez.git
171
+ cd triez
172
+ rake glob_src
173
+ rake
174
+ ```