triez 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/copying +18 -0
- data/ext/common.h +8 -0
- data/ext/extconf.rb +31 -0
- data/ext/hat-stub.c +14 -0
- data/ext/hat-trie/COPYING +19 -0
- data/ext/hat-trie/ahtable.c +551 -0
- data/ext/hat-trie/ahtable.h +93 -0
- data/ext/hat-trie/hat-trie.c +709 -0
- data/ext/hat-trie/hat-trie.h +75 -0
- data/ext/hat-trie/misc.h +22 -0
- data/ext/hat-trie/murmurhash3.c +77 -0
- data/ext/hat-trie/murmurhash3.h +12 -0
- data/ext/hat-trie/pstdint.h +800 -0
- data/ext/triez.cc +261 -0
- data/lib/triez.rb +45 -0
- data/readme.md +174 -0
- data/test/triez_test.rb +116 -0
- metadata +63 -0
data/ext/triez.cc
ADDED
@@ -0,0 +1,261 @@
|
|
1
|
+
#include <hat-trie.h>
|
2
|
+
#include <ruby.h>
|
3
|
+
#include <ruby/encoding.h>
|
4
|
+
|
5
|
+
static VALUE hat_class;
|
6
|
+
static rb_encoding* u8_enc;
|
7
|
+
static rb_encoding* bin_enc;
|
8
|
+
|
9
|
+
static inline VALUE unify_key(VALUE key) {
|
10
|
+
rb_encoding* enc = rb_enc_get(key);
|
11
|
+
if (enc != u8_enc && enc != bin_enc) {
|
12
|
+
return rb_funcall(key, rb_intern("encode"), 1, rb_enc_from_encoding(u8_enc));
|
13
|
+
} else {
|
14
|
+
return key;
|
15
|
+
}
|
16
|
+
}
|
17
|
+
|
18
|
+
static inline long long V2LL(VALUE v) {
|
19
|
+
union {VALUE v; long long l;} u;
|
20
|
+
u.v = v;
|
21
|
+
return u.l;
|
22
|
+
}
|
23
|
+
|
24
|
+
static inline VALUE LL2V(long long l) {
|
25
|
+
union {VALUE v; long long l;} u;
|
26
|
+
u.l = l;
|
27
|
+
return u.v;
|
28
|
+
}
|
29
|
+
|
30
|
+
struct HatTrie {
|
31
|
+
hattrie_t* p;
|
32
|
+
bool obj_value;
|
33
|
+
bool suffix;
|
34
|
+
bool initialized;
|
35
|
+
|
36
|
+
HatTrie() : obj_value(false), suffix(false), initialized(false) {
|
37
|
+
p = hattrie_create();
|
38
|
+
}
|
39
|
+
|
40
|
+
~HatTrie() {
|
41
|
+
hattrie_free(p);
|
42
|
+
}
|
43
|
+
};
|
44
|
+
|
45
|
+
static void hat_mark(void* p_ht) {
|
46
|
+
HatTrie* ht = (HatTrie*)p_ht;
|
47
|
+
if (!ht->obj_value) {
|
48
|
+
return;
|
49
|
+
}
|
50
|
+
hattrie_t* p = ht->p;
|
51
|
+
hattrie_iter_t* it = hattrie_iter_begin(p, false);
|
52
|
+
while (!hattrie_iter_finished(it)) {
|
53
|
+
value_t* v = hattrie_iter_val(it);
|
54
|
+
if (!IMMEDIATE_P(*v)) {
|
55
|
+
rb_gc_mark(*v);
|
56
|
+
}
|
57
|
+
}
|
58
|
+
hattrie_iter_free(it);
|
59
|
+
}
|
60
|
+
|
61
|
+
static void hat_free(void* p) {
|
62
|
+
delete (HatTrie*)p;
|
63
|
+
}
|
64
|
+
|
65
|
+
static VALUE hat_alloc(VALUE self) {
|
66
|
+
HatTrie* ht = new HatTrie();
|
67
|
+
return Data_Wrap_Struct(hat_class, hat_mark, hat_free, ht);
|
68
|
+
}
|
69
|
+
|
70
|
+
#define PRE_HAT\
|
71
|
+
hattrie_t* p;\
|
72
|
+
HatTrie* ht;\
|
73
|
+
Data_Get_Struct(self, HatTrie, ht);\
|
74
|
+
p = ht->p;\
|
75
|
+
Check_Type(key, T_STRING);\
|
76
|
+
key = unify_key(key);
|
77
|
+
|
78
|
+
static VALUE hat_set_type(VALUE self, VALUE obj_value, VALUE suffix) {
|
79
|
+
HatTrie* ht;
|
80
|
+
Data_Get_Struct(self, HatTrie, ht);
|
81
|
+
if (ht->initialized) {
|
82
|
+
rb_raise(rb_eRuntimeError, "Already initialized");
|
83
|
+
return self;
|
84
|
+
}
|
85
|
+
ht->obj_value = RTEST(obj_value);
|
86
|
+
ht->suffix = RTEST(suffix);
|
87
|
+
ht->initialized = true;
|
88
|
+
return self;
|
89
|
+
}
|
90
|
+
|
91
|
+
static VALUE hat_obj_value_p(VALUE self) {
|
92
|
+
HatTrie* ht;
|
93
|
+
Data_Get_Struct(self, HatTrie, ht);
|
94
|
+
return ht->obj_value ? Qtrue : Qfalse;
|
95
|
+
}
|
96
|
+
|
97
|
+
static VALUE hat_suffix_p(VALUE self) {
|
98
|
+
HatTrie* ht;
|
99
|
+
Data_Get_Struct(self, HatTrie, ht);
|
100
|
+
return ht->suffix ? Qtrue : Qfalse;
|
101
|
+
}
|
102
|
+
|
103
|
+
static VALUE hat_size(VALUE self) {
|
104
|
+
HatTrie* ht;
|
105
|
+
Data_Get_Struct(self, HatTrie, ht);
|
106
|
+
return ULL2NUM(hattrie_size(ht->p));
|
107
|
+
}
|
108
|
+
|
109
|
+
static VALUE hat_set(VALUE self, VALUE key, VALUE value) {
|
110
|
+
PRE_HAT;
|
111
|
+
long long v = ht->obj_value ? value : NUM2LL(value);
|
112
|
+
char* s = RSTRING_PTR(key);
|
113
|
+
size_t len = RSTRING_LEN(key);
|
114
|
+
if (ht->suffix) {
|
115
|
+
char* s_end = s + len;
|
116
|
+
long n;
|
117
|
+
for (; s < s_end; s += n, len -= n) {
|
118
|
+
n = rb_enc_mbclen(s, s_end, u8_enc);
|
119
|
+
hattrie_get(p, s, len)[0] = v;
|
120
|
+
}
|
121
|
+
} else {
|
122
|
+
hattrie_get(p, s, len)[0] = v;
|
123
|
+
}
|
124
|
+
return self;
|
125
|
+
}
|
126
|
+
|
127
|
+
static VALUE hat_alt(VALUE self, VALUE key) {
|
128
|
+
PRE_HAT;
|
129
|
+
char* s = RSTRING_PTR(key);
|
130
|
+
size_t len = RSTRING_LEN(key);
|
131
|
+
if (ht->suffix) {
|
132
|
+
char* s_end = s + len;
|
133
|
+
long n;
|
134
|
+
for (; s < s_end; s += n, len -= n) {
|
135
|
+
n = rb_enc_mbclen(s, s_end, u8_enc);
|
136
|
+
value_t* vp = hattrie_tryget(p, s, len);
|
137
|
+
long long v;
|
138
|
+
if (ht->obj_value) {
|
139
|
+
VALUE value = vp ? LL2V(vp[0]) : Qnil;
|
140
|
+
v = V2LL(rb_yield(value));
|
141
|
+
} else {
|
142
|
+
VALUE value = vp ? LL2NUM(vp[0]) : LL2NUM(0);
|
143
|
+
v = NUM2LL(rb_yield(value));
|
144
|
+
}
|
145
|
+
hattrie_get(p, s, len)[0] = v;
|
146
|
+
}
|
147
|
+
} else {
|
148
|
+
value_t* vp = hattrie_tryget(p, s, len);
|
149
|
+
long long v;
|
150
|
+
if (ht->obj_value) {
|
151
|
+
VALUE value = vp ? LL2V(vp[0]) : Qnil;
|
152
|
+
v = V2LL(rb_yield(value));
|
153
|
+
} else {
|
154
|
+
VALUE value = vp ? LL2NUM(vp[0]) : LL2NUM(0);
|
155
|
+
v = NUM2LL(rb_yield(value));
|
156
|
+
}
|
157
|
+
hattrie_get(p, s, len)[0] = v;
|
158
|
+
}
|
159
|
+
return self;
|
160
|
+
}
|
161
|
+
|
162
|
+
static VALUE hat_append(VALUE self, VALUE key) {
|
163
|
+
HatTrie* ht;
|
164
|
+
Data_Get_Struct(self, HatTrie, ht);
|
165
|
+
return hat_set(self, key, ht->obj_value ? Qnil : LL2NUM(0));
|
166
|
+
}
|
167
|
+
|
168
|
+
static VALUE hat_get(VALUE self, VALUE key) {
|
169
|
+
PRE_HAT;
|
170
|
+
value_t* vt = hattrie_tryget(p, RSTRING_PTR(key), RSTRING_LEN(key));
|
171
|
+
if (vt) {
|
172
|
+
return ht->obj_value ? (*vt) : LL2NUM(*vt);
|
173
|
+
} else {
|
174
|
+
return Qnil;
|
175
|
+
}
|
176
|
+
}
|
177
|
+
|
178
|
+
static VALUE hat_del(VALUE self, VALUE key) {
|
179
|
+
PRE_HAT;
|
180
|
+
const char* s = RSTRING_PTR(key);
|
181
|
+
size_t len = RSTRING_LEN(key);
|
182
|
+
value_t* vt = hattrie_tryget(p, s, len);
|
183
|
+
if (vt) {
|
184
|
+
hattrie_del(p, RSTRING_PTR(key), RSTRING_LEN(key));
|
185
|
+
return ht->obj_value ? (*vt) : LL2NUM(*vt);
|
186
|
+
} else {
|
187
|
+
return Qnil;
|
188
|
+
}
|
189
|
+
}
|
190
|
+
|
191
|
+
static VALUE hat_check(VALUE self, VALUE key) {
|
192
|
+
PRE_HAT;
|
193
|
+
value_t* vt = hattrie_tryget(p, RSTRING_PTR(key), RSTRING_LEN(key));
|
194
|
+
return vt ? Qtrue : Qfalse;
|
195
|
+
}
|
196
|
+
|
197
|
+
struct SearchCbData {
|
198
|
+
VALUE callback;
|
199
|
+
VALUE suffix;
|
200
|
+
VALUE value;
|
201
|
+
};
|
202
|
+
|
203
|
+
static VALUE hat_search_callback(VALUE data) {
|
204
|
+
SearchCbData* p = (SearchCbData*)data;
|
205
|
+
return rb_funcall(p->callback, rb_intern("call"), 2, p->suffix, p->value);
|
206
|
+
}
|
207
|
+
|
208
|
+
// returns: true if an error occured
|
209
|
+
static VALUE hat_search(VALUE self, VALUE key, VALUE vlimit, VALUE vsort, VALUE callback) {
|
210
|
+
PRE_HAT;
|
211
|
+
long limit = 0;
|
212
|
+
if (vlimit != Qnil) {
|
213
|
+
limit = NUM2LONG(vlimit);
|
214
|
+
}
|
215
|
+
|
216
|
+
hattrie_iter_t* it = hattrie_iter_with_prefix(p, RTEST(vsort), RSTRING_PTR(key), RSTRING_LEN(key));
|
217
|
+
int error = 0;
|
218
|
+
SearchCbData data = {callback};
|
219
|
+
while (!hattrie_iter_finished(it)) {
|
220
|
+
if (vlimit != Qnil && limit-- <= 0) {
|
221
|
+
break;
|
222
|
+
}
|
223
|
+
size_t suffix_len;
|
224
|
+
const char* suffix_s = hattrie_iter_key(it, &suffix_len);
|
225
|
+
value_t* v = hattrie_iter_val(it);
|
226
|
+
data.suffix = rb_enc_str_new(suffix_s, suffix_len, u8_enc);
|
227
|
+
data.value = ht->obj_value ? (*v) : LL2NUM(*v);
|
228
|
+
rb_protect(hat_search_callback, (VALUE)&data, &error);
|
229
|
+
if (error) {
|
230
|
+
break;
|
231
|
+
}
|
232
|
+
hattrie_iter_next(it);
|
233
|
+
}
|
234
|
+
hattrie_iter_free(it);
|
235
|
+
if (error) {
|
236
|
+
rb_funcall(rb_mKernel, rb_intern("raise"), 0);
|
237
|
+
}
|
238
|
+
return self;
|
239
|
+
}
|
240
|
+
|
241
|
+
#define DEF(k,n,f,c) rb_define_method(k,n,RUBY_METHOD_FUNC(f),c)
|
242
|
+
|
243
|
+
extern "C"
|
244
|
+
void Init_triez() {
|
245
|
+
hat_class = rb_define_class("Triez", rb_cObject);
|
246
|
+
u8_enc = rb_utf8_encoding();
|
247
|
+
bin_enc = rb_ascii8bit_encoding();
|
248
|
+
|
249
|
+
rb_define_alloc_func(hat_class, hat_alloc);
|
250
|
+
DEF(hat_class, "_internal_set_type", hat_set_type, 2);
|
251
|
+
DEF(hat_class, "obj_value?", hat_obj_value_p, 0);
|
252
|
+
DEF(hat_class, "suffix?", hat_suffix_p, 0);
|
253
|
+
DEF(hat_class, "size", hat_size, 0);
|
254
|
+
DEF(hat_class, "has_key?", hat_check, 1);
|
255
|
+
DEF(hat_class, "[]=", hat_set, 2);
|
256
|
+
DEF(hat_class, "alt", hat_alt, 1);
|
257
|
+
DEF(hat_class, "<<", hat_append, 1);
|
258
|
+
DEF(hat_class, "[]", hat_get, 1);
|
259
|
+
DEF(hat_class, "delete", hat_del, 1);
|
260
|
+
DEF(hat_class, "_internal_search", hat_search, 4);
|
261
|
+
}
|
data/lib/triez.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
require_relative "../ext/triez"
|
2
|
+
|
3
|
+
class Triez
|
4
|
+
VERSION = '0.2'
|
5
|
+
|
6
|
+
private :_internal_set_type
|
7
|
+
private :_internal_search
|
8
|
+
|
9
|
+
def initialize opts={}
|
10
|
+
opts = opts.dup
|
11
|
+
obj_value = opts.delete :obj_value
|
12
|
+
obj_value = false if obj_value.nil?
|
13
|
+
suffix = opts.delete :suffix
|
14
|
+
suffix = false if suffix.nil?
|
15
|
+
unless opts.empty?
|
16
|
+
raise ArgumentError, "Unknown options: #{opts.keys.inspect}, only [:suffix, :obj_value] are allowed"
|
17
|
+
end
|
18
|
+
_internal_set_type obj_value, suffix
|
19
|
+
end
|
20
|
+
|
21
|
+
def each &p
|
22
|
+
raise ArgumentError, 'Need a block' unless p
|
23
|
+
_internal_search '', nil, true, p
|
24
|
+
end
|
25
|
+
|
26
|
+
def search_with_prefix prefix, opts={}, &p
|
27
|
+
opts = opts.dup
|
28
|
+
limit = opts.delete :limit
|
29
|
+
if !limit.nil? and limit < 0
|
30
|
+
raise ArgumentError, "Limit should be > 0"
|
31
|
+
end
|
32
|
+
sort = opts.delete :sort
|
33
|
+
unless opts.empty?
|
34
|
+
raise ArgumentError, "Unknown options: #{opts.keys.inspect}, only [:limit, :sort] are allowed"
|
35
|
+
end
|
36
|
+
|
37
|
+
if p
|
38
|
+
_internal_search prefix, limit, sort, p
|
39
|
+
else
|
40
|
+
a = []
|
41
|
+
_internal_search prefix, limit, sort, -> k, v {a << [k, v]}
|
42
|
+
a
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
data/readme.md
ADDED
@@ -0,0 +1,174 @@
|
|
1
|
+
## What
|
2
|
+
|
3
|
+
Pragmatic [trie](http://en.wikipedia.org/wiki/Trie) for Ruby.
|
4
|
+
|
5
|
+
It is fast, memory efficient, unicode aware.
|
6
|
+
|
7
|
+
The backend of *triez* is a cache oblivious data structure: the [HAT trie](https://github.com/dcjones/hat-trie). It is generally faster and more memory efficient than double arrays or burst tries.
|
8
|
+
|
9
|
+
## Requirement
|
10
|
+
|
11
|
+
- Ruby 1.9
|
12
|
+
- `g++` or `clang`
|
13
|
+
|
14
|
+
## Install
|
15
|
+
|
16
|
+
``` bash
|
17
|
+
gem ins triez
|
18
|
+
```
|
19
|
+
|
20
|
+
## Synopsis
|
21
|
+
|
22
|
+
``` ruby
|
23
|
+
require 'triez'
|
24
|
+
|
25
|
+
t = Triez.new
|
26
|
+
|
27
|
+
# insertion
|
28
|
+
t['key'] = 100
|
29
|
+
|
30
|
+
# insert a key with default value (0 for normal triez, nil for obj_valued triez)
|
31
|
+
t << 'key'
|
32
|
+
|
33
|
+
# search
|
34
|
+
t.has_key? 'key'
|
35
|
+
t['key']
|
36
|
+
|
37
|
+
# iterate over values under a prefix.
|
38
|
+
t.search_with_prefix(prefix, limit: 10, sort: true) do |suffix, value|
|
39
|
+
...
|
40
|
+
end
|
41
|
+
|
42
|
+
# enumerate (NOTE it is unordered)
|
43
|
+
t.each do |key, value|
|
44
|
+
...
|
45
|
+
end
|
46
|
+
```
|
47
|
+
|
48
|
+
---
|
49
|
+
|
50
|
+
By default, a *triez* stores signed integers within 64bits, you can use it as weights, counts or database IDs, and doesn't cost any time in GC marking phase. In case you need to store arbitrary object in a node, use `obj_value: true`:
|
51
|
+
|
52
|
+
``` ruby
|
53
|
+
t = Triez.new obj_value: true
|
54
|
+
t['Tom'] = {name: 'Tom', sex: 'Female'}
|
55
|
+
t['Tree'] = [:leaf, :trunk, :root]
|
56
|
+
```
|
57
|
+
|
58
|
+
---
|
59
|
+
|
60
|
+
When a *triez* is initialized with `suffix: true`, it inserts all suffices of a key
|
61
|
+
|
62
|
+
``` ruby
|
63
|
+
t = Triez.new suffix: true
|
64
|
+
t['万塘路一锅鸡'] = 2
|
65
|
+
t['万塘路一锅鸡'] #=> 2
|
66
|
+
t['塘路一锅鸡'] #=> 2
|
67
|
+
t['路一锅鸡'] #=> 2
|
68
|
+
t['一锅鸡'] #=> 2
|
69
|
+
t['锅鸡'] #=> 2
|
70
|
+
t['鸡'] #=> 2
|
71
|
+
```
|
72
|
+
|
73
|
+
You can batch change values with a block
|
74
|
+
|
75
|
+
``` ruby
|
76
|
+
# v *= 5 for 'abcd', 'bcd', 'cd', 'd'
|
77
|
+
t.alt 'abcd' do |v|
|
78
|
+
v * 5
|
79
|
+
end
|
80
|
+
t['abcd'] #=> 10
|
81
|
+
t['cd'] #=> 10
|
82
|
+
```
|
83
|
+
|
84
|
+
---
|
85
|
+
|
86
|
+
Misc methods
|
87
|
+
|
88
|
+
``` ruby
|
89
|
+
# if it is a suffix trie
|
90
|
+
t.suffix?
|
91
|
+
# if the value type is object
|
92
|
+
t.obj_value?
|
93
|
+
```
|
94
|
+
|
95
|
+
## Examples
|
96
|
+
|
97
|
+
Prefix-based autocompletion:
|
98
|
+
|
99
|
+
``` ruby
|
100
|
+
require 'triez'
|
101
|
+
words = %w[readme, rot, red, rah, rasterization]
|
102
|
+
t = Triez.new
|
103
|
+
words.each do |word|
|
104
|
+
t[word] = 1
|
105
|
+
end
|
106
|
+
t.search_with_prefix 're' do |word|
|
107
|
+
puts "candidate: #{word}"
|
108
|
+
end
|
109
|
+
```
|
110
|
+
|
111
|
+
The output:
|
112
|
+
|
113
|
+
```bash
|
114
|
+
candidate: readme
|
115
|
+
candidate: red
|
116
|
+
```
|
117
|
+
|
118
|
+
---
|
119
|
+
|
120
|
+
Efficiently search for strings containing a substring:
|
121
|
+
|
122
|
+
``` ruby
|
123
|
+
require 'triez'
|
124
|
+
sequences = {
|
125
|
+
'ACTGAAAAAAACTG' => 1,
|
126
|
+
'ATACGGTCCA' => 2,
|
127
|
+
'GCTTGTACGT' => 3
|
128
|
+
}
|
129
|
+
t = Triez.new suffix: true
|
130
|
+
sequences.each do |seq, id|
|
131
|
+
t[seq] = id
|
132
|
+
end
|
133
|
+
t.search_with_prefix 'CGGT' do |_, id|
|
134
|
+
puts id #=> 2
|
135
|
+
end
|
136
|
+
```
|
137
|
+
|
138
|
+
The search time is linear to the length of the substring.
|
139
|
+
|
140
|
+
## Benchmarks
|
141
|
+
|
142
|
+
Here's a benchmark on
|
143
|
+
|
144
|
+
```ruby
|
145
|
+
ruby 1.9.3p374 (2013-01-15 revision 38858) [x86_64-darwin12.2.1]
|
146
|
+
2.3 GHz Intel Core i7
|
147
|
+
```
|
148
|
+
|
149
|
+
The test data is 3 milion titles of wikipedia articles (from http://dumps.wikimedia.org/enwiki/20121101/)
|
150
|
+
|
151
|
+
```
|
152
|
+
thing/backend | memory | insertion time | 3 M query
|
153
|
+
-------------------|---------|----------------|----------
|
154
|
+
hash/linked hash | 340.2 M | 4.369 s | 0.2800 s
|
155
|
+
trie/double array* | 155.6 M | 130.7 s | 0.4359 s
|
156
|
+
triez/HAT trie | 121.7 M | 3.872 s | 0.3472 s
|
157
|
+
```
|
158
|
+
|
159
|
+
NOTE: `trie/double array` -> https://github.com/tyler/trie
|
160
|
+
|
161
|
+
## Caveats
|
162
|
+
|
163
|
+
- `sort` orders keys with binary collation, not unicode codepoint collation in string comparison.
|
164
|
+
- For some rare case of many threads modifying the same trie, you may need a mutex.
|
165
|
+
- If you still feel memory not enough, you may consider [MARISA-trie](https://code.google.com/p/marisa-trie/) (NOTE that MARISA is immutable) or a database.
|
166
|
+
|
167
|
+
## Development
|
168
|
+
|
169
|
+
``` bash
|
170
|
+
git clone git://github.com/luikore/triez.git
|
171
|
+
cd triez
|
172
|
+
rake glob_src
|
173
|
+
rake
|
174
|
+
```
|