triez 0.3 → 1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/hat-trie/misc.c +46 -0
- data/ext/triez.cc +54 -51
- data/lib/triez.rb +22 -7
- data/readme.md +96 -63
- data/test/triez_test.rb +89 -17
- metadata +2 -2
- data/ext/hat-stub.c +0 -14
data/ext/hat-trie/misc.c
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
/*
|
2
|
+
* This file is part of hat-trie.
|
3
|
+
*
|
4
|
+
* Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
|
5
|
+
*
|
6
|
+
*/
|
7
|
+
|
8
|
+
#include "misc.h"
|
9
|
+
#include <stdlib.h>
|
10
|
+
|
11
|
+
|
12
|
+
void* malloc_or_die(size_t n)
|
13
|
+
{
|
14
|
+
void* p = malloc(n);
|
15
|
+
if (p == NULL && n != 0) {
|
16
|
+
fprintf(stderr, "Cannot allocate %zu bytes.\n", n);
|
17
|
+
exit(EXIT_FAILURE);
|
18
|
+
}
|
19
|
+
return p;
|
20
|
+
}
|
21
|
+
|
22
|
+
|
23
|
+
void* realloc_or_die(void* ptr, size_t n)
|
24
|
+
{
|
25
|
+
void* p = realloc(ptr, n);
|
26
|
+
if (p == NULL && n != 0) {
|
27
|
+
fprintf(stderr, "Cannot allocate %zu bytes.\n", n);
|
28
|
+
exit(EXIT_FAILURE);
|
29
|
+
}
|
30
|
+
return p;
|
31
|
+
}
|
32
|
+
|
33
|
+
|
34
|
+
FILE* fopen_or_die(const char* path, const char* mode)
|
35
|
+
{
|
36
|
+
FILE* f = fopen(path, mode);
|
37
|
+
if (f == NULL) {
|
38
|
+
fprintf(stderr, "Cannot open file %s with mode %s.\n", path, mode);
|
39
|
+
exit(EXIT_FAILURE);
|
40
|
+
}
|
41
|
+
return f;
|
42
|
+
}
|
43
|
+
|
44
|
+
|
45
|
+
|
46
|
+
|
data/ext/triez.cc
CHANGED
@@ -29,11 +29,11 @@ static inline VALUE LL2V(long long l) {
|
|
29
29
|
|
30
30
|
struct HatTrie {
|
31
31
|
hattrie_t* p;
|
32
|
+
VALUE default_value;
|
32
33
|
bool obj_value;
|
33
|
-
bool suffix;
|
34
34
|
bool initialized;
|
35
35
|
|
36
|
-
HatTrie() :
|
36
|
+
HatTrie() : default_value(Qnil), obj_value(false), initialized(false) {
|
37
37
|
p = hattrie_create();
|
38
38
|
}
|
39
39
|
|
@@ -44,6 +44,9 @@ struct HatTrie {
|
|
44
44
|
|
45
45
|
static void hat_mark(void* p_ht) {
|
46
46
|
HatTrie* ht = (HatTrie*)p_ht;
|
47
|
+
if (!IMMEDIATE_P(ht->default_value)) {
|
48
|
+
rb_gc_mark(ht->default_value);
|
49
|
+
}
|
47
50
|
if (!ht->obj_value) {
|
48
51
|
return;
|
49
52
|
}
|
@@ -75,29 +78,23 @@ static VALUE hat_alloc(VALUE self) {
|
|
75
78
|
Check_Type(key, T_STRING);\
|
76
79
|
key = unify_key(key);
|
77
80
|
|
78
|
-
static VALUE hat_set_type(VALUE self, VALUE obj_value, VALUE
|
81
|
+
static VALUE hat_set_type(VALUE self, VALUE obj_value, VALUE default_value) {
|
79
82
|
HatTrie* ht;
|
80
83
|
Data_Get_Struct(self, HatTrie, ht);
|
81
84
|
if (ht->initialized) {
|
82
85
|
rb_raise(rb_eRuntimeError, "Already initialized");
|
83
86
|
return self;
|
84
87
|
}
|
88
|
+
ht->default_value = default_value;
|
85
89
|
ht->obj_value = RTEST(obj_value);
|
86
|
-
ht->suffix = RTEST(suffix);
|
87
90
|
ht->initialized = true;
|
88
91
|
return self;
|
89
92
|
}
|
90
93
|
|
91
|
-
static VALUE
|
92
|
-
HatTrie* ht;
|
93
|
-
Data_Get_Struct(self, HatTrie, ht);
|
94
|
-
return ht->obj_value ? Qtrue : Qfalse;
|
95
|
-
}
|
96
|
-
|
97
|
-
static VALUE hat_suffix_p(VALUE self) {
|
94
|
+
static VALUE hat_value_type(VALUE self) {
|
98
95
|
HatTrie* ht;
|
99
96
|
Data_Get_Struct(self, HatTrie, ht);
|
100
|
-
return ht->
|
97
|
+
return ht->obj_value ? ID2SYM(rb_intern("object")) : ID2SYM(rb_intern("int64"));
|
101
98
|
}
|
102
99
|
|
103
100
|
static VALUE hat_size(VALUE self) {
|
@@ -111,50 +108,58 @@ static VALUE hat_set(VALUE self, VALUE key, VALUE value) {
|
|
111
108
|
long long v = ht->obj_value ? value : NUM2LL(value);
|
112
109
|
char* s = RSTRING_PTR(key);
|
113
110
|
size_t len = RSTRING_LEN(key);
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
111
|
+
hattrie_get(p, s, len)[0] = v;
|
112
|
+
return self;
|
113
|
+
}
|
114
|
+
|
115
|
+
static inline void hat_change(HatTrie* ht, hattrie_t* p, char* s, size_t len) {
|
116
|
+
// NOTE must use 2-step change, because the block may change the trie
|
117
|
+
value_t* vp = hattrie_tryget(p, s, len);
|
118
|
+
long long v;
|
119
|
+
if (ht->obj_value) {
|
120
|
+
VALUE value = vp ? LL2V(vp[0]) : ht->default_value;
|
121
|
+
v = V2LL(rb_yield(value));
|
121
122
|
} else {
|
122
|
-
|
123
|
+
VALUE value = vp ? LL2NUM(vp[0]) : ht->default_value;
|
124
|
+
v = NUM2LL(rb_yield(value));
|
125
|
+
}
|
126
|
+
hattrie_get(p, s, len)[0] = v;
|
127
|
+
}
|
128
|
+
|
129
|
+
static inline void hat_change_prefix(HatTrie* ht, hattrie_t* p, char* s, size_t len, char* rs) {
|
130
|
+
char* rs_end = rs + len;
|
131
|
+
long n;
|
132
|
+
for (; rs < rs_end; rs += n, len -= n) {
|
133
|
+
hat_change(ht, p, s, len);
|
134
|
+
// no need check encoding because reverse succeeded
|
135
|
+
n = rb_enc_fast_mbclen(rs, rs_end, u8_enc);
|
123
136
|
}
|
124
|
-
return self;
|
125
137
|
}
|
126
138
|
|
127
|
-
static VALUE
|
139
|
+
static VALUE hat_change_all(VALUE self, VALUE type, VALUE key) {
|
128
140
|
PRE_HAT;
|
129
141
|
char* s = RSTRING_PTR(key);
|
130
142
|
size_t len = RSTRING_LEN(key);
|
131
|
-
|
143
|
+
ID ty = SYM2ID(type);
|
144
|
+
if (ty == rb_intern("suffix")) {
|
132
145
|
char* s_end = s + len;
|
133
146
|
long n;
|
134
147
|
for (; s < s_end; s += n, len -= n) {
|
148
|
+
hat_change(ht, p, s, len);
|
135
149
|
n = rb_enc_mbclen(s, s_end, u8_enc);
|
136
|
-
value_t* vp = hattrie_tryget(p, s, len);
|
137
|
-
long long v;
|
138
|
-
if (ht->obj_value) {
|
139
|
-
VALUE value = vp ? LL2V(vp[0]) : Qnil;
|
140
|
-
v = V2LL(rb_yield(value));
|
141
|
-
} else {
|
142
|
-
VALUE value = vp ? LL2NUM(vp[0]) : LL2NUM(0);
|
143
|
-
v = NUM2LL(rb_yield(value));
|
144
|
-
}
|
145
|
-
hattrie_get(p, s, len)[0] = v;
|
146
150
|
}
|
147
|
-
} else {
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
151
|
+
} else if (ty == rb_intern("prefix")) {
|
152
|
+
volatile VALUE reversed = rb_funcall(key, rb_intern("reverse"), 0);
|
153
|
+
hat_change_prefix(ht, p, s, len, RSTRING_PTR(reversed));
|
154
|
+
} else if (ty == rb_intern("substring")) {
|
155
|
+
volatile VALUE reversed = rb_funcall(key, rb_intern("reverse"), 0);
|
156
|
+
char* rs = RSTRING_PTR(reversed);
|
157
|
+
char* s_end = s + len;
|
158
|
+
long n;
|
159
|
+
for (; s < s_end; s += n, len -= n) {
|
160
|
+
hat_change_prefix(ht, p, s, len, rs);
|
161
|
+
n = rb_enc_fast_mbclen(s, s_end, u8_enc);
|
156
162
|
}
|
157
|
-
hattrie_get(p, s, len)[0] = v;
|
158
163
|
}
|
159
164
|
return self;
|
160
165
|
}
|
@@ -162,7 +167,7 @@ static VALUE hat_alt(VALUE self, VALUE key) {
|
|
162
167
|
static VALUE hat_append(VALUE self, VALUE key) {
|
163
168
|
HatTrie* ht;
|
164
169
|
Data_Get_Struct(self, HatTrie, ht);
|
165
|
-
return hat_set(self, key, ht->
|
170
|
+
return hat_set(self, key, ht->default_value);
|
166
171
|
}
|
167
172
|
|
168
173
|
static VALUE hat_get(VALUE self, VALUE key) {
|
@@ -171,7 +176,7 @@ static VALUE hat_get(VALUE self, VALUE key) {
|
|
171
176
|
if (vt) {
|
172
177
|
return ht->obj_value ? (*vt) : LL2NUM(*vt);
|
173
178
|
} else {
|
174
|
-
return
|
179
|
+
return ht->default_value;
|
175
180
|
}
|
176
181
|
}
|
177
182
|
|
@@ -184,7 +189,7 @@ static VALUE hat_del(VALUE self, VALUE key) {
|
|
184
189
|
hattrie_del(p, RSTRING_PTR(key), RSTRING_LEN(key));
|
185
190
|
return ht->obj_value ? (*vt) : LL2NUM(*vt);
|
186
191
|
} else {
|
187
|
-
return
|
192
|
+
return ht->default_value;
|
188
193
|
}
|
189
194
|
}
|
190
195
|
|
@@ -205,7 +210,6 @@ static VALUE hat_search_callback(VALUE data) {
|
|
205
210
|
return rb_funcall(p->callback, rb_intern("call"), 2, p->suffix, p->value);
|
206
211
|
}
|
207
212
|
|
208
|
-
// returns: true if an error occured
|
209
213
|
static VALUE hat_search(VALUE self, VALUE key, VALUE vlimit, VALUE vsort, VALUE callback) {
|
210
214
|
PRE_HAT;
|
211
215
|
long limit = 0;
|
@@ -248,14 +252,13 @@ void Init_triez() {
|
|
248
252
|
|
249
253
|
rb_define_alloc_func(hat_class, hat_alloc);
|
250
254
|
DEF(hat_class, "_internal_set_type", hat_set_type, 2);
|
251
|
-
DEF(hat_class, "
|
252
|
-
DEF(hat_class, "suffix?", hat_suffix_p, 0);
|
255
|
+
DEF(hat_class, "value_type", hat_value_type, 0);
|
253
256
|
DEF(hat_class, "size", hat_size, 0);
|
254
|
-
DEF(hat_class, "has_key?", hat_check, 1);
|
255
257
|
DEF(hat_class, "[]=", hat_set, 2);
|
256
|
-
DEF(hat_class, "
|
258
|
+
DEF(hat_class, "change_all", hat_change_all, 2);
|
257
259
|
DEF(hat_class, "<<", hat_append, 1);
|
258
260
|
DEF(hat_class, "[]", hat_get, 1);
|
261
|
+
DEF(hat_class, "has_key?", hat_check, 1);
|
259
262
|
DEF(hat_class, "delete", hat_del, 1);
|
260
263
|
DEF(hat_class, "_internal_search", hat_search, 4);
|
261
264
|
}
|
data/lib/triez.rb
CHANGED
@@ -1,34 +1,49 @@
|
|
1
1
|
require_relative "../ext/triez"
|
2
2
|
|
3
3
|
class Triez
|
4
|
-
VERSION = '0
|
4
|
+
VERSION = '1.0'
|
5
5
|
|
6
6
|
private :_internal_set_type
|
7
7
|
private :_internal_search
|
8
8
|
|
9
9
|
def initialize opts={}
|
10
10
|
opts = opts.dup
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
11
|
+
|
12
|
+
value_type = opts.delete :value_type
|
13
|
+
if value_type.nil?
|
14
|
+
value_type = :int64
|
15
|
+
elsif value_type != :int64 and value_type != :object
|
16
|
+
raise ArgumentError, "value_type should be :int64 or :object, but got #{value_type.inspect}"
|
17
|
+
end
|
18
|
+
|
19
|
+
default = opts.delete :default
|
20
|
+
if default.nil?
|
21
|
+
default = (value_type == :int64 ? 0 : nil)
|
22
|
+
elsif value_type == :int64
|
23
|
+
default = default.to_i
|
24
|
+
end
|
25
|
+
|
15
26
|
unless opts.empty?
|
16
|
-
raise ArgumentError, "Unknown options: #{opts.keys.inspect}, only [:
|
27
|
+
raise ArgumentError, "Unknown options: #{opts.keys.inspect}, only [:value_type, :default] are allowed"
|
17
28
|
end
|
18
|
-
|
29
|
+
|
30
|
+
_internal_set_type value_type == :object, default
|
19
31
|
end
|
20
32
|
|
21
33
|
def each &p
|
22
34
|
raise ArgumentError, 'Need a block' unless p
|
35
|
+
|
23
36
|
_internal_search '', nil, true, p
|
24
37
|
end
|
25
38
|
|
26
39
|
def search_with_prefix prefix, opts={}, &p
|
27
40
|
opts = opts.dup
|
41
|
+
|
28
42
|
limit = opts.delete :limit
|
29
43
|
if !limit.nil? and limit < 0
|
30
44
|
raise ArgumentError, "Limit should be > 0"
|
31
45
|
end
|
46
|
+
|
32
47
|
sort = opts.delete :sort
|
33
48
|
unless opts.empty?
|
34
49
|
raise ArgumentError, "Unknown options: #{opts.keys.inspect}, only [:limit, :sort] are allowed"
|
data/readme.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
## What
|
2
2
|
|
3
|
-
Pragmatic [
|
3
|
+
Pragmatic [tries](http://en.wikipedia.org/wiki/Trie) for Ruby, spelled in lolcat.
|
4
4
|
|
5
|
-
It is fast, memory efficient, unicode aware.
|
5
|
+
It is fast, memory efficient, unicode aware, prefix searchable, and enchanced with prefix/suffix/substring keys.
|
6
6
|
|
7
|
-
The backend of *triez* is a cache oblivious data structure: the [HAT trie](https://github.com/dcjones/hat-trie).
|
7
|
+
The backend of *triez* is a cache oblivious data structure: the [HAT trie](https://github.com/dcjones/hat-trie) (In fact I'm using a [modified version](https://github.com/luikore/hat-trie) for improved functionality). HAT trie is generally faster and more memory efficient than [double array](http://linux.thai.net/~thep/datrie/datrie.html) or [burst trie](http://ww2.cs.mu.oz.au/~jz/fulltext/acmtois02.pdf).
|
8
8
|
|
9
9
|
## Requirement
|
10
10
|
|
@@ -22,32 +22,57 @@ gem ins triez
|
|
22
22
|
``` ruby
|
23
23
|
require 'triez'
|
24
24
|
|
25
|
+
# create triez
|
25
26
|
t = Triez.new
|
26
27
|
|
27
|
-
#
|
28
|
+
# the above code is equivalent to :int64 for :value_type and 0 for :default
|
29
|
+
t = Triez.new value_type: :int64
|
30
|
+
|
31
|
+
# more flexible with object type [*see note below]
|
32
|
+
t = Triez.new value_type: :object
|
33
|
+
|
34
|
+
# get the value type
|
35
|
+
t.value_type
|
36
|
+
|
37
|
+
# set a different default value
|
38
|
+
t = Triez.new value_type: :object, default: 'hello'
|
39
|
+
|
40
|
+
# insert or change value
|
28
41
|
t['key'] = 100
|
29
42
|
|
30
|
-
# insert a key with default value
|
43
|
+
# insert a key with default value
|
31
44
|
t << 'key'
|
32
45
|
|
33
|
-
#
|
46
|
+
# batch change values under all suffices/prefices/substrings of a key
|
47
|
+
t.change_all(:suffix, 'key') {|old_value| ...calculate new value }
|
48
|
+
t.change_all(:prefix, 'key') {|old_value| ...calculate new value }
|
49
|
+
# enumerates all occurences of substrings of the key
|
50
|
+
t.change_all(:substring, 'key') {|old_value| ...calculate new value }
|
51
|
+
|
52
|
+
# size of inserted keys
|
53
|
+
t.size
|
54
|
+
|
55
|
+
# search with exact match
|
34
56
|
t.has_key? 'key'
|
35
57
|
t['key']
|
36
58
|
|
37
|
-
# iterate over values under a prefix
|
59
|
+
# prefixed search (iterate over values under a prefix), available options are:
|
60
|
+
# - limit: max items, `nil` means no limit
|
61
|
+
# - sort: whether iterate in alphabetic order, default is true
|
38
62
|
t.search_with_prefix(prefix, limit: 10, sort: true) do |suffix, value|
|
39
63
|
...
|
40
64
|
end
|
41
65
|
|
42
|
-
#
|
66
|
+
# if no block given, an array in the form of [[suffix, value]] is returned
|
67
|
+
t.search_with_prefix('prefix')
|
68
|
+
|
69
|
+
# enumerate all keys and values in the order of binary collation
|
43
70
|
t.each do |key, value|
|
44
71
|
...
|
45
72
|
end
|
46
73
|
```
|
47
74
|
|
48
|
-
|
49
|
-
|
50
|
-
By default, a *triez* stores signed integers within 64bits, you can use it as weights, counts or database IDs, and doesn't cost any time in GC marking phase. In case you need to store arbitrary object in a node, use `obj_value: true`:
|
75
|
+
\* Note: By default, *triez* store signed integers within 64bits, you can use them as weights, counts or database IDs. In case you need to store arbitrary object in a node, use `obj_value: true`:
|
51
76
|
|
52
77
|
``` ruby
|
53
78
|
t = Triez.new obj_value: true
|
@@ -55,46 +80,9 @@ t['Tom'] = {name: 'Tom', sex: 'Female'}
|
|
55
80
|
t['Tree'] = [:leaf, :trunk, :root]
|
56
81
|
```
|
57
82
|
|
58
|
-
---
|
59
|
-
|
60
|
-
When a *triez* is initialized with `suffix: true`, it inserts all suffices of a key
|
61
|
-
|
62
|
-
``` ruby
|
63
|
-
t = Triez.new suffix: true
|
64
|
-
t['万塘路一锅鸡'] = 2
|
65
|
-
t['万塘路一锅鸡'] #=> 2
|
66
|
-
t['塘路一锅鸡'] #=> 2
|
67
|
-
t['路一锅鸡'] #=> 2
|
68
|
-
t['一锅鸡'] #=> 2
|
69
|
-
t['锅鸡'] #=> 2
|
70
|
-
t['鸡'] #=> 2
|
71
|
-
```
|
72
|
-
|
73
|
-
You can batch change values with a block
|
74
|
-
|
75
|
-
``` ruby
|
76
|
-
# v *= 5 for 'abcd', 'bcd', 'cd', 'd'
|
77
|
-
t.alt 'abcd' do |v|
|
78
|
-
v * 5
|
79
|
-
end
|
80
|
-
t['abcd'] #=> 10
|
81
|
-
t['cd'] #=> 10
|
82
|
-
```
|
83
|
-
|
84
|
-
---
|
85
|
-
|
86
|
-
Misc methods
|
87
|
-
|
88
|
-
``` ruby
|
89
|
-
# if it is a suffix trie
|
90
|
-
t.suffix?
|
91
|
-
# if the value type is object
|
92
|
-
t.obj_value?
|
93
|
-
```
|
94
|
-
|
95
83
|
## Examples
|
96
84
|
|
97
|
-
Prefix
|
85
|
+
**Prefix based autocompletion**:
|
98
86
|
|
99
87
|
``` ruby
|
100
88
|
require 'triez'
|
@@ -117,7 +105,7 @@ candidate: red
|
|
117
105
|
|
118
106
|
---
|
119
107
|
|
120
|
-
|
108
|
+
**Efficient [full text search](https://en.wikipedia.org/wiki/Full_text_search) with a [suffix tree](https://en.wikipedia.org/wiki/Suffix_tree)**:
|
121
109
|
|
122
110
|
``` ruby
|
123
111
|
require 'triez'
|
@@ -126,18 +114,59 @@ sequences = {
|
|
126
114
|
'ATACGGTCCA' => 2,
|
127
115
|
'GCTTGTACGT' => 3
|
128
116
|
}
|
129
|
-
t = Triez.new
|
117
|
+
t = Triez.new
|
118
|
+
|
119
|
+
# build suffix trie
|
130
120
|
sequences.each do |seq, id|
|
131
|
-
t[seq] = id
|
121
|
+
t.change_all :suffix, [seq] = id
|
132
122
|
end
|
123
|
+
|
133
124
|
t.search_with_prefix 'CGGT' do |_, id|
|
134
125
|
puts id #=> 2
|
135
126
|
end
|
136
127
|
```
|
137
128
|
|
138
|
-
The
|
129
|
+
The searching time is linear to the length of the substring.
|
130
|
+
|
131
|
+
---
|
139
132
|
|
140
|
-
|
133
|
+
**Solve the [longest common substring problem](https://en.wikipedia.org/wiki/Longest_common_substring_problem)**:
|
134
|
+
|
135
|
+
``` ruby
|
136
|
+
# coding: utf-8
|
137
|
+
require 'triez'
|
138
|
+
sentences = %w[
|
139
|
+
万塘路一锅鸡
|
140
|
+
去文二路一锅鸡吃饭
|
141
|
+
来一锅鸡顶盒
|
142
|
+
一锅鸡胗
|
143
|
+
]
|
144
|
+
|
145
|
+
# value is bitset representing id of the sentence
|
146
|
+
# in ruby we can use integers of arbitrary length as bitsets
|
147
|
+
t = Triez.new value_type: :object, default: 0
|
148
|
+
|
149
|
+
sentences.each_with_index do |sentence, i|
|
150
|
+
elem = 1 << i
|
151
|
+
t.change_all :substring, sentence do |v|
|
152
|
+
# union
|
153
|
+
v | elem
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
# longest common substring
|
158
|
+
lcs = ''
|
159
|
+
|
160
|
+
# find the key tagged with universe
|
161
|
+
universe = (1 << sentences.size) - 1
|
162
|
+
t.each do |k, v|
|
163
|
+
lcs = k if k.size > lcs.size and v == universe
|
164
|
+
end
|
165
|
+
|
166
|
+
puts lcs #=> 一锅鸡
|
167
|
+
```
|
168
|
+
|
169
|
+
## Benchmark
|
141
170
|
|
142
171
|
Here's a benchmark on
|
143
172
|
|
@@ -146,23 +175,23 @@ ruby 1.9.3p374 (2013-01-15 revision 38858) [x86_64-darwin12.2.1]
|
|
146
175
|
2.3 GHz Intel Core i7
|
147
176
|
```
|
148
177
|
|
149
|
-
The test data
|
178
|
+
The test data are 3 milion titles of wikipedia articles (from http://dumps.wikimedia.org/enwiki/20121101/)
|
150
179
|
|
151
180
|
```
|
152
|
-
thing/backend
|
153
|
-
|
154
|
-
hash/linked hash
|
155
|
-
|
156
|
-
triez/HAT trie
|
181
|
+
thing/backend | memory | insertion time | 3 M query
|
182
|
+
------------------------|---------|----------------|----------
|
183
|
+
hash/linked hash | 340.2 M | 4.369 s | 0.2800 s
|
184
|
+
fast_trie/double array* | 155.6 M | 130.7 s | 0.4359 s
|
185
|
+
triez/HAT trie | 121.7 M | 3.872 s | 0.3472 s
|
157
186
|
```
|
158
187
|
|
159
|
-
|
188
|
+
Note: `trie/double array` -> https://github.com/tyler/trie
|
160
189
|
|
161
190
|
## Caveats
|
162
191
|
|
163
|
-
- `sort` orders keys with binary collation,
|
192
|
+
- The `sort` option in prefixed search orders keys with binary [collation](https://en.wikipedia.org/wiki/Collation), but string comparison in Ruby is with unicode codepoint collation.
|
164
193
|
- For some rare case of many threads modifying the same trie, you may need a mutex.
|
165
|
-
- If you still feel memory not enough, you may consider [MARISA-trie](https://code.google.com/p/marisa-trie/) (
|
194
|
+
- If you still feel memory not enough, you may consider [MARISA-trie](https://code.google.com/p/marisa-trie/) (note that MARISA is immutable), or a database.
|
166
195
|
|
167
196
|
## Development
|
168
197
|
|
@@ -172,3 +201,7 @@ cd triez
|
|
172
201
|
rake glob_src
|
173
202
|
rake
|
174
203
|
```
|
204
|
+
|
205
|
+
## Note
|
206
|
+
|
207
|
+
Although HAT trie uses MurMurHash3 instead of SipHash in Ruby, It is still safe under hashDoS because bucket size is limited.
|
data/test/triez_test.rb
CHANGED
@@ -1,20 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
1
2
|
require "test/unit"
|
2
3
|
require_relative "../lib/triez"
|
3
4
|
|
4
5
|
GC.stress
|
5
6
|
|
6
7
|
class TriezTest < Test::Unit::TestCase
|
7
|
-
def
|
8
|
-
t = Triez.new
|
9
|
-
assert_equal
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
assert_equal
|
8
|
+
def test_init_type_options
|
9
|
+
t = Triez.new value_type: :int64
|
10
|
+
assert_equal :int64, t.value_type
|
11
|
+
t = Triez.new value_type: :object
|
12
|
+
assert_equal :object, t.value_type
|
13
|
+
t = Triez.new
|
14
|
+
assert_equal :int64, t.value_type
|
15
|
+
|
16
|
+
assert_raise ArgumentError do
|
17
|
+
Triez.new value_type: :string
|
18
|
+
end
|
19
|
+
assert_raise ArgumentError do
|
20
|
+
Triez.new invalid_option: :int64
|
21
|
+
end
|
14
22
|
end
|
15
23
|
|
16
24
|
def test_hat_trie
|
17
|
-
t = Triez.new
|
25
|
+
t = Triez.new value_type: :object
|
18
26
|
|
19
27
|
v1 = (1 << 40)
|
20
28
|
v2 = (1 << 141)
|
@@ -87,21 +95,15 @@ class TriezTest < Test::Unit::TestCase
|
|
87
95
|
assert_equal true, t.has_key?('c')
|
88
96
|
end
|
89
97
|
|
90
|
-
def test_suffix_insert
|
91
|
-
t = Triez.new suffix: true
|
92
|
-
t << '12345'
|
93
|
-
assert_equal 5, t.size
|
94
|
-
end
|
95
|
-
|
96
98
|
def test_full_text_search
|
97
99
|
sequences = {
|
98
100
|
'ACTGAAAAAAACTG' => 1,
|
99
101
|
'ATACGGTCCA' => 2,
|
100
102
|
'GCTTGTACGT' => 3
|
101
103
|
}
|
102
|
-
t = Triez.new
|
104
|
+
t = Triez.new
|
103
105
|
sequences.each do |seq, id|
|
104
|
-
t
|
106
|
+
t.change_all(:suffix, seq){ id }
|
105
107
|
end
|
106
108
|
assert_equal 2, t.search_with_prefix('CGGT').map(&:last).flatten.first
|
107
109
|
end
|
@@ -111,6 +113,76 @@ class TriezTest < Test::Unit::TestCase
|
|
111
113
|
t["a\0b"] = 1
|
112
114
|
assert_equal 1, t["a\0b"]
|
113
115
|
assert_equal 1, t.size
|
114
|
-
assert_equal
|
116
|
+
assert_equal 0, t["a"]
|
117
|
+
end
|
118
|
+
|
119
|
+
def test_change_all_with_prefix
|
120
|
+
default = 10
|
121
|
+
t = Triez.new default: default
|
122
|
+
t['regexp'] = 1
|
123
|
+
t['readme'] = 2
|
124
|
+
t.change_all :prefix, 'readme' do |v|
|
125
|
+
v += 4
|
126
|
+
end
|
127
|
+
assert_equal 'readme'.size + 1, t.size
|
128
|
+
assert_equal 6, t['readme']
|
129
|
+
assert_equal default + 4, t['read']
|
130
|
+
assert_equal 1, t['regexp']
|
131
|
+
end
|
132
|
+
|
133
|
+
def test_change_all_with_suffix
|
134
|
+
t = Triez.new
|
135
|
+
t['regexp'] = 1
|
136
|
+
t['exp'] = 2
|
137
|
+
t['reg'] = 3
|
138
|
+
t.change_all :suffix, 'regexp' do |v|
|
139
|
+
v += 4
|
140
|
+
end
|
141
|
+
assert_equal 5, t['regexp']
|
142
|
+
assert_equal 6, t['exp']
|
143
|
+
assert_equal 3, t['reg']
|
144
|
+
assert_equal 'regexp'.size + 1, t.size
|
145
|
+
end
|
146
|
+
|
147
|
+
def test_change_all_with_substring
|
148
|
+
t = Triez.new value_type: :object
|
149
|
+
t.change_all :substring, 'abc' do
|
150
|
+
1
|
151
|
+
end
|
152
|
+
|
153
|
+
keys = []
|
154
|
+
t.each do |k, v|
|
155
|
+
keys << k
|
156
|
+
end
|
157
|
+
assert_equal %w[a b c ab bc abc].sort, keys.sort
|
158
|
+
end
|
159
|
+
|
160
|
+
def test_solve_longest_common_substring
|
161
|
+
sentences = %w[
|
162
|
+
万塘路一锅鸡
|
163
|
+
文二路一锅鸡
|
164
|
+
来一锅鸡顶盒
|
165
|
+
一锅鸡胗
|
166
|
+
]
|
167
|
+
|
168
|
+
# value is bitset representing id of the sentence
|
169
|
+
# in ruby we can use integers of arbitrary length as bitsets
|
170
|
+
t = Triez.new value_type: :object, default: 0
|
171
|
+
|
172
|
+
sentences.each_with_index do |sentence, i|
|
173
|
+
elem = 1 << i
|
174
|
+
t.change_all :substring, sentence do |v|
|
175
|
+
# union
|
176
|
+
v | elem
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
# longest common substring
|
181
|
+
lcs = ''
|
182
|
+
universe = (1 << sentences.size) - 1
|
183
|
+
t.each do |k, v|
|
184
|
+
lcs = k if (k.size > lcs.size and v == universe)
|
185
|
+
end
|
186
|
+
assert_equal '一锅鸡', lcs
|
115
187
|
end
|
116
188
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: triez
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0
|
4
|
+
version: '1.0'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -22,7 +22,6 @@ files:
|
|
22
22
|
- readme.md
|
23
23
|
- lib/triez.rb
|
24
24
|
- test/triez_test.rb
|
25
|
-
- ext/hat-stub.c
|
26
25
|
- ext/triez.cc
|
27
26
|
- ext/common.h
|
28
27
|
- ext/extconf.rb
|
@@ -31,6 +30,7 @@ files:
|
|
31
30
|
- ext/hat-trie/COPYING
|
32
31
|
- ext/hat-trie/hat-trie.c
|
33
32
|
- ext/hat-trie/hat-trie.h
|
33
|
+
- ext/hat-trie/misc.c
|
34
34
|
- ext/hat-trie/misc.h
|
35
35
|
- ext/hat-trie/murmurhash3.c
|
36
36
|
- ext/hat-trie/murmurhash3.h
|
data/ext/hat-stub.c
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
#include <ruby.h>
|
2
|
-
|
3
|
-
void* malloc_or_die(size_t sz) {
|
4
|
-
return malloc(sz);
|
5
|
-
}
|
6
|
-
|
7
|
-
void* realloc_or_die(void* p, size_t sz) {
|
8
|
-
return realloc(p, sz);
|
9
|
-
}
|
10
|
-
|
11
|
-
FILE* fopen_or_die(const char* file, const char* mode) {
|
12
|
-
// to do raise error
|
13
|
-
return fopen(file, mode);
|
14
|
-
}
|