triez 0.3 → 1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,46 @@
1
+ /*
2
+ * This file is part of hat-trie.
3
+ *
4
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
5
+ *
6
+ */
7
+
8
+ #include "misc.h"
9
+ #include <stdlib.h>
10
+
11
+
12
+ void* malloc_or_die(size_t n)
13
+ {
14
+ void* p = malloc(n);
15
+ if (p == NULL && n != 0) {
16
+ fprintf(stderr, "Cannot allocate %zu bytes.\n", n);
17
+ exit(EXIT_FAILURE);
18
+ }
19
+ return p;
20
+ }
21
+
22
+
23
+ void* realloc_or_die(void* ptr, size_t n)
24
+ {
25
+ void* p = realloc(ptr, n);
26
+ if (p == NULL && n != 0) {
27
+ fprintf(stderr, "Cannot allocate %zu bytes.\n", n);
28
+ exit(EXIT_FAILURE);
29
+ }
30
+ return p;
31
+ }
32
+
33
+
34
+ FILE* fopen_or_die(const char* path, const char* mode)
35
+ {
36
+ FILE* f = fopen(path, mode);
37
+ if (f == NULL) {
38
+ fprintf(stderr, "Cannot open file %s with mode %s.\n", path, mode);
39
+ exit(EXIT_FAILURE);
40
+ }
41
+ return f;
42
+ }
43
+
44
+
45
+
46
+
data/ext/triez.cc CHANGED
@@ -29,11 +29,11 @@ static inline VALUE LL2V(long long l) {
29
29
 
30
30
  struct HatTrie {
31
31
  hattrie_t* p;
32
+ VALUE default_value;
32
33
  bool obj_value;
33
- bool suffix;
34
34
  bool initialized;
35
35
 
36
- HatTrie() : obj_value(false), suffix(false), initialized(false) {
36
+ HatTrie() : default_value(Qnil), obj_value(false), initialized(false) {
37
37
  p = hattrie_create();
38
38
  }
39
39
 
@@ -44,6 +44,9 @@ struct HatTrie {
44
44
 
45
45
  static void hat_mark(void* p_ht) {
46
46
  HatTrie* ht = (HatTrie*)p_ht;
47
+ if (!IMMEDIATE_P(ht->default_value)) {
48
+ rb_gc_mark(ht->default_value);
49
+ }
47
50
  if (!ht->obj_value) {
48
51
  return;
49
52
  }
@@ -75,29 +78,23 @@ static VALUE hat_alloc(VALUE self) {
75
78
  Check_Type(key, T_STRING);\
76
79
  key = unify_key(key);
77
80
 
78
- static VALUE hat_set_type(VALUE self, VALUE obj_value, VALUE suffix) {
81
+ static VALUE hat_set_type(VALUE self, VALUE obj_value, VALUE default_value) {
79
82
  HatTrie* ht;
80
83
  Data_Get_Struct(self, HatTrie, ht);
81
84
  if (ht->initialized) {
82
85
  rb_raise(rb_eRuntimeError, "Already initialized");
83
86
  return self;
84
87
  }
88
+ ht->default_value = default_value;
85
89
  ht->obj_value = RTEST(obj_value);
86
- ht->suffix = RTEST(suffix);
87
90
  ht->initialized = true;
88
91
  return self;
89
92
  }
90
93
 
91
- static VALUE hat_obj_value_p(VALUE self) {
92
- HatTrie* ht;
93
- Data_Get_Struct(self, HatTrie, ht);
94
- return ht->obj_value ? Qtrue : Qfalse;
95
- }
96
-
97
- static VALUE hat_suffix_p(VALUE self) {
94
+ static VALUE hat_value_type(VALUE self) {
98
95
  HatTrie* ht;
99
96
  Data_Get_Struct(self, HatTrie, ht);
100
- return ht->suffix ? Qtrue : Qfalse;
97
+ return ht->obj_value ? ID2SYM(rb_intern("object")) : ID2SYM(rb_intern("int64"));
101
98
  }
102
99
 
103
100
  static VALUE hat_size(VALUE self) {
@@ -111,50 +108,58 @@ static VALUE hat_set(VALUE self, VALUE key, VALUE value) {
111
108
  long long v = ht->obj_value ? value : NUM2LL(value);
112
109
  char* s = RSTRING_PTR(key);
113
110
  size_t len = RSTRING_LEN(key);
114
- if (ht->suffix) {
115
- char* s_end = s + len;
116
- long n;
117
- for (; s < s_end; s += n, len -= n) {
118
- n = rb_enc_mbclen(s, s_end, u8_enc);
119
- hattrie_get(p, s, len)[0] = v;
120
- }
111
+ hattrie_get(p, s, len)[0] = v;
112
+ return self;
113
+ }
114
+
115
+ static inline void hat_change(HatTrie* ht, hattrie_t* p, char* s, size_t len) {
116
+ // NOTE must use 2-step change, because the block may change the trie
117
+ value_t* vp = hattrie_tryget(p, s, len);
118
+ long long v;
119
+ if (ht->obj_value) {
120
+ VALUE value = vp ? LL2V(vp[0]) : ht->default_value;
121
+ v = V2LL(rb_yield(value));
121
122
  } else {
122
- hattrie_get(p, s, len)[0] = v;
123
+ VALUE value = vp ? LL2NUM(vp[0]) : ht->default_value;
124
+ v = NUM2LL(rb_yield(value));
125
+ }
126
+ hattrie_get(p, s, len)[0] = v;
127
+ }
128
+
129
+ static inline void hat_change_prefix(HatTrie* ht, hattrie_t* p, char* s, size_t len, char* rs) {
130
+ char* rs_end = rs + len;
131
+ long n;
132
+ for (; rs < rs_end; rs += n, len -= n) {
133
+ hat_change(ht, p, s, len);
134
+ // no need check encoding because reverse succeeded
135
+ n = rb_enc_fast_mbclen(rs, rs_end, u8_enc);
123
136
  }
124
- return self;
125
137
  }
126
138
 
127
- static VALUE hat_alt(VALUE self, VALUE key) {
139
+ static VALUE hat_change_all(VALUE self, VALUE type, VALUE key) {
128
140
  PRE_HAT;
129
141
  char* s = RSTRING_PTR(key);
130
142
  size_t len = RSTRING_LEN(key);
131
- if (ht->suffix) {
143
+ ID ty = SYM2ID(type);
144
+ if (ty == rb_intern("suffix")) {
132
145
  char* s_end = s + len;
133
146
  long n;
134
147
  for (; s < s_end; s += n, len -= n) {
148
+ hat_change(ht, p, s, len);
135
149
  n = rb_enc_mbclen(s, s_end, u8_enc);
136
- value_t* vp = hattrie_tryget(p, s, len);
137
- long long v;
138
- if (ht->obj_value) {
139
- VALUE value = vp ? LL2V(vp[0]) : Qnil;
140
- v = V2LL(rb_yield(value));
141
- } else {
142
- VALUE value = vp ? LL2NUM(vp[0]) : LL2NUM(0);
143
- v = NUM2LL(rb_yield(value));
144
- }
145
- hattrie_get(p, s, len)[0] = v;
146
150
  }
147
- } else {
148
- value_t* vp = hattrie_tryget(p, s, len);
149
- long long v;
150
- if (ht->obj_value) {
151
- VALUE value = vp ? LL2V(vp[0]) : Qnil;
152
- v = V2LL(rb_yield(value));
153
- } else {
154
- VALUE value = vp ? LL2NUM(vp[0]) : LL2NUM(0);
155
- v = NUM2LL(rb_yield(value));
151
+ } else if (ty == rb_intern("prefix")) {
152
+ volatile VALUE reversed = rb_funcall(key, rb_intern("reverse"), 0);
153
+ hat_change_prefix(ht, p, s, len, RSTRING_PTR(reversed));
154
+ } else if (ty == rb_intern("substring")) {
155
+ volatile VALUE reversed = rb_funcall(key, rb_intern("reverse"), 0);
156
+ char* rs = RSTRING_PTR(reversed);
157
+ char* s_end = s + len;
158
+ long n;
159
+ for (; s < s_end; s += n, len -= n) {
160
+ hat_change_prefix(ht, p, s, len, rs);
161
+ n = rb_enc_fast_mbclen(s, s_end, u8_enc);
156
162
  }
157
- hattrie_get(p, s, len)[0] = v;
158
163
  }
159
164
  return self;
160
165
  }
@@ -162,7 +167,7 @@ static VALUE hat_alt(VALUE self, VALUE key) {
162
167
  static VALUE hat_append(VALUE self, VALUE key) {
163
168
  HatTrie* ht;
164
169
  Data_Get_Struct(self, HatTrie, ht);
165
- return hat_set(self, key, ht->obj_value ? Qnil : LL2NUM(0));
170
+ return hat_set(self, key, ht->default_value);
166
171
  }
167
172
 
168
173
  static VALUE hat_get(VALUE self, VALUE key) {
@@ -171,7 +176,7 @@ static VALUE hat_get(VALUE self, VALUE key) {
171
176
  if (vt) {
172
177
  return ht->obj_value ? (*vt) : LL2NUM(*vt);
173
178
  } else {
174
- return Qnil;
179
+ return ht->default_value;
175
180
  }
176
181
  }
177
182
 
@@ -184,7 +189,7 @@ static VALUE hat_del(VALUE self, VALUE key) {
184
189
  hattrie_del(p, RSTRING_PTR(key), RSTRING_LEN(key));
185
190
  return ht->obj_value ? (*vt) : LL2NUM(*vt);
186
191
  } else {
187
- return Qnil;
192
+ return ht->default_value;
188
193
  }
189
194
  }
190
195
 
@@ -205,7 +210,6 @@ static VALUE hat_search_callback(VALUE data) {
205
210
  return rb_funcall(p->callback, rb_intern("call"), 2, p->suffix, p->value);
206
211
  }
207
212
 
208
- // returns: true if an error occured
209
213
  static VALUE hat_search(VALUE self, VALUE key, VALUE vlimit, VALUE vsort, VALUE callback) {
210
214
  PRE_HAT;
211
215
  long limit = 0;
@@ -248,14 +252,13 @@ void Init_triez() {
248
252
 
249
253
  rb_define_alloc_func(hat_class, hat_alloc);
250
254
  DEF(hat_class, "_internal_set_type", hat_set_type, 2);
251
- DEF(hat_class, "obj_value?", hat_obj_value_p, 0);
252
- DEF(hat_class, "suffix?", hat_suffix_p, 0);
255
+ DEF(hat_class, "value_type", hat_value_type, 0);
253
256
  DEF(hat_class, "size", hat_size, 0);
254
- DEF(hat_class, "has_key?", hat_check, 1);
255
257
  DEF(hat_class, "[]=", hat_set, 2);
256
- DEF(hat_class, "alt", hat_alt, 1);
258
+ DEF(hat_class, "change_all", hat_change_all, 2);
257
259
  DEF(hat_class, "<<", hat_append, 1);
258
260
  DEF(hat_class, "[]", hat_get, 1);
261
+ DEF(hat_class, "has_key?", hat_check, 1);
259
262
  DEF(hat_class, "delete", hat_del, 1);
260
263
  DEF(hat_class, "_internal_search", hat_search, 4);
261
264
  }
data/lib/triez.rb CHANGED
@@ -1,34 +1,49 @@
1
1
  require_relative "../ext/triez"
2
2
 
3
3
  class Triez
4
- VERSION = '0.3'
4
+ VERSION = '1.0'
5
5
 
6
6
  private :_internal_set_type
7
7
  private :_internal_search
8
8
 
9
9
  def initialize opts={}
10
10
  opts = opts.dup
11
- obj_value = opts.delete :obj_value
12
- obj_value = false if obj_value.nil?
13
- suffix = opts.delete :suffix
14
- suffix = false if suffix.nil?
11
+
12
+ value_type = opts.delete :value_type
13
+ if value_type.nil?
14
+ value_type = :int64
15
+ elsif value_type != :int64 and value_type != :object
16
+ raise ArgumentError, "value_type should be :int64 or :object, but got #{value_type.inspect}"
17
+ end
18
+
19
+ default = opts.delete :default
20
+ if default.nil?
21
+ default = (value_type == :int64 ? 0 : nil)
22
+ elsif value_type == :int64
23
+ default = default.to_i
24
+ end
25
+
15
26
  unless opts.empty?
16
- raise ArgumentError, "Unknown options: #{opts.keys.inspect}, only [:suffix, :obj_value] are allowed"
27
+ raise ArgumentError, "Unknown options: #{opts.keys.inspect}, only [:value_type, :default] are allowed"
17
28
  end
18
- _internal_set_type obj_value, suffix
29
+
30
+ _internal_set_type value_type == :object, default
19
31
  end
20
32
 
21
33
  def each &p
22
34
  raise ArgumentError, 'Need a block' unless p
35
+
23
36
  _internal_search '', nil, true, p
24
37
  end
25
38
 
26
39
  def search_with_prefix prefix, opts={}, &p
27
40
  opts = opts.dup
41
+
28
42
  limit = opts.delete :limit
29
43
  if !limit.nil? and limit < 0
30
44
  raise ArgumentError, "Limit should be > 0"
31
45
  end
46
+
32
47
  sort = opts.delete :sort
33
48
  unless opts.empty?
34
49
  raise ArgumentError, "Unknown options: #{opts.keys.inspect}, only [:limit, :sort] are allowed"
data/readme.md CHANGED
@@ -1,10 +1,10 @@
1
1
  ## What
2
2
 
3
- Pragmatic [trie](http://en.wikipedia.org/wiki/Trie) for Ruby.
3
+ Pragmatic [tries](http://en.wikipedia.org/wiki/Trie) for Ruby, spelled in lolcat.
4
4
 
5
- It is fast, memory efficient, unicode aware.
5
+ It is fast, memory efficient, unicode aware, prefix searchable, and enchanced with prefix/suffix/substring keys.
6
6
 
7
- The backend of *triez* is a cache oblivious data structure: the [HAT trie](https://github.com/dcjones/hat-trie). It is generally faster and more memory efficient than double arrays or burst tries.
7
+ The backend of *triez* is a cache oblivious data structure: the [HAT trie](https://github.com/dcjones/hat-trie) (In fact I'm using a [modified version](https://github.com/luikore/hat-trie) for improved functionality). HAT trie is generally faster and more memory efficient than [double array](http://linux.thai.net/~thep/datrie/datrie.html) or [burst trie](http://ww2.cs.mu.oz.au/~jz/fulltext/acmtois02.pdf).
8
8
 
9
9
  ## Requirement
10
10
 
@@ -22,32 +22,57 @@ gem ins triez
22
22
  ``` ruby
23
23
  require 'triez'
24
24
 
25
+ # create triez
25
26
  t = Triez.new
26
27
 
27
- # insertion
28
+ # the above code is equivalent to :int64 for :value_type and 0 for :default
29
+ t = Triez.new value_type: :int64
30
+
31
+ # more flexible with object type [*see note below]
32
+ t = Triez.new value_type: :object
33
+
34
+ # get the value type
35
+ t.value_type
36
+
37
+ # set a different default value
38
+ t = Triez.new value_type: :object, default: 'hello'
39
+
40
+ # insert or change value
28
41
  t['key'] = 100
29
42
 
30
- # insert a key with default value (0 for normal triez, nil for obj_valued triez)
43
+ # insert a key with default value
31
44
  t << 'key'
32
45
 
33
- # search
46
+ # batch change values under all suffices/prefices/substrings of a key
47
+ t.change_all(:suffix, 'key') {|old_value| ...calculate new value }
48
+ t.change_all(:prefix, 'key') {|old_value| ...calculate new value }
49
+ # enumerates all occurences of substrings of the key
50
+ t.change_all(:substring, 'key') {|old_value| ...calculate new value }
51
+
52
+ # size of inserted keys
53
+ t.size
54
+
55
+ # search with exact match
34
56
  t.has_key? 'key'
35
57
  t['key']
36
58
 
37
- # iterate over values under a prefix.
59
+ # prefixed search (iterate over values under a prefix), available options are:
60
+ # - limit: max items, `nil` means no limit
61
+ # - sort: whether iterate in alphabetic order, default is true
38
62
  t.search_with_prefix(prefix, limit: 10, sort: true) do |suffix, value|
39
63
  ...
40
64
  end
41
65
 
42
- # enumerate (NOTE it is unordered)
66
+ # if no block given, an array in the form of [[suffix, value]] is returned
67
+ t.search_with_prefix('prefix')
68
+
69
+ # enumerate all keys and values in the order of binary collation
43
70
  t.each do |key, value|
44
71
  ...
45
72
  end
46
73
  ```
47
74
 
48
- ---
49
-
50
- By default, a *triez* stores signed integers within 64bits, you can use it as weights, counts or database IDs, and doesn't cost any time in GC marking phase. In case you need to store arbitrary object in a node, use `obj_value: true`:
75
+ \* Note: By default, *triez* store signed integers within 64bits, you can use them as weights, counts or database IDs. In case you need to store arbitrary object in a node, use `obj_value: true`:
51
76
 
52
77
  ``` ruby
53
78
  t = Triez.new obj_value: true
@@ -55,46 +80,9 @@ t['Tom'] = {name: 'Tom', sex: 'Female'}
55
80
  t['Tree'] = [:leaf, :trunk, :root]
56
81
  ```
57
82
 
58
- ---
59
-
60
- When a *triez* is initialized with `suffix: true`, it inserts all suffices of a key
61
-
62
- ``` ruby
63
- t = Triez.new suffix: true
64
- t['万塘路一锅鸡'] = 2
65
- t['万塘路一锅鸡'] #=> 2
66
- t['塘路一锅鸡'] #=> 2
67
- t['路一锅鸡'] #=> 2
68
- t['一锅鸡'] #=> 2
69
- t['锅鸡'] #=> 2
70
- t['鸡'] #=> 2
71
- ```
72
-
73
- You can batch change values with a block
74
-
75
- ``` ruby
76
- # v *= 5 for 'abcd', 'bcd', 'cd', 'd'
77
- t.alt 'abcd' do |v|
78
- v * 5
79
- end
80
- t['abcd'] #=> 10
81
- t['cd'] #=> 10
82
- ```
83
-
84
- ---
85
-
86
- Misc methods
87
-
88
- ``` ruby
89
- # if it is a suffix trie
90
- t.suffix?
91
- # if the value type is object
92
- t.obj_value?
93
- ```
94
-
95
83
  ## Examples
96
84
 
97
- Prefix-based autocompletion:
85
+ **Prefix based autocompletion**:
98
86
 
99
87
  ``` ruby
100
88
  require 'triez'
@@ -117,7 +105,7 @@ candidate: red
117
105
 
118
106
  ---
119
107
 
120
- Efficiently search for strings containing a substring:
108
+ **Efficient [full text search](https://en.wikipedia.org/wiki/Full_text_search) with a [suffix tree](https://en.wikipedia.org/wiki/Suffix_tree)**:
121
109
 
122
110
  ``` ruby
123
111
  require 'triez'
@@ -126,18 +114,59 @@ sequences = {
126
114
  'ATACGGTCCA' => 2,
127
115
  'GCTTGTACGT' => 3
128
116
  }
129
- t = Triez.new suffix: true
117
+ t = Triez.new
118
+
119
+ # build suffix trie
130
120
  sequences.each do |seq, id|
131
- t[seq] = id
121
+ t.change_all :suffix, [seq] = id
132
122
  end
123
+
133
124
  t.search_with_prefix 'CGGT' do |_, id|
134
125
  puts id #=> 2
135
126
  end
136
127
  ```
137
128
 
138
- The search time is linear to the length of the substring.
129
+ The searching time is linear to the length of the substring.
130
+
131
+ ---
139
132
 
140
- ## Benchmarks
133
+ **Solve the [longest common substring problem](https://en.wikipedia.org/wiki/Longest_common_substring_problem)**:
134
+
135
+ ``` ruby
136
+ # coding: utf-8
137
+ require 'triez'
138
+ sentences = %w[
139
+ 万塘路一锅鸡
140
+ 去文二路一锅鸡吃饭
141
+ 来一锅鸡顶盒
142
+ 一锅鸡胗
143
+ ]
144
+
145
+ # value is bitset representing id of the sentence
146
+ # in ruby we can use integers of arbitrary length as bitsets
147
+ t = Triez.new value_type: :object, default: 0
148
+
149
+ sentences.each_with_index do |sentence, i|
150
+ elem = 1 << i
151
+ t.change_all :substring, sentence do |v|
152
+ # union
153
+ v | elem
154
+ end
155
+ end
156
+
157
+ # longest common substring
158
+ lcs = ''
159
+
160
+ # find the key tagged with universe
161
+ universe = (1 << sentences.size) - 1
162
+ t.each do |k, v|
163
+ lcs = k if k.size > lcs.size and v == universe
164
+ end
165
+
166
+ puts lcs #=> 一锅鸡
167
+ ```
168
+
169
+ ## Benchmark
141
170
 
142
171
  Here's a benchmark on
143
172
 
@@ -146,23 +175,23 @@ ruby 1.9.3p374 (2013-01-15 revision 38858) [x86_64-darwin12.2.1]
146
175
  2.3 GHz Intel Core i7
147
176
  ```
148
177
 
149
- The test data is 3 milion titles of wikipedia articles (from http://dumps.wikimedia.org/enwiki/20121101/)
178
+ The test data are 3 milion titles of wikipedia articles (from http://dumps.wikimedia.org/enwiki/20121101/)
150
179
 
151
180
  ```
152
- thing/backend | memory | insertion time | 3 M query
153
- -------------------|---------|----------------|----------
154
- hash/linked hash | 340.2 M | 4.369 s | 0.2800 s
155
- trie/double array* | 155.6 M | 130.7 s | 0.4359 s
156
- triez/HAT trie | 121.7 M | 3.872 s | 0.3472 s
181
+ thing/backend | memory | insertion time | 3 M query
182
+ ------------------------|---------|----------------|----------
183
+ hash/linked hash | 340.2 M | 4.369 s | 0.2800 s
184
+ fast_trie/double array* | 155.6 M | 130.7 s | 0.4359 s
185
+ triez/HAT trie | 121.7 M | 3.872 s | 0.3472 s
157
186
  ```
158
187
 
159
- NOTE: `trie/double array` -> https://github.com/tyler/trie
188
+ Note: `trie/double array` -> https://github.com/tyler/trie
160
189
 
161
190
  ## Caveats
162
191
 
163
- - `sort` orders keys with binary collation, not unicode codepoint collation in string comparison.
192
+ - The `sort` option in prefixed search orders keys with binary [collation](https://en.wikipedia.org/wiki/Collation), but string comparison in Ruby is with unicode codepoint collation.
164
193
  - For some rare case of many threads modifying the same trie, you may need a mutex.
165
- - If you still feel memory not enough, you may consider [MARISA-trie](https://code.google.com/p/marisa-trie/) (NOTE that MARISA is immutable) or a database.
194
+ - If you still feel memory not enough, you may consider [MARISA-trie](https://code.google.com/p/marisa-trie/) (note that MARISA is immutable), or a database.
166
195
 
167
196
  ## Development
168
197
 
@@ -172,3 +201,7 @@ cd triez
172
201
  rake glob_src
173
202
  rake
174
203
  ```
204
+
205
+ ## Note
206
+
207
+ Although HAT trie uses MurMurHash3 instead of SipHash in Ruby, It is still safe under hashDoS because bucket size is limited.
data/test/triez_test.rb CHANGED
@@ -1,20 +1,28 @@
1
+ # coding: utf-8
1
2
  require "test/unit"
2
3
  require_relative "../lib/triez"
3
4
 
4
5
  GC.stress
5
6
 
6
7
  class TriezTest < Test::Unit::TestCase
7
- def test_init_options
8
- t = Triez.new obj_value: true
9
- assert_equal true, t.obj_value?
10
- assert_equal false, t.suffix?
11
- t = Triez.new suffix: true
12
- assert_equal true, t.suffix?
13
- assert_equal false, t.obj_value?
8
+ def test_init_type_options
9
+ t = Triez.new value_type: :int64
10
+ assert_equal :int64, t.value_type
11
+ t = Triez.new value_type: :object
12
+ assert_equal :object, t.value_type
13
+ t = Triez.new
14
+ assert_equal :int64, t.value_type
15
+
16
+ assert_raise ArgumentError do
17
+ Triez.new value_type: :string
18
+ end
19
+ assert_raise ArgumentError do
20
+ Triez.new invalid_option: :int64
21
+ end
14
22
  end
15
23
 
16
24
  def test_hat_trie
17
- t = Triez.new obj_value: true
25
+ t = Triez.new value_type: :object
18
26
 
19
27
  v1 = (1 << 40)
20
28
  v2 = (1 << 141)
@@ -87,21 +95,15 @@ class TriezTest < Test::Unit::TestCase
87
95
  assert_equal true, t.has_key?('c')
88
96
  end
89
97
 
90
- def test_suffix_insert
91
- t = Triez.new suffix: true
92
- t << '12345'
93
- assert_equal 5, t.size
94
- end
95
-
96
98
  def test_full_text_search
97
99
  sequences = {
98
100
  'ACTGAAAAAAACTG' => 1,
99
101
  'ATACGGTCCA' => 2,
100
102
  'GCTTGTACGT' => 3
101
103
  }
102
- t = Triez.new suffix: true
104
+ t = Triez.new
103
105
  sequences.each do |seq, id|
104
- t[seq] = id
106
+ t.change_all(:suffix, seq){ id }
105
107
  end
106
108
  assert_equal 2, t.search_with_prefix('CGGT').map(&:last).flatten.first
107
109
  end
@@ -111,6 +113,76 @@ class TriezTest < Test::Unit::TestCase
111
113
  t["a\0b"] = 1
112
114
  assert_equal 1, t["a\0b"]
113
115
  assert_equal 1, t.size
114
- assert_equal nil, t["a"]
116
+ assert_equal 0, t["a"]
117
+ end
118
+
119
+ def test_change_all_with_prefix
120
+ default = 10
121
+ t = Triez.new default: default
122
+ t['regexp'] = 1
123
+ t['readme'] = 2
124
+ t.change_all :prefix, 'readme' do |v|
125
+ v += 4
126
+ end
127
+ assert_equal 'readme'.size + 1, t.size
128
+ assert_equal 6, t['readme']
129
+ assert_equal default + 4, t['read']
130
+ assert_equal 1, t['regexp']
131
+ end
132
+
133
+ def test_change_all_with_suffix
134
+ t = Triez.new
135
+ t['regexp'] = 1
136
+ t['exp'] = 2
137
+ t['reg'] = 3
138
+ t.change_all :suffix, 'regexp' do |v|
139
+ v += 4
140
+ end
141
+ assert_equal 5, t['regexp']
142
+ assert_equal 6, t['exp']
143
+ assert_equal 3, t['reg']
144
+ assert_equal 'regexp'.size + 1, t.size
145
+ end
146
+
147
+ def test_change_all_with_substring
148
+ t = Triez.new value_type: :object
149
+ t.change_all :substring, 'abc' do
150
+ 1
151
+ end
152
+
153
+ keys = []
154
+ t.each do |k, v|
155
+ keys << k
156
+ end
157
+ assert_equal %w[a b c ab bc abc].sort, keys.sort
158
+ end
159
+
160
+ def test_solve_longest_common_substring
161
+ sentences = %w[
162
+ 万塘路一锅鸡
163
+ 文二路一锅鸡
164
+ 来一锅鸡顶盒
165
+ 一锅鸡胗
166
+ ]
167
+
168
+ # value is bitset representing id of the sentence
169
+ # in ruby we can use integers of arbitrary length as bitsets
170
+ t = Triez.new value_type: :object, default: 0
171
+
172
+ sentences.each_with_index do |sentence, i|
173
+ elem = 1 << i
174
+ t.change_all :substring, sentence do |v|
175
+ # union
176
+ v | elem
177
+ end
178
+ end
179
+
180
+ # longest common substring
181
+ lcs = ''
182
+ universe = (1 << sentences.size) - 1
183
+ t.each do |k, v|
184
+ lcs = k if (k.size > lcs.size and v == universe)
185
+ end
186
+ assert_equal '一锅鸡', lcs
115
187
  end
116
188
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: triez
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.3'
4
+ version: '1.0'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -22,7 +22,6 @@ files:
22
22
  - readme.md
23
23
  - lib/triez.rb
24
24
  - test/triez_test.rb
25
- - ext/hat-stub.c
26
25
  - ext/triez.cc
27
26
  - ext/common.h
28
27
  - ext/extconf.rb
@@ -31,6 +30,7 @@ files:
31
30
  - ext/hat-trie/COPYING
32
31
  - ext/hat-trie/hat-trie.c
33
32
  - ext/hat-trie/hat-trie.h
33
+ - ext/hat-trie/misc.c
34
34
  - ext/hat-trie/misc.h
35
35
  - ext/hat-trie/murmurhash3.c
36
36
  - ext/hat-trie/murmurhash3.h
data/ext/hat-stub.c DELETED
@@ -1,14 +0,0 @@
1
- #include <ruby.h>
2
-
3
- void* malloc_or_die(size_t sz) {
4
- return malloc(sz);
5
- }
6
-
7
- void* realloc_or_die(void* p, size_t sz) {
8
- return realloc(p, sz);
9
- }
10
-
11
- FILE* fopen_or_die(const char* file, const char* mode) {
12
- // to do raise error
13
- return fopen(file, mode);
14
- }