wordtriez 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/changes +21 -0
- data/copying +18 -0
- data/ext/common.h +8 -0
- data/ext/extconf.rb +32 -0
- data/ext/hat-trie/ahtable.c +550 -0
- data/ext/hat-trie/ahtable.h +93 -0
- data/ext/hat-trie/common.h +19 -0
- data/ext/hat-trie/hat-trie.c +771 -0
- data/ext/hat-trie/hat-trie.h +86 -0
- data/ext/hat-trie/misc.c +46 -0
- data/ext/hat-trie/misc.h +22 -0
- data/ext/hat-trie/murmurhash3.c +77 -0
- data/ext/hat-trie/murmurhash3.h +12 -0
- data/ext/hat-trie/pstdint.h +800 -0
- data/ext/hat-trie/text.c +174 -0
- data/ext/hat-trie/text.h +22 -0
- data/ext/triez.cc +313 -0
- data/lib/wordtriez.rb +65 -0
- data/readme.md +223 -0
- data/test/triez_test.rb +225 -0
- metadata +67 -0
data/readme.md
ADDED
@@ -0,0 +1,223 @@
|
|
1
|
+
## Triez
|
2
|
+
|
3
|
+
[](https://travis-ci.org/luikore/triez)
|
4
|
+
[](https://codeclimate.com/github/luikore/triez)
|
5
|
+
[](http://badge.fury.io/rb/triez)
|
6
|
+
|
7
|
+
Pragmatic [tries](http://en.wikipedia.org/wiki/Trie) for Ruby, spelled in lolcat.
|
8
|
+
|
9
|
+
It is fast, memory efficient, unicode aware, prefix searchable, and enchanced with prefix/suffix/substring keys.
|
10
|
+
|
11
|
+
The backend of *triez* is a cache oblivious data structure: the [HAT trie](https://github.com/dcjones/hat-trie) (In fact it is a [modified version](https://github.com/luikore/hat-trie) for improved functionality). HAT trie is generally faster and more memory efficient than [double array](http://linux.thai.net/~thep/datrie/datrie.html) or [burst trie](http://ww2.cs.mu.oz.au/~jz/fulltext/acmtois02.pdf).
|
12
|
+
|
13
|
+
## Requirement
|
14
|
+
|
15
|
+
- CRuby 1.9 / 2.0
|
16
|
+
- `g++` or `clang`
|
17
|
+
|
18
|
+
## Install
|
19
|
+
|
20
|
+
``` bash
|
21
|
+
gem ins triez
|
22
|
+
```
|
23
|
+
|
24
|
+
## Synopsis
|
25
|
+
|
26
|
+
``` ruby
|
27
|
+
require 'triez'
|
28
|
+
|
29
|
+
# create triez
|
30
|
+
t = Triez.new
|
31
|
+
|
32
|
+
# the above code is equivalent to :int64 for :value_type and 0 for :default
|
33
|
+
t = Triez.new value_type: :int64
|
34
|
+
|
35
|
+
# more flexible with object type [*see note below]
|
36
|
+
t = Triez.new value_type: :object
|
37
|
+
|
38
|
+
# get the value type
|
39
|
+
t.value_type
|
40
|
+
|
41
|
+
# set a different default value
|
42
|
+
t = Triez.new value_type: :object, default: 'hello'
|
43
|
+
|
44
|
+
# insert or change value
|
45
|
+
t['key'] = 100
|
46
|
+
|
47
|
+
# insert a key with default value
|
48
|
+
t << 'key'
|
49
|
+
|
50
|
+
# batch change values under all suffices/prefices/substrings of a key
|
51
|
+
t.change_all(:suffix, 'key') {|old_value| ...calculate new value }
|
52
|
+
t.change_all(:prefix, 'key') {|old_value| ...calculate new value }
|
53
|
+
# enumerates all occurences of substrings of the key
|
54
|
+
t.change_all(:substring, 'key') {|old_value| ...calculate new value }
|
55
|
+
|
56
|
+
# size of inserted keys
|
57
|
+
t.size
|
58
|
+
|
59
|
+
# search with exact match
|
60
|
+
t.has_key? 'key'
|
61
|
+
t['key']
|
62
|
+
|
63
|
+
# prefixed search (iterate over values under a prefix), available options are:
|
64
|
+
# - limit: max items, `nil` means no limit
|
65
|
+
# - sort: whether iterate in alphabetic order, default is true
|
66
|
+
t.search_with_prefix(prefix, limit: 10, sort: true) do |suffix, value|
|
67
|
+
...
|
68
|
+
end
|
69
|
+
|
70
|
+
# if no block given, an array in the form of [[suffix, value]] is returned
|
71
|
+
t.search_with_prefix('prefix')
|
72
|
+
|
73
|
+
# enumerate all keys and values in the order of binary collation
|
74
|
+
t.each do |key, value|
|
75
|
+
...
|
76
|
+
end
|
77
|
+
|
78
|
+
# iterate stored keys which are prefices of a given string, from shallow to deep
|
79
|
+
t.walk string do |k, v|
|
80
|
+
...
|
81
|
+
end
|
82
|
+
```
|
83
|
+
|
84
|
+
\* Note: By default, *triez* store signed integers within 64bits, you can use them as weights, counts or database IDs. In case you need to store arbitrary object in a node, use `value_type: :object`:
|
85
|
+
|
86
|
+
``` ruby
|
87
|
+
t = Triez.new value_type: :object
|
88
|
+
t['Tom'] = {name: 'Tom', sex: 'Female'}
|
89
|
+
t['Tree'] = [:leaf, :trunk, :root]
|
90
|
+
```
|
91
|
+
|
92
|
+
## Examples
|
93
|
+
|
94
|
+
**Prefix based autocompletion**:
|
95
|
+
|
96
|
+
``` ruby
|
97
|
+
require 'triez'
|
98
|
+
words = %w[readme, rot, red, rah, rasterization]
|
99
|
+
t = Triez.new
|
100
|
+
words.each do |word|
|
101
|
+
t[word] = 1
|
102
|
+
end
|
103
|
+
t.search_with_prefix 're' do |suffix|
|
104
|
+
puts "candidate: re#{suffix}"
|
105
|
+
end
|
106
|
+
```
|
107
|
+
|
108
|
+
The output:
|
109
|
+
|
110
|
+
```bash
|
111
|
+
candidate: readme
|
112
|
+
candidate: red
|
113
|
+
```
|
114
|
+
|
115
|
+
---
|
116
|
+
|
117
|
+
**Efficient [full text search](https://en.wikipedia.org/wiki/Full_text_search) with a [suffix tree](https://en.wikipedia.org/wiki/Suffix_tree)**:
|
118
|
+
|
119
|
+
``` ruby
|
120
|
+
require 'triez'
|
121
|
+
sequences = {
|
122
|
+
'ACTGAAAAAAACTG' => 1,
|
123
|
+
'ATACGGTCCA' => 2,
|
124
|
+
'GCTTGTACGT' => 3
|
125
|
+
}
|
126
|
+
t = Triez.new
|
127
|
+
|
128
|
+
# build suffix tree
|
129
|
+
sequences.each do |seq, id|
|
130
|
+
t.change_all(:suffix, seq){id}
|
131
|
+
end
|
132
|
+
|
133
|
+
t.search_with_prefix 'CGGT' do |_, id|
|
134
|
+
puts id #=> 2
|
135
|
+
end
|
136
|
+
```
|
137
|
+
|
138
|
+
The searching time is linear to the length of the substring. You may also be interested in the example of a simple [full text search server](https://github.com/luikore/triez/tree/master/examples/full-text-search-server) with *triez*.
|
139
|
+
|
140
|
+
---
|
141
|
+
|
142
|
+
**Solve the [longest common substring problem](https://en.wikipedia.org/wiki/Longest_common_substring_problem)**:
|
143
|
+
|
144
|
+
``` ruby
|
145
|
+
# coding: utf-8
|
146
|
+
require 'triez'
|
147
|
+
sentences = %w[
|
148
|
+
万塘路一锅鸡
|
149
|
+
去文二路一锅鸡吃饭
|
150
|
+
来一锅鸡顶盒
|
151
|
+
一锅鸡胗
|
152
|
+
]
|
153
|
+
|
154
|
+
# value is bitset representing id of the sentence
|
155
|
+
# in ruby we can use integers of arbitrary length as bitsets
|
156
|
+
t = Triez.new value_type: :object, default: 0
|
157
|
+
|
158
|
+
sentences.each_with_index do |sentence, i|
|
159
|
+
elem = 1 << i
|
160
|
+
t.change_all :substring, sentence do |v|
|
161
|
+
# union
|
162
|
+
v | elem
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
# longest common substring
|
167
|
+
lcs = ''
|
168
|
+
|
169
|
+
# find the key tagged with universe
|
170
|
+
universe = (1 << sentences.size) - 1
|
171
|
+
t.each do |k, v|
|
172
|
+
lcs = k if k.size > lcs.size and v == universe
|
173
|
+
end
|
174
|
+
|
175
|
+
puts lcs #=> 一锅鸡
|
176
|
+
```
|
177
|
+
|
178
|
+
## Benchmark
|
179
|
+
|
180
|
+
Here's a benchmark on
|
181
|
+
|
182
|
+
```ruby
|
183
|
+
ruby 1.9.3p374 (2013-01-15 revision 38858) [x86_64-darwin12.2.1]
|
184
|
+
2.3 GHz Intel Core i7
|
185
|
+
```
|
186
|
+
|
187
|
+
The test data are 3 milion titles of wikipedia articles (from http://dumps.wikimedia.org/enwiki/20121101/)
|
188
|
+
|
189
|
+
```
|
190
|
+
thing/backend | memory | insertion time | 3 M query
|
191
|
+
------------------------|---------|----------------|----------
|
192
|
+
hash/linked hash | 340.2 M | 4.369 s | 0.2800 s
|
193
|
+
fast_trie/double array* | 155.6 M | 130.7 s | 0.4359 s
|
194
|
+
triez/HAT trie | 121.7 M | 3.872 s | 0.3472 s
|
195
|
+
```
|
196
|
+
|
197
|
+
Note: `fast_trie/double array` -> https://github.com/tyler/trie
|
198
|
+
|
199
|
+
## Caveats
|
200
|
+
|
201
|
+
- The `sort` option in prefixed search orders keys with binary [collation](https://en.wikipedia.org/wiki/Collation), but string comparison in Ruby is with unicode codepoint collation.
|
202
|
+
- For some rare case of many threads modifying the same trie, you may need a mutex.
|
203
|
+
- If you still feel memory not enough, you may consider [MARISA-trie](https://code.google.com/p/marisa-trie/) (note that MARISA is immutable), or a database.
|
204
|
+
|
205
|
+
## Development
|
206
|
+
|
207
|
+
``` bash
|
208
|
+
git clone git://github.com/luikore/triez.git
|
209
|
+
cd triez
|
210
|
+
rake glob_src
|
211
|
+
rake
|
212
|
+
```
|
213
|
+
|
214
|
+
To update vendor lib and re-compile:
|
215
|
+
|
216
|
+
``` bash
|
217
|
+
rake glob_src
|
218
|
+
rake
|
219
|
+
```
|
220
|
+
|
221
|
+
## Note
|
222
|
+
|
223
|
+
Although HAT trie uses MurMurHash3 instead of SipHash in Ruby, It is still safe under hashDoS because bucket size is limited.
|
data/test/triez_test.rb
ADDED
@@ -0,0 +1,225 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require "test/unit"
|
3
|
+
require_relative "../lib/triez"
|
4
|
+
|
5
|
+
GC.stress
|
6
|
+
|
7
|
+
class TriezTest < Test::Unit::TestCase
|
8
|
+
def test_init_type_options
|
9
|
+
t = Triez.new value_type: :int64
|
10
|
+
assert_equal :int64, t.value_type
|
11
|
+
t = Triez.new value_type: :object
|
12
|
+
assert_equal :object, t.value_type
|
13
|
+
t = Triez.new
|
14
|
+
assert_equal :int64, t.value_type
|
15
|
+
|
16
|
+
assert_raise ArgumentError do
|
17
|
+
Triez.new value_type: :string
|
18
|
+
end
|
19
|
+
assert_raise ArgumentError do
|
20
|
+
Triez.new invalid_option: :int64
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_hat_trie
|
25
|
+
t = Triez.new value_type: :object
|
26
|
+
|
27
|
+
v1 = (1 << 40)
|
28
|
+
v2 = (1 << 141)
|
29
|
+
t['万塘路一锅鸡'] = v1
|
30
|
+
t['万塘路'] = v2
|
31
|
+
assert_equal v1, t['万塘路一锅鸡']
|
32
|
+
assert_equal v2, t['万塘路']
|
33
|
+
assert_equal nil, t['万']
|
34
|
+
assert_equal false, t.has_key?('万')
|
35
|
+
assert_equal true, t.has_key?('万塘路')
|
36
|
+
|
37
|
+
assert_equal v1, t.delete('万塘路一锅鸡')
|
38
|
+
assert_equal nil, t['万塘路一锅鸡']
|
39
|
+
assert_equal v2, t['万塘路']
|
40
|
+
|
41
|
+
a = t.search_with_prefix ''
|
42
|
+
assert_equal [['万塘路', v2]], a
|
43
|
+
|
44
|
+
t['马当路'] = 3
|
45
|
+
a = t.search_with_prefix '万塘'
|
46
|
+
assert_equal [['路', v2]], a
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_insertion_and_search_on_many_keys
|
50
|
+
t = Triez.new
|
51
|
+
as = ('A'..'z').to_a
|
52
|
+
bs = ('一'..'百').to_a
|
53
|
+
as.each do |a|
|
54
|
+
# 10k chars to ensure burst
|
55
|
+
bs.each do |b|
|
56
|
+
t[a + b] = 0
|
57
|
+
end
|
58
|
+
end
|
59
|
+
assert_equal as.size * bs.size, t.size
|
60
|
+
|
61
|
+
a = t.search_with_prefix 'a'
|
62
|
+
assert_equal bs.to_a, a.map(&:first).sort
|
63
|
+
|
64
|
+
a = []
|
65
|
+
t.search_with_prefix 'b', sort: true, limit: 3 do |k, v|
|
66
|
+
a << k
|
67
|
+
end
|
68
|
+
assert_equal 3, a.size
|
69
|
+
assert_equal a, a.sort
|
70
|
+
end
|
71
|
+
|
72
|
+
def test_each_and_raise
|
73
|
+
t = Triez.new
|
74
|
+
t['abcd'] = 0
|
75
|
+
t['abc'] = 1
|
76
|
+
|
77
|
+
assert_raise NameError do
|
78
|
+
t.each do |k, v|
|
79
|
+
raise NameError, k
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
assert_raise ArgumentError do
|
84
|
+
t.each
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def test_append
|
89
|
+
t = Triez.new
|
90
|
+
('a'..'z').each do |c|
|
91
|
+
t << c
|
92
|
+
end
|
93
|
+
assert_equal 26, t.size
|
94
|
+
assert_equal 0, t['c']
|
95
|
+
assert_equal true, t.has_key?('c')
|
96
|
+
end
|
97
|
+
|
98
|
+
def test_full_text_search
|
99
|
+
sequences = {
|
100
|
+
'ACTGAAAAAAACTG' => 1,
|
101
|
+
'ATACGGTCCA' => 2,
|
102
|
+
'GCTTGTACGT' => 3
|
103
|
+
}
|
104
|
+
t = Triez.new
|
105
|
+
sequences.each do |seq, id|
|
106
|
+
t.change_all(:suffix, seq){ id }
|
107
|
+
end
|
108
|
+
assert_equal 2, t.search_with_prefix('CGGT').map(&:last).flatten.first
|
109
|
+
end
|
110
|
+
|
111
|
+
def test_nul_char_in_keys
|
112
|
+
t = Triez.new
|
113
|
+
t["a\0b"] = 1
|
114
|
+
assert_equal 1, t["a\0b"]
|
115
|
+
assert_equal 1, t.size
|
116
|
+
assert_equal 0, t["a"]
|
117
|
+
end
|
118
|
+
|
119
|
+
def test_change_all_with_prefix
|
120
|
+
default = 10
|
121
|
+
t = Triez.new default: default
|
122
|
+
t['regexp'] = 1
|
123
|
+
t['readme'] = 2
|
124
|
+
t.change_all :prefix, 'readme' do |v|
|
125
|
+
v += 4
|
126
|
+
end
|
127
|
+
assert_equal 'readme'.size + 1, t.size
|
128
|
+
assert_equal 6, t['readme']
|
129
|
+
assert_equal default + 4, t['read']
|
130
|
+
assert_equal 1, t['regexp']
|
131
|
+
end
|
132
|
+
|
133
|
+
def test_change_all_with_suffix
|
134
|
+
t = Triez.new
|
135
|
+
t['regexp'] = 1
|
136
|
+
t['exp'] = 2
|
137
|
+
t['reg'] = 3
|
138
|
+
t.change_all :suffix, 'regexp' do |v|
|
139
|
+
v += 4
|
140
|
+
end
|
141
|
+
assert_equal 5, t['regexp']
|
142
|
+
assert_equal 6, t['exp']
|
143
|
+
assert_equal 3, t['reg']
|
144
|
+
assert_equal 'regexp'.size + 1, t.size
|
145
|
+
end
|
146
|
+
|
147
|
+
def test_change_all_with_substring
|
148
|
+
t = Triez.new value_type: :object
|
149
|
+
t.change_all :substring, 'abc' do
|
150
|
+
1
|
151
|
+
end
|
152
|
+
|
153
|
+
keys = []
|
154
|
+
t.each do |k, v|
|
155
|
+
keys << k
|
156
|
+
end
|
157
|
+
assert_equal %w[a b c ab bc abc].sort, keys.sort
|
158
|
+
end
|
159
|
+
|
160
|
+
def test_walk
|
161
|
+
urls = %w[
|
162
|
+
/users/
|
163
|
+
/users/12/edit
|
164
|
+
/posts
|
165
|
+
]
|
166
|
+
t = Triez.new value_type: :object
|
167
|
+
urls.each_with_index do |url, i|
|
168
|
+
t[url] = i.to_s
|
169
|
+
end
|
170
|
+
|
171
|
+
assert_equal [%w'/users/ 0'], t.walk('/users/12/delete').to_a
|
172
|
+
assert_equal [%w'/users/ 0', %w'/users/12/edit 1'], t.walk('/users/12/edit').to_a
|
173
|
+
assert_equal [%w'/users/ 0', %w'/users/12/edit 1'], t.walk('/users/12/edit/3').to_a
|
174
|
+
|
175
|
+
assert_raise TypeError do
|
176
|
+
t.walk :'/post' do
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
t.walk '' do |k, v|
|
181
|
+
assert_equal [nil, nil], [k, v]
|
182
|
+
end
|
183
|
+
|
184
|
+
# try to trigger rb_gc_mark(), it can stuck if hattrie_iter_next() not called properly
|
185
|
+
100000.times{ 'a' + 'b' }
|
186
|
+
end
|
187
|
+
|
188
|
+
def test_solve_longest_common_substring
|
189
|
+
sentences = %w[
|
190
|
+
万塘路一锅鸡
|
191
|
+
文二路一锅鸡
|
192
|
+
来一锅鸡顶盒
|
193
|
+
一锅鸡胗
|
194
|
+
]
|
195
|
+
|
196
|
+
# value is bitset representing id of the sentence
|
197
|
+
# in ruby we can use integers of arbitrary length as bitsets
|
198
|
+
t = Triez.new value_type: :object, default: 0
|
199
|
+
|
200
|
+
sentences.each_with_index do |sentence, i|
|
201
|
+
elem = 1 << i
|
202
|
+
t.change_all :substring, sentence do |v|
|
203
|
+
# union
|
204
|
+
v | elem
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
# longest common substring
|
209
|
+
lcs = ''
|
210
|
+
universe = (1 << sentences.size) - 1
|
211
|
+
t.each do |k, v|
|
212
|
+
lcs = k if (k.size > lcs.size and v == universe)
|
213
|
+
end
|
214
|
+
assert_equal '一锅鸡', lcs
|
215
|
+
end
|
216
|
+
|
217
|
+
def test_should_not_segfault_when_search_with_prefix
|
218
|
+
t = Triez.new
|
219
|
+
# bursts when 16384
|
220
|
+
16_385.times{ |i| t["a#{i}"] = i }
|
221
|
+
expected_postfices = 16_385.times.map &:to_s
|
222
|
+
actual_postfices = t.search_with_prefix("a").map(&:first)
|
223
|
+
assert_equal expected_postfices.sort, actual_postfices.sort
|
224
|
+
end
|
225
|
+
end
|
metadata
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wordtriez
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Zete Lui
|
9
|
+
- Duane Johnson
|
10
|
+
autorequire:
|
11
|
+
bindir: bin
|
12
|
+
cert_chain: []
|
13
|
+
date: 2014-09-21 00:00:00.000000000 Z
|
14
|
+
dependencies: []
|
15
|
+
description: fast, efficient, unicode aware HAT trie with prefix / suffix support.
|
16
|
+
email:
|
17
|
+
executables: []
|
18
|
+
extensions:
|
19
|
+
- ext/extconf.rb
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- copying
|
23
|
+
- changes
|
24
|
+
- readme.md
|
25
|
+
- lib/wordtriez.rb
|
26
|
+
- test/triez_test.rb
|
27
|
+
- ext/triez.cc
|
28
|
+
- ext/common.h
|
29
|
+
- ext/extconf.rb
|
30
|
+
- ext/hat-trie/ahtable.c
|
31
|
+
- ext/hat-trie/ahtable.h
|
32
|
+
- ext/hat-trie/common.h
|
33
|
+
- ext/hat-trie/hat-trie.c
|
34
|
+
- ext/hat-trie/hat-trie.h
|
35
|
+
- ext/hat-trie/misc.c
|
36
|
+
- ext/hat-trie/misc.h
|
37
|
+
- ext/hat-trie/murmurhash3.c
|
38
|
+
- ext/hat-trie/murmurhash3.h
|
39
|
+
- ext/hat-trie/pstdint.h
|
40
|
+
- ext/hat-trie/text.c
|
41
|
+
- ext/hat-trie/text.h
|
42
|
+
homepage: https://github.com/canadaduane/triez
|
43
|
+
licenses: []
|
44
|
+
post_install_message:
|
45
|
+
rdoc_options: []
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.9.2
|
54
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ! '>='
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '0'
|
60
|
+
requirements: []
|
61
|
+
rubyforge_project:
|
62
|
+
rubygems_version: 1.8.23
|
63
|
+
signing_key:
|
64
|
+
specification_version: 3
|
65
|
+
summary: fast, efficient, unicode aware HAT trie with prefix / suffix support
|
66
|
+
test_files: []
|
67
|
+
has_rdoc: false
|