wordtriez 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/changes +21 -0
- data/copying +18 -0
- data/ext/common.h +8 -0
- data/ext/extconf.rb +32 -0
- data/ext/hat-trie/ahtable.c +550 -0
- data/ext/hat-trie/ahtable.h +93 -0
- data/ext/hat-trie/common.h +19 -0
- data/ext/hat-trie/hat-trie.c +771 -0
- data/ext/hat-trie/hat-trie.h +86 -0
- data/ext/hat-trie/misc.c +46 -0
- data/ext/hat-trie/misc.h +22 -0
- data/ext/hat-trie/murmurhash3.c +77 -0
- data/ext/hat-trie/murmurhash3.h +12 -0
- data/ext/hat-trie/pstdint.h +800 -0
- data/ext/hat-trie/text.c +174 -0
- data/ext/hat-trie/text.h +22 -0
- data/ext/triez.cc +313 -0
- data/lib/wordtriez.rb +65 -0
- data/readme.md +223 -0
- data/test/triez_test.rb +225 -0
- metadata +67 -0
data/readme.md
ADDED
@@ -0,0 +1,223 @@
|
|
1
|
+
## Triez
|
2
|
+
|
3
|
+
[![Build Status](https://travis-ci.org/luikore/triez.png)](https://travis-ci.org/luikore/triez)
|
4
|
+
[![Code Climate](https://codeclimate.com/github/luikore/triez.png)](https://codeclimate.com/github/luikore/triez)
|
5
|
+
[![Gem Version](https://badge.fury.io/rb/triez.png)](http://badge.fury.io/rb/triez)
|
6
|
+
|
7
|
+
Pragmatic [tries](http://en.wikipedia.org/wiki/Trie) for Ruby, spelled in lolcat.
|
8
|
+
|
9
|
+
It is fast, memory efficient, unicode aware, prefix searchable, and enchanced with prefix/suffix/substring keys.
|
10
|
+
|
11
|
+
The backend of *triez* is a cache oblivious data structure: the [HAT trie](https://github.com/dcjones/hat-trie) (In fact it is a [modified version](https://github.com/luikore/hat-trie) for improved functionality). HAT trie is generally faster and more memory efficient than [double array](http://linux.thai.net/~thep/datrie/datrie.html) or [burst trie](http://ww2.cs.mu.oz.au/~jz/fulltext/acmtois02.pdf).
|
12
|
+
|
13
|
+
## Requirement
|
14
|
+
|
15
|
+
- CRuby 1.9 / 2.0
|
16
|
+
- `g++` or `clang`
|
17
|
+
|
18
|
+
## Install
|
19
|
+
|
20
|
+
``` bash
|
21
|
+
gem ins triez
|
22
|
+
```
|
23
|
+
|
24
|
+
## Synopsis
|
25
|
+
|
26
|
+
``` ruby
|
27
|
+
require 'triez'
|
28
|
+
|
29
|
+
# create triez
|
30
|
+
t = Triez.new
|
31
|
+
|
32
|
+
# the above code is equivalent to :int64 for :value_type and 0 for :default
|
33
|
+
t = Triez.new value_type: :int64
|
34
|
+
|
35
|
+
# more flexible with object type [*see note below]
|
36
|
+
t = Triez.new value_type: :object
|
37
|
+
|
38
|
+
# get the value type
|
39
|
+
t.value_type
|
40
|
+
|
41
|
+
# set a different default value
|
42
|
+
t = Triez.new value_type: :object, default: 'hello'
|
43
|
+
|
44
|
+
# insert or change value
|
45
|
+
t['key'] = 100
|
46
|
+
|
47
|
+
# insert a key with default value
|
48
|
+
t << 'key'
|
49
|
+
|
50
|
+
# batch change values under all suffices/prefices/substrings of a key
|
51
|
+
t.change_all(:suffix, 'key') {|old_value| ...calculate new value }
|
52
|
+
t.change_all(:prefix, 'key') {|old_value| ...calculate new value }
|
53
|
+
# enumerates all occurences of substrings of the key
|
54
|
+
t.change_all(:substring, 'key') {|old_value| ...calculate new value }
|
55
|
+
|
56
|
+
# size of inserted keys
|
57
|
+
t.size
|
58
|
+
|
59
|
+
# search with exact match
|
60
|
+
t.has_key? 'key'
|
61
|
+
t['key']
|
62
|
+
|
63
|
+
# prefixed search (iterate over values under a prefix), available options are:
|
64
|
+
# - limit: max items, `nil` means no limit
|
65
|
+
# - sort: whether iterate in alphabetic order, default is true
|
66
|
+
t.search_with_prefix(prefix, limit: 10, sort: true) do |suffix, value|
|
67
|
+
...
|
68
|
+
end
|
69
|
+
|
70
|
+
# if no block given, an array in the form of [[suffix, value]] is returned
|
71
|
+
t.search_with_prefix('prefix')
|
72
|
+
|
73
|
+
# enumerate all keys and values in the order of binary collation
|
74
|
+
t.each do |key, value|
|
75
|
+
...
|
76
|
+
end
|
77
|
+
|
78
|
+
# iterate stored keys which are prefices of a given string, from shallow to deep
|
79
|
+
t.walk string do |k, v|
|
80
|
+
...
|
81
|
+
end
|
82
|
+
```
|
83
|
+
|
84
|
+
\* Note: By default, *triez* store signed integers within 64bits, you can use them as weights, counts or database IDs. In case you need to store arbitrary object in a node, use `value_type: :object`:
|
85
|
+
|
86
|
+
``` ruby
|
87
|
+
t = Triez.new value_type: :object
|
88
|
+
t['Tom'] = {name: 'Tom', sex: 'Female'}
|
89
|
+
t['Tree'] = [:leaf, :trunk, :root]
|
90
|
+
```
|
91
|
+
|
92
|
+
## Examples
|
93
|
+
|
94
|
+
**Prefix based autocompletion**:
|
95
|
+
|
96
|
+
``` ruby
|
97
|
+
require 'triez'
|
98
|
+
words = %w[readme, rot, red, rah, rasterization]
|
99
|
+
t = Triez.new
|
100
|
+
words.each do |word|
|
101
|
+
t[word] = 1
|
102
|
+
end
|
103
|
+
t.search_with_prefix 're' do |suffix|
|
104
|
+
puts "candidate: re#{suffix}"
|
105
|
+
end
|
106
|
+
```
|
107
|
+
|
108
|
+
The output:
|
109
|
+
|
110
|
+
```bash
|
111
|
+
candidate: readme
|
112
|
+
candidate: red
|
113
|
+
```
|
114
|
+
|
115
|
+
---
|
116
|
+
|
117
|
+
**Efficient [full text search](https://en.wikipedia.org/wiki/Full_text_search) with a [suffix tree](https://en.wikipedia.org/wiki/Suffix_tree)**:
|
118
|
+
|
119
|
+
``` ruby
|
120
|
+
require 'triez'
|
121
|
+
sequences = {
|
122
|
+
'ACTGAAAAAAACTG' => 1,
|
123
|
+
'ATACGGTCCA' => 2,
|
124
|
+
'GCTTGTACGT' => 3
|
125
|
+
}
|
126
|
+
t = Triez.new
|
127
|
+
|
128
|
+
# build suffix tree
|
129
|
+
sequences.each do |seq, id|
|
130
|
+
t.change_all(:suffix, seq){id}
|
131
|
+
end
|
132
|
+
|
133
|
+
t.search_with_prefix 'CGGT' do |_, id|
|
134
|
+
puts id #=> 2
|
135
|
+
end
|
136
|
+
```
|
137
|
+
|
138
|
+
The searching time is linear to the length of the substring. You may also be interested in the example of a simple [full text search server](https://github.com/luikore/triez/tree/master/examples/full-text-search-server) with *triez*.
|
139
|
+
|
140
|
+
---
|
141
|
+
|
142
|
+
**Solve the [longest common substring problem](https://en.wikipedia.org/wiki/Longest_common_substring_problem)**:
|
143
|
+
|
144
|
+
``` ruby
|
145
|
+
# coding: utf-8
|
146
|
+
require 'triez'
|
147
|
+
sentences = %w[
|
148
|
+
万塘路一锅鸡
|
149
|
+
去文二路一锅鸡吃饭
|
150
|
+
来一锅鸡顶盒
|
151
|
+
一锅鸡胗
|
152
|
+
]
|
153
|
+
|
154
|
+
# value is bitset representing id of the sentence
|
155
|
+
# in ruby we can use integers of arbitrary length as bitsets
|
156
|
+
t = Triez.new value_type: :object, default: 0
|
157
|
+
|
158
|
+
sentences.each_with_index do |sentence, i|
|
159
|
+
elem = 1 << i
|
160
|
+
t.change_all :substring, sentence do |v|
|
161
|
+
# union
|
162
|
+
v | elem
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
# longest common substring
|
167
|
+
lcs = ''
|
168
|
+
|
169
|
+
# find the key tagged with universe
|
170
|
+
universe = (1 << sentences.size) - 1
|
171
|
+
t.each do |k, v|
|
172
|
+
lcs = k if k.size > lcs.size and v == universe
|
173
|
+
end
|
174
|
+
|
175
|
+
puts lcs #=> 一锅鸡
|
176
|
+
```
|
177
|
+
|
178
|
+
## Benchmark
|
179
|
+
|
180
|
+
Here's a benchmark on
|
181
|
+
|
182
|
+
```ruby
|
183
|
+
ruby 1.9.3p374 (2013-01-15 revision 38858) [x86_64-darwin12.2.1]
|
184
|
+
2.3 GHz Intel Core i7
|
185
|
+
```
|
186
|
+
|
187
|
+
The test data are 3 milion titles of wikipedia articles (from http://dumps.wikimedia.org/enwiki/20121101/)
|
188
|
+
|
189
|
+
```
|
190
|
+
thing/backend | memory | insertion time | 3 M query
|
191
|
+
------------------------|---------|----------------|----------
|
192
|
+
hash/linked hash | 340.2 M | 4.369 s | 0.2800 s
|
193
|
+
fast_trie/double array* | 155.6 M | 130.7 s | 0.4359 s
|
194
|
+
triez/HAT trie | 121.7 M | 3.872 s | 0.3472 s
|
195
|
+
```
|
196
|
+
|
197
|
+
Note: `fast_trie/double array` -> https://github.com/tyler/trie
|
198
|
+
|
199
|
+
## Caveats
|
200
|
+
|
201
|
+
- The `sort` option in prefixed search orders keys with binary [collation](https://en.wikipedia.org/wiki/Collation), but string comparison in Ruby is with unicode codepoint collation.
|
202
|
+
- For some rare case of many threads modifying the same trie, you may need a mutex.
|
203
|
+
- If you still feel memory not enough, you may consider [MARISA-trie](https://code.google.com/p/marisa-trie/) (note that MARISA is immutable), or a database.
|
204
|
+
|
205
|
+
## Development
|
206
|
+
|
207
|
+
``` bash
|
208
|
+
git clone git://github.com/luikore/triez.git
|
209
|
+
cd triez
|
210
|
+
rake glob_src
|
211
|
+
rake
|
212
|
+
```
|
213
|
+
|
214
|
+
To update vendor lib and re-compile:
|
215
|
+
|
216
|
+
``` bash
|
217
|
+
rake glob_src
|
218
|
+
rake
|
219
|
+
```
|
220
|
+
|
221
|
+
## Note
|
222
|
+
|
223
|
+
Although HAT trie uses MurMurHash3 instead of SipHash in Ruby, It is still safe under hashDoS because bucket size is limited.
|
data/test/triez_test.rb
ADDED
@@ -0,0 +1,225 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require "test/unit"
|
3
|
+
require_relative "../lib/triez"
|
4
|
+
|
5
|
+
GC.stress
|
6
|
+
|
7
|
+
class TriezTest < Test::Unit::TestCase
|
8
|
+
def test_init_type_options
|
9
|
+
t = Triez.new value_type: :int64
|
10
|
+
assert_equal :int64, t.value_type
|
11
|
+
t = Triez.new value_type: :object
|
12
|
+
assert_equal :object, t.value_type
|
13
|
+
t = Triez.new
|
14
|
+
assert_equal :int64, t.value_type
|
15
|
+
|
16
|
+
assert_raise ArgumentError do
|
17
|
+
Triez.new value_type: :string
|
18
|
+
end
|
19
|
+
assert_raise ArgumentError do
|
20
|
+
Triez.new invalid_option: :int64
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_hat_trie
|
25
|
+
t = Triez.new value_type: :object
|
26
|
+
|
27
|
+
v1 = (1 << 40)
|
28
|
+
v2 = (1 << 141)
|
29
|
+
t['万塘路一锅鸡'] = v1
|
30
|
+
t['万塘路'] = v2
|
31
|
+
assert_equal v1, t['万塘路一锅鸡']
|
32
|
+
assert_equal v2, t['万塘路']
|
33
|
+
assert_equal nil, t['万']
|
34
|
+
assert_equal false, t.has_key?('万')
|
35
|
+
assert_equal true, t.has_key?('万塘路')
|
36
|
+
|
37
|
+
assert_equal v1, t.delete('万塘路一锅鸡')
|
38
|
+
assert_equal nil, t['万塘路一锅鸡']
|
39
|
+
assert_equal v2, t['万塘路']
|
40
|
+
|
41
|
+
a = t.search_with_prefix ''
|
42
|
+
assert_equal [['万塘路', v2]], a
|
43
|
+
|
44
|
+
t['马当路'] = 3
|
45
|
+
a = t.search_with_prefix '万塘'
|
46
|
+
assert_equal [['路', v2]], a
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_insertion_and_search_on_many_keys
|
50
|
+
t = Triez.new
|
51
|
+
as = ('A'..'z').to_a
|
52
|
+
bs = ('一'..'百').to_a
|
53
|
+
as.each do |a|
|
54
|
+
# 10k chars to ensure burst
|
55
|
+
bs.each do |b|
|
56
|
+
t[a + b] = 0
|
57
|
+
end
|
58
|
+
end
|
59
|
+
assert_equal as.size * bs.size, t.size
|
60
|
+
|
61
|
+
a = t.search_with_prefix 'a'
|
62
|
+
assert_equal bs.to_a, a.map(&:first).sort
|
63
|
+
|
64
|
+
a = []
|
65
|
+
t.search_with_prefix 'b', sort: true, limit: 3 do |k, v|
|
66
|
+
a << k
|
67
|
+
end
|
68
|
+
assert_equal 3, a.size
|
69
|
+
assert_equal a, a.sort
|
70
|
+
end
|
71
|
+
|
72
|
+
def test_each_and_raise
|
73
|
+
t = Triez.new
|
74
|
+
t['abcd'] = 0
|
75
|
+
t['abc'] = 1
|
76
|
+
|
77
|
+
assert_raise NameError do
|
78
|
+
t.each do |k, v|
|
79
|
+
raise NameError, k
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
assert_raise ArgumentError do
|
84
|
+
t.each
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def test_append
|
89
|
+
t = Triez.new
|
90
|
+
('a'..'z').each do |c|
|
91
|
+
t << c
|
92
|
+
end
|
93
|
+
assert_equal 26, t.size
|
94
|
+
assert_equal 0, t['c']
|
95
|
+
assert_equal true, t.has_key?('c')
|
96
|
+
end
|
97
|
+
|
98
|
+
def test_full_text_search
|
99
|
+
sequences = {
|
100
|
+
'ACTGAAAAAAACTG' => 1,
|
101
|
+
'ATACGGTCCA' => 2,
|
102
|
+
'GCTTGTACGT' => 3
|
103
|
+
}
|
104
|
+
t = Triez.new
|
105
|
+
sequences.each do |seq, id|
|
106
|
+
t.change_all(:suffix, seq){ id }
|
107
|
+
end
|
108
|
+
assert_equal 2, t.search_with_prefix('CGGT').map(&:last).flatten.first
|
109
|
+
end
|
110
|
+
|
111
|
+
def test_nul_char_in_keys
|
112
|
+
t = Triez.new
|
113
|
+
t["a\0b"] = 1
|
114
|
+
assert_equal 1, t["a\0b"]
|
115
|
+
assert_equal 1, t.size
|
116
|
+
assert_equal 0, t["a"]
|
117
|
+
end
|
118
|
+
|
119
|
+
def test_change_all_with_prefix
|
120
|
+
default = 10
|
121
|
+
t = Triez.new default: default
|
122
|
+
t['regexp'] = 1
|
123
|
+
t['readme'] = 2
|
124
|
+
t.change_all :prefix, 'readme' do |v|
|
125
|
+
v += 4
|
126
|
+
end
|
127
|
+
assert_equal 'readme'.size + 1, t.size
|
128
|
+
assert_equal 6, t['readme']
|
129
|
+
assert_equal default + 4, t['read']
|
130
|
+
assert_equal 1, t['regexp']
|
131
|
+
end
|
132
|
+
|
133
|
+
def test_change_all_with_suffix
|
134
|
+
t = Triez.new
|
135
|
+
t['regexp'] = 1
|
136
|
+
t['exp'] = 2
|
137
|
+
t['reg'] = 3
|
138
|
+
t.change_all :suffix, 'regexp' do |v|
|
139
|
+
v += 4
|
140
|
+
end
|
141
|
+
assert_equal 5, t['regexp']
|
142
|
+
assert_equal 6, t['exp']
|
143
|
+
assert_equal 3, t['reg']
|
144
|
+
assert_equal 'regexp'.size + 1, t.size
|
145
|
+
end
|
146
|
+
|
147
|
+
def test_change_all_with_substring
|
148
|
+
t = Triez.new value_type: :object
|
149
|
+
t.change_all :substring, 'abc' do
|
150
|
+
1
|
151
|
+
end
|
152
|
+
|
153
|
+
keys = []
|
154
|
+
t.each do |k, v|
|
155
|
+
keys << k
|
156
|
+
end
|
157
|
+
assert_equal %w[a b c ab bc abc].sort, keys.sort
|
158
|
+
end
|
159
|
+
|
160
|
+
def test_walk
|
161
|
+
urls = %w[
|
162
|
+
/users/
|
163
|
+
/users/12/edit
|
164
|
+
/posts
|
165
|
+
]
|
166
|
+
t = Triez.new value_type: :object
|
167
|
+
urls.each_with_index do |url, i|
|
168
|
+
t[url] = i.to_s
|
169
|
+
end
|
170
|
+
|
171
|
+
assert_equal [%w'/users/ 0'], t.walk('/users/12/delete').to_a
|
172
|
+
assert_equal [%w'/users/ 0', %w'/users/12/edit 1'], t.walk('/users/12/edit').to_a
|
173
|
+
assert_equal [%w'/users/ 0', %w'/users/12/edit 1'], t.walk('/users/12/edit/3').to_a
|
174
|
+
|
175
|
+
assert_raise TypeError do
|
176
|
+
t.walk :'/post' do
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
t.walk '' do |k, v|
|
181
|
+
assert_equal [nil, nil], [k, v]
|
182
|
+
end
|
183
|
+
|
184
|
+
# try to trigger rb_gc_mark(), it can stuck if hattrie_iter_next() not called properly
|
185
|
+
100000.times{ 'a' + 'b' }
|
186
|
+
end
|
187
|
+
|
188
|
+
def test_solve_longest_common_substring
|
189
|
+
sentences = %w[
|
190
|
+
万塘路一锅鸡
|
191
|
+
文二路一锅鸡
|
192
|
+
来一锅鸡顶盒
|
193
|
+
一锅鸡胗
|
194
|
+
]
|
195
|
+
|
196
|
+
# value is bitset representing id of the sentence
|
197
|
+
# in ruby we can use integers of arbitrary length as bitsets
|
198
|
+
t = Triez.new value_type: :object, default: 0
|
199
|
+
|
200
|
+
sentences.each_with_index do |sentence, i|
|
201
|
+
elem = 1 << i
|
202
|
+
t.change_all :substring, sentence do |v|
|
203
|
+
# union
|
204
|
+
v | elem
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
# longest common substring
|
209
|
+
lcs = ''
|
210
|
+
universe = (1 << sentences.size) - 1
|
211
|
+
t.each do |k, v|
|
212
|
+
lcs = k if (k.size > lcs.size and v == universe)
|
213
|
+
end
|
214
|
+
assert_equal '一锅鸡', lcs
|
215
|
+
end
|
216
|
+
|
217
|
+
def test_should_not_segfault_when_search_with_prefix
|
218
|
+
t = Triez.new
|
219
|
+
# bursts when 16384
|
220
|
+
16_385.times{ |i| t["a#{i}"] = i }
|
221
|
+
expected_postfices = 16_385.times.map &:to_s
|
222
|
+
actual_postfices = t.search_with_prefix("a").map(&:first)
|
223
|
+
assert_equal expected_postfices.sort, actual_postfices.sort
|
224
|
+
end
|
225
|
+
end
|
metadata
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wordtriez
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Zete Lui
|
9
|
+
- Duane Johnson
|
10
|
+
autorequire:
|
11
|
+
bindir: bin
|
12
|
+
cert_chain: []
|
13
|
+
date: 2014-09-21 00:00:00.000000000 Z
|
14
|
+
dependencies: []
|
15
|
+
description: fast, efficient, unicode aware HAT trie with prefix / suffix support.
|
16
|
+
email:
|
17
|
+
executables: []
|
18
|
+
extensions:
|
19
|
+
- ext/extconf.rb
|
20
|
+
extra_rdoc_files: []
|
21
|
+
files:
|
22
|
+
- copying
|
23
|
+
- changes
|
24
|
+
- readme.md
|
25
|
+
- lib/wordtriez.rb
|
26
|
+
- test/triez_test.rb
|
27
|
+
- ext/triez.cc
|
28
|
+
- ext/common.h
|
29
|
+
- ext/extconf.rb
|
30
|
+
- ext/hat-trie/ahtable.c
|
31
|
+
- ext/hat-trie/ahtable.h
|
32
|
+
- ext/hat-trie/common.h
|
33
|
+
- ext/hat-trie/hat-trie.c
|
34
|
+
- ext/hat-trie/hat-trie.h
|
35
|
+
- ext/hat-trie/misc.c
|
36
|
+
- ext/hat-trie/misc.h
|
37
|
+
- ext/hat-trie/murmurhash3.c
|
38
|
+
- ext/hat-trie/murmurhash3.h
|
39
|
+
- ext/hat-trie/pstdint.h
|
40
|
+
- ext/hat-trie/text.c
|
41
|
+
- ext/hat-trie/text.h
|
42
|
+
homepage: https://github.com/canadaduane/triez
|
43
|
+
licenses: []
|
44
|
+
post_install_message:
|
45
|
+
rdoc_options: []
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.9.2
|
54
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ! '>='
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '0'
|
60
|
+
requirements: []
|
61
|
+
rubyforge_project:
|
62
|
+
rubygems_version: 1.8.23
|
63
|
+
signing_key:
|
64
|
+
specification_version: 3
|
65
|
+
summary: fast, efficient, unicode aware HAT trie with prefix / suffix support
|
66
|
+
test_files: []
|
67
|
+
has_rdoc: false
|