trie-file 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +13 -0
- data/History.txt +3 -0
- data/README.md +77 -0
- data/Rakefile +18 -0
- data/lib/trie-file.rb +5 -0
- data/lib/trie-file/file.rb +232 -0
- data/lib/trie-file/node.rb +42 -0
- data/lib/trie-file/trie-file.rb +0 -0
- data/lib/trie-file/trie.rb +54 -0
- data/lib/trie-file/version.rb +5 -0
- data/spec/file_spec.rb +154 -0
- data/spec/node_spec.rb +98 -0
- data/spec/spec_helper.rb +19 -0
- data/spec/trie_spec.rb +59 -0
- data/trie-file.gemspec +18 -0
- metadata +58 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 8bc2e8641b2afc2c19a2e519a6af559efdf08a97
|
4
|
+
data.tar.gz: 85c0da8ef2d170d9588525adcccb6edad71ebad6
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 3bab7cec0c62f28a54fce93b5e8f9eabc05390fd67c077317e0477663bcbe5bc9aa0b4feb15c50528c2fdc241183b8e4290faeea6d79aed6d04e2b5019645891
|
7
|
+
data.tar.gz: 55ec9c2d427f8a399a4470905dfb1c78ac2a37d799af20531b5525128e4da0707f6256db5e05531e37aaff7dc2b3612a106009d35a1017479ad54239ddc93d72
|
data/Gemfile
ADDED
data/History.txt
ADDED
data/README.md
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
trie-file
|
2
|
+
=========
|
3
|
+
|
4
|
+
Memory-efficient cached trie and trie storage.
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
`gem install trie-file`
|
9
|
+
|
10
|
+
Then, somewhere in your code:
|
11
|
+
|
12
|
+
```ruby
|
13
|
+
require 'trie-file'
|
14
|
+
```
|
15
|
+
|
16
|
+
## Rationale
|
17
|
+
|
18
|
+
trie-file contains two things: an implementation of the [trie data structure](http://en.wikipedia.org/wiki/Trie), and a way to write them to disk and read them back again. It tries (ha!) to do this in a memory-efficient way by packing the trie structure in a specialized binary form. This special packing method means the trie can be searched entirely _on disk_ without needing to load the whole structure into memory (linear time). Each key you look up is cached so subsequent accesses are even faster (constant time). trie-file is also capable of reading and writing entire trie structures.
|
19
|
+
|
20
|
+
Because tries (also known as prefix trees) rely on keys having common prefixes, you're required to use string keys. There are no type restrictions on values.
|
21
|
+
|
22
|
+
## What's a Trie?
|
23
|
+
|
24
|
+
For an in-depth explanation, see the Wikipedia link above. Essentially tries are key-value data structures that work similar to Ruby hashes. You add a key and a value to the trie and can later retrieve the value using the same key.
|
25
|
+
|
26
|
+
## Basic Usage
|
27
|
+
|
28
|
+
Create a trie and write it to disk:
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
trie = TrieFile::Trie.new
|
32
|
+
trie.add('foo', 'bar')
|
33
|
+
|
34
|
+
TrieFile::File.open('/path/to/file', 'wb') do |f|
|
35
|
+
f.write_trie(trie)
|
36
|
+
end
|
37
|
+
```
|
38
|
+
|
39
|
+
Open a file handle to a trie and search it _on disk_:
|
40
|
+
|
41
|
+
```ruby
|
42
|
+
trie_file = TrieFile::File.open('/path/to/file', 'rb')
|
43
|
+
trie_file.find('foo') # => 'bar'
|
44
|
+
```
|
45
|
+
|
46
|
+
To read an entire trie, use the `#read` method instead of `#open`:
|
47
|
+
|
48
|
+
```ruby
|
49
|
+
trie = TrieFile::File.read('/path/to/file')
|
50
|
+
```
|
51
|
+
|
52
|
+
## Choosing a Hash Method
|
53
|
+
|
54
|
+
By default, trie-file does not hash your keys. Instead, it iterates over each character in the key and constructs the internal trie structure. trie-file also supports hashing keys with the md5 or sha1 algorithms to minimize your search space:
|
55
|
+
|
56
|
+
```ruby
|
57
|
+
trie = TrieFile::Trie.new(nil, :sha1)
|
58
|
+
```
|
59
|
+
|
60
|
+
If you wrote a trie to disk that was hashed using sha1, you'll need to supply an additional argument to `#open` and `#read`:
|
61
|
+
|
62
|
+
```ruby
|
63
|
+
trie_file = TrieFile::File.open('/path/to/file', 'rb', :sha1)
|
64
|
+
trie = TrieFile::File.read('/path/to/file', :sha1)
|
65
|
+
```
|
66
|
+
|
67
|
+
## Requirements
|
68
|
+
|
69
|
+
No external requirements.
|
70
|
+
|
71
|
+
## Running Tests
|
72
|
+
|
73
|
+
`bundle exec rspec` should do the trick :)
|
74
|
+
|
75
|
+
## Authors
|
76
|
+
|
77
|
+
* Cameron C. Dutro: http://github.com/camertron
|
data/Rakefile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'rubygems' unless ENV['NO_RUBYGEMS']
|
4
|
+
|
5
|
+
require 'bundler'
|
6
|
+
require 'rspec/core/rake_task'
|
7
|
+
require 'rubygems/package_task'
|
8
|
+
|
9
|
+
require './lib/trie-file'
|
10
|
+
|
11
|
+
Bundler::GemHelper.install_tasks
|
12
|
+
|
13
|
+
task :default => :spec
|
14
|
+
|
15
|
+
desc 'Run specs'
|
16
|
+
RSpec::Core::RakeTask.new do |t|
|
17
|
+
t.pattern = './spec/**/*_spec.rb'
|
18
|
+
end
|
data/lib/trie-file.rb
ADDED
@@ -0,0 +1,232 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'thread'
|
4
|
+
|
5
|
+
# root:
|
6
|
+
# 2b value length
|
7
|
+
# nb value
|
8
|
+
# 2b (number of children)
|
9
|
+
# children metadata:
|
10
|
+
# 3b letter
|
11
|
+
# 3b child location
|
12
|
+
# children:
|
13
|
+
# node:
|
14
|
+
# 2b value length
|
15
|
+
# ...
|
16
|
+
|
17
|
+
module TrieFile
|
18
|
+
class File
|
19
|
+
attr_reader :handle, :hash_mode
|
20
|
+
|
21
|
+
def self.open(path, mode, hash_mode = :none)
|
22
|
+
handle = ::File.open(path, mode)
|
23
|
+
|
24
|
+
unless handle.binmode?
|
25
|
+
raise ArgumentError, 'TrieFile must be opened in binary mode.'
|
26
|
+
end
|
27
|
+
|
28
|
+
file = new(handle, hash_mode)
|
29
|
+
|
30
|
+
if block_given?
|
31
|
+
yield file
|
32
|
+
handle.close
|
33
|
+
end
|
34
|
+
|
35
|
+
file
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.read(path, hash_mode = :none)
|
39
|
+
root = nil
|
40
|
+
|
41
|
+
::File.open(path, 'rb') do |io|
|
42
|
+
root = read_node(io)
|
43
|
+
end
|
44
|
+
|
45
|
+
Trie.new(root, hash_mode)
|
46
|
+
end
|
47
|
+
|
48
|
+
def initialize(handle, hash_mode)
|
49
|
+
@handle = handle
|
50
|
+
@semaphore = Mutex.new
|
51
|
+
@hash_mode = hash_mode
|
52
|
+
end
|
53
|
+
|
54
|
+
def write_trie(trie)
|
55
|
+
mark(trie)
|
56
|
+
self.class.write_node(trie.root, handle)
|
57
|
+
end
|
58
|
+
|
59
|
+
def find(key)
|
60
|
+
if closed?
|
61
|
+
raise IOError, 'file is not currently open.'
|
62
|
+
end
|
63
|
+
|
64
|
+
@semaphore.synchronize do
|
65
|
+
key = hash_key(key)
|
66
|
+
cache.fetch(key) do
|
67
|
+
handle.seek(0, IO::SEEK_SET)
|
68
|
+
value = nil
|
69
|
+
|
70
|
+
key.each_char do |char|
|
71
|
+
value, child_metadata = self.class.read_node_header(handle)
|
72
|
+
metadata = child_metadata.find do |data|
|
73
|
+
data.first == char
|
74
|
+
end
|
75
|
+
|
76
|
+
return nil unless metadata
|
77
|
+
handle.seek(metadata.last, IO::SEEK_SET)
|
78
|
+
end
|
79
|
+
|
80
|
+
value = self.class.read_value(handle)
|
81
|
+
cache[key] = value
|
82
|
+
value
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def closed?
|
88
|
+
@handle.closed?
|
89
|
+
end
|
90
|
+
|
91
|
+
def close
|
92
|
+
handle.close
|
93
|
+
end
|
94
|
+
|
95
|
+
private
|
96
|
+
|
97
|
+
BYTE_LENGTH = 8
|
98
|
+
LETTER_FIELD_LENGTH = 3
|
99
|
+
POSITION_FIELD_LENGTH = 3
|
100
|
+
POSITION_MAX = 2 ** (BYTE_LENGTH * POSITION_FIELD_LENGTH)
|
101
|
+
VALUE_FIELD_LENGTH = 2
|
102
|
+
CHILD_COUNT_FIELD_LENGTH = 2
|
103
|
+
|
104
|
+
def hash_key(key)
|
105
|
+
Trie.hash_key(key, hash_mode)
|
106
|
+
end
|
107
|
+
|
108
|
+
def cache
|
109
|
+
@cache ||= {}
|
110
|
+
end
|
111
|
+
|
112
|
+
def mark(trie)
|
113
|
+
mark_node(trie.root, 0)
|
114
|
+
end
|
115
|
+
|
116
|
+
def mark_node(node, byte_pos)
|
117
|
+
node.byte_pos = byte_pos
|
118
|
+
total_child_size = 0
|
119
|
+
node.children.each_pair do |letter, child|
|
120
|
+
offset = mark_node(child, byte_pos + node.bytesize + total_child_size)
|
121
|
+
total_child_size += child.bytesize + offset
|
122
|
+
end
|
123
|
+
total_child_size
|
124
|
+
end
|
125
|
+
|
126
|
+
def self.read_value(io)
|
127
|
+
# 2b value length
|
128
|
+
value_bytesize = read_int(io, VALUE_FIELD_LENGTH)
|
129
|
+
|
130
|
+
# nb value
|
131
|
+
value = io.read(value_bytesize)
|
132
|
+
end
|
133
|
+
|
134
|
+
def self.read_node_header(io)
|
135
|
+
value = read_value(io)
|
136
|
+
|
137
|
+
# 2b number of children
|
138
|
+
number_of_children = read_int(io, CHILD_COUNT_FIELD_LENGTH)
|
139
|
+
|
140
|
+
child_metadata = number_of_children.times.map do
|
141
|
+
# 2b letter
|
142
|
+
letter = read_bytes(io, LETTER_FIELD_LENGTH)
|
143
|
+
|
144
|
+
# 3b child location
|
145
|
+
child_pos = read_int(io, POSITION_FIELD_LENGTH)
|
146
|
+
[letter, child_pos]
|
147
|
+
end
|
148
|
+
|
149
|
+
[value, child_metadata]
|
150
|
+
end
|
151
|
+
|
152
|
+
def self.read_node(io)
|
153
|
+
value, child_metadata = read_node_header(io)
|
154
|
+
node = Node.new(value)
|
155
|
+
|
156
|
+
child_metadata.each do |metadata|
|
157
|
+
node.add_child(
|
158
|
+
metadata.first,
|
159
|
+
read_node(io)
|
160
|
+
)
|
161
|
+
end
|
162
|
+
|
163
|
+
node
|
164
|
+
end
|
165
|
+
|
166
|
+
def self.write_node(node, io)
|
167
|
+
# 2b value length
|
168
|
+
write_int(io, node.value_bytesize, VALUE_FIELD_LENGTH)
|
169
|
+
|
170
|
+
# nb value
|
171
|
+
write_bytes(io, node.value_bytes.to_a)
|
172
|
+
|
173
|
+
# 2b number of children
|
174
|
+
write_int(io, node.children.size, CHILD_COUNT_FIELD_LENGTH)
|
175
|
+
|
176
|
+
# children
|
177
|
+
node.children.each_pair do |letter, child_node|
|
178
|
+
# 2b letter
|
179
|
+
if letter.bytesize > LETTER_FIELD_LENGTH
|
180
|
+
raise "Letter #{letter} is larger than #{LETTER_FIELD_LENGTH} bytes."
|
181
|
+
else
|
182
|
+
write_bytes(io, letter.bytes.to_a, LETTER_FIELD_LENGTH)
|
183
|
+
end
|
184
|
+
|
185
|
+
# 3b child location
|
186
|
+
if child_node.byte_pos > POSITION_MAX
|
187
|
+
raise "Encountered write position greater than #{POSITION_FIELD_LENGTH} bytes."
|
188
|
+
else
|
189
|
+
write_int(io, child_node.byte_pos, POSITION_FIELD_LENGTH)
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
node.children.each_pair do |letter, child_node|
|
194
|
+
write_node(child_node, io)
|
195
|
+
end
|
196
|
+
end
|
197
|
+
|
198
|
+
def self.write_int(io, int, bytesize = int_bytesize(int))
|
199
|
+
actual_bytesize = int_bytesize(int)
|
200
|
+
(bytesize - actual_bytesize).times { io.putc("\0") }
|
201
|
+
|
202
|
+
actual_bytesize.times do |i|
|
203
|
+
# putc always writes the LSB if given a multibyte arg
|
204
|
+
io.putc(int >> ((actual_bytesize - i - 1) * BYTE_LENGTH))
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
def self.int_bytesize(int)
|
209
|
+
return 0 if int == 0
|
210
|
+
(Math.log2(int) / BYTE_LENGTH).to_i + 1
|
211
|
+
end
|
212
|
+
|
213
|
+
def self.write_bytes(io, bytes, bytesize = bytes.size)
|
214
|
+
(bytesize - bytes.size).times { io.putc("\0") }
|
215
|
+
bytes.each { |byte| io.putc(byte) }
|
216
|
+
end
|
217
|
+
|
218
|
+
def self.read_int(io, bytesize)
|
219
|
+
(bytesize - 1).downto(0).inject(0) do |sum, i|
|
220
|
+
sum + (io.readbyte << (i * BYTE_LENGTH))
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
def self.read_bytes(io, bytesize)
|
225
|
+
# remove leading zero bytes
|
226
|
+
bytes = bytesize.times.map { io.readbyte }
|
227
|
+
return [0] if bytes.all? { |byte| byte == 0 }
|
228
|
+
idx = bytes.find_index { |byte| byte != 0 }
|
229
|
+
bytes[idx..-1].pack("U*")
|
230
|
+
end
|
231
|
+
end
|
232
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module TrieFile
|
4
|
+
class Node
|
5
|
+
CHILD_FIELDS_LENGTH = 6
|
6
|
+
HEADER_FIELD_LENGTH = 4
|
7
|
+
|
8
|
+
attr_reader :children
|
9
|
+
attr_accessor :value, :byte_pos
|
10
|
+
|
11
|
+
def initialize(value = nil)
|
12
|
+
@value = value
|
13
|
+
@children = {}
|
14
|
+
@byte_pos = 0
|
15
|
+
end
|
16
|
+
|
17
|
+
def has_child?(char)
|
18
|
+
children.include?(char)
|
19
|
+
end
|
20
|
+
|
21
|
+
def child_at(char)
|
22
|
+
children[char]
|
23
|
+
end
|
24
|
+
|
25
|
+
def add_child(char, node)
|
26
|
+
@children[char] = node
|
27
|
+
end
|
28
|
+
|
29
|
+
def bytesize
|
30
|
+
# add some constants here
|
31
|
+
HEADER_FIELD_LENGTH + (children.size * CHILD_FIELDS_LENGTH) + value_bytesize
|
32
|
+
end
|
33
|
+
|
34
|
+
def value_bytesize
|
35
|
+
value ? value.bytesize : 0
|
36
|
+
end
|
37
|
+
|
38
|
+
def value_bytes
|
39
|
+
value ? value.bytes : []
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
File without changes
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'digest/md5'
|
4
|
+
require 'digest/sha1'
|
5
|
+
|
6
|
+
module TrieFile
|
7
|
+
class Trie
|
8
|
+
attr_reader :root, :hash_mode
|
9
|
+
|
10
|
+
def initialize(root = nil, hash_mode = :none)
|
11
|
+
@root = root || Node.new
|
12
|
+
@hash_mode = hash_mode
|
13
|
+
end
|
14
|
+
|
15
|
+
def add(str, value)
|
16
|
+
node = root
|
17
|
+
key = hash_key(str)
|
18
|
+
|
19
|
+
key.each_char do |char|
|
20
|
+
if node.has_child?(char)
|
21
|
+
node = node.child_at(char)
|
22
|
+
else
|
23
|
+
node = node.add_child(char, Node.new)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
node.value = value
|
28
|
+
end
|
29
|
+
|
30
|
+
def find(key)
|
31
|
+
node = root
|
32
|
+
hash_key(key).each_char do |char|
|
33
|
+
node = node.child_at(char)
|
34
|
+
return nil unless node
|
35
|
+
end
|
36
|
+
node.value
|
37
|
+
end
|
38
|
+
|
39
|
+
def hash_key(key)
|
40
|
+
self.class.hash_key(key, hash_mode)
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.hash_key(key, hash_mode)
|
44
|
+
case hash_mode
|
45
|
+
when :md5
|
46
|
+
Digest::MD5.hexdigest(key)
|
47
|
+
when :sha1
|
48
|
+
Digest::SHA1.hexdigest(key)
|
49
|
+
else
|
50
|
+
key
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
data/spec/file_spec.rb
ADDED
@@ -0,0 +1,154 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe TrieFile::File do
|
6
|
+
def file
|
7
|
+
TrieFile::File
|
8
|
+
end
|
9
|
+
|
10
|
+
def trie
|
11
|
+
TrieFile::Trie
|
12
|
+
end
|
13
|
+
|
14
|
+
let(:tmpdir) { './' }
|
15
|
+
let(:tmpfile) { File.join(tmpdir, 'test.txt') }
|
16
|
+
|
17
|
+
after(:each) do
|
18
|
+
File.unlink(tmpfile) if File.exist?(tmpfile)
|
19
|
+
end
|
20
|
+
|
21
|
+
describe 'self#open' do
|
22
|
+
it "raises an exception if the file isn't opened in binary mode" do
|
23
|
+
proc = lambda { file.open(tmpfile, 'w') }
|
24
|
+
expect(proc).to raise_error(ArgumentError, 'TrieFile must be opened in binary mode.')
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'yields the file when a block is given and closes then returns it afterwards' do
|
28
|
+
f = file.open(tmpfile, 'wb') do |f|
|
29
|
+
expect(f).to be_a(file)
|
30
|
+
expect(f).to respond_to(:write_trie)
|
31
|
+
expect(f).to_not be_closed
|
32
|
+
end
|
33
|
+
|
34
|
+
expect(f).to be_a(file)
|
35
|
+
expect(f).to respond_to(:write_trie)
|
36
|
+
expect(f).to be_closed
|
37
|
+
end
|
38
|
+
|
39
|
+
it 'returns the open file when a block is not given' do
|
40
|
+
file.open(tmpfile, 'wb').tap do |f|
|
41
|
+
expect(f).to be_a(file)
|
42
|
+
expect(f).to_not be_closed
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
it 'uses the given hash mode when passed' do
|
47
|
+
file.open(tmpfile, 'wb') do |f|
|
48
|
+
f.write_trie(trie.new(nil, :md5).tap { |t| t.add('foo', 'bar') })
|
49
|
+
end
|
50
|
+
|
51
|
+
f = file.open(tmpfile, 'rb', :md5)
|
52
|
+
expect(f.find('foo')).to eq('bar')
|
53
|
+
f.close
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
describe 'self#read' do
|
58
|
+
let(:bytes) do
|
59
|
+
[
|
60
|
+
0, 0, 0, 1, 0, 0, 102, 0, 0, 10, 0, 0, 0, 1, 0, 0, 111, 0, 0,
|
61
|
+
20, 0, 0, 0, 1, 0, 0, 111, 0, 0, 30, 0, 3, 98, 97, 114, 0, 0
|
62
|
+
]
|
63
|
+
end
|
64
|
+
|
65
|
+
let(:sha1_bytes) do
|
66
|
+
[
|
67
|
+
0, 0, 0, 1, 0, 0, 48, 0, 0, 10, 0, 0, 0, 1, 0, 0, 98, 0, 0, 20,
|
68
|
+
0, 0, 0, 1, 0, 0, 101, 0, 0, 30, 0, 0, 0, 1, 0, 0, 101, 0, 0, 40,
|
69
|
+
0, 0, 0, 1, 0, 0, 99, 0, 0, 50, 0, 0, 0, 1, 0, 0, 55, 0, 0, 60,
|
70
|
+
0, 0, 0, 1, 0, 0, 98, 0, 0, 70, 0, 0, 0, 1, 0, 0, 53, 0, 0, 80,
|
71
|
+
0, 0, 0, 1, 0, 0, 101, 0, 0, 90, 0, 0, 0, 1, 0, 0, 97, 0, 0, 100,
|
72
|
+
0, 0, 0, 1, 0, 0, 51, 0, 0, 110, 0, 0, 0, 1, 0, 0, 102, 0, 0, 120,
|
73
|
+
0, 0, 0, 1, 0, 0, 48, 0, 0, 130, 0, 0, 0, 1, 0, 0, 102, 0, 0, 140,
|
74
|
+
0, 0, 0, 1, 0, 0, 100, 0, 0, 150, 0, 0, 0, 1, 0, 0, 98, 0, 0, 160,
|
75
|
+
0, 0, 0, 1, 0, 0, 99, 0, 0, 170, 0, 0, 0, 1, 0, 0, 57, 0, 0, 180,
|
76
|
+
0, 0, 0, 1, 0, 0, 53, 0, 0, 190, 0, 0, 0, 1, 0, 0, 100, 0, 0, 200,
|
77
|
+
0, 0, 0, 1, 0, 0, 48, 0, 0, 210, 0, 0, 0, 1, 0, 0, 100, 0, 0, 220,
|
78
|
+
0, 0, 0, 1, 0, 0, 100, 0, 0, 230, 0, 0, 0, 1, 0, 0, 52, 0, 0, 240,
|
79
|
+
0, 0, 0, 1, 0, 0, 55, 0, 0, 250, 0, 0, 0, 1, 0, 0, 102, 0, 1, 4, 0,
|
80
|
+
0, 0, 1, 0, 0, 51, 0, 1, 14, 0, 0, 0, 1, 0, 0, 99, 0, 1, 24, 0, 0,
|
81
|
+
0, 1, 0, 0, 53, 0, 1, 34, 0, 0, 0, 1, 0, 0, 98, 0, 1, 44, 0, 0, 0,
|
82
|
+
1, 0, 0, 99, 0, 1, 54, 0, 0, 0, 1, 0, 0, 50, 0, 1, 64, 0, 0, 0, 1,
|
83
|
+
0, 0, 55, 0, 1, 74, 0, 0, 0, 1, 0, 0, 53, 0, 1, 84, 0, 0, 0, 1, 0,
|
84
|
+
0, 100, 0, 1, 94, 0, 0, 0, 1, 0, 0, 97, 0, 1, 104, 0, 0, 0, 1, 0,
|
85
|
+
0, 56, 0, 1, 114, 0, 0, 0, 1, 0, 0, 97, 0, 1, 124, 0, 0, 0, 1, 0,
|
86
|
+
0, 51, 0, 1, 134, 0, 0, 0, 1, 0, 0, 51, 0, 1, 144, 0, 3, 98, 97,
|
87
|
+
114, 0, 0
|
88
|
+
]
|
89
|
+
end
|
90
|
+
|
91
|
+
it 'reads a trie from disk' do
|
92
|
+
File.open(tmpfile, 'wb') do |f|
|
93
|
+
bytes.each { |byte| f.putc(byte) }
|
94
|
+
end
|
95
|
+
|
96
|
+
t = file.read(tmpfile)
|
97
|
+
check_trie(t.root, 'foo', 'bar')
|
98
|
+
expect(t.find('foo')).to eq('bar')
|
99
|
+
end
|
100
|
+
|
101
|
+
it 'reads a sha1-hashed trie from disk' do
|
102
|
+
File.open(tmpfile, 'wb') do |f|
|
103
|
+
sha1_bytes.each { |byte| f.putc(byte) }
|
104
|
+
end
|
105
|
+
|
106
|
+
t = file.read(tmpfile)
|
107
|
+
expect(t.find('foo')).to be_nil
|
108
|
+
|
109
|
+
t = file.read(tmpfile, :sha1)
|
110
|
+
check_trie(t.root, Digest::SHA1.hexdigest('foo'), 'bar')
|
111
|
+
expect(t.find('foo')).to eq('bar')
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
describe '#write_trie' do
|
116
|
+
it 'should write the trie to disk' do
|
117
|
+
file.open(tmpfile, 'wb') do |f|
|
118
|
+
f.write_trie(trie.new.tap { |t| t.add('foo', 'bar') })
|
119
|
+
end
|
120
|
+
|
121
|
+
t = file.read(tmpfile)
|
122
|
+
check_trie(t.root, 'foo', 'bar')
|
123
|
+
expect(t.find('foo')).to eq('bar')
|
124
|
+
end
|
125
|
+
|
126
|
+
it 'uses the given hash mode when passed' do
|
127
|
+
file.open(tmpfile, 'wb') do |f|
|
128
|
+
f.write_trie(trie.new(nil, :md5).tap { |t| t.add('foo', 'bar') })
|
129
|
+
end
|
130
|
+
|
131
|
+
t = file.read(tmpfile)
|
132
|
+
check_trie(t.root, Digest::MD5.hexdigest('foo'), 'bar')
|
133
|
+
end
|
134
|
+
end
|
135
|
+
|
136
|
+
describe '#find' do
|
137
|
+
it 'should traverse the file on disk and find the value' do
|
138
|
+
file.open(tmpfile, 'wb') do |f|
|
139
|
+
f.write_trie(trie.new.tap { |t| t.add('foo', 'bar') })
|
140
|
+
end
|
141
|
+
|
142
|
+
# notice we're calling 'open' instead of 'read'
|
143
|
+
f = file.open(tmpfile, 'rb')
|
144
|
+
expect(f.find('foo')).to eq('bar')
|
145
|
+
f.close
|
146
|
+
end
|
147
|
+
|
148
|
+
it 'raises an error if the file is already closed, eg. if open is called with a block' do
|
149
|
+
File.open(tmpfile, 'w+') { |f| f.write('test') }
|
150
|
+
f = file.open(tmpfile, 'rb') {}
|
151
|
+
expect(lambda { f.find('foo') }).to raise_error(IOError, 'file is not currently open.')
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
data/spec/node_spec.rb
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe TrieFile::Node do
|
6
|
+
def create_node(value = nil)
|
7
|
+
node = TrieFile::Node.new(value)
|
8
|
+
yield node if block_given?
|
9
|
+
node
|
10
|
+
end
|
11
|
+
|
12
|
+
def header_field_length
|
13
|
+
TrieFile::Node::HEADER_FIELD_LENGTH
|
14
|
+
end
|
15
|
+
|
16
|
+
def child_fields_length
|
17
|
+
TrieFile::Node::CHILD_FIELDS_LENGTH
|
18
|
+
end
|
19
|
+
|
20
|
+
describe '#has_child?' do
|
21
|
+
it 'returns true if the node contains the child, false otherwise' do
|
22
|
+
node = create_node do |node|
|
23
|
+
node.add_child('a', create_node('foo'))
|
24
|
+
end
|
25
|
+
|
26
|
+
expect(node.has_child?('a')).to be(true)
|
27
|
+
expect(node.has_child?('b')).to be(false)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
describe '#child_at' do
|
32
|
+
it 'returns the child at the given letter, nil otherwise' do
|
33
|
+
child = create_node('foo')
|
34
|
+
node = create_node do |node|
|
35
|
+
node.add_child('a', child)
|
36
|
+
end
|
37
|
+
|
38
|
+
expect(node.child_at('a')).to be(child)
|
39
|
+
expect(node.child_at('b')).to be(nil)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
describe '#add_child' do
|
44
|
+
it 'should add the child at the given letter' do
|
45
|
+
node = create_node
|
46
|
+
node.add_child('a', create_node('foo'))
|
47
|
+
expect(node.children).to include('a')
|
48
|
+
expect(node.children['a'].value).to eq('foo')
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
describe '#bytesize' do
|
53
|
+
it 'when no children and no value, returns just the header size' do
|
54
|
+
expect(create_node.bytesize).to eq(header_field_length)
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'when no children and a value, returns the header size plus the size of the value' do
|
58
|
+
expect(create_node('foo').bytesize).to eq(header_field_length + 3)
|
59
|
+
end
|
60
|
+
|
61
|
+
it 'when a child and a value, returns the header size plus the size of the children plus the size of the value' do
|
62
|
+
expect(
|
63
|
+
create_node('foo') do |node|
|
64
|
+
node.add_child('a', create_node('foo'))
|
65
|
+
end.bytesize
|
66
|
+
).to eq(header_field_length + 3 + child_fields_length)
|
67
|
+
end
|
68
|
+
|
69
|
+
it 'when multiple children and a value, returns the header size plus the size of the children plus the size of the value' do
|
70
|
+
expect(
|
71
|
+
create_node('foo') do |node|
|
72
|
+
node.add_child('a', create_node('foo'))
|
73
|
+
node.add_child('b', create_node('bar'))
|
74
|
+
end.bytesize
|
75
|
+
).to eq(header_field_length + 3 + child_fields_length * 2)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
describe '#value_bytesize' do
|
80
|
+
it 'returns the number of bytes in the value' do
|
81
|
+
expect(create_node('foo').value_bytesize).to eq(3)
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'returns zero if the value is nil' do
|
85
|
+
expect(create_node.value_bytesize).to eq(0)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
describe '#value_bytes' do
|
90
|
+
it 'returns an enumerator of the bytes in the value' do
|
91
|
+
expect(create_node('foo').value_bytes.to_a).to eq([102, 111, 111])
|
92
|
+
end
|
93
|
+
|
94
|
+
it 'returns an empty array if the value is nil' do
|
95
|
+
expect(create_node.value_bytes).to eq([])
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'rspec'
|
4
|
+
require 'trie-file'
|
5
|
+
require 'pry-nav'
|
6
|
+
|
7
|
+
RSpec.configure do |config|
|
8
|
+
config.mock_with :rr
|
9
|
+
end
|
10
|
+
|
11
|
+
def check_trie(root, key, val)
|
12
|
+
node = root
|
13
|
+
key.each_char do |char|
|
14
|
+
expect(node.children.size).to eq(1)
|
15
|
+
expect(node.children).to include(char)
|
16
|
+
node = node.children[char]
|
17
|
+
end
|
18
|
+
expect(node.value).to eq(val)
|
19
|
+
end
|
data/spec/trie_spec.rb
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe TrieFile::Trie do
|
6
|
+
def trie
|
7
|
+
TrieFile::Trie
|
8
|
+
end
|
9
|
+
|
10
|
+
describe '#add' do
|
11
|
+
it 'should add the item' do
|
12
|
+
trie.new.tap do |t|
|
13
|
+
t.add('foo', 'bar')
|
14
|
+
check_trie(t.root, 'foo', 'bar')
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
it 'should hash the key with md5 if asked' do
|
19
|
+
trie.new(nil, :md5).tap do |t|
|
20
|
+
t.add('foo', 'bar')
|
21
|
+
check_trie(
|
22
|
+
t.root, Digest::MD5.hexdigest('foo'), 'bar'
|
23
|
+
)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
it 'should hash the key with sha1 if asked' do
|
28
|
+
trie.new(nil, :sha1).tap do |t|
|
29
|
+
t.add('foo', 'bar')
|
30
|
+
check_trie(
|
31
|
+
t.root, Digest::SHA1.hexdigest('foo'), 'bar'
|
32
|
+
)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe '#find' do
|
38
|
+
it 'should be able to find the item' do
|
39
|
+
trie.new.tap do |t|
|
40
|
+
t.add('foo', 'bar')
|
41
|
+
expect(t.find('foo')).to eq('bar')
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
it 'should be able to find the item using the md5 hash mode' do
|
46
|
+
trie.new(nil, :md5).tap do |t|
|
47
|
+
t.add('foo', 'bar')
|
48
|
+
expect(t.find('foo')).to eq('bar')
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
it 'should be able to find the item using the sha1 hash mode' do
|
53
|
+
trie.new(nil, :sha1).tap do |t|
|
54
|
+
t.add('foo', 'bar')
|
55
|
+
expect(t.find('foo')).to eq('bar')
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
data/trie-file.gemspec
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
$:.unshift File.join(File.dirname(__FILE__), 'lib')
|
2
|
+
require 'trie-file/version'
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "trie-file"
|
6
|
+
s.version = ::TrieFile::VERSION
|
7
|
+
s.authors = ["Cameron Dutro"]
|
8
|
+
s.email = ["camertron@gmail.com"]
|
9
|
+
s.homepage = "http://github.com/camertron"
|
10
|
+
|
11
|
+
s.description = s.summary = "Memory-efficient cached trie and trie storage."
|
12
|
+
|
13
|
+
s.platform = Gem::Platform::RUBY
|
14
|
+
s.has_rdoc = true
|
15
|
+
|
16
|
+
s.require_path = 'lib'
|
17
|
+
s.files = Dir["{lib,spec}/**/*", "Gemfile", "History.txt", "README.md", "Rakefile", "trie-file.gemspec"]
|
18
|
+
end
|
metadata
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: trie-file
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Cameron Dutro
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-07-28 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Memory-efficient cached trie and trie storage.
|
14
|
+
email:
|
15
|
+
- camertron@gmail.com
|
16
|
+
executables: []
|
17
|
+
extensions: []
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- Gemfile
|
21
|
+
- History.txt
|
22
|
+
- README.md
|
23
|
+
- Rakefile
|
24
|
+
- lib/trie-file.rb
|
25
|
+
- lib/trie-file/file.rb
|
26
|
+
- lib/trie-file/node.rb
|
27
|
+
- lib/trie-file/trie-file.rb
|
28
|
+
- lib/trie-file/trie.rb
|
29
|
+
- lib/trie-file/version.rb
|
30
|
+
- spec/file_spec.rb
|
31
|
+
- spec/node_spec.rb
|
32
|
+
- spec/spec_helper.rb
|
33
|
+
- spec/trie_spec.rb
|
34
|
+
- trie-file.gemspec
|
35
|
+
homepage: http://github.com/camertron
|
36
|
+
licenses: []
|
37
|
+
metadata: {}
|
38
|
+
post_install_message:
|
39
|
+
rdoc_options: []
|
40
|
+
require_paths:
|
41
|
+
- lib
|
42
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - '>='
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
48
|
+
requirements:
|
49
|
+
- - '>='
|
50
|
+
- !ruby/object:Gem::Version
|
51
|
+
version: '0'
|
52
|
+
requirements: []
|
53
|
+
rubyforge_project:
|
54
|
+
rubygems_version: 2.2.2
|
55
|
+
signing_key:
|
56
|
+
specification_version: 4
|
57
|
+
summary: Memory-efficient cached trie and trie storage.
|
58
|
+
test_files: []
|