bk 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +8 -0
- data/README.md +77 -0
- data/lib/bk.rb +75 -0
- data/lib/bk/dot_graph.rb +23 -0
- data/lib/bk/dump.rb +23 -0
- data/lib/bk/version.rb +3 -0
- data/samples/graph.rb +11 -0
- data/samples/performance.rb +63 -0
- data/test/test_all.rb +3 -0
- data/test/test_building_tree.rb +66 -0
- data/test/test_import_and_export.rb +33 -0
- data/test/test_querying_tree.rb +84 -0
- metadata +82 -0
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
# BK-Tree implementation in Ruby
|
2
|
+
|
3
|
+
If you don’t know what a BK-tree is, these links should provide a good explanation and introduction.
|
4
|
+
|
5
|
+
* [Damn Cool Algorithms, Part 1: BK-Trees](http://blog.notdot.net/2007/4/Damn-Cool-Algorithms-Part-1-BK-Trees)
|
6
|
+
* [Fast Approximate String Matching in a Dictionary](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.3317)
|
7
|
+
|
8
|
+
## Usage
|
9
|
+
|
10
|
+
require "bk"
|
11
|
+
tree = BK::Tree.new # Use the default Levenshtein distance algorithm
|
12
|
+
|
13
|
+
Add items to the tree:
|
14
|
+
|
15
|
+
tree.add "cat"
|
16
|
+
tree.add "dog"
|
17
|
+
tree.add "monkey"
|
18
|
+
tree.add "donkey"
|
19
|
+
|
20
|
+
Find all items within distance 1 of ‘munkey’:
|
21
|
+
|
22
|
+
tree.query("munkey", 1)
|
23
|
+
# => {"monkey"=>1}
|
24
|
+
|
25
|
+
Find all items within distance 2 of ‘munkey’:
|
26
|
+
|
27
|
+
tree.query("munkey", 2)
|
28
|
+
# => {"donkey"=>2, "monkey"=>1}
|
29
|
+
|
30
|
+
You can specify a custom distance algorithm by passing an object that responds
|
31
|
+
to `call(a, b)` with a number:
|
32
|
+
|
33
|
+
custom_algorithm = lambda{ |a, b|
|
34
|
+
Text::Levenshtein.distance(a, b)
|
35
|
+
}
|
36
|
+
|
37
|
+
tree = BK::Tree.new(custom_algorithm)
|
38
|
+
|
39
|
+
Note that the result *must* satisfy the
|
40
|
+
_triangle inequality_, i.e. _d(x,z) ≤ d(x,y) + d(y,z)_.
|
41
|
+
|
42
|
+
The precomputed tree can be exported to and reimported later from an IO-like object:
|
43
|
+
|
44
|
+
File.open("tree", "wb") do |f|
|
45
|
+
tree.export(f)
|
46
|
+
end
|
47
|
+
|
48
|
+
File.open("tree", "rb") do |f|
|
49
|
+
tree = BK::Tree.import(f)
|
50
|
+
end
|
51
|
+
|
52
|
+
## Dependencies
|
53
|
+
|
54
|
+
* [text](http://rubygems.org/gems/text) version 0.2.0 or newer.
|
55
|
+
|
56
|
+
## Performance
|
57
|
+
|
58
|
+
Results of looking for words within distance 1 of ‘alien’ in a 20,000-word dictionary:
|
59
|
+
|
60
|
+
Loading 20000 words from dictionary ... 0.273s
|
61
|
+
Building tree ... 57.331s
|
62
|
+
Linear scan to find expected terms ... 5.711s
|
63
|
+
Query tree ... 0.133s
|
64
|
+
2.1% of tree was queried
|
65
|
+
|
66
|
+
This means that the BK-tree is about 40 times as fast as a linear search,
|
67
|
+
although building the initial tree took 10 times as long as a linear search.
|
68
|
+
|
69
|
+
As the threshold increases, the benefit is reduced. At threshold 3:
|
70
|
+
|
71
|
+
Query tree ... 3.368s
|
72
|
+
62.9% of tree was queried
|
73
|
+
|
74
|
+
## Limitations
|
75
|
+
|
76
|
+
* Memory usage: around 6 MB for a 20,000-word tree.
|
77
|
+
* Maximum tree depth is limited by the stack.
|
data/lib/bk.rb
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'text/levenshtein'
|
2
|
+
require 'yaml'
|
3
|
+
|
4
|
+
module BK
|
5
|
+
# Paul Battley 2007
|
6
|
+
# See http://blog.notdot.net/archives/30-Damn-Cool-Algorithms,-Part-1-BK-Trees.html
|
7
|
+
# and http://www.dcc.uchile.cl/~gnavarro/ps/spire98.2.ps.gz
|
8
|
+
|
9
|
+
class LevenshteinDistancer
|
10
|
+
def call(a, b)
|
11
|
+
Text::Levenshtein.distance(a, b)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class Node
|
16
|
+
attr_reader :term, :children
|
17
|
+
|
18
|
+
def initialize(term, distancer)
|
19
|
+
@term = term
|
20
|
+
@children = {}
|
21
|
+
@distancer = distancer
|
22
|
+
end
|
23
|
+
|
24
|
+
def add(term)
|
25
|
+
score = distance(term)
|
26
|
+
if child = children[score]
|
27
|
+
child.add(term)
|
28
|
+
else
|
29
|
+
children[score] = Node.new(term, @distancer)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def query(term, threshold, collected)
|
34
|
+
distance_at_node = distance(term)
|
35
|
+
collected[self.term] = distance_at_node if distance_at_node <= threshold
|
36
|
+
((distance_at_node-threshold)..(threshold+distance_at_node)).each do |score|
|
37
|
+
child = children[score]
|
38
|
+
child.query(term, threshold, collected) if child
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def distance(term)
|
43
|
+
@distancer.call(term, self.term)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
class Tree
|
48
|
+
def initialize(distancer = LevenshteinDistancer.new)
|
49
|
+
@root = nil
|
50
|
+
@distancer = distancer
|
51
|
+
end
|
52
|
+
|
53
|
+
def add(term)
|
54
|
+
if @root
|
55
|
+
@root.add(term)
|
56
|
+
else
|
57
|
+
@root = Node.new(term, @distancer)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def query(term, threshold)
|
62
|
+
collected = {}
|
63
|
+
@root.query(term, threshold, collected)
|
64
|
+
return collected
|
65
|
+
end
|
66
|
+
|
67
|
+
def export(stream)
|
68
|
+
stream.write(YAML.dump(self))
|
69
|
+
end
|
70
|
+
|
71
|
+
def self.import(stream)
|
72
|
+
YAML.load(stream.read)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
data/lib/bk/dot_graph.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'bk'
|
2
|
+
|
3
|
+
module BK
|
4
|
+
module DotGraphable
|
5
|
+
def graph
|
6
|
+
%{"#{term}" [label = "#{term}"]\n} +
|
7
|
+
children.sort_by{ |distance, child| distance }.map{ |distance, child|
|
8
|
+
child.graph +
|
9
|
+
%{edge [label = "#{distance}"]\n"#{term}" -> "#{child.term}"\n}
|
10
|
+
}.join
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class Node
|
15
|
+
include DotGraphable
|
16
|
+
end
|
17
|
+
|
18
|
+
class Tree
|
19
|
+
def dot_graph
|
20
|
+
["digraph G {", @root.graph, "}"].join("\n")
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/bk/dump.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'bk'
|
2
|
+
|
3
|
+
module BK
|
4
|
+
module Dumpable
|
5
|
+
def dump
|
6
|
+
if children.any?
|
7
|
+
[term, children.inject({}){ |h,(score,child)| h[score] = child.dump; h }]
|
8
|
+
else
|
9
|
+
[term]
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class Node
|
15
|
+
include Dumpable
|
16
|
+
end
|
17
|
+
|
18
|
+
class Tree
|
19
|
+
def dump
|
20
|
+
@root ? @root.dump : []
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/bk/version.rb
ADDED
data/samples/graph.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
$:.unshift( File.join( File.dirname(__FILE__), '..', 'lib' ))
|
2
|
+
require 'bk'
|
3
|
+
require 'bk/dot_graph'
|
4
|
+
|
5
|
+
tree = BK::Tree.new
|
6
|
+
$stdin.each_with_index do |line, i|
|
7
|
+
tree.add(line.strip)
|
8
|
+
File.open('bk-%04d.dot' % i, 'w') do |io|
|
9
|
+
io << tree.dot_graph
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
$:.unshift( File.join( File.dirname(__FILE__), '..', 'lib' ))
|
2
|
+
require 'bk'
|
3
|
+
|
4
|
+
class CountingLevenshteinDistancer < BK::LevenshteinDistancer
|
5
|
+
attr_reader :count
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@count = 0
|
9
|
+
@counting = false
|
10
|
+
end
|
11
|
+
|
12
|
+
def call(a, b)
|
13
|
+
@count += 1 if @counting
|
14
|
+
super
|
15
|
+
end
|
16
|
+
|
17
|
+
def start_counting
|
18
|
+
@counting = true
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def time(message)
|
23
|
+
t0 = Time.now
|
24
|
+
print "#{message} ... "
|
25
|
+
$stdout.flush
|
26
|
+
retval = yield
|
27
|
+
puts "%0.3fs" % [Time.now - t0]
|
28
|
+
return retval
|
29
|
+
end
|
30
|
+
|
31
|
+
search_term = 'alien'
|
32
|
+
threshold = 1
|
33
|
+
distancer = CountingLevenshteinDistancer.new
|
34
|
+
|
35
|
+
terms = time('Loading 10 K words from dictionary'){
|
36
|
+
File.read('/usr/share/dict/words').scan(/\w+/)[0, 10000]
|
37
|
+
}
|
38
|
+
|
39
|
+
tree = time('Building tree'){
|
40
|
+
tree = BK::Tree.new(distancer)
|
41
|
+
terms.each do |term|
|
42
|
+
tree.add(term)
|
43
|
+
end
|
44
|
+
tree
|
45
|
+
}
|
46
|
+
|
47
|
+
expected = time('Linear scan to find expected terms'){
|
48
|
+
terms.inject({}){ |acc, t|
|
49
|
+
d = Text::Levenshtein.distance(t, search_term)
|
50
|
+
acc[t] = d if d <= threshold
|
51
|
+
acc
|
52
|
+
}
|
53
|
+
}
|
54
|
+
|
55
|
+
distancer.start_counting
|
56
|
+
|
57
|
+
actual = time('Query tree'){
|
58
|
+
tree.query(search_term, threshold)
|
59
|
+
}
|
60
|
+
|
61
|
+
raise 'Results of linear and tree scan differ' unless expected == actual
|
62
|
+
|
63
|
+
puts '%0.1f%% of tree was queried' % [(distancer.count * 100.0) / terms.length]
|
data/test/test_all.rb
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
$:.unshift( File.join( File.dirname(__FILE__), '..', 'lib' ))
|
2
|
+
require 'test/unit'
|
3
|
+
require 'bk'
|
4
|
+
require 'bk/dump'
|
5
|
+
|
6
|
+
class BKTreeBuildingWhiteBoxTest < Test::Unit::TestCase
|
7
|
+
attr_reader :tree
|
8
|
+
|
9
|
+
def setup
|
10
|
+
@tree = BK::Tree.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_should_build_root
|
14
|
+
tree.add('book')
|
15
|
+
assert_equal ['book'], tree.dump
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_should_add_one_term
|
19
|
+
tree.add('book')
|
20
|
+
tree.add('rook')
|
21
|
+
assert_equal(
|
22
|
+
[ 'book', {
|
23
|
+
1 => [ 'rook' ]}],
|
24
|
+
tree.dump
|
25
|
+
)
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_should_add_second_term
|
29
|
+
%w[ book rook nooks ].each do |word|
|
30
|
+
tree.add(word)
|
31
|
+
end
|
32
|
+
assert_equal(
|
33
|
+
[ 'book', {
|
34
|
+
1 => [ 'rook' ],
|
35
|
+
2 => [ 'nooks' ]}],
|
36
|
+
tree.dump
|
37
|
+
)
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_should_add_third_term
|
41
|
+
%w[ book rook nooks boon ].each do |word|
|
42
|
+
tree.add(word)
|
43
|
+
end
|
44
|
+
assert_equal(
|
45
|
+
[ 'book', {
|
46
|
+
1 => [ 'rook', {
|
47
|
+
2 => [ 'boon' ]}],
|
48
|
+
2 => [ 'nooks' ]}],
|
49
|
+
tree.dump
|
50
|
+
)
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_should_add_fourth_term
|
54
|
+
%w[ book rook nooks boon boot ].each do |word|
|
55
|
+
tree.add(word)
|
56
|
+
end
|
57
|
+
assert_equal(
|
58
|
+
[ 'book', {
|
59
|
+
1 => [ 'rook', {
|
60
|
+
2 => [ 'boon', {
|
61
|
+
1 => [ 'boot' ]}]}],
|
62
|
+
2 => [ 'nooks' ]}],
|
63
|
+
tree.dump
|
64
|
+
)
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
$:.unshift( File.join( File.dirname(__FILE__), '..', 'lib' ))
|
2
|
+
require 'test/unit'
|
3
|
+
require 'bk'
|
4
|
+
require 'stringio'
|
5
|
+
|
6
|
+
class BKTreeImportAndExportTest < Test::Unit::TestCase
|
7
|
+
def test_should_give_correct_results_after_exporting_and_reimporting
|
8
|
+
tree = BK::Tree.new
|
9
|
+
terms = %w[
|
10
|
+
lorem ipsum dolor sit amet consectetuer adipiscing elit donec eget lectus vivamus nec
|
11
|
+
odio non ipsum adipiscing ornare etiam sapien
|
12
|
+
].uniq
|
13
|
+
terms.each do |term|
|
14
|
+
tree.add(term)
|
15
|
+
end
|
16
|
+
|
17
|
+
stream = StringIO.new
|
18
|
+
tree.export(stream)
|
19
|
+
|
20
|
+
stream.rewind
|
21
|
+
tree = BK::Tree.import(stream)
|
22
|
+
|
23
|
+
search_term = 'sapient'
|
24
|
+
threshold = 1
|
25
|
+
expected = terms.inject({}){ |acc, t|
|
26
|
+
d = Text::Levenshtein.distance(t, search_term)
|
27
|
+
acc[t] = d if d <= threshold
|
28
|
+
acc
|
29
|
+
}
|
30
|
+
assert expected.any?
|
31
|
+
assert_equal expected, tree.query(search_term, threshold)
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
$:.unshift( File.join( File.dirname(__FILE__), '..', 'lib' ))
|
2
|
+
require 'test/unit'
|
3
|
+
require 'bk'
|
4
|
+
|
5
|
+
class BKTreeQueryAccuracyTest < Test::Unit::TestCase
|
6
|
+
def test_should_match_the_results_of_a_linear_scan
|
7
|
+
tree = BK::Tree.new
|
8
|
+
terms = %w[
|
9
|
+
lorem ipsum dolor sit amet consectetuer adipiscing elit donec eget lectus vivamus nec
|
10
|
+
odio non ipsum adipiscing ornare etiam sapien
|
11
|
+
].uniq
|
12
|
+
terms.each do |term|
|
13
|
+
tree.add(term)
|
14
|
+
end
|
15
|
+
|
16
|
+
search_term = 'sapient'
|
17
|
+
threshold = 1
|
18
|
+
expected = terms.inject({}){ |acc, t|
|
19
|
+
d = Text::Levenshtein.distance(t, search_term)
|
20
|
+
acc[t] = d if d <= threshold
|
21
|
+
acc
|
22
|
+
}
|
23
|
+
assert expected.any?
|
24
|
+
assert_equal expected, tree.query(search_term, threshold)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
class BKTreeSearchSpaceTest < Test::Unit::TestCase
|
29
|
+
|
30
|
+
class RecordingLevenshteinDistancer < BK::LevenshteinDistancer
|
31
|
+
attr_reader :history
|
32
|
+
|
33
|
+
def initialize
|
34
|
+
@history = []
|
35
|
+
@counting = false
|
36
|
+
end
|
37
|
+
|
38
|
+
def call(a, b)
|
39
|
+
@history << [a, b] if @recording
|
40
|
+
super
|
41
|
+
end
|
42
|
+
|
43
|
+
def start_recording
|
44
|
+
@recording = true
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_should_compare_only_necessary_nodes
|
49
|
+
tree = BK::Tree.new
|
50
|
+
terms = %w[
|
51
|
+
infighting
|
52
|
+
birded
|
53
|
+
inebriation
|
54
|
+
stargazers
|
55
|
+
troika
|
56
|
+
bostonians
|
57
|
+
contemplating
|
58
|
+
gamey
|
59
|
+
skydove
|
60
|
+
scandalously
|
61
|
+
archaeological
|
62
|
+
soundness
|
63
|
+
tightwads
|
64
|
+
wanderlust
|
65
|
+
]
|
66
|
+
distancer = RecordingLevenshteinDistancer.new
|
67
|
+
tree = BK::Tree.new(distancer)
|
68
|
+
terms.each do |term|
|
69
|
+
tree.add(term)
|
70
|
+
end
|
71
|
+
distancer.start_recording
|
72
|
+
tree.query('game', 1)
|
73
|
+
expected = [
|
74
|
+
%w[ game infighting ],
|
75
|
+
%w[ game contemplating],
|
76
|
+
%w[ game birded],
|
77
|
+
%w[ game gamey ],
|
78
|
+
%w[ game troika ],
|
79
|
+
%w[ game skydove],
|
80
|
+
%w[ game soundness ]
|
81
|
+
]
|
82
|
+
assert_equal expected, distancer.history
|
83
|
+
end
|
84
|
+
end
|
metadata
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bk
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Paul Battley
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-06-13 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: text
|
16
|
+
requirement: &70195843345880 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70195843345880
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: test-unit
|
27
|
+
requirement: &70195843345460 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70195843345460
|
36
|
+
description: Burkhard Keller Tree implementation in Ruby
|
37
|
+
email: pbattley@gmail.com
|
38
|
+
executables: []
|
39
|
+
extensions: []
|
40
|
+
extra_rdoc_files: []
|
41
|
+
files:
|
42
|
+
- README.md
|
43
|
+
- Gemfile
|
44
|
+
- lib/bk/dot_graph.rb
|
45
|
+
- lib/bk/dump.rb
|
46
|
+
- lib/bk/version.rb
|
47
|
+
- lib/bk.rb
|
48
|
+
- samples/graph.rb
|
49
|
+
- samples/performance.rb
|
50
|
+
- test/test_all.rb
|
51
|
+
- test/test_building_tree.rb
|
52
|
+
- test/test_import_and_export.rb
|
53
|
+
- test/test_querying_tree.rb
|
54
|
+
homepage: https://github.com/threedaymonk/bktree
|
55
|
+
licenses: []
|
56
|
+
post_install_message:
|
57
|
+
rdoc_options: []
|
58
|
+
require_paths:
|
59
|
+
- lib
|
60
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ! '>='
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
requirements: []
|
73
|
+
rubyforge_project:
|
74
|
+
rubygems_version: 1.8.10
|
75
|
+
signing_key:
|
76
|
+
specification_version: 3
|
77
|
+
summary: Burkhard Keller Tree implementation in Ruby
|
78
|
+
test_files:
|
79
|
+
- test/test_all.rb
|
80
|
+
- test/test_building_tree.rb
|
81
|
+
- test/test_import_and_export.rb
|
82
|
+
- test/test_querying_tree.rb
|