bk 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +8 -0
- data/README.md +77 -0
- data/lib/bk.rb +75 -0
- data/lib/bk/dot_graph.rb +23 -0
- data/lib/bk/dump.rb +23 -0
- data/lib/bk/version.rb +3 -0
- data/samples/graph.rb +11 -0
- data/samples/performance.rb +63 -0
- data/test/test_all.rb +3 -0
- data/test/test_building_tree.rb +66 -0
- data/test/test_import_and_export.rb +33 -0
- data/test/test_querying_tree.rb +84 -0
- metadata +82 -0
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
# BK-Tree implementation in Ruby
|
2
|
+
|
3
|
+
If you don’t know what a BK-tree is, these links should provide a good explanation and introduction.
|
4
|
+
|
5
|
+
* [Damn Cool Algorithms, Part 1: BK-Trees](http://blog.notdot.net/2007/4/Damn-Cool-Algorithms-Part-1-BK-Trees)
|
6
|
+
* [Fast Approximate String Matching in a Dictionary](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.3317)
|
7
|
+
|
8
|
+
## Usage
|
9
|
+
|
10
|
+
require "bk"
|
11
|
+
tree = BK::Tree.new # Use the default Levenshtein distance algorithm
|
12
|
+
|
13
|
+
Add items to the tree:
|
14
|
+
|
15
|
+
tree.add "cat"
|
16
|
+
tree.add "dog"
|
17
|
+
tree.add "monkey"
|
18
|
+
tree.add "donkey"
|
19
|
+
|
20
|
+
Find all items within distance 1 of ‘munkey’:
|
21
|
+
|
22
|
+
tree.query("munkey", 1)
|
23
|
+
# => {"monkey"=>1}
|
24
|
+
|
25
|
+
Find all items within distance 2 of ‘munkey’:
|
26
|
+
|
27
|
+
tree.query("munkey", 2)
|
28
|
+
# => {"donkey"=>2, "monkey"=>1}
|
29
|
+
|
30
|
+
You can specify a custom distance algorithm by passing an object that responds
|
31
|
+
to `call(a, b)` with a number:
|
32
|
+
|
33
|
+
custom_algorithm = lambda{ |a, b|
|
34
|
+
Text::Levenshtein.distance(a, b)
|
35
|
+
}
|
36
|
+
|
37
|
+
tree = BK::Tree.new(custom_algorithm)
|
38
|
+
|
39
|
+
Note that the result *must* satisfy the
|
40
|
+
_triangle inequality_, i.e. _d(x,z) ≤ d(x,y) + d(y,z)_.
|
41
|
+
|
42
|
+
The precomputed tree can be exported to and reimported later from an IO-like object:
|
43
|
+
|
44
|
+
File.open("tree", "wb") do |f|
|
45
|
+
tree.export(f)
|
46
|
+
end
|
47
|
+
|
48
|
+
File.open("tree", "rb") do |f|
|
49
|
+
tree = BK::Tree.import(f)
|
50
|
+
end
|
51
|
+
|
52
|
+
## Dependencies
|
53
|
+
|
54
|
+
* [text](http://rubygems.org/gems/text) version 0.2.0 or newer.
|
55
|
+
|
56
|
+
## Performance
|
57
|
+
|
58
|
+
Results of looking for words within distance 1 of ‘alien’ in a 20,000-word dictionary:
|
59
|
+
|
60
|
+
Loading 20000 words from dictionary ... 0.273s
|
61
|
+
Building tree ... 57.331s
|
62
|
+
Linear scan to find expected terms ... 5.711s
|
63
|
+
Query tree ... 0.133s
|
64
|
+
2.1% of tree was queried
|
65
|
+
|
66
|
+
This means that the BK-tree is about 40 times as fast as a linear search,
|
67
|
+
although building the initial tree took 10 times as long as a linear search.
|
68
|
+
|
69
|
+
As the threshold increases, the benefit is reduced. At threshold 3:
|
70
|
+
|
71
|
+
Query tree ... 3.368s
|
72
|
+
62.9% of tree was queried
|
73
|
+
|
74
|
+
## Limitations
|
75
|
+
|
76
|
+
* Memory usage: around 6 MB for a 20,000-word tree.
|
77
|
+
* Maximum tree depth is limited by the stack.
|
data/lib/bk.rb
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'text/levenshtein'
|
2
|
+
require 'yaml'
|
3
|
+
|
4
|
+
module BK
|
5
|
+
# Paul Battley 2007
|
6
|
+
# See http://blog.notdot.net/archives/30-Damn-Cool-Algorithms,-Part-1-BK-Trees.html
|
7
|
+
# and http://www.dcc.uchile.cl/~gnavarro/ps/spire98.2.ps.gz
|
8
|
+
|
9
|
+
class LevenshteinDistancer
|
10
|
+
def call(a, b)
|
11
|
+
Text::Levenshtein.distance(a, b)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class Node
|
16
|
+
attr_reader :term, :children
|
17
|
+
|
18
|
+
def initialize(term, distancer)
|
19
|
+
@term = term
|
20
|
+
@children = {}
|
21
|
+
@distancer = distancer
|
22
|
+
end
|
23
|
+
|
24
|
+
def add(term)
|
25
|
+
score = distance(term)
|
26
|
+
if child = children[score]
|
27
|
+
child.add(term)
|
28
|
+
else
|
29
|
+
children[score] = Node.new(term, @distancer)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def query(term, threshold, collected)
|
34
|
+
distance_at_node = distance(term)
|
35
|
+
collected[self.term] = distance_at_node if distance_at_node <= threshold
|
36
|
+
((distance_at_node-threshold)..(threshold+distance_at_node)).each do |score|
|
37
|
+
child = children[score]
|
38
|
+
child.query(term, threshold, collected) if child
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def distance(term)
|
43
|
+
@distancer.call(term, self.term)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
class Tree
|
48
|
+
def initialize(distancer = LevenshteinDistancer.new)
|
49
|
+
@root = nil
|
50
|
+
@distancer = distancer
|
51
|
+
end
|
52
|
+
|
53
|
+
def add(term)
|
54
|
+
if @root
|
55
|
+
@root.add(term)
|
56
|
+
else
|
57
|
+
@root = Node.new(term, @distancer)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def query(term, threshold)
|
62
|
+
collected = {}
|
63
|
+
@root.query(term, threshold, collected)
|
64
|
+
return collected
|
65
|
+
end
|
66
|
+
|
67
|
+
def export(stream)
|
68
|
+
stream.write(YAML.dump(self))
|
69
|
+
end
|
70
|
+
|
71
|
+
def self.import(stream)
|
72
|
+
YAML.load(stream.read)
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
data/lib/bk/dot_graph.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'bk'
|
2
|
+
|
3
|
+
module BK
|
4
|
+
module DotGraphable
|
5
|
+
def graph
|
6
|
+
%{"#{term}" [label = "#{term}"]\n} +
|
7
|
+
children.sort_by{ |distance, child| distance }.map{ |distance, child|
|
8
|
+
child.graph +
|
9
|
+
%{edge [label = "#{distance}"]\n"#{term}" -> "#{child.term}"\n}
|
10
|
+
}.join
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class Node
|
15
|
+
include DotGraphable
|
16
|
+
end
|
17
|
+
|
18
|
+
class Tree
|
19
|
+
def dot_graph
|
20
|
+
["digraph G {", @root.graph, "}"].join("\n")
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/bk/dump.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'bk'
|
2
|
+
|
3
|
+
module BK
|
4
|
+
module Dumpable
|
5
|
+
def dump
|
6
|
+
if children.any?
|
7
|
+
[term, children.inject({}){ |h,(score,child)| h[score] = child.dump; h }]
|
8
|
+
else
|
9
|
+
[term]
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
class Node
|
15
|
+
include Dumpable
|
16
|
+
end
|
17
|
+
|
18
|
+
class Tree
|
19
|
+
def dump
|
20
|
+
@root ? @root.dump : []
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/bk/version.rb
ADDED
data/samples/graph.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
$:.unshift( File.join( File.dirname(__FILE__), '..', 'lib' ))
|
2
|
+
require 'bk'
|
3
|
+
require 'bk/dot_graph'
|
4
|
+
|
5
|
+
tree = BK::Tree.new
|
6
|
+
$stdin.each_with_index do |line, i|
|
7
|
+
tree.add(line.strip)
|
8
|
+
File.open('bk-%04d.dot' % i, 'w') do |io|
|
9
|
+
io << tree.dot_graph
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
$:.unshift( File.join( File.dirname(__FILE__), '..', 'lib' ))
|
2
|
+
require 'bk'
|
3
|
+
|
4
|
+
class CountingLevenshteinDistancer < BK::LevenshteinDistancer
|
5
|
+
attr_reader :count
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@count = 0
|
9
|
+
@counting = false
|
10
|
+
end
|
11
|
+
|
12
|
+
def call(a, b)
|
13
|
+
@count += 1 if @counting
|
14
|
+
super
|
15
|
+
end
|
16
|
+
|
17
|
+
def start_counting
|
18
|
+
@counting = true
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def time(message)
|
23
|
+
t0 = Time.now
|
24
|
+
print "#{message} ... "
|
25
|
+
$stdout.flush
|
26
|
+
retval = yield
|
27
|
+
puts "%0.3fs" % [Time.now - t0]
|
28
|
+
return retval
|
29
|
+
end
|
30
|
+
|
31
|
+
search_term = 'alien'
|
32
|
+
threshold = 1
|
33
|
+
distancer = CountingLevenshteinDistancer.new
|
34
|
+
|
35
|
+
terms = time('Loading 10 K words from dictionary'){
|
36
|
+
File.read('/usr/share/dict/words').scan(/\w+/)[0, 10000]
|
37
|
+
}
|
38
|
+
|
39
|
+
tree = time('Building tree'){
|
40
|
+
tree = BK::Tree.new(distancer)
|
41
|
+
terms.each do |term|
|
42
|
+
tree.add(term)
|
43
|
+
end
|
44
|
+
tree
|
45
|
+
}
|
46
|
+
|
47
|
+
expected = time('Linear scan to find expected terms'){
|
48
|
+
terms.inject({}){ |acc, t|
|
49
|
+
d = Text::Levenshtein.distance(t, search_term)
|
50
|
+
acc[t] = d if d <= threshold
|
51
|
+
acc
|
52
|
+
}
|
53
|
+
}
|
54
|
+
|
55
|
+
distancer.start_counting
|
56
|
+
|
57
|
+
actual = time('Query tree'){
|
58
|
+
tree.query(search_term, threshold)
|
59
|
+
}
|
60
|
+
|
61
|
+
raise 'Results of linear and tree scan differ' unless expected == actual
|
62
|
+
|
63
|
+
puts '%0.1f%% of tree was queried' % [(distancer.count * 100.0) / terms.length]
|
data/test/test_all.rb
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
$:.unshift( File.join( File.dirname(__FILE__), '..', 'lib' ))
|
2
|
+
require 'test/unit'
|
3
|
+
require 'bk'
|
4
|
+
require 'bk/dump'
|
5
|
+
|
6
|
+
class BKTreeBuildingWhiteBoxTest < Test::Unit::TestCase
|
7
|
+
attr_reader :tree
|
8
|
+
|
9
|
+
def setup
|
10
|
+
@tree = BK::Tree.new
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_should_build_root
|
14
|
+
tree.add('book')
|
15
|
+
assert_equal ['book'], tree.dump
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_should_add_one_term
|
19
|
+
tree.add('book')
|
20
|
+
tree.add('rook')
|
21
|
+
assert_equal(
|
22
|
+
[ 'book', {
|
23
|
+
1 => [ 'rook' ]}],
|
24
|
+
tree.dump
|
25
|
+
)
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_should_add_second_term
|
29
|
+
%w[ book rook nooks ].each do |word|
|
30
|
+
tree.add(word)
|
31
|
+
end
|
32
|
+
assert_equal(
|
33
|
+
[ 'book', {
|
34
|
+
1 => [ 'rook' ],
|
35
|
+
2 => [ 'nooks' ]}],
|
36
|
+
tree.dump
|
37
|
+
)
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_should_add_third_term
|
41
|
+
%w[ book rook nooks boon ].each do |word|
|
42
|
+
tree.add(word)
|
43
|
+
end
|
44
|
+
assert_equal(
|
45
|
+
[ 'book', {
|
46
|
+
1 => [ 'rook', {
|
47
|
+
2 => [ 'boon' ]}],
|
48
|
+
2 => [ 'nooks' ]}],
|
49
|
+
tree.dump
|
50
|
+
)
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_should_add_fourth_term
|
54
|
+
%w[ book rook nooks boon boot ].each do |word|
|
55
|
+
tree.add(word)
|
56
|
+
end
|
57
|
+
assert_equal(
|
58
|
+
[ 'book', {
|
59
|
+
1 => [ 'rook', {
|
60
|
+
2 => [ 'boon', {
|
61
|
+
1 => [ 'boot' ]}]}],
|
62
|
+
2 => [ 'nooks' ]}],
|
63
|
+
tree.dump
|
64
|
+
)
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
$:.unshift( File.join( File.dirname(__FILE__), '..', 'lib' ))
|
2
|
+
require 'test/unit'
|
3
|
+
require 'bk'
|
4
|
+
require 'stringio'
|
5
|
+
|
6
|
+
class BKTreeImportAndExportTest < Test::Unit::TestCase
|
7
|
+
def test_should_give_correct_results_after_exporting_and_reimporting
|
8
|
+
tree = BK::Tree.new
|
9
|
+
terms = %w[
|
10
|
+
lorem ipsum dolor sit amet consectetuer adipiscing elit donec eget lectus vivamus nec
|
11
|
+
odio non ipsum adipiscing ornare etiam sapien
|
12
|
+
].uniq
|
13
|
+
terms.each do |term|
|
14
|
+
tree.add(term)
|
15
|
+
end
|
16
|
+
|
17
|
+
stream = StringIO.new
|
18
|
+
tree.export(stream)
|
19
|
+
|
20
|
+
stream.rewind
|
21
|
+
tree = BK::Tree.import(stream)
|
22
|
+
|
23
|
+
search_term = 'sapient'
|
24
|
+
threshold = 1
|
25
|
+
expected = terms.inject({}){ |acc, t|
|
26
|
+
d = Text::Levenshtein.distance(t, search_term)
|
27
|
+
acc[t] = d if d <= threshold
|
28
|
+
acc
|
29
|
+
}
|
30
|
+
assert expected.any?
|
31
|
+
assert_equal expected, tree.query(search_term, threshold)
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
$:.unshift( File.join( File.dirname(__FILE__), '..', 'lib' ))
|
2
|
+
require 'test/unit'
|
3
|
+
require 'bk'
|
4
|
+
|
5
|
+
class BKTreeQueryAccuracyTest < Test::Unit::TestCase
|
6
|
+
def test_should_match_the_results_of_a_linear_scan
|
7
|
+
tree = BK::Tree.new
|
8
|
+
terms = %w[
|
9
|
+
lorem ipsum dolor sit amet consectetuer adipiscing elit donec eget lectus vivamus nec
|
10
|
+
odio non ipsum adipiscing ornare etiam sapien
|
11
|
+
].uniq
|
12
|
+
terms.each do |term|
|
13
|
+
tree.add(term)
|
14
|
+
end
|
15
|
+
|
16
|
+
search_term = 'sapient'
|
17
|
+
threshold = 1
|
18
|
+
expected = terms.inject({}){ |acc, t|
|
19
|
+
d = Text::Levenshtein.distance(t, search_term)
|
20
|
+
acc[t] = d if d <= threshold
|
21
|
+
acc
|
22
|
+
}
|
23
|
+
assert expected.any?
|
24
|
+
assert_equal expected, tree.query(search_term, threshold)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
class BKTreeSearchSpaceTest < Test::Unit::TestCase
|
29
|
+
|
30
|
+
class RecordingLevenshteinDistancer < BK::LevenshteinDistancer
|
31
|
+
attr_reader :history
|
32
|
+
|
33
|
+
def initialize
|
34
|
+
@history = []
|
35
|
+
@counting = false
|
36
|
+
end
|
37
|
+
|
38
|
+
def call(a, b)
|
39
|
+
@history << [a, b] if @recording
|
40
|
+
super
|
41
|
+
end
|
42
|
+
|
43
|
+
def start_recording
|
44
|
+
@recording = true
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def test_should_compare_only_necessary_nodes
|
49
|
+
tree = BK::Tree.new
|
50
|
+
terms = %w[
|
51
|
+
infighting
|
52
|
+
birded
|
53
|
+
inebriation
|
54
|
+
stargazers
|
55
|
+
troika
|
56
|
+
bostonians
|
57
|
+
contemplating
|
58
|
+
gamey
|
59
|
+
skydove
|
60
|
+
scandalously
|
61
|
+
archaeological
|
62
|
+
soundness
|
63
|
+
tightwads
|
64
|
+
wanderlust
|
65
|
+
]
|
66
|
+
distancer = RecordingLevenshteinDistancer.new
|
67
|
+
tree = BK::Tree.new(distancer)
|
68
|
+
terms.each do |term|
|
69
|
+
tree.add(term)
|
70
|
+
end
|
71
|
+
distancer.start_recording
|
72
|
+
tree.query('game', 1)
|
73
|
+
expected = [
|
74
|
+
%w[ game infighting ],
|
75
|
+
%w[ game contemplating],
|
76
|
+
%w[ game birded],
|
77
|
+
%w[ game gamey ],
|
78
|
+
%w[ game troika ],
|
79
|
+
%w[ game skydove],
|
80
|
+
%w[ game soundness ]
|
81
|
+
]
|
82
|
+
assert_equal expected, distancer.history
|
83
|
+
end
|
84
|
+
end
|
metadata
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bk
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Paul Battley
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-06-13 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: text
|
16
|
+
requirement: &70195843345880 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70195843345880
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: test-unit
|
27
|
+
requirement: &70195843345460 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70195843345460
|
36
|
+
description: Burkhard Keller Tree implementation in Ruby
|
37
|
+
email: pbattley@gmail.com
|
38
|
+
executables: []
|
39
|
+
extensions: []
|
40
|
+
extra_rdoc_files: []
|
41
|
+
files:
|
42
|
+
- README.md
|
43
|
+
- Gemfile
|
44
|
+
- lib/bk/dot_graph.rb
|
45
|
+
- lib/bk/dump.rb
|
46
|
+
- lib/bk/version.rb
|
47
|
+
- lib/bk.rb
|
48
|
+
- samples/graph.rb
|
49
|
+
- samples/performance.rb
|
50
|
+
- test/test_all.rb
|
51
|
+
- test/test_building_tree.rb
|
52
|
+
- test/test_import_and_export.rb
|
53
|
+
- test/test_querying_tree.rb
|
54
|
+
homepage: https://github.com/threedaymonk/bktree
|
55
|
+
licenses: []
|
56
|
+
post_install_message:
|
57
|
+
rdoc_options: []
|
58
|
+
require_paths:
|
59
|
+
- lib
|
60
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ! '>='
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
version: '0'
|
72
|
+
requirements: []
|
73
|
+
rubyforge_project:
|
74
|
+
rubygems_version: 1.8.10
|
75
|
+
signing_key:
|
76
|
+
specification_version: 3
|
77
|
+
summary: Burkhard Keller Tree implementation in Ruby
|
78
|
+
test_files:
|
79
|
+
- test/test_all.rb
|
80
|
+
- test/test_building_tree.rb
|
81
|
+
- test/test_import_and_export.rb
|
82
|
+
- test/test_querying_tree.rb
|