bk 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ # A sample Gemfile
2
+ source "http://rubygems.org"
3
+
4
+ gem "text"
5
+
6
+ group :test do
7
+ gem "test-unit"
8
+ end
@@ -0,0 +1,77 @@
1
+ # BK-Tree implementation in Ruby
2
+
3
+ If you don’t know what a BK-tree is, these links should provide a good explanation and introduction.
4
+
5
+ * [Damn Cool Algorithms, Part 1: BK-Trees](http://blog.notdot.net/2007/4/Damn-Cool-Algorithms-Part-1-BK-Trees)
6
+ * [Fast Approximate String Matching in a Dictionary](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.3317)
7
+
8
+ ## Usage
9
+
10
+ require "bk"
11
+ tree = BK::Tree.new # Use the default Levenshtein distance algorithm
12
+
13
+ Add items to the tree:
14
+
15
+ tree.add "cat"
16
+ tree.add "dog"
17
+ tree.add "monkey"
18
+ tree.add "donkey"
19
+
20
+ Find all items within distance 1 of ‘munkey’:
21
+
22
+ tree.query("munkey", 1)
23
+ # => {"monkey"=>1}
24
+
25
+ Find all items within distance 2 of ‘munkey’:
26
+
27
+ tree.query("munkey", 2)
28
+ # => {"donkey"=>2, "monkey"=>1}
29
+
30
+ You can specify a custom distance algorithm by passing an object that responds
31
+ to `call(a, b)` with a number:
32
+
33
+ custom_algorithm = lambda{ |a, b|
34
+ Text::Levenshtein.distance(a, b)
35
+ }
36
+
37
+ tree = BK::Tree.new(custom_algorithm)
38
+
39
+ Note that the result *must* satisfy the
40
+ _triangle inequality_, i.e. _d(x,z) ≤ d(x,y) + d(y,z)_.
41
+
42
+ The precomputed tree can be exported to and reimported later from an IO-like object:
43
+
44
+ File.open("tree", "wb") do |f|
45
+ tree.export(f)
46
+ end
47
+
48
+ File.open("tree", "rb") do |f|
49
+ tree = BK::Tree.import(f)
50
+ end
51
+
52
+ ## Dependencies
53
+
54
+ * [text](http://rubygems.org/gems/text) version 0.2.0 or newer.
55
+
56
+ ## Performance
57
+
58
+ Results of looking for words within distance 1 of ‘alien’ in a 20,000-word dictionary:
59
+
60
+ Loading 20000 words from dictionary ... 0.273s
61
+ Building tree ... 57.331s
62
+ Linear scan to find expected terms ... 5.711s
63
+ Query tree ... 0.133s
64
+ 2.1% of tree was queried
65
+
66
+ This means that the BK-tree is about 40 times as fast as a linear search,
67
+ although building the initial tree took 10 times as long as a linear search.
68
+
69
+ As the threshold increases, the benefit is reduced. At threshold 3:
70
+
71
+ Query tree ... 3.368s
72
+ 62.9% of tree was queried
73
+
74
+ ## Limitations
75
+
76
+ * Memory usage: around 6 MB for a 20,000-word tree.
77
+ * Maximum tree depth is limited by the stack.
@@ -0,0 +1,75 @@
1
+ require 'text/levenshtein'
2
+ require 'yaml'
3
+
4
+ module BK
5
+ # Paul Battley 2007
6
+ # See http://blog.notdot.net/archives/30-Damn-Cool-Algorithms,-Part-1-BK-Trees.html
7
+ # and http://www.dcc.uchile.cl/~gnavarro/ps/spire98.2.ps.gz
8
+
9
+ class LevenshteinDistancer
10
+ def call(a, b)
11
+ Text::Levenshtein.distance(a, b)
12
+ end
13
+ end
14
+
15
+ class Node
16
+ attr_reader :term, :children
17
+
18
+ def initialize(term, distancer)
19
+ @term = term
20
+ @children = {}
21
+ @distancer = distancer
22
+ end
23
+
24
+ def add(term)
25
+ score = distance(term)
26
+ if child = children[score]
27
+ child.add(term)
28
+ else
29
+ children[score] = Node.new(term, @distancer)
30
+ end
31
+ end
32
+
33
+ def query(term, threshold, collected)
34
+ distance_at_node = distance(term)
35
+ collected[self.term] = distance_at_node if distance_at_node <= threshold
36
+ ((distance_at_node-threshold)..(threshold+distance_at_node)).each do |score|
37
+ child = children[score]
38
+ child.query(term, threshold, collected) if child
39
+ end
40
+ end
41
+
42
+ def distance(term)
43
+ @distancer.call(term, self.term)
44
+ end
45
+ end
46
+
47
+ class Tree
48
+ def initialize(distancer = LevenshteinDistancer.new)
49
+ @root = nil
50
+ @distancer = distancer
51
+ end
52
+
53
+ def add(term)
54
+ if @root
55
+ @root.add(term)
56
+ else
57
+ @root = Node.new(term, @distancer)
58
+ end
59
+ end
60
+
61
+ def query(term, threshold)
62
+ collected = {}
63
+ @root.query(term, threshold, collected)
64
+ return collected
65
+ end
66
+
67
+ def export(stream)
68
+ stream.write(YAML.dump(self))
69
+ end
70
+
71
+ def self.import(stream)
72
+ YAML.load(stream.read)
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,23 @@
1
+ require 'bk'
2
+
3
+ module BK
4
+ module DotGraphable
5
+ def graph
6
+ %{"#{term}" [label = "#{term}"]\n} +
7
+ children.sort_by{ |distance, child| distance }.map{ |distance, child|
8
+ child.graph +
9
+ %{edge [label = "#{distance}"]\n"#{term}" -> "#{child.term}"\n}
10
+ }.join
11
+ end
12
+ end
13
+
14
+ class Node
15
+ include DotGraphable
16
+ end
17
+
18
+ class Tree
19
+ def dot_graph
20
+ ["digraph G {", @root.graph, "}"].join("\n")
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ require 'bk'
2
+
3
+ module BK
4
+ module Dumpable
5
+ def dump
6
+ if children.any?
7
+ [term, children.inject({}){ |h,(score,child)| h[score] = child.dump; h }]
8
+ else
9
+ [term]
10
+ end
11
+ end
12
+ end
13
+
14
+ class Node
15
+ include Dumpable
16
+ end
17
+
18
+ class Tree
19
+ def dump
20
+ @root ? @root.dump : []
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,3 @@
1
+ module BK
2
+ VERSION = '0.0.1'
3
+ end
@@ -0,0 +1,11 @@
1
+ $:.unshift( File.join( File.dirname(__FILE__), '..', 'lib' ))
2
+ require 'bk'
3
+ require 'bk/dot_graph'
4
+
5
+ tree = BK::Tree.new
6
+ $stdin.each_with_index do |line, i|
7
+ tree.add(line.strip)
8
+ File.open('bk-%04d.dot' % i, 'w') do |io|
9
+ io << tree.dot_graph
10
+ end
11
+ end
@@ -0,0 +1,63 @@
1
+ $:.unshift( File.join( File.dirname(__FILE__), '..', 'lib' ))
2
+ require 'bk'
3
+
4
+ class CountingLevenshteinDistancer < BK::LevenshteinDistancer
5
+ attr_reader :count
6
+
7
+ def initialize
8
+ @count = 0
9
+ @counting = false
10
+ end
11
+
12
+ def call(a, b)
13
+ @count += 1 if @counting
14
+ super
15
+ end
16
+
17
+ def start_counting
18
+ @counting = true
19
+ end
20
+ end
21
+
22
+ def time(message)
23
+ t0 = Time.now
24
+ print "#{message} ... "
25
+ $stdout.flush
26
+ retval = yield
27
+ puts "%0.3fs" % [Time.now - t0]
28
+ return retval
29
+ end
30
+
31
+ search_term = 'alien'
32
+ threshold = 1
33
+ distancer = CountingLevenshteinDistancer.new
34
+
35
+ terms = time('Loading 10 K words from dictionary'){
36
+ File.read('/usr/share/dict/words').scan(/\w+/)[0, 10000]
37
+ }
38
+
39
+ tree = time('Building tree'){
40
+ tree = BK::Tree.new(distancer)
41
+ terms.each do |term|
42
+ tree.add(term)
43
+ end
44
+ tree
45
+ }
46
+
47
+ expected = time('Linear scan to find expected terms'){
48
+ terms.inject({}){ |acc, t|
49
+ d = Text::Levenshtein.distance(t, search_term)
50
+ acc[t] = d if d <= threshold
51
+ acc
52
+ }
53
+ }
54
+
55
+ distancer.start_counting
56
+
57
+ actual = time('Query tree'){
58
+ tree.query(search_term, threshold)
59
+ }
60
+
61
+ raise 'Results of linear and tree scan differ' unless expected == actual
62
+
63
+ puts '%0.1f%% of tree was queried' % [(distancer.count * 100.0) / terms.length]
@@ -0,0 +1,3 @@
1
+ Dir[ File.join( File.dirname(__FILE__), '*.rb' )].each do |f|
2
+ require f
3
+ end
@@ -0,0 +1,66 @@
1
+ $:.unshift( File.join( File.dirname(__FILE__), '..', 'lib' ))
2
+ require 'test/unit'
3
+ require 'bk'
4
+ require 'bk/dump'
5
+
6
+ class BKTreeBuildingWhiteBoxTest < Test::Unit::TestCase
7
+ attr_reader :tree
8
+
9
+ def setup
10
+ @tree = BK::Tree.new
11
+ end
12
+
13
+ def test_should_build_root
14
+ tree.add('book')
15
+ assert_equal ['book'], tree.dump
16
+ end
17
+
18
+ def test_should_add_one_term
19
+ tree.add('book')
20
+ tree.add('rook')
21
+ assert_equal(
22
+ [ 'book', {
23
+ 1 => [ 'rook' ]}],
24
+ tree.dump
25
+ )
26
+ end
27
+
28
+ def test_should_add_second_term
29
+ %w[ book rook nooks ].each do |word|
30
+ tree.add(word)
31
+ end
32
+ assert_equal(
33
+ [ 'book', {
34
+ 1 => [ 'rook' ],
35
+ 2 => [ 'nooks' ]}],
36
+ tree.dump
37
+ )
38
+ end
39
+
40
+ def test_should_add_third_term
41
+ %w[ book rook nooks boon ].each do |word|
42
+ tree.add(word)
43
+ end
44
+ assert_equal(
45
+ [ 'book', {
46
+ 1 => [ 'rook', {
47
+ 2 => [ 'boon' ]}],
48
+ 2 => [ 'nooks' ]}],
49
+ tree.dump
50
+ )
51
+ end
52
+
53
+ def test_should_add_fourth_term
54
+ %w[ book rook nooks boon boot ].each do |word|
55
+ tree.add(word)
56
+ end
57
+ assert_equal(
58
+ [ 'book', {
59
+ 1 => [ 'rook', {
60
+ 2 => [ 'boon', {
61
+ 1 => [ 'boot' ]}]}],
62
+ 2 => [ 'nooks' ]}],
63
+ tree.dump
64
+ )
65
+ end
66
+ end
@@ -0,0 +1,33 @@
1
+ $:.unshift( File.join( File.dirname(__FILE__), '..', 'lib' ))
2
+ require 'test/unit'
3
+ require 'bk'
4
+ require 'stringio'
5
+
6
+ class BKTreeImportAndExportTest < Test::Unit::TestCase
7
+ def test_should_give_correct_results_after_exporting_and_reimporting
8
+ tree = BK::Tree.new
9
+ terms = %w[
10
+ lorem ipsum dolor sit amet consectetuer adipiscing elit donec eget lectus vivamus nec
11
+ odio non ipsum adipiscing ornare etiam sapien
12
+ ].uniq
13
+ terms.each do |term|
14
+ tree.add(term)
15
+ end
16
+
17
+ stream = StringIO.new
18
+ tree.export(stream)
19
+
20
+ stream.rewind
21
+ tree = BK::Tree.import(stream)
22
+
23
+ search_term = 'sapient'
24
+ threshold = 1
25
+ expected = terms.inject({}){ |acc, t|
26
+ d = Text::Levenshtein.distance(t, search_term)
27
+ acc[t] = d if d <= threshold
28
+ acc
29
+ }
30
+ assert expected.any?
31
+ assert_equal expected, tree.query(search_term, threshold)
32
+ end
33
+ end
@@ -0,0 +1,84 @@
1
+ $:.unshift( File.join( File.dirname(__FILE__), '..', 'lib' ))
2
+ require 'test/unit'
3
+ require 'bk'
4
+
5
+ class BKTreeQueryAccuracyTest < Test::Unit::TestCase
6
+ def test_should_match_the_results_of_a_linear_scan
7
+ tree = BK::Tree.new
8
+ terms = %w[
9
+ lorem ipsum dolor sit amet consectetuer adipiscing elit donec eget lectus vivamus nec
10
+ odio non ipsum adipiscing ornare etiam sapien
11
+ ].uniq
12
+ terms.each do |term|
13
+ tree.add(term)
14
+ end
15
+
16
+ search_term = 'sapient'
17
+ threshold = 1
18
+ expected = terms.inject({}){ |acc, t|
19
+ d = Text::Levenshtein.distance(t, search_term)
20
+ acc[t] = d if d <= threshold
21
+ acc
22
+ }
23
+ assert expected.any?
24
+ assert_equal expected, tree.query(search_term, threshold)
25
+ end
26
+ end
27
+
28
+ class BKTreeSearchSpaceTest < Test::Unit::TestCase
29
+
30
+ class RecordingLevenshteinDistancer < BK::LevenshteinDistancer
31
+ attr_reader :history
32
+
33
+ def initialize
34
+ @history = []
35
+ @counting = false
36
+ end
37
+
38
+ def call(a, b)
39
+ @history << [a, b] if @recording
40
+ super
41
+ end
42
+
43
+ def start_recording
44
+ @recording = true
45
+ end
46
+ end
47
+
48
+ def test_should_compare_only_necessary_nodes
49
+ tree = BK::Tree.new
50
+ terms = %w[
51
+ infighting
52
+ birded
53
+ inebriation
54
+ stargazers
55
+ troika
56
+ bostonians
57
+ contemplating
58
+ gamey
59
+ skydove
60
+ scandalously
61
+ archaeological
62
+ soundness
63
+ tightwads
64
+ wanderlust
65
+ ]
66
+ distancer = RecordingLevenshteinDistancer.new
67
+ tree = BK::Tree.new(distancer)
68
+ terms.each do |term|
69
+ tree.add(term)
70
+ end
71
+ distancer.start_recording
72
+ tree.query('game', 1)
73
+ expected = [
74
+ %w[ game infighting ],
75
+ %w[ game contemplating],
76
+ %w[ game birded],
77
+ %w[ game gamey ],
78
+ %w[ game troika ],
79
+ %w[ game skydove],
80
+ %w[ game soundness ]
81
+ ]
82
+ assert_equal expected, distancer.history
83
+ end
84
+ end
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bk
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Paul Battley
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-06-13 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: text
16
+ requirement: &70195843345880 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70195843345880
25
+ - !ruby/object:Gem::Dependency
26
+ name: test-unit
27
+ requirement: &70195843345460 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70195843345460
36
+ description: Burkhard Keller Tree implementation in Ruby
37
+ email: pbattley@gmail.com
38
+ executables: []
39
+ extensions: []
40
+ extra_rdoc_files: []
41
+ files:
42
+ - README.md
43
+ - Gemfile
44
+ - lib/bk/dot_graph.rb
45
+ - lib/bk/dump.rb
46
+ - lib/bk/version.rb
47
+ - lib/bk.rb
48
+ - samples/graph.rb
49
+ - samples/performance.rb
50
+ - test/test_all.rb
51
+ - test/test_building_tree.rb
52
+ - test/test_import_and_export.rb
53
+ - test/test_querying_tree.rb
54
+ homepage: https://github.com/threedaymonk/bktree
55
+ licenses: []
56
+ post_install_message:
57
+ rdoc_options: []
58
+ require_paths:
59
+ - lib
60
+ required_ruby_version: !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ required_rubygems_version: !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ! '>='
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ requirements: []
73
+ rubyforge_project:
74
+ rubygems_version: 1.8.10
75
+ signing_key:
76
+ specification_version: 3
77
+ summary: Burkhard Keller Tree implementation in Ruby
78
+ test_files:
79
+ - test/test_all.rb
80
+ - test/test_building_tree.rb
81
+ - test/test_import_and_export.rb
82
+ - test/test_querying_tree.rb