bk 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ # A sample Gemfile
2
+ source "http://rubygems.org"
3
+
4
+ gem "text"
5
+
6
+ group :test do
7
+ gem "test-unit"
8
+ end
@@ -0,0 +1,77 @@
1
+ # BK-Tree implementation in Ruby
2
+
3
+ If you don’t know what a BK-tree is, these links should provide a good explanation and introduction.
4
+
5
+ * [Damn Cool Algorithms, Part 1: BK-Trees](http://blog.notdot.net/2007/4/Damn-Cool-Algorithms-Part-1-BK-Trees)
6
+ * [Fast Approximate String Matching in a Dictionary](http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.21.3317)
7
+
8
+ ## Usage
9
+
10
+ require "bk"
11
+ tree = BK::Tree.new # Use the default Levenshtein distance algorithm
12
+
13
+ Add items to the tree:
14
+
15
+ tree.add "cat"
16
+ tree.add "dog"
17
+ tree.add "monkey"
18
+ tree.add "donkey"
19
+
20
+ Find all items within distance 1 of ‘munkey’:
21
+
22
+ tree.query("munkey", 1)
23
+ # => {"monkey"=>1}
24
+
25
+ Find all items within distance 2 of ‘munkey’:
26
+
27
+ tree.query("munkey", 2)
28
+ # => {"donkey"=>2, "monkey"=>1}
29
+
30
+ You can specify a custom distance algorithm by passing an object that responds
31
+ to `call(a, b)` with a number:
32
+
33
+ custom_algorithm = lambda{ |a, b|
34
+ Text::Levenshtein.distance(a, b)
35
+ }
36
+
37
+ tree = BK::Tree.new(custom_algorithm)
38
+
39
+ Note that the result *must* satisfy the
40
+ _triangle inequality_, i.e. _d(x,z) ≤ d(x,y) + d(y,z)_.
41
+
42
+ The precomputed tree can be exported to and reimported later from an IO-like object:
43
+
44
+ File.open("tree", "wb") do |f|
45
+ tree.export(f)
46
+ end
47
+
48
+ File.open("tree", "rb") do |f|
49
+ tree = BK::Tree.import(f)
50
+ end
51
+
52
+ ## Dependencies
53
+
54
+ * [text](http://rubygems.org/gems/text) version 0.2.0 or newer.
55
+
56
+ ## Performance
57
+
58
+ Results of looking for words within distance 1 of ‘alien’ in a 20,000-word dictionary:
59
+
60
+ Loading 20000 words from dictionary ... 0.273s
61
+ Building tree ... 57.331s
62
+ Linear scan to find expected terms ... 5.711s
63
+ Query tree ... 0.133s
64
+ 2.1% of tree was queried
65
+
66
+ This means that the BK-tree is about 40 times as fast as a linear search,
67
+ although building the initial tree took 10 times as long as a linear search.
68
+
69
+ As the threshold increases, the benefit is reduced. At threshold 3:
70
+
71
+ Query tree ... 3.368s
72
+ 62.9% of tree was queried
73
+
74
+ ## Limitations
75
+
76
+ * Memory usage: around 6 MB for a 20,000-word tree.
77
+ * Maximum tree depth is limited by the stack.
@@ -0,0 +1,75 @@
1
+ require 'text/levenshtein'
2
+ require 'yaml'
3
+
4
+ module BK
5
+ # Paul Battley 2007
6
+ # See http://blog.notdot.net/archives/30-Damn-Cool-Algorithms,-Part-1-BK-Trees.html
7
+ # and http://www.dcc.uchile.cl/~gnavarro/ps/spire98.2.ps.gz
8
+
9
+ class LevenshteinDistancer
10
+ def call(a, b)
11
+ Text::Levenshtein.distance(a, b)
12
+ end
13
+ end
14
+
15
+ class Node
16
+ attr_reader :term, :children
17
+
18
+ def initialize(term, distancer)
19
+ @term = term
20
+ @children = {}
21
+ @distancer = distancer
22
+ end
23
+
24
+ def add(term)
25
+ score = distance(term)
26
+ if child = children[score]
27
+ child.add(term)
28
+ else
29
+ children[score] = Node.new(term, @distancer)
30
+ end
31
+ end
32
+
33
+ def query(term, threshold, collected)
34
+ distance_at_node = distance(term)
35
+ collected[self.term] = distance_at_node if distance_at_node <= threshold
36
+ ((distance_at_node-threshold)..(threshold+distance_at_node)).each do |score|
37
+ child = children[score]
38
+ child.query(term, threshold, collected) if child
39
+ end
40
+ end
41
+
42
+ def distance(term)
43
+ @distancer.call(term, self.term)
44
+ end
45
+ end
46
+
47
+ class Tree
48
+ def initialize(distancer = LevenshteinDistancer.new)
49
+ @root = nil
50
+ @distancer = distancer
51
+ end
52
+
53
+ def add(term)
54
+ if @root
55
+ @root.add(term)
56
+ else
57
+ @root = Node.new(term, @distancer)
58
+ end
59
+ end
60
+
61
+ def query(term, threshold)
62
+ collected = {}
63
+ @root.query(term, threshold, collected)
64
+ return collected
65
+ end
66
+
67
+ def export(stream)
68
+ stream.write(YAML.dump(self))
69
+ end
70
+
71
+ def self.import(stream)
72
+ YAML.load(stream.read)
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,23 @@
1
+ require 'bk'
2
+
3
+ module BK
4
+ module DotGraphable
5
+ def graph
6
+ %{"#{term}" [label = "#{term}"]\n} +
7
+ children.sort_by{ |distance, child| distance }.map{ |distance, child|
8
+ child.graph +
9
+ %{edge [label = "#{distance}"]\n"#{term}" -> "#{child.term}"\n}
10
+ }.join
11
+ end
12
+ end
13
+
14
+ class Node
15
+ include DotGraphable
16
+ end
17
+
18
+ class Tree
19
+ def dot_graph
20
+ ["digraph G {", @root.graph, "}"].join("\n")
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,23 @@
1
+ require 'bk'
2
+
3
+ module BK
4
+ module Dumpable
5
+ def dump
6
+ if children.any?
7
+ [term, children.inject({}){ |h,(score,child)| h[score] = child.dump; h }]
8
+ else
9
+ [term]
10
+ end
11
+ end
12
+ end
13
+
14
+ class Node
15
+ include Dumpable
16
+ end
17
+
18
+ class Tree
19
+ def dump
20
+ @root ? @root.dump : []
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,3 @@
1
+ module BK
2
+ VERSION = '0.0.1'
3
+ end
@@ -0,0 +1,11 @@
1
+ $:.unshift( File.join( File.dirname(__FILE__), '..', 'lib' ))
2
+ require 'bk'
3
+ require 'bk/dot_graph'
4
+
5
+ tree = BK::Tree.new
6
+ $stdin.each_with_index do |line, i|
7
+ tree.add(line.strip)
8
+ File.open('bk-%04d.dot' % i, 'w') do |io|
9
+ io << tree.dot_graph
10
+ end
11
+ end
@@ -0,0 +1,63 @@
1
+ $:.unshift( File.join( File.dirname(__FILE__), '..', 'lib' ))
2
+ require 'bk'
3
+
4
+ class CountingLevenshteinDistancer < BK::LevenshteinDistancer
5
+ attr_reader :count
6
+
7
+ def initialize
8
+ @count = 0
9
+ @counting = false
10
+ end
11
+
12
+ def call(a, b)
13
+ @count += 1 if @counting
14
+ super
15
+ end
16
+
17
+ def start_counting
18
+ @counting = true
19
+ end
20
+ end
21
+
22
+ def time(message)
23
+ t0 = Time.now
24
+ print "#{message} ... "
25
+ $stdout.flush
26
+ retval = yield
27
+ puts "%0.3fs" % [Time.now - t0]
28
+ return retval
29
+ end
30
+
31
+ search_term = 'alien'
32
+ threshold = 1
33
+ distancer = CountingLevenshteinDistancer.new
34
+
35
+ terms = time('Loading 10 K words from dictionary'){
36
+ File.read('/usr/share/dict/words').scan(/\w+/)[0, 10000]
37
+ }
38
+
39
+ tree = time('Building tree'){
40
+ tree = BK::Tree.new(distancer)
41
+ terms.each do |term|
42
+ tree.add(term)
43
+ end
44
+ tree
45
+ }
46
+
47
+ expected = time('Linear scan to find expected terms'){
48
+ terms.inject({}){ |acc, t|
49
+ d = Text::Levenshtein.distance(t, search_term)
50
+ acc[t] = d if d <= threshold
51
+ acc
52
+ }
53
+ }
54
+
55
+ distancer.start_counting
56
+
57
+ actual = time('Query tree'){
58
+ tree.query(search_term, threshold)
59
+ }
60
+
61
+ raise 'Results of linear and tree scan differ' unless expected == actual
62
+
63
+ puts '%0.1f%% of tree was queried' % [(distancer.count * 100.0) / terms.length]
@@ -0,0 +1,3 @@
1
+ Dir[ File.join( File.dirname(__FILE__), '*.rb' )].each do |f|
2
+ require f
3
+ end
@@ -0,0 +1,66 @@
1
+ $:.unshift( File.join( File.dirname(__FILE__), '..', 'lib' ))
2
+ require 'test/unit'
3
+ require 'bk'
4
+ require 'bk/dump'
5
+
6
+ class BKTreeBuildingWhiteBoxTest < Test::Unit::TestCase
7
+ attr_reader :tree
8
+
9
+ def setup
10
+ @tree = BK::Tree.new
11
+ end
12
+
13
+ def test_should_build_root
14
+ tree.add('book')
15
+ assert_equal ['book'], tree.dump
16
+ end
17
+
18
+ def test_should_add_one_term
19
+ tree.add('book')
20
+ tree.add('rook')
21
+ assert_equal(
22
+ [ 'book', {
23
+ 1 => [ 'rook' ]}],
24
+ tree.dump
25
+ )
26
+ end
27
+
28
+ def test_should_add_second_term
29
+ %w[ book rook nooks ].each do |word|
30
+ tree.add(word)
31
+ end
32
+ assert_equal(
33
+ [ 'book', {
34
+ 1 => [ 'rook' ],
35
+ 2 => [ 'nooks' ]}],
36
+ tree.dump
37
+ )
38
+ end
39
+
40
+ def test_should_add_third_term
41
+ %w[ book rook nooks boon ].each do |word|
42
+ tree.add(word)
43
+ end
44
+ assert_equal(
45
+ [ 'book', {
46
+ 1 => [ 'rook', {
47
+ 2 => [ 'boon' ]}],
48
+ 2 => [ 'nooks' ]}],
49
+ tree.dump
50
+ )
51
+ end
52
+
53
+ def test_should_add_fourth_term
54
+ %w[ book rook nooks boon boot ].each do |word|
55
+ tree.add(word)
56
+ end
57
+ assert_equal(
58
+ [ 'book', {
59
+ 1 => [ 'rook', {
60
+ 2 => [ 'boon', {
61
+ 1 => [ 'boot' ]}]}],
62
+ 2 => [ 'nooks' ]}],
63
+ tree.dump
64
+ )
65
+ end
66
+ end
@@ -0,0 +1,33 @@
1
+ $:.unshift( File.join( File.dirname(__FILE__), '..', 'lib' ))
2
+ require 'test/unit'
3
+ require 'bk'
4
+ require 'stringio'
5
+
6
+ class BKTreeImportAndExportTest < Test::Unit::TestCase
7
+ def test_should_give_correct_results_after_exporting_and_reimporting
8
+ tree = BK::Tree.new
9
+ terms = %w[
10
+ lorem ipsum dolor sit amet consectetuer adipiscing elit donec eget lectus vivamus nec
11
+ odio non ipsum adipiscing ornare etiam sapien
12
+ ].uniq
13
+ terms.each do |term|
14
+ tree.add(term)
15
+ end
16
+
17
+ stream = StringIO.new
18
+ tree.export(stream)
19
+
20
+ stream.rewind
21
+ tree = BK::Tree.import(stream)
22
+
23
+ search_term = 'sapient'
24
+ threshold = 1
25
+ expected = terms.inject({}){ |acc, t|
26
+ d = Text::Levenshtein.distance(t, search_term)
27
+ acc[t] = d if d <= threshold
28
+ acc
29
+ }
30
+ assert expected.any?
31
+ assert_equal expected, tree.query(search_term, threshold)
32
+ end
33
+ end
@@ -0,0 +1,84 @@
1
+ $:.unshift( File.join( File.dirname(__FILE__), '..', 'lib' ))
2
+ require 'test/unit'
3
+ require 'bk'
4
+
5
+ class BKTreeQueryAccuracyTest < Test::Unit::TestCase
6
+ def test_should_match_the_results_of_a_linear_scan
7
+ tree = BK::Tree.new
8
+ terms = %w[
9
+ lorem ipsum dolor sit amet consectetuer adipiscing elit donec eget lectus vivamus nec
10
+ odio non ipsum adipiscing ornare etiam sapien
11
+ ].uniq
12
+ terms.each do |term|
13
+ tree.add(term)
14
+ end
15
+
16
+ search_term = 'sapient'
17
+ threshold = 1
18
+ expected = terms.inject({}){ |acc, t|
19
+ d = Text::Levenshtein.distance(t, search_term)
20
+ acc[t] = d if d <= threshold
21
+ acc
22
+ }
23
+ assert expected.any?
24
+ assert_equal expected, tree.query(search_term, threshold)
25
+ end
26
+ end
27
+
28
+ class BKTreeSearchSpaceTest < Test::Unit::TestCase
29
+
30
+ class RecordingLevenshteinDistancer < BK::LevenshteinDistancer
31
+ attr_reader :history
32
+
33
+ def initialize
34
+ @history = []
35
+ @counting = false
36
+ end
37
+
38
+ def call(a, b)
39
+ @history << [a, b] if @recording
40
+ super
41
+ end
42
+
43
+ def start_recording
44
+ @recording = true
45
+ end
46
+ end
47
+
48
+ def test_should_compare_only_necessary_nodes
49
+ tree = BK::Tree.new
50
+ terms = %w[
51
+ infighting
52
+ birded
53
+ inebriation
54
+ stargazers
55
+ troika
56
+ bostonians
57
+ contemplating
58
+ gamey
59
+ skydove
60
+ scandalously
61
+ archaeological
62
+ soundness
63
+ tightwads
64
+ wanderlust
65
+ ]
66
+ distancer = RecordingLevenshteinDistancer.new
67
+ tree = BK::Tree.new(distancer)
68
+ terms.each do |term|
69
+ tree.add(term)
70
+ end
71
+ distancer.start_recording
72
+ tree.query('game', 1)
73
+ expected = [
74
+ %w[ game infighting ],
75
+ %w[ game contemplating],
76
+ %w[ game birded],
77
+ %w[ game gamey ],
78
+ %w[ game troika ],
79
+ %w[ game skydove],
80
+ %w[ game soundness ]
81
+ ]
82
+ assert_equal expected, distancer.history
83
+ end
84
+ end
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bk
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Paul Battley
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-06-13 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: text
16
+ requirement: &70195843345880 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70195843345880
25
+ - !ruby/object:Gem::Dependency
26
+ name: test-unit
27
+ requirement: &70195843345460 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70195843345460
36
+ description: Burkhard Keller Tree implementation in Ruby
37
+ email: pbattley@gmail.com
38
+ executables: []
39
+ extensions: []
40
+ extra_rdoc_files: []
41
+ files:
42
+ - README.md
43
+ - Gemfile
44
+ - lib/bk/dot_graph.rb
45
+ - lib/bk/dump.rb
46
+ - lib/bk/version.rb
47
+ - lib/bk.rb
48
+ - samples/graph.rb
49
+ - samples/performance.rb
50
+ - test/test_all.rb
51
+ - test/test_building_tree.rb
52
+ - test/test_import_and_export.rb
53
+ - test/test_querying_tree.rb
54
+ homepage: https://github.com/threedaymonk/bktree
55
+ licenses: []
56
+ post_install_message:
57
+ rdoc_options: []
58
+ require_paths:
59
+ - lib
60
+ required_ruby_version: !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ required_rubygems_version: !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ! '>='
70
+ - !ruby/object:Gem::Version
71
+ version: '0'
72
+ requirements: []
73
+ rubyforge_project:
74
+ rubygems_version: 1.8.10
75
+ signing_key:
76
+ specification_version: 3
77
+ summary: Burkhard Keller Tree implementation in Ruby
78
+ test_files:
79
+ - test/test_all.rb
80
+ - test/test_building_tree.rb
81
+ - test/test_import_and_export.rb
82
+ - test/test_querying_tree.rb