hintable_levenshtein 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +35 -0
- data/Rakefile +48 -0
- data/VERSION +1 -0
- data/lib/hintable_levenshtein.rb +129 -0
- data/lib/hintable_levenshtein/event.rb +31 -0
- data/lib/hintable_levenshtein/rule_set.rb +70 -0
- data/spec/hl_spec.rb +30 -0
- data/spec/spec.opts +7 -0
- metadata +62 -0
data/README.rdoc
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
= Hintable Levenshtein
|
2
|
+
|
3
|
+
Levenshtein distances but with extra hints. Perhaps adding or deleting a space is not as big as a change as other things, or
|
4
|
+
substituting a 'c' for a 'k' is again a cheaper operation than just any arbitrary change.
|
5
|
+
|
6
|
+
Just an example
|
7
|
+
|
8
|
+
english_rules = [
|
9
|
+
HintableLevenshtein::RuleSet.new(0.3, HintableLevenshtein::Rule.insert(/[\.,!]/)),
|
10
|
+
HintableLevenshtein::RuleSet.new(0.3, HintableLevenshtein::Rule.delete(/[\.,!]/)),
|
11
|
+
HintableLevenshtein::RuleSet.new(0.4, HintableLevenshtein::Rule.substitute('!' => '.')),
|
12
|
+
HintableLevenshtein::RuleSet.new(0.4, HintableLevenshtein::Rule.substitute('!' => ',')),
|
13
|
+
HintableLevenshtein::RuleSet.new(0.75, HintableLevenshtein::Rule.insert(' '), HintableLevenshtein::Rule.insert(' ')),
|
14
|
+
HintableLevenshtein::RuleSet.new(0.5, HintableLevenshtein::Rule.insert(' ')),
|
15
|
+
HintableLevenshtein::RuleSet.new(0.5, HintableLevenshtein::Rule.delete(' ')),
|
16
|
+
HintableLevenshtein::RuleSet.new(0.7, HintableLevenshtein::Rule.substitute('z' => 's')),
|
17
|
+
HintableLevenshtein::RuleSet.new(0.7, HintableLevenshtein::Rule.substitute('k' => 'c')),
|
18
|
+
HintableLevenshtein::RuleSet.new(0.7, HintableLevenshtein::Rule.substitute('u' => 'o')),
|
19
|
+
HintableLevenshtein::RuleSet.new(0.7, HintableLevenshtein::Rule.substitute('e' => 'a')),
|
20
|
+
HintableLevenshtein::RuleSet.new(0.7, HintableLevenshtein::Rule.substitute('i' => 'y')),
|
21
|
+
HintableLevenshtein::RuleSet.new(1, HintableLevenshtein::Rule.delete),
|
22
|
+
HintableLevenshtein::RuleSet.new(1, HintableLevenshtein::Rule.insert),
|
23
|
+
HintableLevenshtein::RuleSet.new(1, HintableLevenshtein::Rule.substitute)
|
24
|
+
]
|
25
|
+
|
26
|
+
a = "hello kitten pizza!!"
|
27
|
+
b = "hello cittin pissssa.."
|
28
|
+
|
29
|
+
puts "normal levenshtein: #{HintableLevenshtein.new.distance(a, b)}"
|
30
|
+
puts "hinted levenshtein: #{HintableLevenshtein.new(english_rules).distance(a, b)}"
|
31
|
+
|
32
|
+
Would output:
|
33
|
+
|
34
|
+
normal levenshtein: 11.0
|
35
|
+
hinted levenshtein: 7.15
|
data/Rakefile
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
libdir = File.expand_path("lib")
|
2
|
+
$:.unshift(libdir) unless $:.include?(libdir)
|
3
|
+
|
4
|
+
require 'hintable_levenshtein'
|
5
|
+
|
6
|
+
begin
|
7
|
+
require 'jeweler'
|
8
|
+
Jeweler::Tasks.new do |s|
|
9
|
+
s.name = "hintable_levenshtein"
|
10
|
+
s.description = s.summary = "Levenshtein with hints"
|
11
|
+
s.email = "joshbuddy@gmail.com"
|
12
|
+
s.homepage = "http://github.com/joshbuddy/hintable_levenshtein"
|
13
|
+
s.authors = ["Joshua Hull"]
|
14
|
+
s.files = FileList["[A-Z]*", "{lib,spec}/**/*"]
|
15
|
+
s.rubyforge_project = 'joshbuddy-hintable_levenshtein'
|
16
|
+
end
|
17
|
+
Jeweler::GemcutterTasks.new
|
18
|
+
rescue LoadError
|
19
|
+
puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
|
20
|
+
end
|
21
|
+
|
22
|
+
require 'spec'
|
23
|
+
require 'spec/rake/spectask'
|
24
|
+
task :spec => 'spec:all'
|
25
|
+
namespace(:spec) do
|
26
|
+
Spec::Rake::SpecTask.new(:all) do |t|
|
27
|
+
t.spec_opts ||= []
|
28
|
+
t.spec_opts << "-rubygems"
|
29
|
+
t.spec_opts << "--options" << "spec/spec.opts"
|
30
|
+
t.spec_files = FileList['spec/**/*_spec.rb']
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
desc "Run all examples with RCov"
|
36
|
+
Spec::Rake::SpecTask.new('spec_with_rcov') do |t|
|
37
|
+
t.spec_files = FileList['spec/**/*.rb']
|
38
|
+
t.rcov = true
|
39
|
+
t.rcov_opts = ['--exclude', 'spec']
|
40
|
+
end
|
41
|
+
|
42
|
+
require 'rake/rdoctask'
|
43
|
+
desc "Generate documentation"
|
44
|
+
Rake::RDocTask.new do |rd|
|
45
|
+
rd.main = "README.rdoc"
|
46
|
+
rd.rdoc_files.include("README.rdoc", "lib/**/*.rb")
|
47
|
+
rd.rdoc_dir = 'rdoc'
|
48
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.0.2
|
@@ -0,0 +1,129 @@
|
|
1
|
+
class HintableLevenshtein
|
2
|
+
|
3
|
+
autoload :Event, File.join(File.dirname(__FILE__), 'hintable_levenshtein', 'event')
|
4
|
+
autoload :RuleSet, File.join(File.dirname(__FILE__), 'hintable_levenshtein', 'rule_set')
|
5
|
+
|
6
|
+
def initialize(new_rule_set = nil, &block)
|
7
|
+
@rule_set = []
|
8
|
+
|
9
|
+
instance_eval(&block) if block
|
10
|
+
|
11
|
+
if new_rule_set
|
12
|
+
rule_set.concat(new_rule_set)
|
13
|
+
end
|
14
|
+
|
15
|
+
if rule_set.empty?
|
16
|
+
rule_set << RuleSet.new(1, Rule.delete)
|
17
|
+
rule_set << RuleSet.new(1, Rule.insert)
|
18
|
+
rule_set << RuleSet.new(1, Rule.substitute)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def rule_set_sizes
|
23
|
+
@rule_set_sizes ||= rule_set.collect{|r| r.rules.size}.uniq.sort.reverse
|
24
|
+
end
|
25
|
+
|
26
|
+
def largest_rule_size
|
27
|
+
rule_set_sizes.first
|
28
|
+
end
|
29
|
+
|
30
|
+
def delete(score, match)
|
31
|
+
rule_set << RuleSet.new(score, Rule.delete(match))
|
32
|
+
end
|
33
|
+
|
34
|
+
def insert(score, match)
|
35
|
+
rule_set << RuleSet.new(score, Rule.insert(match))
|
36
|
+
end
|
37
|
+
|
38
|
+
def substitute(score, match)
|
39
|
+
rule_set << RuleSet.new(score, Rule.substitute(match))
|
40
|
+
end
|
41
|
+
|
42
|
+
Position = Struct.new(:x, :y)
|
43
|
+
|
44
|
+
def distance(s, t)
|
45
|
+
matrix = levenshtein_matrix(s, t)
|
46
|
+
steps = calculate_steps(s, t, matrix)
|
47
|
+
calculate_score(steps)
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
attr_reader :rule_set
|
53
|
+
|
54
|
+
def levenshtein_matrix(s, t)
|
55
|
+
d = Array.new(s.size + 1) {|i| Array.new(t.size + 1)}
|
56
|
+
|
57
|
+
(0..s.size).each do |m|
|
58
|
+
(0..t.size).each do |n|
|
59
|
+
if m * n == 0
|
60
|
+
d[m][n] = m + n
|
61
|
+
else
|
62
|
+
d[m][n] = [
|
63
|
+
d[m-1][n] + 1,
|
64
|
+
d[m][n-1] + 1,
|
65
|
+
d[m-1][n-1] + (s[m-1] == t[n-1] ? 0 : 1)
|
66
|
+
].min
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
d
|
72
|
+
end
|
73
|
+
|
74
|
+
def calculate_steps(s, t, matrix)
|
75
|
+
position = Position.new(s.size, t.size)
|
76
|
+
value = matrix[position.x][position.y]
|
77
|
+
steps = []
|
78
|
+
while matrix[position.x][position.y] != 0
|
79
|
+
previous_position = position.dup
|
80
|
+
if position.x == 0
|
81
|
+
previous.y -= 1
|
82
|
+
elsif position.y == 0
|
83
|
+
previous.x -= 1
|
84
|
+
else
|
85
|
+
possible_values = [matrix[position.x - 1][position.y], matrix[position.x][position.y - 1], matrix[position.x - 1][position.y - 1]]
|
86
|
+
case possible_values.min
|
87
|
+
when possible_values[2]
|
88
|
+
position.x -= 1
|
89
|
+
position.y -= 1
|
90
|
+
when possible_values[1]
|
91
|
+
position.y -= 1
|
92
|
+
when possible_values[0]
|
93
|
+
position.x -= 1
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
next unless value != matrix[position.x][position.y]
|
98
|
+
|
99
|
+
value = matrix[position.x][position.y]
|
100
|
+
steps << if previous_position.x == position.x + 1 && previous_position.y == position.y + 1
|
101
|
+
Event.new(:substitute, s[position.x] => t[position.y])
|
102
|
+
elsif previous_position.x == (position.x + 1)
|
103
|
+
Event.new(:delete, s[position.x].chr)
|
104
|
+
else
|
105
|
+
Event.new(:insert, t[position.y].chr)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
steps
|
109
|
+
end
|
110
|
+
|
111
|
+
def calculate_score(steps)
|
112
|
+
score = 0.0
|
113
|
+
rule_set_sizes.each do |set_size|
|
114
|
+
rule_set.select{|r| r.rules.size == set_size}.each do |rule|
|
115
|
+
(0..(steps.size - set_size)).each do |offset|
|
116
|
+
analyzed_set = steps[offset..(offset + set_size - 1)]
|
117
|
+
unless analyzed_set.any?{|s| s.nil?}
|
118
|
+
if rule.match?(analyzed_set)
|
119
|
+
steps.fill(nil, offset, set_size)
|
120
|
+
score += rule.score
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
score
|
127
|
+
end
|
128
|
+
|
129
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
class HintableLevenshtein
|
2
|
+
class Event
|
3
|
+
|
4
|
+
attr_reader :type, :chr, :from, :to
|
5
|
+
|
6
|
+
def initialize(type, chr = nil)
|
7
|
+
@type = type
|
8
|
+
case type
|
9
|
+
when :substitute
|
10
|
+
unless chr.nil?
|
11
|
+
@from = chr.keys.first
|
12
|
+
@from = @from.chr unless @from.is_a?(String)
|
13
|
+
@to = chr.values.first
|
14
|
+
@to = @to.chr unless @to.is_a?(String)
|
15
|
+
else
|
16
|
+
@from = @to = nil
|
17
|
+
end
|
18
|
+
else
|
19
|
+
@chr = chr.is_a?(String) ? chr : chr.chr
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def to_s
|
24
|
+
if from
|
25
|
+
"#{type} -> #{from}..#{to}"
|
26
|
+
else
|
27
|
+
"#{type} -> #{chr}"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
class HintableLevenshtein
|
2
|
+
class RuleSet
|
3
|
+
|
4
|
+
include Comparable
|
5
|
+
|
6
|
+
attr_reader :rules, :score
|
7
|
+
def initialize(score, *rules)
|
8
|
+
@rules = rules
|
9
|
+
@score = score
|
10
|
+
end
|
11
|
+
|
12
|
+
def to_s
|
13
|
+
"#{score} -> #{rules.collect{|r| r.to_s} * ', '}"
|
14
|
+
end
|
15
|
+
|
16
|
+
def match?(events)
|
17
|
+
return if events.size != rules.size
|
18
|
+
events.each_with_index do |e, idx|
|
19
|
+
return unless rules[idx].match?(e)
|
20
|
+
end
|
21
|
+
true
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
class Rule
|
26
|
+
def self.delete(matcher = nil)
|
27
|
+
new(:delete, matcher)
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.insert(matcher = nil)
|
31
|
+
new(:insert, matcher)
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.substitute(matcher = {})
|
35
|
+
new(:substitute, matcher)
|
36
|
+
end
|
37
|
+
|
38
|
+
attr_reader :type, :matcher
|
39
|
+
|
40
|
+
def match?(event)
|
41
|
+
if event.type == type
|
42
|
+
case type
|
43
|
+
when :delete, :insert
|
44
|
+
ret = case matcher
|
45
|
+
when nil
|
46
|
+
true
|
47
|
+
when String
|
48
|
+
!matcher.index(event.chr).nil?
|
49
|
+
when Array
|
50
|
+
matcher.include?(event.chr)
|
51
|
+
else
|
52
|
+
matcher === event.chr
|
53
|
+
end
|
54
|
+
when :substitute
|
55
|
+
matcher.empty? || matcher[event.from] == event.to || matcher[event.to] == event.from
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
def to_s
|
61
|
+
"#{type} #{matcher.inspect}"
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
def initialize(type, matcher)
|
66
|
+
@type = type
|
67
|
+
@matcher = matcher
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
data/spec/hl_spec.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
require 'lib/hintable_levenshtein'
|
2
|
+
|
3
|
+
describe HintableLevenshtein do
|
4
|
+
it "should do this thing" do
|
5
|
+
english_rules = [
|
6
|
+
HintableLevenshtein::RuleSet.new(0.3, HintableLevenshtein::Rule.insert(/[\.,!]/)),
|
7
|
+
HintableLevenshtein::RuleSet.new(0.3, HintableLevenshtein::Rule.delete(/[\.,!]/)),
|
8
|
+
HintableLevenshtein::RuleSet.new(0.4, HintableLevenshtein::Rule.substitute('!' => '.')),
|
9
|
+
HintableLevenshtein::RuleSet.new(0.4, HintableLevenshtein::Rule.substitute('!' => ',')),
|
10
|
+
HintableLevenshtein::RuleSet.new(0.75, HintableLevenshtein::Rule.insert(' '), HintableLevenshtein::Rule.insert(' ')),
|
11
|
+
HintableLevenshtein::RuleSet.new(0.5, HintableLevenshtein::Rule.insert(' ')),
|
12
|
+
HintableLevenshtein::RuleSet.new(0.5, HintableLevenshtein::Rule.delete(' ')),
|
13
|
+
HintableLevenshtein::RuleSet.new(0.7, HintableLevenshtein::Rule.substitute('z' => 's')),
|
14
|
+
HintableLevenshtein::RuleSet.new(0.7, HintableLevenshtein::Rule.substitute('k' => 'c')),
|
15
|
+
HintableLevenshtein::RuleSet.new(0.7, HintableLevenshtein::Rule.substitute('u' => 'o')),
|
16
|
+
HintableLevenshtein::RuleSet.new(0.7, HintableLevenshtein::Rule.substitute('e' => 'a')),
|
17
|
+
HintableLevenshtein::RuleSet.new(0.7, HintableLevenshtein::Rule.substitute('i' => 'y')),
|
18
|
+
HintableLevenshtein::RuleSet.new(1, HintableLevenshtein::Rule.delete),
|
19
|
+
HintableLevenshtein::RuleSet.new(1, HintableLevenshtein::Rule.insert),
|
20
|
+
HintableLevenshtein::RuleSet.new(1, HintableLevenshtein::Rule.substitute)
|
21
|
+
]
|
22
|
+
|
23
|
+
a = "hello kitten pizza!!"
|
24
|
+
b = "hello cittin pissssa.."
|
25
|
+
|
26
|
+
HintableLevenshtein.new.distance(a, b).should == 11
|
27
|
+
HintableLevenshtein.new(english_rules).distance(a, b).should == 7.15
|
28
|
+
|
29
|
+
end
|
30
|
+
end
|
data/spec/spec.opts
ADDED
metadata
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: hintable_levenshtein
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Joshua Hull
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-12-18 00:00:00 -05:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Levenshtein with hints
|
17
|
+
email: joshbuddy@gmail.com
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- README.rdoc
|
24
|
+
files:
|
25
|
+
- README.rdoc
|
26
|
+
- Rakefile
|
27
|
+
- VERSION
|
28
|
+
- lib/hintable_levenshtein.rb
|
29
|
+
- lib/hintable_levenshtein/event.rb
|
30
|
+
- lib/hintable_levenshtein/rule_set.rb
|
31
|
+
- spec/hl_spec.rb
|
32
|
+
- spec/spec.opts
|
33
|
+
has_rdoc: true
|
34
|
+
homepage: http://github.com/joshbuddy/hintable_levenshtein
|
35
|
+
licenses: []
|
36
|
+
|
37
|
+
post_install_message:
|
38
|
+
rdoc_options:
|
39
|
+
- --charset=UTF-8
|
40
|
+
require_paths:
|
41
|
+
- lib
|
42
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: "0"
|
47
|
+
version:
|
48
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: "0"
|
53
|
+
version:
|
54
|
+
requirements: []
|
55
|
+
|
56
|
+
rubyforge_project: joshbuddy-hintable_levenshtein
|
57
|
+
rubygems_version: 1.3.5
|
58
|
+
signing_key:
|
59
|
+
specification_version: 3
|
60
|
+
summary: Levenshtein with hints
|
61
|
+
test_files:
|
62
|
+
- spec/hl_spec.rb
|