damerau-levenshtein 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc
CHANGED
|
@@ -1,8 +1,64 @@
|
|
|
1
1
|
= damerau-levenshtein
|
|
2
2
|
|
|
3
|
+
The damerau-levenshtein gem allows to find edit distance between two UTF-8 or ASCII encoded strings with O(N**2) efficiency.
|
|
3
4
|
|
|
4
|
-
|
|
5
|
+
This gem implements pure Levenshtein algorithm, Damerau modification of it (where 2 character transposition counts as 1 edit distance).
|
|
6
|
+
It also includes Boehmer & Rees 2008 modification of Damerau algorithm, where transposition of bigger than 1 character blocks is taken as 1 edit distance as well (Boehmer & Rees 2008).
|
|
7
|
+
|
|
8
|
+
require 'damerau-levenshtein'
|
|
9
|
+
DamerauLevenshtein.distance('Something', 'Smoething') #returns 1
|
|
5
10
|
|
|
11
|
+
Gem damerau-levenshtein is compatible with ruby versions 1.8.7 and 1.9.2 and higher
|
|
12
|
+
|
|
13
|
+
== Installation
|
|
14
|
+
|
|
15
|
+
gem install damerau-levenshtein
|
|
16
|
+
|
|
17
|
+
== Examples
|
|
18
|
+
|
|
19
|
+
require 'rubygems' #not needed for ruby >= 1.9.0
|
|
20
|
+
require 'damerau-levenshtein'
|
|
21
|
+
dl = DamerauLevenshtein
|
|
22
|
+
|
|
23
|
+
* compare using Damerau Levenshtein algorithm
|
|
24
|
+
|
|
25
|
+
dl.distance("Something", "Smoething") #returns 1
|
|
26
|
+
|
|
27
|
+
* compare using Levensthein algorithm
|
|
28
|
+
|
|
29
|
+
dl.distance("Something", "Smoething", 0) #returns 2
|
|
30
|
+
|
|
31
|
+
* compare using Boehmer & Rees modification
|
|
32
|
+
|
|
33
|
+
dl.distance("Something", "meSothing", 2) #returns 2 instead of 4
|
|
34
|
+
|
|
35
|
+
* comparison of words with utf-8 characters should work fine:
|
|
36
|
+
|
|
37
|
+
dl.distance("Sjöstedt", "Sjostedt") #returns 1
|
|
38
|
+
|
|
39
|
+
== Description
|
|
40
|
+
|
|
41
|
+
DamerauLevenshtein.distance takes 4 arguments:
|
|
42
|
+
|
|
43
|
+
* string1
|
|
44
|
+
* string2
|
|
45
|
+
* block_size (default is 1)
|
|
46
|
+
* max_distance (default is 10)
|
|
47
|
+
|
|
48
|
+
block_size determines maximum number of characters in a transposition block:
|
|
49
|
+
|
|
50
|
+
block_size = 0 (transposition does not count -- it is a pure Levenshtein algorithm)
|
|
51
|
+
block_size = 1 (transposition between 2 adjustent characters -- it is pure Damerau-Levenshtein algorithm)
|
|
52
|
+
block_size = 2 (transposition between blocks as big as 2 characters -- so abcd and cdab counts as edit distance 2, not 4)
|
|
53
|
+
block_size = 3 (transposition between blocks as big as 3 characters -- so abcdef and defabc counts as edit distance 3, not 6)
|
|
54
|
+
etc.
|
|
55
|
+
|
|
56
|
+
max_distance -- is a threshold after which algorithm gives up and returns max_distance instead of real edit distance.
|
|
57
|
+
|
|
58
|
+
Levenshtein algorithm is expensive, so it makes sense to give up when edit distance is becoming too big. The argument max_distance does just that.
|
|
59
|
+
|
|
60
|
+
DamerauLevenshtein.distance('abcdefg', '1234567', 0, 3) #give up when edit distance exceeds 3)
|
|
61
|
+
|
|
6
62
|
== Contributing to damerau-levenshtein
|
|
7
63
|
|
|
8
64
|
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
|
data/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
0.5.
|
|
1
|
+
0.5.2
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# Generated by jeweler
|
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
|
4
|
+
# -*- encoding: utf-8 -*-
|
|
5
|
+
|
|
6
|
+
Gem::Specification.new do |s|
|
|
7
|
+
s.name = %q{damerau-levenshtein}
|
|
8
|
+
s.version = "0.5.2"
|
|
9
|
+
|
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
|
11
|
+
s.authors = ["Dmitry Mozzherin"]
|
|
12
|
+
s.date = %q{2011-07-23}
|
|
13
|
+
s.description = %q{Calculation of editing distance for 2 strings using Levenshtein or Damerau-Levenshtein algorithms}
|
|
14
|
+
s.email = %q{dmozzherin@gmail.com}
|
|
15
|
+
s.extensions = ["ext/damerau_levenshtein_binding/extconf.rb"]
|
|
16
|
+
s.extra_rdoc_files = [
|
|
17
|
+
"LICENSE.txt",
|
|
18
|
+
"README.rdoc"
|
|
19
|
+
]
|
|
20
|
+
s.files = [
|
|
21
|
+
"Gemfile",
|
|
22
|
+
"Gemfile.lock",
|
|
23
|
+
"LICENSE.txt",
|
|
24
|
+
"README.rdoc",
|
|
25
|
+
"Rakefile",
|
|
26
|
+
"VERSION",
|
|
27
|
+
"damerau-levenshtein.gemspec",
|
|
28
|
+
"ext/damerau_levenshtein_binding/damerau_levenshtein_binding.c",
|
|
29
|
+
"lib/damerau-levenshtein.rb",
|
|
30
|
+
"spec/damerau-levenshtein_spec.rb",
|
|
31
|
+
"spec/damerau_levenshtein_test.txt",
|
|
32
|
+
"spec/spec_helper.rb"
|
|
33
|
+
]
|
|
34
|
+
s.homepage = %q{http://github.com/dimus/damerau-levenshtein}
|
|
35
|
+
s.licenses = ["MIT"]
|
|
36
|
+
s.require_paths = ["lib"]
|
|
37
|
+
s.rubygems_version = %q{1.3.7}
|
|
38
|
+
s.summary = %q{Calculation of editing distance for 2 strings using Levenshtein or Damerau-Levenshtein algorithms}
|
|
39
|
+
|
|
40
|
+
if s.respond_to? :specification_version then
|
|
41
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
|
42
|
+
s.specification_version = 3
|
|
43
|
+
|
|
44
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
|
45
|
+
s.add_development_dependency(%q<rake-compiler>, [">= 0"])
|
|
46
|
+
s.add_development_dependency(%q<rspec>, ["~> 2.3.0"])
|
|
47
|
+
s.add_development_dependency(%q<cucumber>, [">= 0"])
|
|
48
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
|
49
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.6.0"])
|
|
50
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
|
51
|
+
s.add_development_dependency(%q<ruby-debug19>, [">= 0"])
|
|
52
|
+
s.add_development_dependency(%q<ruby-prof>, [">= 0"])
|
|
53
|
+
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
|
54
|
+
s.add_development_dependency(%q<mocha>, [">= 0"])
|
|
55
|
+
else
|
|
56
|
+
s.add_dependency(%q<rake-compiler>, [">= 0"])
|
|
57
|
+
s.add_dependency(%q<rspec>, ["~> 2.3.0"])
|
|
58
|
+
s.add_dependency(%q<cucumber>, [">= 0"])
|
|
59
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
|
60
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.0"])
|
|
61
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
|
62
|
+
s.add_dependency(%q<ruby-debug19>, [">= 0"])
|
|
63
|
+
s.add_dependency(%q<ruby-prof>, [">= 0"])
|
|
64
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
|
65
|
+
s.add_dependency(%q<mocha>, [">= 0"])
|
|
66
|
+
end
|
|
67
|
+
else
|
|
68
|
+
s.add_dependency(%q<rake-compiler>, [">= 0"])
|
|
69
|
+
s.add_dependency(%q<rspec>, ["~> 2.3.0"])
|
|
70
|
+
s.add_dependency(%q<cucumber>, [">= 0"])
|
|
71
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
|
72
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.0"])
|
|
73
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
|
74
|
+
s.add_dependency(%q<ruby-debug19>, [">= 0"])
|
|
75
|
+
s.add_dependency(%q<ruby-prof>, [">= 0"])
|
|
76
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
|
77
|
+
s.add_dependency(%q<mocha>, [">= 0"])
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
|
|
@@ -17,9 +17,15 @@ VALUE method_distance_utf(VALUE self, VALUE _s, VALUE _t, VALUE _block_size, VAL
|
|
|
17
17
|
int stop_execution = 0;
|
|
18
18
|
int min = 0;
|
|
19
19
|
int current_distance = 0;
|
|
20
|
+
int pure_levenshtein = 0;
|
|
20
21
|
|
|
21
22
|
int block_size = NUM2INT(_block_size);
|
|
22
23
|
int max_distance = NUM2INT(_max_distance);
|
|
24
|
+
|
|
25
|
+
if (block_size == 0) {
|
|
26
|
+
pure_levenshtein = 1;
|
|
27
|
+
block_size = 1;
|
|
28
|
+
}
|
|
23
29
|
|
|
24
30
|
VALUE *sv = RARRAY_PTR(_s);
|
|
25
31
|
VALUE *tv = RARRAY_PTR(_t);
|
|
@@ -87,7 +93,7 @@ VALUE method_distance_utf(VALUE self, VALUE _s, VALUE _t, VALUE _block_size, VAL
|
|
|
87
93
|
min = del;
|
|
88
94
|
if (ins < min) min = ins;
|
|
89
95
|
//if (i == 2 && j==2) return INT2NUM(swap2+5);
|
|
90
|
-
if (
|
|
96
|
+
if (pure_levenshtein == 0 && i >= block && j >= block && swap1 == 1 && swap2 == 1){
|
|
91
97
|
transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1;
|
|
92
98
|
if (transp < min) min = transp;
|
|
93
99
|
block = 0;
|
data/lib/damerau-levenshtein.rb
CHANGED
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
require 'damerau_levenshtein_binding'
|
|
4
4
|
|
|
5
|
-
|
|
6
|
-
|
|
5
|
+
module DamerauLevenshtein
|
|
6
|
+
extend DamerauLevenshteinBinding
|
|
7
7
|
|
|
8
|
-
def distance(str1, str2, block_size =
|
|
8
|
+
def self.distance(str1, str2, block_size = 1, max_distance = 10)
|
|
9
9
|
distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance)
|
|
10
10
|
end
|
|
11
11
|
end
|
|
@@ -3,11 +3,10 @@ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
|
|
3
3
|
describe "DamerauLevenshtein" do
|
|
4
4
|
it 'should get tests' do
|
|
5
5
|
read_test_file(File.expand_path(File.dirname(__FILE__)) + '/damerau_levenshtein_test.txt', 5) do |y|
|
|
6
|
-
dl = DamerauLevenshtein
|
|
6
|
+
dl = DamerauLevenshtein
|
|
7
7
|
if y
|
|
8
8
|
res = dl.distance(y[0], y[1], y[3].to_i, y[2].to_i)
|
|
9
9
|
puts y if res != y[4].to_i
|
|
10
|
-
puts y
|
|
11
10
|
res.should == y[4].to_i
|
|
12
11
|
end
|
|
13
12
|
end
|
|
@@ -48,24 +48,24 @@ Potamomus|Pomatomus|10|1|2
|
|
|
48
48
|
# 1 utf-8 substitution
|
|
49
49
|
Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Sjostedt 1921|10|1|1
|
|
50
50
|
# transposition (Levenshtein takes it as 2 substitutions)
|
|
51
|
-
Pomatomus|oPmatomus|10|
|
|
52
|
-
Pomatomus|oPmatomus|10|
|
|
51
|
+
Pomatomus|oPmatomus|10|0|2
|
|
52
|
+
Pomatomus|oPmatomus|10|1|1
|
|
53
53
|
# transposition (Levenshtein takes it as 2 substitutions)
|
|
54
|
-
Pomatomus|Pomatomsu|10|
|
|
55
|
-
Pomatomus|Pomatomsu|10|
|
|
54
|
+
Pomatomus|Pomatomsu|10|0|2
|
|
55
|
+
Pomatomus|Pomatomsu|10|1|1
|
|
56
56
|
# transposition
|
|
57
|
-
Pomtaomus|Pomatomus|10|
|
|
58
|
-
Pomtaomus|Pomatomus|10|
|
|
57
|
+
Pomtaomus|Pomatomus|10|0|2
|
|
58
|
+
Pomtaomus|Pomatomus|10|1|1
|
|
59
59
|
# transposition
|
|
60
|
-
Pomatoums|Pomatomus|10|
|
|
61
|
-
Pomatoums|Pomatomus|10|
|
|
60
|
+
Pomatoums|Pomatomus|10|0|2
|
|
61
|
+
Pomatoums|Pomatomus|10|1|1
|
|
62
62
|
# transposition + substitution
|
|
63
|
-
PoamtosusPomatomus|10|
|
|
64
|
-
PoamtosusPomatomus|10|
|
|
63
|
+
PoamtosusPomatomus|10|0|3
|
|
64
|
+
PoamtosusPomatomus|10|1|2
|
|
65
65
|
|
|
66
66
|
# transposition with utf-8 char
|
|
67
|
-
Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Söjstedt 1921|10|
|
|
68
|
-
Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Söjstedt 1921|10|
|
|
67
|
+
Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Söjstedt 1921|10|0|2
|
|
68
|
+
Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Söjstedt 1921|10|1|1
|
|
69
69
|
|
|
70
70
|
#it should calculate Modified Damerau Levenshtein distance with 2 or more characters transposition (block size > 2)
|
|
71
71
|
serrulatus|serratulus|10|2|2
|
metadata
CHANGED
|
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
|
5
5
|
segments:
|
|
6
6
|
- 0
|
|
7
7
|
- 5
|
|
8
|
-
-
|
|
9
|
-
version: 0.5.
|
|
8
|
+
- 2
|
|
9
|
+
version: 0.5.2
|
|
10
10
|
platform: ruby
|
|
11
11
|
authors:
|
|
12
12
|
- Dmitry Mozzherin
|
|
@@ -169,6 +169,7 @@ files:
|
|
|
169
169
|
- README.rdoc
|
|
170
170
|
- Rakefile
|
|
171
171
|
- VERSION
|
|
172
|
+
- damerau-levenshtein.gemspec
|
|
172
173
|
- ext/damerau_levenshtein_binding/damerau_levenshtein_binding.c
|
|
173
174
|
- lib/damerau-levenshtein.rb
|
|
174
175
|
- spec/damerau-levenshtein_spec.rb
|
|
@@ -189,7 +190,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
189
190
|
requirements:
|
|
190
191
|
- - ">="
|
|
191
192
|
- !ruby/object:Gem::Version
|
|
192
|
-
hash:
|
|
193
|
+
hash: 1769781403176282343
|
|
193
194
|
segments:
|
|
194
195
|
- 0
|
|
195
196
|
version: "0"
|