damerau-levenshtein 0.5.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc
CHANGED
@@ -1,8 +1,64 @@
|
|
1
1
|
= damerau-levenshtein
|
2
2
|
|
3
|
+
The damerau-levenshtein gem allows to find edit distance between two UTF-8 or ASCII encoded strings with O(N**2) efficiency.
|
3
4
|
|
4
|
-
|
5
|
+
This gem implements pure Levenshtein algorithm, Damerau modification of it (where 2 character transposition counts as 1 edit distance).
|
6
|
+
It also includes Boehmer & Rees 2008 modification of Damerau algorithm, where transposition of bigger than 1 character blocks is taken as 1 edit distance as well (Boehmer & Rees 2008).
|
7
|
+
|
8
|
+
require 'damerau-levenshtein'
|
9
|
+
DamerauLevenshtein.distance('Something', 'Smoething') #returns 1
|
5
10
|
|
11
|
+
Gem damerau-levenshtein is compatible with ruby versions 1.8.7 and 1.9.2 and higher
|
12
|
+
|
13
|
+
== Installation
|
14
|
+
|
15
|
+
gem install damerau-levenshtein
|
16
|
+
|
17
|
+
== Examples
|
18
|
+
|
19
|
+
require 'rubygems' #not needed for ruby >= 1.9.0
|
20
|
+
require 'damerau-levenshtein'
|
21
|
+
dl = DamerauLevenshtein
|
22
|
+
|
23
|
+
* compare using Damerau Levenshtein algorithm
|
24
|
+
|
25
|
+
dl.distance("Something", "Smoething") #returns 1
|
26
|
+
|
27
|
+
* compare using Levensthein algorithm
|
28
|
+
|
29
|
+
dl.distance("Something", "Smoething", 0) #returns 2
|
30
|
+
|
31
|
+
* compare using Boehmer & Rees modification
|
32
|
+
|
33
|
+
dl.distance("Something", "meSothing", 2) #returns 2 instead of 4
|
34
|
+
|
35
|
+
* comparison of words with utf-8 characters should work fine:
|
36
|
+
|
37
|
+
dl.distance("Sjöstedt", "Sjostedt") #returns 1
|
38
|
+
|
39
|
+
== Description
|
40
|
+
|
41
|
+
DamerauLevenshtein.distance takes 4 arguments:
|
42
|
+
|
43
|
+
* string1
|
44
|
+
* string2
|
45
|
+
* block_size (default is 1)
|
46
|
+
* max_distance (default is 10)
|
47
|
+
|
48
|
+
block_size determines maximum number of characters in a transposition block:
|
49
|
+
|
50
|
+
block_size = 0 (transposition does not count -- it is a pure Levenshtein algorithm)
|
51
|
+
block_size = 1 (transposition between 2 adjustent characters -- it is pure Damerau-Levenshtein algorithm)
|
52
|
+
block_size = 2 (transposition between blocks as big as 2 characters -- so abcd and cdab counts as edit distance 2, not 4)
|
53
|
+
block_size = 3 (transposition between blocks as big as 3 characters -- so abcdef and defabc counts as edit distance 3, not 6)
|
54
|
+
etc.
|
55
|
+
|
56
|
+
max_distance -- is a threshold after which algorithm gives up and returns max_distance instead of real edit distance.
|
57
|
+
|
58
|
+
Levenshtein algorithm is expensive, so it makes sense to give up when edit distance is becoming too big. The argument max_distance does just that.
|
59
|
+
|
60
|
+
DamerauLevenshtein.distance('abcdefg', '1234567', 0, 3) #give up when edit distance exceeds 3)
|
61
|
+
|
6
62
|
== Contributing to damerau-levenshtein
|
7
63
|
|
8
64
|
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.5.
|
1
|
+
0.5.2
|
@@ -0,0 +1,80 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{damerau-levenshtein}
|
8
|
+
s.version = "0.5.2"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Dmitry Mozzherin"]
|
12
|
+
s.date = %q{2011-07-23}
|
13
|
+
s.description = %q{Calculation of editing distance for 2 strings using Levenshtein or Damerau-Levenshtein algorithms}
|
14
|
+
s.email = %q{dmozzherin@gmail.com}
|
15
|
+
s.extensions = ["ext/damerau_levenshtein_binding/extconf.rb"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"LICENSE.txt",
|
18
|
+
"README.rdoc"
|
19
|
+
]
|
20
|
+
s.files = [
|
21
|
+
"Gemfile",
|
22
|
+
"Gemfile.lock",
|
23
|
+
"LICENSE.txt",
|
24
|
+
"README.rdoc",
|
25
|
+
"Rakefile",
|
26
|
+
"VERSION",
|
27
|
+
"damerau-levenshtein.gemspec",
|
28
|
+
"ext/damerau_levenshtein_binding/damerau_levenshtein_binding.c",
|
29
|
+
"lib/damerau-levenshtein.rb",
|
30
|
+
"spec/damerau-levenshtein_spec.rb",
|
31
|
+
"spec/damerau_levenshtein_test.txt",
|
32
|
+
"spec/spec_helper.rb"
|
33
|
+
]
|
34
|
+
s.homepage = %q{http://github.com/dimus/damerau-levenshtein}
|
35
|
+
s.licenses = ["MIT"]
|
36
|
+
s.require_paths = ["lib"]
|
37
|
+
s.rubygems_version = %q{1.3.7}
|
38
|
+
s.summary = %q{Calculation of editing distance for 2 strings using Levenshtein or Damerau-Levenshtein algorithms}
|
39
|
+
|
40
|
+
if s.respond_to? :specification_version then
|
41
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
42
|
+
s.specification_version = 3
|
43
|
+
|
44
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
45
|
+
s.add_development_dependency(%q<rake-compiler>, [">= 0"])
|
46
|
+
s.add_development_dependency(%q<rspec>, ["~> 2.3.0"])
|
47
|
+
s.add_development_dependency(%q<cucumber>, [">= 0"])
|
48
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
49
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.6.0"])
|
50
|
+
s.add_development_dependency(%q<rcov>, [">= 0"])
|
51
|
+
s.add_development_dependency(%q<ruby-debug19>, [">= 0"])
|
52
|
+
s.add_development_dependency(%q<ruby-prof>, [">= 0"])
|
53
|
+
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
54
|
+
s.add_development_dependency(%q<mocha>, [">= 0"])
|
55
|
+
else
|
56
|
+
s.add_dependency(%q<rake-compiler>, [">= 0"])
|
57
|
+
s.add_dependency(%q<rspec>, ["~> 2.3.0"])
|
58
|
+
s.add_dependency(%q<cucumber>, [">= 0"])
|
59
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
60
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.0"])
|
61
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
62
|
+
s.add_dependency(%q<ruby-debug19>, [">= 0"])
|
63
|
+
s.add_dependency(%q<ruby-prof>, [">= 0"])
|
64
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
65
|
+
s.add_dependency(%q<mocha>, [">= 0"])
|
66
|
+
end
|
67
|
+
else
|
68
|
+
s.add_dependency(%q<rake-compiler>, [">= 0"])
|
69
|
+
s.add_dependency(%q<rspec>, ["~> 2.3.0"])
|
70
|
+
s.add_dependency(%q<cucumber>, [">= 0"])
|
71
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
72
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.0"])
|
73
|
+
s.add_dependency(%q<rcov>, [">= 0"])
|
74
|
+
s.add_dependency(%q<ruby-debug19>, [">= 0"])
|
75
|
+
s.add_dependency(%q<ruby-prof>, [">= 0"])
|
76
|
+
s.add_dependency(%q<shoulda>, [">= 0"])
|
77
|
+
s.add_dependency(%q<mocha>, [">= 0"])
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
@@ -17,9 +17,15 @@ VALUE method_distance_utf(VALUE self, VALUE _s, VALUE _t, VALUE _block_size, VAL
|
|
17
17
|
int stop_execution = 0;
|
18
18
|
int min = 0;
|
19
19
|
int current_distance = 0;
|
20
|
+
int pure_levenshtein = 0;
|
20
21
|
|
21
22
|
int block_size = NUM2INT(_block_size);
|
22
23
|
int max_distance = NUM2INT(_max_distance);
|
24
|
+
|
25
|
+
if (block_size == 0) {
|
26
|
+
pure_levenshtein = 1;
|
27
|
+
block_size = 1;
|
28
|
+
}
|
23
29
|
|
24
30
|
VALUE *sv = RARRAY_PTR(_s);
|
25
31
|
VALUE *tv = RARRAY_PTR(_t);
|
@@ -87,7 +93,7 @@ VALUE method_distance_utf(VALUE self, VALUE _s, VALUE _t, VALUE _block_size, VAL
|
|
87
93
|
min = del;
|
88
94
|
if (ins < min) min = ins;
|
89
95
|
//if (i == 2 && j==2) return INT2NUM(swap2+5);
|
90
|
-
if (
|
96
|
+
if (pure_levenshtein == 0 && i >= block && j >= block && swap1 == 1 && swap2 == 1){
|
91
97
|
transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1;
|
92
98
|
if (transp < min) min = transp;
|
93
99
|
block = 0;
|
data/lib/damerau-levenshtein.rb
CHANGED
@@ -2,10 +2,10 @@
|
|
2
2
|
|
3
3
|
require 'damerau_levenshtein_binding'
|
4
4
|
|
5
|
-
|
6
|
-
|
5
|
+
module DamerauLevenshtein
|
6
|
+
extend DamerauLevenshteinBinding
|
7
7
|
|
8
|
-
def distance(str1, str2, block_size =
|
8
|
+
def self.distance(str1, str2, block_size = 1, max_distance = 10)
|
9
9
|
distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance)
|
10
10
|
end
|
11
11
|
end
|
@@ -3,11 +3,10 @@ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
|
3
3
|
describe "DamerauLevenshtein" do
|
4
4
|
it 'should get tests' do
|
5
5
|
read_test_file(File.expand_path(File.dirname(__FILE__)) + '/damerau_levenshtein_test.txt', 5) do |y|
|
6
|
-
dl = DamerauLevenshtein
|
6
|
+
dl = DamerauLevenshtein
|
7
7
|
if y
|
8
8
|
res = dl.distance(y[0], y[1], y[3].to_i, y[2].to_i)
|
9
9
|
puts y if res != y[4].to_i
|
10
|
-
puts y
|
11
10
|
res.should == y[4].to_i
|
12
11
|
end
|
13
12
|
end
|
@@ -48,24 +48,24 @@ Potamomus|Pomatomus|10|1|2
|
|
48
48
|
# 1 utf-8 substitution
|
49
49
|
Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Sjostedt 1921|10|1|1
|
50
50
|
# transposition (Levenshtein takes it as 2 substitutions)
|
51
|
-
Pomatomus|oPmatomus|10|
|
52
|
-
Pomatomus|oPmatomus|10|
|
51
|
+
Pomatomus|oPmatomus|10|0|2
|
52
|
+
Pomatomus|oPmatomus|10|1|1
|
53
53
|
# transposition (Levenshtein takes it as 2 substitutions)
|
54
|
-
Pomatomus|Pomatomsu|10|
|
55
|
-
Pomatomus|Pomatomsu|10|
|
54
|
+
Pomatomus|Pomatomsu|10|0|2
|
55
|
+
Pomatomus|Pomatomsu|10|1|1
|
56
56
|
# transposition
|
57
|
-
Pomtaomus|Pomatomus|10|
|
58
|
-
Pomtaomus|Pomatomus|10|
|
57
|
+
Pomtaomus|Pomatomus|10|0|2
|
58
|
+
Pomtaomus|Pomatomus|10|1|1
|
59
59
|
# transposition
|
60
|
-
Pomatoums|Pomatomus|10|
|
61
|
-
Pomatoums|Pomatomus|10|
|
60
|
+
Pomatoums|Pomatomus|10|0|2
|
61
|
+
Pomatoums|Pomatomus|10|1|1
|
62
62
|
# transposition + substitution
|
63
|
-
PoamtosusPomatomus|10|
|
64
|
-
PoamtosusPomatomus|10|
|
63
|
+
PoamtosusPomatomus|10|0|3
|
64
|
+
PoamtosusPomatomus|10|1|2
|
65
65
|
|
66
66
|
# transposition with utf-8 char
|
67
|
-
Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Söjstedt 1921|10|
|
68
|
-
Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Söjstedt 1921|10|
|
67
|
+
Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Söjstedt 1921|10|0|2
|
68
|
+
Cedarinia scabra Sjöstedt 1921|Cedarinia scabra Söjstedt 1921|10|1|1
|
69
69
|
|
70
70
|
#it should calculate Modified Damerau Levenshtein distance with 2 or more characters transposition (block size > 2)
|
71
71
|
serrulatus|serratulus|10|2|2
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 5
|
8
|
-
-
|
9
|
-
version: 0.5.
|
8
|
+
- 2
|
9
|
+
version: 0.5.2
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Dmitry Mozzherin
|
@@ -169,6 +169,7 @@ files:
|
|
169
169
|
- README.rdoc
|
170
170
|
- Rakefile
|
171
171
|
- VERSION
|
172
|
+
- damerau-levenshtein.gemspec
|
172
173
|
- ext/damerau_levenshtein_binding/damerau_levenshtein_binding.c
|
173
174
|
- lib/damerau-levenshtein.rb
|
174
175
|
- spec/damerau-levenshtein_spec.rb
|
@@ -189,7 +190,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
189
190
|
requirements:
|
190
191
|
- - ">="
|
191
192
|
- !ruby/object:Gem::Version
|
192
|
-
hash:
|
193
|
+
hash: 1769781403176282343
|
193
194
|
segments:
|
194
195
|
- 0
|
195
196
|
version: "0"
|