levenshtein 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +15 -0
- data/README +8 -0
- data/VERSION +1 -0
- data/ext/levenshtein/extconf.rb +5 -0
- data/ext/levenshtein/levenshtein_c.c +122 -0
- data/lib/levenshtein.rb +100 -0
- data/test/test.rb +90 -0
- metadata +65 -0
data/LICENSE
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# Copyright Erik Veenstra <levenshtein@erikveen.dds.nl>
|
2
|
+
#
|
3
|
+
# This program is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU General Public License,
|
5
|
+
# version 2, as published by the Free Software Foundation.
|
6
|
+
#
|
7
|
+
# This program is distributed in the hope that it will be
|
8
|
+
# useful, but WITHOUT ANY WARRANTY; without even the implied
|
9
|
+
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
10
|
+
# PURPOSE. See the GNU General Public License for more details.
|
11
|
+
#
|
12
|
+
# You should have received a copy of the GNU General Public
|
13
|
+
# License along with this program; if not, write to the Free
|
14
|
+
# Software Foundation, Inc., 59 Temple Place, Suite 330,
|
15
|
+
# Boston, MA 02111-1307 USA.
|
data/README
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
# The Levenshtein distance is a metric for measuring the amount of difference
|
2
|
+
# between two sequences (i.e., the so called edit distance). The Levenshtein
|
3
|
+
# distance between two strings is given by the minimum number of operations
|
4
|
+
# needed to transform one string into the other, where an operation is an
|
5
|
+
# insertion, deletion, or substitution of a single character.
|
6
|
+
#
|
7
|
+
# More information about the Levenshtein distance algorithm:
|
8
|
+
# http://en.wikipedia.org/wiki/Levenshtein_distance .
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
@@ -0,0 +1,122 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VALUE rb_threshold) {
|
4
|
+
VALUE rb_s3;
|
5
|
+
int threshold;
|
6
|
+
int l1, l2, l3;
|
7
|
+
char *s1, *s2, *s3;
|
8
|
+
int *prev_row, *curr_row;
|
9
|
+
int col, row;
|
10
|
+
int curr_row_min, result;
|
11
|
+
|
12
|
+
/* Convert Ruby's s1 to C's s1. */
|
13
|
+
|
14
|
+
rb_s1 = StringValue(rb_s1);
|
15
|
+
s1 = RSTRING(rb_s1)->ptr;
|
16
|
+
l1 = RSTRING(rb_s1)->len;
|
17
|
+
|
18
|
+
/* Convert Ruby's s2 to C's s2. */
|
19
|
+
|
20
|
+
rb_s2 = StringValue(rb_s2);
|
21
|
+
s2 = RSTRING(rb_s2)->ptr;
|
22
|
+
l2 = RSTRING(rb_s2)->len;
|
23
|
+
|
24
|
+
/* Convert Ruby's threshold to C's threshold. */
|
25
|
+
|
26
|
+
if (!NIL_P(rb_threshold)) {
|
27
|
+
threshold = FIX2INT(rb_threshold);
|
28
|
+
} else {
|
29
|
+
threshold = -1;
|
30
|
+
}
|
31
|
+
|
32
|
+
/* The Levenshtein Algorithm itself. */
|
33
|
+
|
34
|
+
/* s1= */
|
35
|
+
/* ERIK */
|
36
|
+
/* */
|
37
|
+
/* 01234 */
|
38
|
+
/* s2=V 11234 */
|
39
|
+
/* E 21234 */
|
40
|
+
/* E 32234 */
|
41
|
+
/* N 43334 <- prev_row */
|
42
|
+
/* S 54444 <- curr_row */
|
43
|
+
/* T 65555 */
|
44
|
+
/* R 76566 */
|
45
|
+
/* A 87667 */
|
46
|
+
|
47
|
+
/* Allocate memory for both rows */
|
48
|
+
|
49
|
+
prev_row = ALLOC_N(int, l1+1);
|
50
|
+
curr_row = ALLOC_N(int, l1+1);
|
51
|
+
|
52
|
+
if ((prev_row == NULL) || (curr_row == NULL)) {
|
53
|
+
rb_raise(rb_eNoMemError, "out of memory");
|
54
|
+
}
|
55
|
+
|
56
|
+
/* Initialize the current row. */
|
57
|
+
|
58
|
+
for (col=0; col<=l1; col++) {
|
59
|
+
curr_row[col] = col;
|
60
|
+
}
|
61
|
+
|
62
|
+
for (row=1; row<=l2; row++) {
|
63
|
+
/* Copy the current row to the previous row. */
|
64
|
+
|
65
|
+
memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
|
66
|
+
|
67
|
+
/* Calculate the values of the current row. */
|
68
|
+
|
69
|
+
curr_row[0] = row;
|
70
|
+
curr_row_min = row;
|
71
|
+
|
72
|
+
for (col=1; col<=l1; col++) {
|
73
|
+
/* Equal (cost=0) or Substitution (cost=1). */
|
74
|
+
|
75
|
+
curr_row[col] = prev_row[col-1] + ((s1[col-1] == s2[row-1]) ? 0 : 1);
|
76
|
+
|
77
|
+
/* Insertion if it's cheaper than substitution. */
|
78
|
+
|
79
|
+
if (prev_row[col]+1 < curr_row[col]) {
|
80
|
+
curr_row[col] = prev_row[col]+1;
|
81
|
+
}
|
82
|
+
|
83
|
+
/* Deletion if it's cheaper than substitution. */
|
84
|
+
|
85
|
+
if (curr_row[col-1]+1 < curr_row[col]) {
|
86
|
+
curr_row[col] = curr_row[col-1]+1;
|
87
|
+
}
|
88
|
+
|
89
|
+
/* Keep track of the minimum value on this row. */
|
90
|
+
|
91
|
+
if (curr_row[col] < curr_row_min) {
|
92
|
+
curr_row_min = curr_row[col];
|
93
|
+
}
|
94
|
+
}
|
95
|
+
|
96
|
+
/* Return nil as soon as we exceed the threshold. */
|
97
|
+
|
98
|
+
if (threshold > -1 && curr_row_min >= threshold) {
|
99
|
+
free(prev_row);
|
100
|
+
free(curr_row);
|
101
|
+
|
102
|
+
return Qnil;
|
103
|
+
}
|
104
|
+
}
|
105
|
+
|
106
|
+
/* The result is the last value on the last row. */
|
107
|
+
|
108
|
+
result = curr_row[l1];
|
109
|
+
|
110
|
+
free(prev_row);
|
111
|
+
free(curr_row);
|
112
|
+
|
113
|
+
/* Return the Ruby version of the result. */
|
114
|
+
|
115
|
+
return INT2FIX(result);
|
116
|
+
}
|
117
|
+
|
118
|
+
void Init_levenshtein_c() {
|
119
|
+
VALUE mLevenshtein = rb_define_module("Levenshtein");
|
120
|
+
|
121
|
+
rb_define_singleton_method(mLevenshtein, "distance_part2_fast" , levenshtein_distance_part2, 3);
|
122
|
+
}
|
data/lib/levenshtein.rb
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
begin
|
2
|
+
require "levenshtein/levenshtein_c"
|
3
|
+
rescue LoadError
|
4
|
+
begin
|
5
|
+
require "levenshtein_c"
|
6
|
+
rescue LoadError
|
7
|
+
$stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance_part2. Using the slow Ruby version instead."
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
# The Levenshtein distance is a metric for measuring the amount of difference
|
12
|
+
# between two sequences (i.e., the so called edit distance). The Levenshtein
|
13
|
+
# distance between two strings is given by the minimum number of operations
|
14
|
+
# needed to transform one string into the other, where an operation is an
|
15
|
+
# insertion, deletion, or substitution of a single character.
|
16
|
+
#
|
17
|
+
# More information about the Levenshtein distance algorithm:
|
18
|
+
# http://en.wikipedia.org/wiki/Levenshtein_distance .
|
19
|
+
|
20
|
+
module Levenshtein
|
21
|
+
# Returns the Levenshtein distance as a number bestween 0.0 and 1.0.
|
22
|
+
# It's basically the Levenshtein distance divided by the length of the longest string.
|
23
|
+
|
24
|
+
def self.normalized_distance(s1, s2, threshold=nil)
|
25
|
+
s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
|
26
|
+
|
27
|
+
if s2.empty?
|
28
|
+
0.0 # Since s1.length < s2.length, s1 must be empty as well.
|
29
|
+
else
|
30
|
+
if threshold
|
31
|
+
if d = self.distance(s1, s2, (threshold*s2.length+1).to_i)
|
32
|
+
d.to_f/s2.length
|
33
|
+
else
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
else
|
37
|
+
self.distance(s1, s2).to_f/s2.length
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Returns the Levenshtein distance between two byte strings.
|
43
|
+
|
44
|
+
def self.distance(s1, s2, threshold=nil)
|
45
|
+
s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
|
46
|
+
|
47
|
+
# Handle some basic circumstances.
|
48
|
+
|
49
|
+
return 0.0 if s1 == s2
|
50
|
+
return s2.length if s1.empty?
|
51
|
+
return nil if threshold and (s2.length-s1.length) >= threshold
|
52
|
+
return nil if threshold and (s1.scan(/./) - s2.scan(/./)).length >= threshold
|
53
|
+
return nil if threshold and (s2.scan(/./) - s1.scan(/./)).length >= threshold
|
54
|
+
|
55
|
+
# Do the expensive calculation on a subset of the strings only, if possible.
|
56
|
+
|
57
|
+
b = 0
|
58
|
+
e1 = s1.length-1
|
59
|
+
e2 = s2.length-1
|
60
|
+
|
61
|
+
while s1[b, 1] == s2[b, 1]
|
62
|
+
b += 1
|
63
|
+
end
|
64
|
+
|
65
|
+
while s1[e1, 1] == s2[e2, 1]
|
66
|
+
e1 -= 1
|
67
|
+
e2 -= 1
|
68
|
+
end
|
69
|
+
|
70
|
+
distance_part2(s1[b..e1], s2[b..e2], threshold)
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.distance_part2(s1, s2, threshold) # :nodoc:
|
74
|
+
if respond_to?(:distance_part2_fast)
|
75
|
+
distance_part2_fast(s1, s2, threshold) # Implemented in C.
|
76
|
+
else
|
77
|
+
distance_part2_slow(s1, s2, threshold) # Implemented in Ruby.
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def self.distance_part2_slow(s1, s2, threshold) # :nodoc:
|
82
|
+
row = (0..s1.length).to_a
|
83
|
+
|
84
|
+
1.upto(s2.length) do |y|
|
85
|
+
prow = row
|
86
|
+
row = [y]
|
87
|
+
|
88
|
+
1.upto(s1.length) do |x|
|
89
|
+
row[x] = [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
|
90
|
+
end
|
91
|
+
|
92
|
+
# Stop analysing this string as soon as the best possible result for this string is bigger than the best result so far.
|
93
|
+
# (The minimum value in the next row will be equal to or greater than the minimum value in this row.)
|
94
|
+
|
95
|
+
return nil if threshold and row.min >= threshold
|
96
|
+
end
|
97
|
+
|
98
|
+
row[-1]
|
99
|
+
end
|
100
|
+
end
|
data/test/test.rb
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
require "test/unit"
|
2
|
+
require "levenshtein"
|
3
|
+
|
4
|
+
class TestLevenshtein < Test::Unit::TestCase
|
5
|
+
def test_erik_veenstra
|
6
|
+
assert_equal(7, Levenshtein.distance("erik", "veenstra"))
|
7
|
+
assert_equal(7, Levenshtein.distance("veenstra", "erik"))
|
8
|
+
|
9
|
+
assert_in_delta(0.875, Levenshtein.normalized_distance("erik", "veenstra"), 0.01)
|
10
|
+
assert_in_delta(0.875, Levenshtein.normalized_distance("veenstra", "erik"), 0.01)
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_empty_string
|
14
|
+
assert_equal(0, Levenshtein.distance("", ""))
|
15
|
+
assert_equal(3, Levenshtein.distance("", "foo"))
|
16
|
+
assert_equal(3, Levenshtein.distance("foo", ""))
|
17
|
+
|
18
|
+
assert_in_delta(0.0, Levenshtein.normalized_distance("", ""), 0.01)
|
19
|
+
assert_in_delta(1.0, Levenshtein.normalized_distance("", "foo"), 0.01)
|
20
|
+
assert_in_delta(1.0, Levenshtein.normalized_distance("foo", ""), 0.01)
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_same_string
|
24
|
+
assert_equal(0, Levenshtein.distance("", ""))
|
25
|
+
assert_equal(0, Levenshtein.distance("foo", "foo"))
|
26
|
+
|
27
|
+
assert_in_delta(0.0, Levenshtein.normalized_distance("", ""), 0.01)
|
28
|
+
assert_in_delta(0.0, Levenshtein.normalized_distance("foo", "foo"), 0.01)
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_threshold
|
32
|
+
assert_equal(3, Levenshtein.distance("foo", "foobar"))
|
33
|
+
assert_equal(nil, Levenshtein.distance("foo", "foobar", 2))
|
34
|
+
|
35
|
+
assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar"), 0.01)
|
36
|
+
assert_equal(nil, Levenshtein.normalized_distance("foo", "foobar", 0.30))
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_same_head_and_or_tail
|
40
|
+
assert_equal(3, Levenshtein.distance("ab123cd", "abxyzcd"))
|
41
|
+
assert_equal(3, Levenshtein.distance("ab123", "abxyz"))
|
42
|
+
assert_equal(3, Levenshtein.distance("123cd", "xyzcd"))
|
43
|
+
|
44
|
+
assert_in_delta(0.42, Levenshtein.normalized_distance("ab123cd", "abxyzcd"), 0.01)
|
45
|
+
assert_in_delta(0.6, Levenshtein.normalized_distance("ab123", "abxyz"), 0.01)
|
46
|
+
assert_in_delta(0.6, Levenshtein.normalized_distance("123cd", "xyzcd"), 0.01)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
class TestLevenshteinPart2Slow < Test::Unit::TestCase
|
51
|
+
def test_erik_veenstra
|
52
|
+
assert_equal(7, Levenshtein.distance_part2_slow("erik", "veenstra", nil))
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_empty_string
|
56
|
+
assert_equal(0, Levenshtein.distance_part2_slow("", "", nil))
|
57
|
+
assert_equal(3, Levenshtein.distance_part2_slow("", "foo", nil))
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_same_string
|
61
|
+
assert_equal(0, Levenshtein.distance_part2_slow("", "", nil))
|
62
|
+
assert_equal(0, Levenshtein.distance_part2_slow("foo", "foo", nil))
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_threshold
|
66
|
+
assert_equal(3, Levenshtein.distance_part2_slow("foo", "foobar", nil))
|
67
|
+
assert_equal(nil, Levenshtein.distance_part2_slow("foo", "foobar", 2))
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
class TestLevenshteinPart2Fast < Test::Unit::TestCase
|
72
|
+
def test_erik_veenstra
|
73
|
+
assert_equal(7, Levenshtein.distance_part2_fast("erik", "veenstra", nil))
|
74
|
+
end
|
75
|
+
|
76
|
+
def test_empty_string
|
77
|
+
assert_equal(0, Levenshtein.distance_part2_fast("", "", nil))
|
78
|
+
assert_equal(3, Levenshtein.distance_part2_fast("", "foo", nil))
|
79
|
+
end
|
80
|
+
|
81
|
+
def test_same_string
|
82
|
+
assert_equal(0, Levenshtein.distance_part2_fast("", "", nil))
|
83
|
+
assert_equal(0, Levenshtein.distance_part2_fast("foo", "foo", nil))
|
84
|
+
end
|
85
|
+
|
86
|
+
def test_threshold
|
87
|
+
assert_equal(3, Levenshtein.distance_part2_fast("foo", "foobar", nil))
|
88
|
+
assert_equal(nil, Levenshtein.distance_part2_fast("foo", "foobar", 2))
|
89
|
+
end
|
90
|
+
end
|
metadata
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: levenshtein
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Erik Veenstra
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-05-24 00:00:00 +02:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Calculates the Levenshtein distance between two byte strings.
|
17
|
+
email: levenshtein@erikveen.dds.nl
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions:
|
21
|
+
- ext/levenshtein/extconf.rb
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
24
|
+
files:
|
25
|
+
- lib/levenshtein.rb
|
26
|
+
- ext/levenshtein
|
27
|
+
- ext/levenshtein/extconf.rb
|
28
|
+
- ext/levenshtein/levenshtein_c.c
|
29
|
+
- README
|
30
|
+
- LICENSE
|
31
|
+
- VERSION
|
32
|
+
has_rdoc: true
|
33
|
+
homepage: http://www.erikveen.dds.nl/levenshtein/index.html
|
34
|
+
post_install_message:
|
35
|
+
rdoc_options:
|
36
|
+
- README
|
37
|
+
- LICENSE
|
38
|
+
- VERSION
|
39
|
+
- --title
|
40
|
+
- levenshtein (0.1.0)
|
41
|
+
- --main
|
42
|
+
- README
|
43
|
+
require_paths:
|
44
|
+
- lib
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: "0"
|
50
|
+
version:
|
51
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: "0"
|
56
|
+
version:
|
57
|
+
requirements: []
|
58
|
+
|
59
|
+
rubyforge_project: levenshtein
|
60
|
+
rubygems_version: 1.0.1
|
61
|
+
signing_key:
|
62
|
+
specification_version: 2
|
63
|
+
summary: Calculates the Levenshtein distance between two byte strings.
|
64
|
+
test_files:
|
65
|
+
- test/test.rb
|