levenshtein 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +15 -0
- data/README +8 -0
- data/VERSION +1 -0
- data/ext/levenshtein/extconf.rb +5 -0
- data/ext/levenshtein/levenshtein_c.c +122 -0
- data/lib/levenshtein.rb +100 -0
- data/test/test.rb +90 -0
- metadata +65 -0
data/LICENSE
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# Copyright Erik Veenstra <levenshtein@erikveen.dds.nl>
|
2
|
+
#
|
3
|
+
# This program is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU General Public License,
|
5
|
+
# version 2, as published by the Free Software Foundation.
|
6
|
+
#
|
7
|
+
# This program is distributed in the hope that it will be
|
8
|
+
# useful, but WITHOUT ANY WARRANTY; without even the implied
|
9
|
+
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
10
|
+
# PURPOSE. See the GNU General Public License for more details.
|
11
|
+
#
|
12
|
+
# You should have received a copy of the GNU General Public
|
13
|
+
# License along with this program; if not, write to the Free
|
14
|
+
# Software Foundation, Inc., 59 Temple Place, Suite 330,
|
15
|
+
# Boston, MA 02111-1307 USA.
|
data/README
ADDED
@@ -0,0 +1,8 @@
|
|
1
|
+
# The Levenshtein distance is a metric for measuring the amount of difference
|
2
|
+
# between two sequences (i.e., the so called edit distance). The Levenshtein
|
3
|
+
# distance between two strings is given by the minimum number of operations
|
4
|
+
# needed to transform one string into the other, where an operation is an
|
5
|
+
# insertion, deletion, or substitution of a single character.
|
6
|
+
#
|
7
|
+
# More information about the Levenshtein distance algorithm:
|
8
|
+
# http://en.wikipedia.org/wiki/Levenshtein_distance .
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
@@ -0,0 +1,122 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VALUE rb_threshold) {
|
4
|
+
VALUE rb_s3;
|
5
|
+
int threshold;
|
6
|
+
int l1, l2, l3;
|
7
|
+
char *s1, *s2, *s3;
|
8
|
+
int *prev_row, *curr_row;
|
9
|
+
int col, row;
|
10
|
+
int curr_row_min, result;
|
11
|
+
|
12
|
+
/* Convert Ruby's s1 to C's s1. */
|
13
|
+
|
14
|
+
rb_s1 = StringValue(rb_s1);
|
15
|
+
s1 = RSTRING(rb_s1)->ptr;
|
16
|
+
l1 = RSTRING(rb_s1)->len;
|
17
|
+
|
18
|
+
/* Convert Ruby's s2 to C's s2. */
|
19
|
+
|
20
|
+
rb_s2 = StringValue(rb_s2);
|
21
|
+
s2 = RSTRING(rb_s2)->ptr;
|
22
|
+
l2 = RSTRING(rb_s2)->len;
|
23
|
+
|
24
|
+
/* Convert Ruby's threshold to C's threshold. */
|
25
|
+
|
26
|
+
if (!NIL_P(rb_threshold)) {
|
27
|
+
threshold = FIX2INT(rb_threshold);
|
28
|
+
} else {
|
29
|
+
threshold = -1;
|
30
|
+
}
|
31
|
+
|
32
|
+
/* The Levenshtein Algorithm itself. */
|
33
|
+
|
34
|
+
/* s1= */
|
35
|
+
/* ERIK */
|
36
|
+
/* */
|
37
|
+
/* 01234 */
|
38
|
+
/* s2=V 11234 */
|
39
|
+
/* E 21234 */
|
40
|
+
/* E 32234 */
|
41
|
+
/* N 43334 <- prev_row */
|
42
|
+
/* S 54444 <- curr_row */
|
43
|
+
/* T 65555 */
|
44
|
+
/* R 76566 */
|
45
|
+
/* A 87667 */
|
46
|
+
|
47
|
+
/* Allocate memory for both rows */
|
48
|
+
|
49
|
+
prev_row = ALLOC_N(int, l1+1);
|
50
|
+
curr_row = ALLOC_N(int, l1+1);
|
51
|
+
|
52
|
+
if ((prev_row == NULL) || (curr_row == NULL)) {
|
53
|
+
rb_raise(rb_eNoMemError, "out of memory");
|
54
|
+
}
|
55
|
+
|
56
|
+
/* Initialize the current row. */
|
57
|
+
|
58
|
+
for (col=0; col<=l1; col++) {
|
59
|
+
curr_row[col] = col;
|
60
|
+
}
|
61
|
+
|
62
|
+
for (row=1; row<=l2; row++) {
|
63
|
+
/* Copy the current row to the previous row. */
|
64
|
+
|
65
|
+
memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
|
66
|
+
|
67
|
+
/* Calculate the values of the current row. */
|
68
|
+
|
69
|
+
curr_row[0] = row;
|
70
|
+
curr_row_min = row;
|
71
|
+
|
72
|
+
for (col=1; col<=l1; col++) {
|
73
|
+
/* Equal (cost=0) or Substitution (cost=1). */
|
74
|
+
|
75
|
+
curr_row[col] = prev_row[col-1] + ((s1[col-1] == s2[row-1]) ? 0 : 1);
|
76
|
+
|
77
|
+
/* Insertion if it's cheaper than substitution. */
|
78
|
+
|
79
|
+
if (prev_row[col]+1 < curr_row[col]) {
|
80
|
+
curr_row[col] = prev_row[col]+1;
|
81
|
+
}
|
82
|
+
|
83
|
+
/* Deletion if it's cheaper than substitution. */
|
84
|
+
|
85
|
+
if (curr_row[col-1]+1 < curr_row[col]) {
|
86
|
+
curr_row[col] = curr_row[col-1]+1;
|
87
|
+
}
|
88
|
+
|
89
|
+
/* Keep track of the minimum value on this row. */
|
90
|
+
|
91
|
+
if (curr_row[col] < curr_row_min) {
|
92
|
+
curr_row_min = curr_row[col];
|
93
|
+
}
|
94
|
+
}
|
95
|
+
|
96
|
+
/* Return nil as soon as we exceed the threshold. */
|
97
|
+
|
98
|
+
if (threshold > -1 && curr_row_min >= threshold) {
|
99
|
+
free(prev_row);
|
100
|
+
free(curr_row);
|
101
|
+
|
102
|
+
return Qnil;
|
103
|
+
}
|
104
|
+
}
|
105
|
+
|
106
|
+
/* The result is the last value on the last row. */
|
107
|
+
|
108
|
+
result = curr_row[l1];
|
109
|
+
|
110
|
+
free(prev_row);
|
111
|
+
free(curr_row);
|
112
|
+
|
113
|
+
/* Return the Ruby version of the result. */
|
114
|
+
|
115
|
+
return INT2FIX(result);
|
116
|
+
}
|
117
|
+
|
118
|
+
void Init_levenshtein_c() {
|
119
|
+
VALUE mLevenshtein = rb_define_module("Levenshtein");
|
120
|
+
|
121
|
+
rb_define_singleton_method(mLevenshtein, "distance_part2_fast" , levenshtein_distance_part2, 3);
|
122
|
+
}
|
data/lib/levenshtein.rb
ADDED
@@ -0,0 +1,100 @@
|
|
1
|
+
begin
|
2
|
+
require "levenshtein/levenshtein_c"
|
3
|
+
rescue LoadError
|
4
|
+
begin
|
5
|
+
require "levenshtein_c"
|
6
|
+
rescue LoadError
|
7
|
+
$stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance_part2. Using the slow Ruby version instead."
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
# The Levenshtein distance is a metric for measuring the amount of difference
|
12
|
+
# between two sequences (i.e., the so called edit distance). The Levenshtein
|
13
|
+
# distance between two strings is given by the minimum number of operations
|
14
|
+
# needed to transform one string into the other, where an operation is an
|
15
|
+
# insertion, deletion, or substitution of a single character.
|
16
|
+
#
|
17
|
+
# More information about the Levenshtein distance algorithm:
|
18
|
+
# http://en.wikipedia.org/wiki/Levenshtein_distance .
|
19
|
+
|
20
|
+
module Levenshtein
|
21
|
+
# Returns the Levenshtein distance as a number bestween 0.0 and 1.0.
|
22
|
+
# It's basically the Levenshtein distance divided by the length of the longest string.
|
23
|
+
|
24
|
+
def self.normalized_distance(s1, s2, threshold=nil)
|
25
|
+
s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
|
26
|
+
|
27
|
+
if s2.empty?
|
28
|
+
0.0 # Since s1.length < s2.length, s1 must be empty as well.
|
29
|
+
else
|
30
|
+
if threshold
|
31
|
+
if d = self.distance(s1, s2, (threshold*s2.length+1).to_i)
|
32
|
+
d.to_f/s2.length
|
33
|
+
else
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
else
|
37
|
+
self.distance(s1, s2).to_f/s2.length
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
# Returns the Levenshtein distance between two byte strings.
|
43
|
+
|
44
|
+
def self.distance(s1, s2, threshold=nil)
|
45
|
+
s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
|
46
|
+
|
47
|
+
# Handle some basic circumstances.
|
48
|
+
|
49
|
+
return 0.0 if s1 == s2
|
50
|
+
return s2.length if s1.empty?
|
51
|
+
return nil if threshold and (s2.length-s1.length) >= threshold
|
52
|
+
return nil if threshold and (s1.scan(/./) - s2.scan(/./)).length >= threshold
|
53
|
+
return nil if threshold and (s2.scan(/./) - s1.scan(/./)).length >= threshold
|
54
|
+
|
55
|
+
# Do the expensive calculation on a subset of the strings only, if possible.
|
56
|
+
|
57
|
+
b = 0
|
58
|
+
e1 = s1.length-1
|
59
|
+
e2 = s2.length-1
|
60
|
+
|
61
|
+
while s1[b, 1] == s2[b, 1]
|
62
|
+
b += 1
|
63
|
+
end
|
64
|
+
|
65
|
+
while s1[e1, 1] == s2[e2, 1]
|
66
|
+
e1 -= 1
|
67
|
+
e2 -= 1
|
68
|
+
end
|
69
|
+
|
70
|
+
distance_part2(s1[b..e1], s2[b..e2], threshold)
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.distance_part2(s1, s2, threshold) # :nodoc:
|
74
|
+
if respond_to?(:distance_part2_fast)
|
75
|
+
distance_part2_fast(s1, s2, threshold) # Implemented in C.
|
76
|
+
else
|
77
|
+
distance_part2_slow(s1, s2, threshold) # Implemented in Ruby.
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def self.distance_part2_slow(s1, s2, threshold) # :nodoc:
|
82
|
+
row = (0..s1.length).to_a
|
83
|
+
|
84
|
+
1.upto(s2.length) do |y|
|
85
|
+
prow = row
|
86
|
+
row = [y]
|
87
|
+
|
88
|
+
1.upto(s1.length) do |x|
|
89
|
+
row[x] = [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
|
90
|
+
end
|
91
|
+
|
92
|
+
# Stop analysing this string as soon as the best possible result for this string is bigger than the best result so far.
|
93
|
+
# (The minimum value in the next row will be equal to or greater than the minimum value in this row.)
|
94
|
+
|
95
|
+
return nil if threshold and row.min >= threshold
|
96
|
+
end
|
97
|
+
|
98
|
+
row[-1]
|
99
|
+
end
|
100
|
+
end
|
data/test/test.rb
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
require "test/unit"
|
2
|
+
require "levenshtein"
|
3
|
+
|
4
|
+
class TestLevenshtein < Test::Unit::TestCase
|
5
|
+
def test_erik_veenstra
|
6
|
+
assert_equal(7, Levenshtein.distance("erik", "veenstra"))
|
7
|
+
assert_equal(7, Levenshtein.distance("veenstra", "erik"))
|
8
|
+
|
9
|
+
assert_in_delta(0.875, Levenshtein.normalized_distance("erik", "veenstra"), 0.01)
|
10
|
+
assert_in_delta(0.875, Levenshtein.normalized_distance("veenstra", "erik"), 0.01)
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_empty_string
|
14
|
+
assert_equal(0, Levenshtein.distance("", ""))
|
15
|
+
assert_equal(3, Levenshtein.distance("", "foo"))
|
16
|
+
assert_equal(3, Levenshtein.distance("foo", ""))
|
17
|
+
|
18
|
+
assert_in_delta(0.0, Levenshtein.normalized_distance("", ""), 0.01)
|
19
|
+
assert_in_delta(1.0, Levenshtein.normalized_distance("", "foo"), 0.01)
|
20
|
+
assert_in_delta(1.0, Levenshtein.normalized_distance("foo", ""), 0.01)
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_same_string
|
24
|
+
assert_equal(0, Levenshtein.distance("", ""))
|
25
|
+
assert_equal(0, Levenshtein.distance("foo", "foo"))
|
26
|
+
|
27
|
+
assert_in_delta(0.0, Levenshtein.normalized_distance("", ""), 0.01)
|
28
|
+
assert_in_delta(0.0, Levenshtein.normalized_distance("foo", "foo"), 0.01)
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_threshold
|
32
|
+
assert_equal(3, Levenshtein.distance("foo", "foobar"))
|
33
|
+
assert_equal(nil, Levenshtein.distance("foo", "foobar", 2))
|
34
|
+
|
35
|
+
assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar"), 0.01)
|
36
|
+
assert_equal(nil, Levenshtein.normalized_distance("foo", "foobar", 0.30))
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_same_head_and_or_tail
|
40
|
+
assert_equal(3, Levenshtein.distance("ab123cd", "abxyzcd"))
|
41
|
+
assert_equal(3, Levenshtein.distance("ab123", "abxyz"))
|
42
|
+
assert_equal(3, Levenshtein.distance("123cd", "xyzcd"))
|
43
|
+
|
44
|
+
assert_in_delta(0.42, Levenshtein.normalized_distance("ab123cd", "abxyzcd"), 0.01)
|
45
|
+
assert_in_delta(0.6, Levenshtein.normalized_distance("ab123", "abxyz"), 0.01)
|
46
|
+
assert_in_delta(0.6, Levenshtein.normalized_distance("123cd", "xyzcd"), 0.01)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
class TestLevenshteinPart2Slow < Test::Unit::TestCase
|
51
|
+
def test_erik_veenstra
|
52
|
+
assert_equal(7, Levenshtein.distance_part2_slow("erik", "veenstra", nil))
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_empty_string
|
56
|
+
assert_equal(0, Levenshtein.distance_part2_slow("", "", nil))
|
57
|
+
assert_equal(3, Levenshtein.distance_part2_slow("", "foo", nil))
|
58
|
+
end
|
59
|
+
|
60
|
+
def test_same_string
|
61
|
+
assert_equal(0, Levenshtein.distance_part2_slow("", "", nil))
|
62
|
+
assert_equal(0, Levenshtein.distance_part2_slow("foo", "foo", nil))
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_threshold
|
66
|
+
assert_equal(3, Levenshtein.distance_part2_slow("foo", "foobar", nil))
|
67
|
+
assert_equal(nil, Levenshtein.distance_part2_slow("foo", "foobar", 2))
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
class TestLevenshteinPart2Fast < Test::Unit::TestCase
|
72
|
+
def test_erik_veenstra
|
73
|
+
assert_equal(7, Levenshtein.distance_part2_fast("erik", "veenstra", nil))
|
74
|
+
end
|
75
|
+
|
76
|
+
def test_empty_string
|
77
|
+
assert_equal(0, Levenshtein.distance_part2_fast("", "", nil))
|
78
|
+
assert_equal(3, Levenshtein.distance_part2_fast("", "foo", nil))
|
79
|
+
end
|
80
|
+
|
81
|
+
def test_same_string
|
82
|
+
assert_equal(0, Levenshtein.distance_part2_fast("", "", nil))
|
83
|
+
assert_equal(0, Levenshtein.distance_part2_fast("foo", "foo", nil))
|
84
|
+
end
|
85
|
+
|
86
|
+
def test_threshold
|
87
|
+
assert_equal(3, Levenshtein.distance_part2_fast("foo", "foobar", nil))
|
88
|
+
assert_equal(nil, Levenshtein.distance_part2_fast("foo", "foobar", 2))
|
89
|
+
end
|
90
|
+
end
|
metadata
ADDED
@@ -0,0 +1,65 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: levenshtein
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Erik Veenstra
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-05-24 00:00:00 +02:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Calculates the Levenshtein distance between two byte strings.
|
17
|
+
email: levenshtein@erikveen.dds.nl
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions:
|
21
|
+
- ext/levenshtein/extconf.rb
|
22
|
+
extra_rdoc_files: []
|
23
|
+
|
24
|
+
files:
|
25
|
+
- lib/levenshtein.rb
|
26
|
+
- ext/levenshtein
|
27
|
+
- ext/levenshtein/extconf.rb
|
28
|
+
- ext/levenshtein/levenshtein_c.c
|
29
|
+
- README
|
30
|
+
- LICENSE
|
31
|
+
- VERSION
|
32
|
+
has_rdoc: true
|
33
|
+
homepage: http://www.erikveen.dds.nl/levenshtein/index.html
|
34
|
+
post_install_message:
|
35
|
+
rdoc_options:
|
36
|
+
- README
|
37
|
+
- LICENSE
|
38
|
+
- VERSION
|
39
|
+
- --title
|
40
|
+
- levenshtein (0.1.0)
|
41
|
+
- --main
|
42
|
+
- README
|
43
|
+
require_paths:
|
44
|
+
- lib
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: "0"
|
50
|
+
version:
|
51
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: "0"
|
56
|
+
version:
|
57
|
+
requirements: []
|
58
|
+
|
59
|
+
rubyforge_project: levenshtein
|
60
|
+
rubygems_version: 1.0.1
|
61
|
+
signing_key:
|
62
|
+
specification_version: 2
|
63
|
+
summary: Calculates the Levenshtein distance between two byte strings.
|
64
|
+
test_files:
|
65
|
+
- test/test.rb
|