levenshtein 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +5 -1
- data/README +10 -7
- data/VERSION +1 -1
- data/ext/levenshtein/levenshtein_fast.c +113 -9
- data/lib/levenshtein.rb +80 -49
- data/lib/levenshtein/version.rb +3 -1
- data/test/test.rb +56 -39
- metadata +22 -44
- data/ext/levenshtein/levenshtein_array.c +0 -130
- data/ext/levenshtein/levenshtein_array_of_strings.c +0 -130
- data/ext/levenshtein/levenshtein_generic.c +0 -133
- data/ext/levenshtein/levenshtein_string.c +0 -138
- data/lib/levenshtein/exception.rb +0 -4
data/CHANGELOG
CHANGED
data/README
CHANGED
@@ -1,12 +1,15 @@
|
|
1
|
-
The Levenshtein distance is a metric for measuring the amount
|
2
|
-
between two sequences (i.e., the so called edit
|
3
|
-
distance between two sequences is
|
4
|
-
|
1
|
+
The Levenshtein distance is a metric for measuring the amount
|
2
|
+
of difference between two sequences (i.e., the so called edit
|
3
|
+
distance). The Levenshtein distance between two sequences is
|
4
|
+
given by the minimum number of operations needed to transform
|
5
|
+
one sequence into the other, where an operation is an
|
5
6
|
insertion, deletion, or substitution of a single element.
|
6
7
|
|
7
|
-
The two sequences can be two strings, two arrays, or two other
|
8
|
-
|
9
|
-
|
8
|
+
The two sequences can be two strings, two arrays, or two other
|
9
|
+
objects responding to :each. All sequences are by generic
|
10
|
+
(fast) C code.
|
11
|
+
|
12
|
+
All objects in the sequences should respond to :hash and :eql?.
|
10
13
|
|
11
14
|
More information about the Levenshtein distance algorithm:
|
12
15
|
http://en.wikipedia.org/wiki/Levenshtein_distance .
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.2
|
@@ -2,17 +2,121 @@
|
|
2
2
|
#include "levenshtein.h"
|
3
3
|
|
4
4
|
VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
5
|
+
VALUE *p1, *p2;
|
6
|
+
long l1, l2;
|
7
|
+
long col, row;
|
8
|
+
int threshold;
|
9
|
+
int *prev_row, *curr_row, *temp_row;
|
10
|
+
int curr_row_min, result;
|
11
|
+
int value1, value2;
|
12
|
+
|
13
|
+
/* Be sure that all equivalent objects in rb_o1 and rb_o2 (a.eql?(b) == true) are taken from a pool (a.equal?(b) == true). */
|
14
|
+
/* This is done in levenshtein.rb by means of Util.pool. */
|
15
|
+
|
16
|
+
/* Get the sizes of both arrays. */
|
17
|
+
|
18
|
+
l1 = RARRAY_LEN(rb_o1);
|
19
|
+
l2 = RARRAY_LEN(rb_o2);
|
20
|
+
|
21
|
+
/* Get the pointers of both arrays. */
|
22
|
+
|
23
|
+
p1 = RARRAY_PTR(rb_o1);
|
24
|
+
p2 = RARRAY_PTR(rb_o2);
|
25
|
+
|
26
|
+
/* Convert Ruby's threshold to C's threshold. */
|
27
|
+
|
28
|
+
if (!NIL_P(rb_threshold)) {
|
29
|
+
threshold = FIX2INT(rb_threshold);
|
13
30
|
} else {
|
14
|
-
|
31
|
+
threshold = -1;
|
32
|
+
}
|
33
|
+
|
34
|
+
/* The Levenshtein algorithm itself. */
|
35
|
+
|
36
|
+
/* s1= */
|
37
|
+
/* ERIK */
|
38
|
+
/* */
|
39
|
+
/* 01234 */
|
40
|
+
/* s2=V 11234 */
|
41
|
+
/* E 21234 */
|
42
|
+
/* E 32234 */
|
43
|
+
/* N 43334 <- prev_row */
|
44
|
+
/* S 54444 <- curr_row */
|
45
|
+
/* T 65555 */
|
46
|
+
/* R 76566 */
|
47
|
+
/* A 87667 */
|
48
|
+
|
49
|
+
/* Allocate memory for both rows */
|
50
|
+
|
51
|
+
prev_row = (int*) ALLOC_N(int, (l1+1));
|
52
|
+
curr_row = (int*) ALLOC_N(int, (l1+1));
|
53
|
+
|
54
|
+
/* Initialize the current row. */
|
55
|
+
|
56
|
+
for (col=0; col<=l1; col++) {
|
57
|
+
curr_row[col] = col;
|
15
58
|
}
|
59
|
+
|
60
|
+
for (row=1; row<=l2; row++) {
|
61
|
+
/* Copy the current row to the previous row. */
|
62
|
+
|
63
|
+
temp_row = prev_row;
|
64
|
+
prev_row = curr_row;
|
65
|
+
curr_row = temp_row;
|
66
|
+
|
67
|
+
/* Calculate the values of the current row. */
|
68
|
+
|
69
|
+
curr_row[0] = row;
|
70
|
+
curr_row_min = row;
|
71
|
+
|
72
|
+
for (col=1; col<=l1; col++) {
|
73
|
+
/* Equal (cost=0) or substitution (cost=1). */
|
74
|
+
|
75
|
+
value1 = prev_row[col-1] + ((p1[col-1] == p2[row-1]) ? 0 : 1);
|
76
|
+
|
77
|
+
/* Insertion if it's cheaper than substitution. */
|
78
|
+
|
79
|
+
value2 = prev_row[col]+1;
|
80
|
+
if (value2 < value1) {
|
81
|
+
value1 = value2;
|
82
|
+
}
|
83
|
+
|
84
|
+
/* Deletion if it's cheaper than substitution. */
|
85
|
+
|
86
|
+
value2 = curr_row[col-1]+1;
|
87
|
+
if (value2 < value1) {
|
88
|
+
value1 = value2;
|
89
|
+
}
|
90
|
+
|
91
|
+
/* Keep track of the minimum value on this row. */
|
92
|
+
|
93
|
+
if (value1 < curr_row_min) {
|
94
|
+
curr_row_min = value1;
|
95
|
+
}
|
96
|
+
|
97
|
+
curr_row[col] = value1;
|
98
|
+
}
|
99
|
+
|
100
|
+
/* Return nil as soon as we exceed the threshold. */
|
101
|
+
|
102
|
+
if (threshold > -1 && curr_row_min >= threshold) {
|
103
|
+
free(prev_row);
|
104
|
+
free(curr_row);
|
105
|
+
|
106
|
+
return Qnil;
|
107
|
+
}
|
108
|
+
}
|
109
|
+
|
110
|
+
/* The result is the last value on the last row. */
|
111
|
+
|
112
|
+
result = curr_row[l1];
|
113
|
+
|
114
|
+
free(prev_row);
|
115
|
+
free(curr_row);
|
116
|
+
|
117
|
+
/* Return the Ruby version of the result. */
|
118
|
+
|
119
|
+
return INT2FIX(result);
|
16
120
|
}
|
17
121
|
|
18
122
|
void Init_levenshtein_fast() {
|
data/lib/levenshtein.rb
CHANGED
@@ -1,25 +1,30 @@
|
|
1
|
-
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
2
3
|
require "levenshtein/version"
|
3
4
|
|
4
5
|
module Levenshtein
|
5
6
|
# Returns the Levenshtein distance as a number between 0.0 and
|
6
7
|
# 1.0. It's basically the Levenshtein distance divided by the
|
7
|
-
#
|
8
|
+
# size of the longest sequence.
|
8
9
|
|
9
|
-
def self.normalized_distance(a1, a2, threshold=nil)
|
10
|
-
|
10
|
+
def self.normalized_distance(a1, a2, threshold=nil, options={})
|
11
|
+
size = [a1.size, a2.size].max
|
11
12
|
|
12
|
-
if a2.
|
13
|
-
0.0
|
13
|
+
if a1.size == 0 and a2.size == 0
|
14
|
+
0.0
|
15
|
+
elsif a1.size == 0
|
16
|
+
a2.size.to_f/size
|
17
|
+
elsif a2.size == 0
|
18
|
+
a1.size.to_f/size
|
14
19
|
else
|
15
20
|
if threshold
|
16
|
-
if d = self.distance(a1, a2, (threshold*
|
17
|
-
d.to_f/
|
21
|
+
if d = self.distance(a1, a2, (threshold*size).to_i+1)
|
22
|
+
d.to_f/size
|
18
23
|
else
|
19
24
|
nil
|
20
25
|
end
|
21
26
|
else
|
22
|
-
self.distance(a1, a2).to_f/
|
27
|
+
self.distance(a1, a2).to_f/size
|
23
28
|
end
|
24
29
|
end
|
25
30
|
end
|
@@ -27,71 +32,79 @@ module Levenshtein
|
|
27
32
|
# Returns the Levenshtein distance between two sequences.
|
28
33
|
#
|
29
34
|
# The two sequences can be two strings, two arrays, or two other
|
30
|
-
# objects
|
31
|
-
#
|
32
|
-
# with generic (fast) C code.
|
35
|
+
# objects responding to :each. All sequences are by generic
|
36
|
+
# (fast) C code.
|
33
37
|
#
|
34
|
-
#
|
35
|
-
# in the sequences (as returned by []) should response to :==.
|
38
|
+
# All objects in the sequences should respond to :hash and :eql?.
|
36
39
|
|
37
|
-
def self.distance(a1, a2, threshold=nil)
|
38
|
-
a1, a2 =
|
40
|
+
def self.distance(a1, a2, threshold=nil, options={})
|
41
|
+
a1, a2 = a1.scan(/./), a2.scan(/./) if String === a1 and String === a2
|
42
|
+
a1, a2 = Util.pool(a1, a2)
|
39
43
|
|
40
44
|
# Handle some basic circumstances.
|
41
45
|
|
42
46
|
return 0 if a1 == a2
|
43
|
-
return a2.
|
47
|
+
return a2.size if a1.empty?
|
48
|
+
return a1.size if a2.empty?
|
44
49
|
|
45
50
|
if threshold
|
46
|
-
return nil if (
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
a3, a4 = a1.scan(/./), a2.scan(/./) if a1.respond_to?(:scan) and a2.respond_to?(:scan)
|
51
|
-
|
52
|
-
if a3 and a4
|
53
|
-
return nil if (a3-a4).length >= threshold
|
54
|
-
return nil if (a4-a3).length >= threshold
|
55
|
-
end
|
51
|
+
return nil if (a1.size-a2.size) >= threshold
|
52
|
+
return nil if (a2.size-a1.size) >= threshold
|
53
|
+
return nil if (a1-a2).size >= threshold
|
54
|
+
return nil if (a2-a1).size >= threshold
|
56
55
|
end
|
57
56
|
|
58
|
-
|
59
|
-
end
|
60
|
-
|
61
|
-
def self.distance_fast_or_slow(a1, a2, threshold) # :nodoc:
|
62
|
-
if respond_to?(:distance_fast)
|
63
|
-
distance_fast(a1, a2, threshold) # Implemented in C.
|
64
|
-
else
|
65
|
-
distance_slow(a1, a2, threshold) # Implemented in Ruby.
|
66
|
-
end
|
67
|
-
end
|
57
|
+
# Remove the common prefix and the common postfix.
|
68
58
|
|
69
|
-
|
70
|
-
|
71
|
-
l2 = a2.length
|
59
|
+
l1 = a1.size
|
60
|
+
l2 = a2.size
|
72
61
|
|
73
|
-
offset
|
62
|
+
offset = 0
|
63
|
+
no_more_optimizations = true
|
74
64
|
|
75
|
-
while offset < l1 and offset < l2 and a1[offset]
|
65
|
+
while offset < l1 and offset < l2 and a1[offset].equal?(a2[offset])
|
76
66
|
offset += 1
|
67
|
+
|
68
|
+
no_more_optimizations = false
|
77
69
|
end
|
78
70
|
|
79
|
-
while offset < l1 and offset < l2 and a1[l1-1]
|
71
|
+
while offset < l1 and offset < l2 and a1[l1-1].equal?(a2[l2-1])
|
80
72
|
l1 -= 1
|
81
73
|
l2 -= 1
|
74
|
+
|
75
|
+
no_more_optimizations = false
|
82
76
|
end
|
83
77
|
|
84
|
-
|
85
|
-
|
78
|
+
if no_more_optimizations
|
79
|
+
distance_fast_or_slow(a1, a2, threshold, options)
|
80
|
+
else
|
81
|
+
l1 -= offset
|
82
|
+
l2 -= offset
|
83
|
+
|
84
|
+
a1 = a1[offset, l1]
|
85
|
+
a2 = a2[offset, l2]
|
86
86
|
|
87
|
-
|
87
|
+
distance(a1, a2, threshold, options)
|
88
|
+
end
|
89
|
+
end
|
88
90
|
|
89
|
-
|
91
|
+
def self.distance_fast_or_slow(a1, a2, threshold, options) # :nodoc:
|
92
|
+
if respond_to?(:distance_fast) and options[:force_slow]
|
93
|
+
distance_fast(a1, a2, threshold) # Implemented in C.
|
94
|
+
else
|
95
|
+
distance_slow(a1, a2, threshold) # Implemented in Ruby.
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def self.distance_slow(a1, a2, threshold) # :nodoc:
|
100
|
+
crow = (0..a1.size).to_a
|
101
|
+
|
102
|
+
1.upto(a2.size) do |y|
|
90
103
|
prow = crow
|
91
104
|
crow = [y]
|
92
105
|
|
93
|
-
1.upto(
|
94
|
-
crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[
|
106
|
+
1.upto(a1.size) do |x|
|
107
|
+
crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[x-1].equal?(a2[y-1]) ? 0 : 1)].min
|
95
108
|
end
|
96
109
|
|
97
110
|
# Stop analysing this sequence as soon as the best possible
|
@@ -104,6 +117,24 @@ module Levenshtein
|
|
104
117
|
|
105
118
|
crow[-1]
|
106
119
|
end
|
120
|
+
|
121
|
+
module Util # :nodoc:
|
122
|
+
def self.pool(*args)
|
123
|
+
# So we can compare pointers instead of objects (equal?() instead of ==()).
|
124
|
+
|
125
|
+
pool = {}
|
126
|
+
|
127
|
+
args.collect do |arg|
|
128
|
+
a = []
|
129
|
+
|
130
|
+
arg.each do |o|
|
131
|
+
a << pool[o] ||= o
|
132
|
+
end
|
133
|
+
|
134
|
+
a
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
107
138
|
end
|
108
139
|
|
109
140
|
begin
|
data/lib/levenshtein/version.rb
CHANGED
data/test/test.rb
CHANGED
@@ -1,3 +1,6 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
|
1
4
|
require "test/unit"
|
2
5
|
require "levenshtein"
|
3
6
|
|
@@ -7,14 +10,10 @@ module Levenshtein
|
|
7
10
|
@sequence = o
|
8
11
|
end
|
9
12
|
|
10
|
-
def
|
11
|
-
@sequence.length
|
12
|
-
|
13
|
-
|
14
|
-
def [](pos)
|
15
|
-
raise "type not allowed [#{pos.inspect}]" unless pos.kind_of?(Fixnum)
|
16
|
-
|
17
|
-
@sequence[pos]
|
13
|
+
def each
|
14
|
+
@sequence.length.times do |pos|
|
15
|
+
yield(@sequence[pos])
|
16
|
+
end
|
18
17
|
end
|
19
18
|
end
|
20
19
|
|
@@ -24,14 +23,18 @@ module Levenshtein
|
|
24
23
|
def initialize(o)
|
25
24
|
@object = o
|
26
25
|
end
|
27
|
-
|
28
|
-
def
|
29
|
-
@object
|
26
|
+
|
27
|
+
def hash
|
28
|
+
@object.hash
|
29
|
+
end
|
30
|
+
|
31
|
+
def eql?(other)
|
32
|
+
@object.eql?(other.object)
|
30
33
|
end
|
31
34
|
end
|
32
35
|
end
|
33
36
|
|
34
|
-
class
|
37
|
+
class TestLevenshtein < Test::Unit::TestCase
|
35
38
|
def test_erik_veenstra
|
36
39
|
assert_equal(7, Levenshtein.distance("erik", "veenstra"))
|
37
40
|
assert_equal(7, Levenshtein.distance("veenstra", "erik"))
|
@@ -79,59 +82,73 @@ class TestLevenshteinString < Test::Unit::TestCase
|
|
79
82
|
assert_in_delta(0.6, Levenshtein.normalized_distance("123cd", "xyzcd"), 0.01)
|
80
83
|
assert_in_delta(0.625, Levenshtein.normalized_distance("123cd123", "123"), 0.01)
|
81
84
|
end
|
82
|
-
end
|
83
85
|
|
84
|
-
|
85
|
-
|
86
|
-
|
86
|
+
def test_interface
|
87
|
+
seq1 = Levenshtein::TestSequence.new("erik".scan(/./).collect{|e| Levenshtein::TestElement.new(e)})
|
88
|
+
seq2 = Levenshtein::TestSequence.new("veenstra".scan(/./).collect{|e| Levenshtein::TestElement.new(e)})
|
87
89
|
|
88
|
-
assert_equal(7, Levenshtein.distance(
|
90
|
+
assert_equal(7, Levenshtein.distance(seq1, seq2))
|
89
91
|
end
|
90
92
|
end
|
91
93
|
|
92
|
-
class
|
94
|
+
class TestLevenshteinFast < Test::Unit::TestCase
|
93
95
|
def test_erik_veenstra
|
94
|
-
|
96
|
+
assert_equal(7, Levenshtein.distance("erik", "veenstra", nil, :force_slow=>false))
|
97
|
+
assert_equal(7, Levenshtein.distance("veenstra", "erik", nil, :force_slow=>false))
|
98
|
+
end
|
95
99
|
|
96
|
-
|
100
|
+
def test_empty_string
|
101
|
+
assert_equal(0, Levenshtein.distance("", "", nil, :force_slow=>false))
|
102
|
+
assert_equal(3, Levenshtein.distance("", "foo", nil, :force_slow=>false))
|
103
|
+
assert_equal(3, Levenshtein.distance("foo", "", nil, :force_slow=>false))
|
97
104
|
end
|
98
|
-
end
|
99
105
|
|
100
|
-
|
101
|
-
|
102
|
-
|
106
|
+
def test_same_string
|
107
|
+
assert_equal(0, Levenshtein.distance("", "", nil, :force_slow=>false))
|
108
|
+
assert_equal(0, Levenshtein.distance("foo", "foo", nil, :force_slow=>false))
|
109
|
+
end
|
103
110
|
|
104
|
-
|
111
|
+
def test_threshold
|
112
|
+
assert_equal(3, Levenshtein.distance("foo", "foobar", nil, :force_slow=>false))
|
113
|
+
assert_equal(3, Levenshtein.distance("foo", "foobar", 4, :force_slow=>false))
|
114
|
+
assert_equal(nil, Levenshtein.distance("foo", "foobar", 2, :force_slow=>false))
|
115
|
+
end
|
116
|
+
|
117
|
+
def test_same_head_and_or_tail
|
118
|
+
assert_equal(3, Levenshtein.distance("ab123cd", "abxyzcd", nil, :force_slow=>false))
|
119
|
+
assert_equal(3, Levenshtein.distance("ab123", "abxyz", nil, :force_slow=>false))
|
120
|
+
assert_equal(3, Levenshtein.distance("123cd", "xyzcd", nil, :force_slow=>false))
|
121
|
+
assert_equal(5, Levenshtein.distance("123cd123", "123", nil, :force_slow=>false))
|
105
122
|
end
|
106
123
|
end
|
107
124
|
|
108
125
|
class TestLevenshteinSlow < Test::Unit::TestCase
|
109
126
|
def test_erik_veenstra
|
110
|
-
assert_equal(7, Levenshtein.
|
111
|
-
assert_equal(7, Levenshtein.
|
127
|
+
assert_equal(7, Levenshtein.distance("erik", "veenstra", nil, :force_slow=>true))
|
128
|
+
assert_equal(7, Levenshtein.distance("veenstra", "erik", nil, :force_slow=>true))
|
112
129
|
end
|
113
130
|
|
114
131
|
def test_empty_string
|
115
|
-
assert_equal(0, Levenshtein.
|
116
|
-
assert_equal(3, Levenshtein.
|
117
|
-
assert_equal(3, Levenshtein.
|
132
|
+
assert_equal(0, Levenshtein.distance("", "", nil, :force_slow=>true))
|
133
|
+
assert_equal(3, Levenshtein.distance("", "foo", nil, :force_slow=>true))
|
134
|
+
assert_equal(3, Levenshtein.distance("foo", "", nil, :force_slow=>true))
|
118
135
|
end
|
119
136
|
|
120
137
|
def test_same_string
|
121
|
-
assert_equal(0, Levenshtein.
|
122
|
-
assert_equal(0, Levenshtein.
|
138
|
+
assert_equal(0, Levenshtein.distance("", "", nil, :force_slow=>true))
|
139
|
+
assert_equal(0, Levenshtein.distance("foo", "foo", nil, :force_slow=>true))
|
123
140
|
end
|
124
141
|
|
125
142
|
def test_threshold
|
126
|
-
assert_equal(3, Levenshtein.
|
127
|
-
assert_equal(3, Levenshtein.
|
128
|
-
assert_equal(nil, Levenshtein.
|
143
|
+
assert_equal(3, Levenshtein.distance("foo", "foobar", nil, :force_slow=>true))
|
144
|
+
assert_equal(3, Levenshtein.distance("foo", "foobar", 4, :force_slow=>true))
|
145
|
+
assert_equal(nil, Levenshtein.distance("foo", "foobar", 2, :force_slow=>true))
|
129
146
|
end
|
130
147
|
|
131
148
|
def test_same_head_and_or_tail
|
132
|
-
assert_equal(3, Levenshtein.
|
133
|
-
assert_equal(3, Levenshtein.
|
134
|
-
assert_equal(3, Levenshtein.
|
135
|
-
assert_equal(5, Levenshtein.
|
149
|
+
assert_equal(3, Levenshtein.distance("ab123cd", "abxyzcd", nil, :force_slow=>true))
|
150
|
+
assert_equal(3, Levenshtein.distance("ab123", "abxyz", nil, :force_slow=>true))
|
151
|
+
assert_equal(3, Levenshtein.distance("123cd", "xyzcd", nil, :force_slow=>true))
|
152
|
+
assert_equal(5, Levenshtein.distance("123cd123", "123", nil, :force_slow=>true))
|
136
153
|
end
|
137
154
|
end
|
metadata
CHANGED
@@ -1,41 +1,27 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: levenshtein
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.2
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 2
|
9
|
-
- 1
|
10
|
-
version: 0.2.1
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Erik Veenstra
|
14
9
|
autorequire:
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
|
-
|
18
|
-
date: 2012-02-11 00:00:00 Z
|
12
|
+
date: 2012-03-16 00:00:00.000000000 Z
|
19
13
|
dependencies: []
|
20
|
-
|
21
14
|
description: Calculates the Levenshtein distance between two byte strings.
|
22
15
|
email: levenshtein@erikveen.dds.nl
|
23
16
|
executables: []
|
24
|
-
|
25
|
-
extensions:
|
17
|
+
extensions:
|
26
18
|
- ext/levenshtein/extconf.rb
|
27
19
|
extra_rdoc_files: []
|
28
|
-
|
29
|
-
files:
|
30
|
-
- lib/levenshtein/exception.rb
|
20
|
+
files:
|
31
21
|
- lib/levenshtein/version.rb
|
32
22
|
- lib/levenshtein.rb
|
33
|
-
- ext/levenshtein/levenshtein_string.c
|
34
|
-
- ext/levenshtein/levenshtein_generic.c
|
35
23
|
- ext/levenshtein/levenshtein.h
|
36
24
|
- ext/levenshtein/levenshtein_fast.c
|
37
|
-
- ext/levenshtein/levenshtein_array_of_strings.c
|
38
|
-
- ext/levenshtein/levenshtein_array.c
|
39
25
|
- ext/levenshtein/extconf.rb
|
40
26
|
- README
|
41
27
|
- LICENSE
|
@@ -44,43 +30,35 @@ files:
|
|
44
30
|
- test/test.rb
|
45
31
|
homepage: http://www.erikveen.dds.nl/levenshtein/index.html
|
46
32
|
licenses: []
|
47
|
-
|
48
33
|
post_install_message:
|
49
|
-
rdoc_options:
|
34
|
+
rdoc_options:
|
50
35
|
- README
|
51
36
|
- LICENSE
|
52
37
|
- VERSION
|
53
38
|
- CHANGELOG
|
54
39
|
- --title
|
55
|
-
- levenshtein (0.2.
|
40
|
+
- levenshtein (0.2.2)
|
56
41
|
- --main
|
57
42
|
- README
|
58
|
-
require_paths:
|
43
|
+
require_paths:
|
59
44
|
- lib
|
60
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
61
46
|
none: false
|
62
|
-
requirements:
|
63
|
-
- -
|
64
|
-
- !ruby/object:Gem::Version
|
65
|
-
|
66
|
-
|
67
|
-
- 0
|
68
|
-
version: "0"
|
69
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
47
|
+
requirements:
|
48
|
+
- - ! '>='
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: '0'
|
51
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
70
52
|
none: false
|
71
|
-
requirements:
|
72
|
-
- -
|
73
|
-
- !ruby/object:Gem::Version
|
74
|
-
|
75
|
-
segments:
|
76
|
-
- 0
|
77
|
-
version: "0"
|
53
|
+
requirements:
|
54
|
+
- - ! '>='
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: '0'
|
78
57
|
requirements: []
|
79
|
-
|
80
58
|
rubyforge_project: levenshtein
|
81
|
-
rubygems_version: 1.8.
|
59
|
+
rubygems_version: 1.8.18
|
82
60
|
signing_key:
|
83
61
|
specification_version: 3
|
84
62
|
summary: Calculates the Levenshtein distance between two byte strings.
|
85
|
-
test_files:
|
63
|
+
test_files:
|
86
64
|
- test/test.rb
|
@@ -1,130 +0,0 @@
|
|
1
|
-
#include "ruby.h"
|
2
|
-
#include "levenshtein.h"
|
3
|
-
|
4
|
-
VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
5
|
-
int threshold;
|
6
|
-
int l1, l2;
|
7
|
-
int *prev_row, *curr_row, *temp_row;
|
8
|
-
int col, row;
|
9
|
-
int curr_row_min, result;
|
10
|
-
int offset;
|
11
|
-
int value1, value2;
|
12
|
-
|
13
|
-
/* Get the sizes of both arrays. */
|
14
|
-
|
15
|
-
l1 = RARRAY_LEN(rb_o1);
|
16
|
-
l2 = RARRAY_LEN(rb_o2);
|
17
|
-
|
18
|
-
/* Convert Ruby's threshold to C's threshold. */
|
19
|
-
|
20
|
-
if (!NIL_P(rb_threshold)) {
|
21
|
-
threshold = FIX2INT(rb_threshold);
|
22
|
-
} else {
|
23
|
-
threshold = -1;
|
24
|
-
}
|
25
|
-
|
26
|
-
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
27
|
-
|
28
|
-
offset = 0;
|
29
|
-
|
30
|
-
while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)))) {
|
31
|
-
offset++;
|
32
|
-
}
|
33
|
-
|
34
|
-
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
35
|
-
|
36
|
-
while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)))) {
|
37
|
-
l1--;
|
38
|
-
l2--;
|
39
|
-
}
|
40
|
-
|
41
|
-
l1 -= offset;
|
42
|
-
l2 -= offset;
|
43
|
-
|
44
|
-
/* The Levenshtein algorithm itself. */
|
45
|
-
|
46
|
-
/* s1= */
|
47
|
-
/* ERIK */
|
48
|
-
/* */
|
49
|
-
/* 01234 */
|
50
|
-
/* s2=V 11234 */
|
51
|
-
/* E 21234 */
|
52
|
-
/* E 32234 */
|
53
|
-
/* N 43334 <- prev_row */
|
54
|
-
/* S 54444 <- curr_row */
|
55
|
-
/* T 65555 */
|
56
|
-
/* R 76566 */
|
57
|
-
/* A 87667 */
|
58
|
-
|
59
|
-
/* Allocate memory for both rows */
|
60
|
-
|
61
|
-
prev_row = (int*) ALLOC_N(int, (l1+1));
|
62
|
-
curr_row = (int*) ALLOC_N(int, (l1+1));
|
63
|
-
|
64
|
-
/* Initialize the current row. */
|
65
|
-
|
66
|
-
for (col=0; col<=l1; col++) {
|
67
|
-
curr_row[col] = col;
|
68
|
-
}
|
69
|
-
|
70
|
-
for (row=1; row<=l2; row++) {
|
71
|
-
/* Copy the current row to the previous row. */
|
72
|
-
|
73
|
-
temp_row = prev_row;
|
74
|
-
prev_row = curr_row;
|
75
|
-
curr_row = temp_row;
|
76
|
-
|
77
|
-
/* Calculate the values of the current row. */
|
78
|
-
|
79
|
-
curr_row[0] = row;
|
80
|
-
curr_row_min = row;
|
81
|
-
|
82
|
-
for (col=1; col<=l1; col++) {
|
83
|
-
/* Equal (cost=0) or substitution (cost=1). */
|
84
|
-
|
85
|
-
value1 = prev_row[col-1] + (RTEST(rb_equal(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
|
86
|
-
|
87
|
-
/* Insertion if it's cheaper than substitution. */
|
88
|
-
|
89
|
-
value2 = prev_row[col]+1;
|
90
|
-
if (value2 < value1) {
|
91
|
-
value1 = value2;
|
92
|
-
}
|
93
|
-
|
94
|
-
/* Deletion if it's cheaper than substitution. */
|
95
|
-
|
96
|
-
value2 = curr_row[col-1]+1;
|
97
|
-
if (value2 < value1) {
|
98
|
-
value1 = value2;
|
99
|
-
}
|
100
|
-
|
101
|
-
/* Keep track of the minimum value on this row. */
|
102
|
-
|
103
|
-
if (value1 < curr_row_min) {
|
104
|
-
curr_row_min = value1;
|
105
|
-
}
|
106
|
-
|
107
|
-
curr_row[col] = value1;
|
108
|
-
}
|
109
|
-
|
110
|
-
/* Return nil as soon as we exceed the threshold. */
|
111
|
-
|
112
|
-
if (threshold > -1 && curr_row_min >= threshold) {
|
113
|
-
free(prev_row);
|
114
|
-
free(curr_row);
|
115
|
-
|
116
|
-
return Qnil;
|
117
|
-
}
|
118
|
-
}
|
119
|
-
|
120
|
-
/* The result is the last value on the last row. */
|
121
|
-
|
122
|
-
result = curr_row[l1];
|
123
|
-
|
124
|
-
free(prev_row);
|
125
|
-
free(curr_row);
|
126
|
-
|
127
|
-
/* Return the Ruby version of the result. */
|
128
|
-
|
129
|
-
return INT2FIX(result);
|
130
|
-
}
|
@@ -1,130 +0,0 @@
|
|
1
|
-
#include "ruby.h"
|
2
|
-
#include "levenshtein.h"
|
3
|
-
|
4
|
-
VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
5
|
-
int threshold;
|
6
|
-
int l1, l2;
|
7
|
-
int *prev_row, *curr_row, *temp_row;
|
8
|
-
int col, row;
|
9
|
-
int curr_row_min, result;
|
10
|
-
int offset;
|
11
|
-
int value1, value2;
|
12
|
-
|
13
|
-
/* Get the sizes of both arrays. */
|
14
|
-
|
15
|
-
l1 = RARRAY_LEN(rb_o1);
|
16
|
-
l2 = RARRAY_LEN(rb_o2);
|
17
|
-
|
18
|
-
/* Convert Ruby's threshold to C's threshold. */
|
19
|
-
|
20
|
-
if (!NIL_P(rb_threshold)) {
|
21
|
-
threshold = FIX2INT(rb_threshold);
|
22
|
-
} else {
|
23
|
-
threshold = -1;
|
24
|
-
}
|
25
|
-
|
26
|
-
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
27
|
-
|
28
|
-
offset = 0;
|
29
|
-
|
30
|
-
while ((offset < l1) && (offset < l2) && (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0)) {
|
31
|
-
offset++;
|
32
|
-
}
|
33
|
-
|
34
|
-
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
35
|
-
|
36
|
-
while ((offset < l1) && (offset < l2) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
|
37
|
-
l1--;
|
38
|
-
l2--;
|
39
|
-
}
|
40
|
-
|
41
|
-
l1 -= offset;
|
42
|
-
l2 -= offset;
|
43
|
-
|
44
|
-
/* The Levenshtein algorithm itself. */
|
45
|
-
|
46
|
-
/* s1= */
|
47
|
-
/* ERIK */
|
48
|
-
/* */
|
49
|
-
/* 01234 */
|
50
|
-
/* s2=V 11234 */
|
51
|
-
/* E 21234 */
|
52
|
-
/* E 32234 */
|
53
|
-
/* N 43334 <- prev_row */
|
54
|
-
/* S 54444 <- curr_row */
|
55
|
-
/* T 65555 */
|
56
|
-
/* R 76566 */
|
57
|
-
/* A 87667 */
|
58
|
-
|
59
|
-
/* Allocate memory for both rows */
|
60
|
-
|
61
|
-
prev_row = (int*) ALLOC_N(int, (l1+1));
|
62
|
-
curr_row = (int*) ALLOC_N(int, (l1+1));
|
63
|
-
|
64
|
-
/* Initialize the current row. */
|
65
|
-
|
66
|
-
for (col=0; col<=l1; col++) {
|
67
|
-
curr_row[col] = col;
|
68
|
-
}
|
69
|
-
|
70
|
-
for (row=1; row<=l2; row++) {
|
71
|
-
/* Copy the current row to the previous row. */
|
72
|
-
|
73
|
-
temp_row = prev_row;
|
74
|
-
prev_row = curr_row;
|
75
|
-
curr_row = temp_row;
|
76
|
-
|
77
|
-
/* Calculate the values of the current row. */
|
78
|
-
|
79
|
-
curr_row[0] = row;
|
80
|
-
curr_row_min = row;
|
81
|
-
|
82
|
-
for (col=1; col<=l1; col++) {
|
83
|
-
/* Equal (cost=0) or substitution (cost=1). */
|
84
|
-
|
85
|
-
value1 = prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
|
86
|
-
|
87
|
-
/* Insertion if it's cheaper than substitution. */
|
88
|
-
|
89
|
-
value2 = prev_row[col]+1;
|
90
|
-
if (value2 < value1) {
|
91
|
-
value1 = value2;
|
92
|
-
}
|
93
|
-
|
94
|
-
/* Deletion if it's cheaper than substitution. */
|
95
|
-
|
96
|
-
value2 = curr_row[col-1]+1;
|
97
|
-
if (value2 < value1) {
|
98
|
-
value1 = value2;
|
99
|
-
}
|
100
|
-
|
101
|
-
/* Keep track of the minimum value on this row. */
|
102
|
-
|
103
|
-
if (value1 < curr_row_min) {
|
104
|
-
curr_row_min = value1;
|
105
|
-
}
|
106
|
-
|
107
|
-
curr_row[col] = value1;
|
108
|
-
}
|
109
|
-
|
110
|
-
/* Return nil as soon as we exceed the threshold. */
|
111
|
-
|
112
|
-
if (threshold > -1 && curr_row_min >= threshold) {
|
113
|
-
free(prev_row);
|
114
|
-
free(curr_row);
|
115
|
-
|
116
|
-
return Qnil;
|
117
|
-
}
|
118
|
-
}
|
119
|
-
|
120
|
-
/* The result is the last value on the last row. */
|
121
|
-
|
122
|
-
result = curr_row[l1];
|
123
|
-
|
124
|
-
free(prev_row);
|
125
|
-
free(curr_row);
|
126
|
-
|
127
|
-
/* Return the Ruby version of the result. */
|
128
|
-
|
129
|
-
return INT2FIX(result);
|
130
|
-
}
|
@@ -1,133 +0,0 @@
|
|
1
|
-
#include "ruby.h"
|
2
|
-
#include "levenshtein.h"
|
3
|
-
|
4
|
-
VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
5
|
-
int threshold;
|
6
|
-
int l1, l2;
|
7
|
-
int *prev_row, *curr_row, *temp_row;
|
8
|
-
int col, row;
|
9
|
-
int curr_row_min, result;
|
10
|
-
int offset;
|
11
|
-
int value1, value2;
|
12
|
-
|
13
|
-
ID id_length = rb_intern("length");
|
14
|
-
ID id_get = rb_intern("[]");
|
15
|
-
|
16
|
-
/* Get the sizes of both sequences. */
|
17
|
-
|
18
|
-
l1 = FIX2INT(rb_funcall(rb_o1, id_length, 0));
|
19
|
-
l2 = FIX2INT(rb_funcall(rb_o2, id_length, 0));
|
20
|
-
|
21
|
-
/* Convert Ruby's threshold to C's threshold. */
|
22
|
-
|
23
|
-
if (!NIL_P(rb_threshold)) {
|
24
|
-
threshold = FIX2INT(rb_threshold);
|
25
|
-
} else {
|
26
|
-
threshold = -1;
|
27
|
-
}
|
28
|
-
|
29
|
-
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
30
|
-
|
31
|
-
offset = 0;
|
32
|
-
|
33
|
-
while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), rb_funcall(rb_o2, id_get, 1, INT2FIX(offset))))) {
|
34
|
-
offset++;
|
35
|
-
}
|
36
|
-
|
37
|
-
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
38
|
-
|
39
|
-
while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
|
40
|
-
l1--;
|
41
|
-
l2--;
|
42
|
-
}
|
43
|
-
|
44
|
-
l1 -= offset;
|
45
|
-
l2 -= offset;
|
46
|
-
|
47
|
-
/* The Levenshtein algorithm itself. */
|
48
|
-
|
49
|
-
/* s1= */
|
50
|
-
/* ERIK */
|
51
|
-
/* */
|
52
|
-
/* 01234 */
|
53
|
-
/* s2=V 11234 */
|
54
|
-
/* E 21234 */
|
55
|
-
/* E 32234 */
|
56
|
-
/* N 43334 <- prev_row */
|
57
|
-
/* S 54444 <- curr_row */
|
58
|
-
/* T 65555 */
|
59
|
-
/* R 76566 */
|
60
|
-
/* A 87667 */
|
61
|
-
|
62
|
-
/* Allocate memory for both rows */
|
63
|
-
|
64
|
-
prev_row = (int*) ALLOC_N(int, (l1+1));
|
65
|
-
curr_row = (int*) ALLOC_N(int, (l1+1));
|
66
|
-
|
67
|
-
/* Initialize the current row. */
|
68
|
-
|
69
|
-
for (col=0; col<=l1; col++) {
|
70
|
-
curr_row[col] = col;
|
71
|
-
}
|
72
|
-
|
73
|
-
for (row=1; row<=l2; row++) {
|
74
|
-
/* Copy the current row to the previous row. */
|
75
|
-
|
76
|
-
temp_row = prev_row;
|
77
|
-
prev_row = curr_row;
|
78
|
-
curr_row = temp_row;
|
79
|
-
|
80
|
-
/* Calculate the values of the current row. */
|
81
|
-
|
82
|
-
curr_row[0] = row;
|
83
|
-
curr_row_min = row;
|
84
|
-
|
85
|
-
for (col=1; col<=l1; col++) {
|
86
|
-
/* Equal (cost=0) or substitution (cost=1). */
|
87
|
-
|
88
|
-
value1 = prev_row[col-1] + (RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
|
89
|
-
|
90
|
-
/* Insertion if it's cheaper than substitution. */
|
91
|
-
|
92
|
-
value2 = prev_row[col]+1;
|
93
|
-
if (value2 < value1) {
|
94
|
-
value1 = value2;
|
95
|
-
}
|
96
|
-
|
97
|
-
/* Deletion if it's cheaper than substitution. */
|
98
|
-
|
99
|
-
value2 = curr_row[col-1]+1;
|
100
|
-
if (value2 < value1) {
|
101
|
-
value1 = value2;
|
102
|
-
}
|
103
|
-
|
104
|
-
/* Keep track of the minimum value on this row. */
|
105
|
-
|
106
|
-
if (value1 < curr_row_min) {
|
107
|
-
curr_row_min = value1;
|
108
|
-
}
|
109
|
-
|
110
|
-
curr_row[col] = value1;
|
111
|
-
}
|
112
|
-
|
113
|
-
/* Return nil as soon as we exceed the threshold. */
|
114
|
-
|
115
|
-
if (threshold > -1 && curr_row_min >= threshold) {
|
116
|
-
free(prev_row);
|
117
|
-
free(curr_row);
|
118
|
-
|
119
|
-
return Qnil;
|
120
|
-
}
|
121
|
-
}
|
122
|
-
|
123
|
-
/* The result is the last value on the last row. */
|
124
|
-
|
125
|
-
result = curr_row[l1];
|
126
|
-
|
127
|
-
free(prev_row);
|
128
|
-
free(curr_row);
|
129
|
-
|
130
|
-
/* Return the Ruby version of the result. */
|
131
|
-
|
132
|
-
return INT2FIX(result);
|
133
|
-
}
|
@@ -1,138 +0,0 @@
|
|
1
|
-
#include "ruby.h"
|
2
|
-
#include "levenshtein.h"
|
3
|
-
|
4
|
-
VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
5
|
-
int threshold;
|
6
|
-
int l1, l2;
|
7
|
-
int *prev_row, *curr_row, *temp_row;
|
8
|
-
int col, row;
|
9
|
-
int curr_row_min, result;
|
10
|
-
int offset;
|
11
|
-
int value1, value2;
|
12
|
-
char *s1, *s2;
|
13
|
-
|
14
|
-
/* Convert Ruby's s1 to C's s1. */
|
15
|
-
|
16
|
-
rb_o1 = StringValue(rb_o1);
|
17
|
-
s1 = RSTRING_PTR(rb_o1);
|
18
|
-
l1 = RSTRING_LEN(rb_o1);
|
19
|
-
|
20
|
-
/* Convert Ruby's s2 to C's s2. */
|
21
|
-
|
22
|
-
rb_o2 = StringValue(rb_o2);
|
23
|
-
s2 = RSTRING_PTR(rb_o2);
|
24
|
-
l2 = RSTRING_LEN(rb_o2);
|
25
|
-
|
26
|
-
/* Convert Ruby's threshold to C's threshold. */
|
27
|
-
|
28
|
-
if (!NIL_P(rb_threshold)) {
|
29
|
-
threshold = FIX2INT(rb_threshold);
|
30
|
-
} else {
|
31
|
-
threshold = -1;
|
32
|
-
}
|
33
|
-
|
34
|
-
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
35
|
-
|
36
|
-
offset = 0;
|
37
|
-
|
38
|
-
while ((offset < l1) && (offset < l2) && (s1[offset] == s2[offset])) {
|
39
|
-
offset++;
|
40
|
-
}
|
41
|
-
|
42
|
-
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
43
|
-
|
44
|
-
while ((offset < l1) && (offset < l2) && (s1[l1-1] == s2[l2-1])) {
|
45
|
-
l1--;
|
46
|
-
l2--;
|
47
|
-
}
|
48
|
-
|
49
|
-
l1 -= offset;
|
50
|
-
l2 -= offset;
|
51
|
-
|
52
|
-
/* The Levenshtein algorithm itself. */
|
53
|
-
|
54
|
-
/* s1= */
|
55
|
-
/* ERIK */
|
56
|
-
/* */
|
57
|
-
/* 01234 */
|
58
|
-
/* s2=V 11234 */
|
59
|
-
/* E 21234 */
|
60
|
-
/* E 32234 */
|
61
|
-
/* N 43334 <- prev_row */
|
62
|
-
/* S 54444 <- curr_row */
|
63
|
-
/* T 65555 */
|
64
|
-
/* R 76566 */
|
65
|
-
/* A 87667 */
|
66
|
-
|
67
|
-
/* Allocate memory for both rows */
|
68
|
-
|
69
|
-
prev_row = (int*) ALLOC_N(int, (l1+1));
|
70
|
-
curr_row = (int*) ALLOC_N(int, (l1+1));
|
71
|
-
|
72
|
-
/* Initialize the current row. */
|
73
|
-
|
74
|
-
for (col=0; col<=l1; col++) {
|
75
|
-
curr_row[col] = col;
|
76
|
-
}
|
77
|
-
|
78
|
-
for (row=1; row<=l2; row++) {
|
79
|
-
/* Copy the current row to the previous row. */
|
80
|
-
|
81
|
-
temp_row = prev_row;
|
82
|
-
prev_row = curr_row;
|
83
|
-
curr_row = temp_row;
|
84
|
-
|
85
|
-
/* Calculate the values of the current row. */
|
86
|
-
|
87
|
-
curr_row[0] = row;
|
88
|
-
curr_row_min = row;
|
89
|
-
|
90
|
-
for (col=1; col<=l1; col++) {
|
91
|
-
/* Equal (cost=0) or substitution (cost=1). */
|
92
|
-
|
93
|
-
value1 = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
|
94
|
-
|
95
|
-
/* Insertion if it's cheaper than substitution. */
|
96
|
-
|
97
|
-
value2 = prev_row[col]+1;
|
98
|
-
if (value2 < value1) {
|
99
|
-
value1 = value2;
|
100
|
-
}
|
101
|
-
|
102
|
-
/* Deletion if it's cheaper than substitution. */
|
103
|
-
|
104
|
-
value2 = curr_row[col-1]+1;
|
105
|
-
if (value2 < value1) {
|
106
|
-
value1 = value2;
|
107
|
-
}
|
108
|
-
|
109
|
-
/* Keep track of the minimum value on this row. */
|
110
|
-
|
111
|
-
if (value1 < curr_row_min) {
|
112
|
-
curr_row_min = value1;
|
113
|
-
}
|
114
|
-
|
115
|
-
curr_row[col] = value1;
|
116
|
-
}
|
117
|
-
|
118
|
-
/* Return nil as soon as we exceed the threshold. */
|
119
|
-
|
120
|
-
if (threshold > -1 && curr_row_min >= threshold) {
|
121
|
-
free(prev_row);
|
122
|
-
free(curr_row);
|
123
|
-
|
124
|
-
return Qnil;
|
125
|
-
}
|
126
|
-
}
|
127
|
-
|
128
|
-
/* The result is the last value on the last row. */
|
129
|
-
|
130
|
-
result = curr_row[l1];
|
131
|
-
|
132
|
-
free(prev_row);
|
133
|
-
free(curr_row);
|
134
|
-
|
135
|
-
/* Return the Ruby version of the result. */
|
136
|
-
|
137
|
-
return INT2FIX(result);
|
138
|
-
}
|