levenshtein 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +5 -1
- data/README +10 -7
- data/VERSION +1 -1
- data/ext/levenshtein/levenshtein_fast.c +113 -9
- data/lib/levenshtein.rb +80 -49
- data/lib/levenshtein/version.rb +3 -1
- data/test/test.rb +56 -39
- metadata +22 -44
- data/ext/levenshtein/levenshtein_array.c +0 -130
- data/ext/levenshtein/levenshtein_array_of_strings.c +0 -130
- data/ext/levenshtein/levenshtein_generic.c +0 -133
- data/ext/levenshtein/levenshtein_string.c +0 -138
- data/lib/levenshtein/exception.rb +0 -4
data/CHANGELOG
CHANGED
data/README
CHANGED
@@ -1,12 +1,15 @@
|
|
1
|
-
The Levenshtein distance is a metric for measuring the amount
|
2
|
-
between two sequences (i.e., the so called edit
|
3
|
-
distance between two sequences is
|
4
|
-
|
1
|
+
The Levenshtein distance is a metric for measuring the amount
|
2
|
+
of difference between two sequences (i.e., the so called edit
|
3
|
+
distance). The Levenshtein distance between two sequences is
|
4
|
+
given by the minimum number of operations needed to transform
|
5
|
+
one sequence into the other, where an operation is an
|
5
6
|
insertion, deletion, or substitution of a single element.
|
6
7
|
|
7
|
-
The two sequences can be two strings, two arrays, or two other
|
8
|
-
|
9
|
-
|
8
|
+
The two sequences can be two strings, two arrays, or two other
|
9
|
+
objects responding to :each. All sequences are by generic
|
10
|
+
(fast) C code.
|
11
|
+
|
12
|
+
All objects in the sequences should respond to :hash and :eql?.
|
10
13
|
|
11
14
|
More information about the Levenshtein distance algorithm:
|
12
15
|
http://en.wikipedia.org/wiki/Levenshtein_distance .
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.2
|
@@ -2,17 +2,121 @@
|
|
2
2
|
#include "levenshtein.h"
|
3
3
|
|
4
4
|
VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
5
|
+
VALUE *p1, *p2;
|
6
|
+
long l1, l2;
|
7
|
+
long col, row;
|
8
|
+
int threshold;
|
9
|
+
int *prev_row, *curr_row, *temp_row;
|
10
|
+
int curr_row_min, result;
|
11
|
+
int value1, value2;
|
12
|
+
|
13
|
+
/* Be sure that all equivalent objects in rb_o1 and rb_o2 (a.eql?(b) == true) are taken from a pool (a.equal?(b) == true). */
|
14
|
+
/* This is done in levenshtein.rb by means of Util.pool. */
|
15
|
+
|
16
|
+
/* Get the sizes of both arrays. */
|
17
|
+
|
18
|
+
l1 = RARRAY_LEN(rb_o1);
|
19
|
+
l2 = RARRAY_LEN(rb_o2);
|
20
|
+
|
21
|
+
/* Get the pointers of both arrays. */
|
22
|
+
|
23
|
+
p1 = RARRAY_PTR(rb_o1);
|
24
|
+
p2 = RARRAY_PTR(rb_o2);
|
25
|
+
|
26
|
+
/* Convert Ruby's threshold to C's threshold. */
|
27
|
+
|
28
|
+
if (!NIL_P(rb_threshold)) {
|
29
|
+
threshold = FIX2INT(rb_threshold);
|
13
30
|
} else {
|
14
|
-
|
31
|
+
threshold = -1;
|
32
|
+
}
|
33
|
+
|
34
|
+
/* The Levenshtein algorithm itself. */
|
35
|
+
|
36
|
+
/* s1= */
|
37
|
+
/* ERIK */
|
38
|
+
/* */
|
39
|
+
/* 01234 */
|
40
|
+
/* s2=V 11234 */
|
41
|
+
/* E 21234 */
|
42
|
+
/* E 32234 */
|
43
|
+
/* N 43334 <- prev_row */
|
44
|
+
/* S 54444 <- curr_row */
|
45
|
+
/* T 65555 */
|
46
|
+
/* R 76566 */
|
47
|
+
/* A 87667 */
|
48
|
+
|
49
|
+
/* Allocate memory for both rows */
|
50
|
+
|
51
|
+
prev_row = (int*) ALLOC_N(int, (l1+1));
|
52
|
+
curr_row = (int*) ALLOC_N(int, (l1+1));
|
53
|
+
|
54
|
+
/* Initialize the current row. */
|
55
|
+
|
56
|
+
for (col=0; col<=l1; col++) {
|
57
|
+
curr_row[col] = col;
|
15
58
|
}
|
59
|
+
|
60
|
+
for (row=1; row<=l2; row++) {
|
61
|
+
/* Copy the current row to the previous row. */
|
62
|
+
|
63
|
+
temp_row = prev_row;
|
64
|
+
prev_row = curr_row;
|
65
|
+
curr_row = temp_row;
|
66
|
+
|
67
|
+
/* Calculate the values of the current row. */
|
68
|
+
|
69
|
+
curr_row[0] = row;
|
70
|
+
curr_row_min = row;
|
71
|
+
|
72
|
+
for (col=1; col<=l1; col++) {
|
73
|
+
/* Equal (cost=0) or substitution (cost=1). */
|
74
|
+
|
75
|
+
value1 = prev_row[col-1] + ((p1[col-1] == p2[row-1]) ? 0 : 1);
|
76
|
+
|
77
|
+
/* Insertion if it's cheaper than substitution. */
|
78
|
+
|
79
|
+
value2 = prev_row[col]+1;
|
80
|
+
if (value2 < value1) {
|
81
|
+
value1 = value2;
|
82
|
+
}
|
83
|
+
|
84
|
+
/* Deletion if it's cheaper than substitution. */
|
85
|
+
|
86
|
+
value2 = curr_row[col-1]+1;
|
87
|
+
if (value2 < value1) {
|
88
|
+
value1 = value2;
|
89
|
+
}
|
90
|
+
|
91
|
+
/* Keep track of the minimum value on this row. */
|
92
|
+
|
93
|
+
if (value1 < curr_row_min) {
|
94
|
+
curr_row_min = value1;
|
95
|
+
}
|
96
|
+
|
97
|
+
curr_row[col] = value1;
|
98
|
+
}
|
99
|
+
|
100
|
+
/* Return nil as soon as we exceed the threshold. */
|
101
|
+
|
102
|
+
if (threshold > -1 && curr_row_min >= threshold) {
|
103
|
+
free(prev_row);
|
104
|
+
free(curr_row);
|
105
|
+
|
106
|
+
return Qnil;
|
107
|
+
}
|
108
|
+
}
|
109
|
+
|
110
|
+
/* The result is the last value on the last row. */
|
111
|
+
|
112
|
+
result = curr_row[l1];
|
113
|
+
|
114
|
+
free(prev_row);
|
115
|
+
free(curr_row);
|
116
|
+
|
117
|
+
/* Return the Ruby version of the result. */
|
118
|
+
|
119
|
+
return INT2FIX(result);
|
16
120
|
}
|
17
121
|
|
18
122
|
void Init_levenshtein_fast() {
|
data/lib/levenshtein.rb
CHANGED
@@ -1,25 +1,30 @@
|
|
1
|
-
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
2
3
|
require "levenshtein/version"
|
3
4
|
|
4
5
|
module Levenshtein
|
5
6
|
# Returns the Levenshtein distance as a number between 0.0 and
|
6
7
|
# 1.0. It's basically the Levenshtein distance divided by the
|
7
|
-
#
|
8
|
+
# size of the longest sequence.
|
8
9
|
|
9
|
-
def self.normalized_distance(a1, a2, threshold=nil)
|
10
|
-
|
10
|
+
def self.normalized_distance(a1, a2, threshold=nil, options={})
|
11
|
+
size = [a1.size, a2.size].max
|
11
12
|
|
12
|
-
if a2.
|
13
|
-
0.0
|
13
|
+
if a1.size == 0 and a2.size == 0
|
14
|
+
0.0
|
15
|
+
elsif a1.size == 0
|
16
|
+
a2.size.to_f/size
|
17
|
+
elsif a2.size == 0
|
18
|
+
a1.size.to_f/size
|
14
19
|
else
|
15
20
|
if threshold
|
16
|
-
if d = self.distance(a1, a2, (threshold*
|
17
|
-
d.to_f/
|
21
|
+
if d = self.distance(a1, a2, (threshold*size).to_i+1)
|
22
|
+
d.to_f/size
|
18
23
|
else
|
19
24
|
nil
|
20
25
|
end
|
21
26
|
else
|
22
|
-
self.distance(a1, a2).to_f/
|
27
|
+
self.distance(a1, a2).to_f/size
|
23
28
|
end
|
24
29
|
end
|
25
30
|
end
|
@@ -27,71 +32,79 @@ module Levenshtein
|
|
27
32
|
# Returns the Levenshtein distance between two sequences.
|
28
33
|
#
|
29
34
|
# The two sequences can be two strings, two arrays, or two other
|
30
|
-
# objects
|
31
|
-
#
|
32
|
-
# with generic (fast) C code.
|
35
|
+
# objects responding to :each. All sequences are by generic
|
36
|
+
# (fast) C code.
|
33
37
|
#
|
34
|
-
#
|
35
|
-
# in the sequences (as returned by []) should response to :==.
|
38
|
+
# All objects in the sequences should respond to :hash and :eql?.
|
36
39
|
|
37
|
-
def self.distance(a1, a2, threshold=nil)
|
38
|
-
a1, a2 =
|
40
|
+
def self.distance(a1, a2, threshold=nil, options={})
|
41
|
+
a1, a2 = a1.scan(/./), a2.scan(/./) if String === a1 and String === a2
|
42
|
+
a1, a2 = Util.pool(a1, a2)
|
39
43
|
|
40
44
|
# Handle some basic circumstances.
|
41
45
|
|
42
46
|
return 0 if a1 == a2
|
43
|
-
return a2.
|
47
|
+
return a2.size if a1.empty?
|
48
|
+
return a1.size if a2.empty?
|
44
49
|
|
45
50
|
if threshold
|
46
|
-
return nil if (
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
a3, a4 = a1.scan(/./), a2.scan(/./) if a1.respond_to?(:scan) and a2.respond_to?(:scan)
|
51
|
-
|
52
|
-
if a3 and a4
|
53
|
-
return nil if (a3-a4).length >= threshold
|
54
|
-
return nil if (a4-a3).length >= threshold
|
55
|
-
end
|
51
|
+
return nil if (a1.size-a2.size) >= threshold
|
52
|
+
return nil if (a2.size-a1.size) >= threshold
|
53
|
+
return nil if (a1-a2).size >= threshold
|
54
|
+
return nil if (a2-a1).size >= threshold
|
56
55
|
end
|
57
56
|
|
58
|
-
|
59
|
-
end
|
60
|
-
|
61
|
-
def self.distance_fast_or_slow(a1, a2, threshold) # :nodoc:
|
62
|
-
if respond_to?(:distance_fast)
|
63
|
-
distance_fast(a1, a2, threshold) # Implemented in C.
|
64
|
-
else
|
65
|
-
distance_slow(a1, a2, threshold) # Implemented in Ruby.
|
66
|
-
end
|
67
|
-
end
|
57
|
+
# Remove the common prefix and the common postfix.
|
68
58
|
|
69
|
-
|
70
|
-
|
71
|
-
l2 = a2.length
|
59
|
+
l1 = a1.size
|
60
|
+
l2 = a2.size
|
72
61
|
|
73
|
-
offset
|
62
|
+
offset = 0
|
63
|
+
no_more_optimizations = true
|
74
64
|
|
75
|
-
while offset < l1 and offset < l2 and a1[offset]
|
65
|
+
while offset < l1 and offset < l2 and a1[offset].equal?(a2[offset])
|
76
66
|
offset += 1
|
67
|
+
|
68
|
+
no_more_optimizations = false
|
77
69
|
end
|
78
70
|
|
79
|
-
while offset < l1 and offset < l2 and a1[l1-1]
|
71
|
+
while offset < l1 and offset < l2 and a1[l1-1].equal?(a2[l2-1])
|
80
72
|
l1 -= 1
|
81
73
|
l2 -= 1
|
74
|
+
|
75
|
+
no_more_optimizations = false
|
82
76
|
end
|
83
77
|
|
84
|
-
|
85
|
-
|
78
|
+
if no_more_optimizations
|
79
|
+
distance_fast_or_slow(a1, a2, threshold, options)
|
80
|
+
else
|
81
|
+
l1 -= offset
|
82
|
+
l2 -= offset
|
83
|
+
|
84
|
+
a1 = a1[offset, l1]
|
85
|
+
a2 = a2[offset, l2]
|
86
86
|
|
87
|
-
|
87
|
+
distance(a1, a2, threshold, options)
|
88
|
+
end
|
89
|
+
end
|
88
90
|
|
89
|
-
|
91
|
+
def self.distance_fast_or_slow(a1, a2, threshold, options) # :nodoc:
|
92
|
+
if respond_to?(:distance_fast) and options[:force_slow]
|
93
|
+
distance_fast(a1, a2, threshold) # Implemented in C.
|
94
|
+
else
|
95
|
+
distance_slow(a1, a2, threshold) # Implemented in Ruby.
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def self.distance_slow(a1, a2, threshold) # :nodoc:
|
100
|
+
crow = (0..a1.size).to_a
|
101
|
+
|
102
|
+
1.upto(a2.size) do |y|
|
90
103
|
prow = crow
|
91
104
|
crow = [y]
|
92
105
|
|
93
|
-
1.upto(
|
94
|
-
crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[
|
106
|
+
1.upto(a1.size) do |x|
|
107
|
+
crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[x-1].equal?(a2[y-1]) ? 0 : 1)].min
|
95
108
|
end
|
96
109
|
|
97
110
|
# Stop analysing this sequence as soon as the best possible
|
@@ -104,6 +117,24 @@ module Levenshtein
|
|
104
117
|
|
105
118
|
crow[-1]
|
106
119
|
end
|
120
|
+
|
121
|
+
module Util # :nodoc:
|
122
|
+
def self.pool(*args)
|
123
|
+
# So we can compare pointers instead of objects (equal?() instead of ==()).
|
124
|
+
|
125
|
+
pool = {}
|
126
|
+
|
127
|
+
args.collect do |arg|
|
128
|
+
a = []
|
129
|
+
|
130
|
+
arg.each do |o|
|
131
|
+
a << pool[o] ||= o
|
132
|
+
end
|
133
|
+
|
134
|
+
a
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
107
138
|
end
|
108
139
|
|
109
140
|
begin
|
data/lib/levenshtein/version.rb
CHANGED
data/test/test.rb
CHANGED
@@ -1,3 +1,6 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# encoding: UTF-8
|
3
|
+
|
1
4
|
require "test/unit"
|
2
5
|
require "levenshtein"
|
3
6
|
|
@@ -7,14 +10,10 @@ module Levenshtein
|
|
7
10
|
@sequence = o
|
8
11
|
end
|
9
12
|
|
10
|
-
def
|
11
|
-
@sequence.length
|
12
|
-
|
13
|
-
|
14
|
-
def [](pos)
|
15
|
-
raise "type not allowed [#{pos.inspect}]" unless pos.kind_of?(Fixnum)
|
16
|
-
|
17
|
-
@sequence[pos]
|
13
|
+
def each
|
14
|
+
@sequence.length.times do |pos|
|
15
|
+
yield(@sequence[pos])
|
16
|
+
end
|
18
17
|
end
|
19
18
|
end
|
20
19
|
|
@@ -24,14 +23,18 @@ module Levenshtein
|
|
24
23
|
def initialize(o)
|
25
24
|
@object = o
|
26
25
|
end
|
27
|
-
|
28
|
-
def
|
29
|
-
@object
|
26
|
+
|
27
|
+
def hash
|
28
|
+
@object.hash
|
29
|
+
end
|
30
|
+
|
31
|
+
def eql?(other)
|
32
|
+
@object.eql?(other.object)
|
30
33
|
end
|
31
34
|
end
|
32
35
|
end
|
33
36
|
|
34
|
-
class
|
37
|
+
class TestLevenshtein < Test::Unit::TestCase
|
35
38
|
def test_erik_veenstra
|
36
39
|
assert_equal(7, Levenshtein.distance("erik", "veenstra"))
|
37
40
|
assert_equal(7, Levenshtein.distance("veenstra", "erik"))
|
@@ -79,59 +82,73 @@ class TestLevenshteinString < Test::Unit::TestCase
|
|
79
82
|
assert_in_delta(0.6, Levenshtein.normalized_distance("123cd", "xyzcd"), 0.01)
|
80
83
|
assert_in_delta(0.625, Levenshtein.normalized_distance("123cd123", "123"), 0.01)
|
81
84
|
end
|
82
|
-
end
|
83
85
|
|
84
|
-
|
85
|
-
|
86
|
-
|
86
|
+
def test_interface
|
87
|
+
seq1 = Levenshtein::TestSequence.new("erik".scan(/./).collect{|e| Levenshtein::TestElement.new(e)})
|
88
|
+
seq2 = Levenshtein::TestSequence.new("veenstra".scan(/./).collect{|e| Levenshtein::TestElement.new(e)})
|
87
89
|
|
88
|
-
assert_equal(7, Levenshtein.distance(
|
90
|
+
assert_equal(7, Levenshtein.distance(seq1, seq2))
|
89
91
|
end
|
90
92
|
end
|
91
93
|
|
92
|
-
class
|
94
|
+
class TestLevenshteinFast < Test::Unit::TestCase
|
93
95
|
def test_erik_veenstra
|
94
|
-
|
96
|
+
assert_equal(7, Levenshtein.distance("erik", "veenstra", nil, :force_slow=>false))
|
97
|
+
assert_equal(7, Levenshtein.distance("veenstra", "erik", nil, :force_slow=>false))
|
98
|
+
end
|
95
99
|
|
96
|
-
|
100
|
+
def test_empty_string
|
101
|
+
assert_equal(0, Levenshtein.distance("", "", nil, :force_slow=>false))
|
102
|
+
assert_equal(3, Levenshtein.distance("", "foo", nil, :force_slow=>false))
|
103
|
+
assert_equal(3, Levenshtein.distance("foo", "", nil, :force_slow=>false))
|
97
104
|
end
|
98
|
-
end
|
99
105
|
|
100
|
-
|
101
|
-
|
102
|
-
|
106
|
+
def test_same_string
|
107
|
+
assert_equal(0, Levenshtein.distance("", "", nil, :force_slow=>false))
|
108
|
+
assert_equal(0, Levenshtein.distance("foo", "foo", nil, :force_slow=>false))
|
109
|
+
end
|
103
110
|
|
104
|
-
|
111
|
+
def test_threshold
|
112
|
+
assert_equal(3, Levenshtein.distance("foo", "foobar", nil, :force_slow=>false))
|
113
|
+
assert_equal(3, Levenshtein.distance("foo", "foobar", 4, :force_slow=>false))
|
114
|
+
assert_equal(nil, Levenshtein.distance("foo", "foobar", 2, :force_slow=>false))
|
115
|
+
end
|
116
|
+
|
117
|
+
def test_same_head_and_or_tail
|
118
|
+
assert_equal(3, Levenshtein.distance("ab123cd", "abxyzcd", nil, :force_slow=>false))
|
119
|
+
assert_equal(3, Levenshtein.distance("ab123", "abxyz", nil, :force_slow=>false))
|
120
|
+
assert_equal(3, Levenshtein.distance("123cd", "xyzcd", nil, :force_slow=>false))
|
121
|
+
assert_equal(5, Levenshtein.distance("123cd123", "123", nil, :force_slow=>false))
|
105
122
|
end
|
106
123
|
end
|
107
124
|
|
108
125
|
class TestLevenshteinSlow < Test::Unit::TestCase
|
109
126
|
def test_erik_veenstra
|
110
|
-
assert_equal(7, Levenshtein.
|
111
|
-
assert_equal(7, Levenshtein.
|
127
|
+
assert_equal(7, Levenshtein.distance("erik", "veenstra", nil, :force_slow=>true))
|
128
|
+
assert_equal(7, Levenshtein.distance("veenstra", "erik", nil, :force_slow=>true))
|
112
129
|
end
|
113
130
|
|
114
131
|
def test_empty_string
|
115
|
-
assert_equal(0, Levenshtein.
|
116
|
-
assert_equal(3, Levenshtein.
|
117
|
-
assert_equal(3, Levenshtein.
|
132
|
+
assert_equal(0, Levenshtein.distance("", "", nil, :force_slow=>true))
|
133
|
+
assert_equal(3, Levenshtein.distance("", "foo", nil, :force_slow=>true))
|
134
|
+
assert_equal(3, Levenshtein.distance("foo", "", nil, :force_slow=>true))
|
118
135
|
end
|
119
136
|
|
120
137
|
def test_same_string
|
121
|
-
assert_equal(0, Levenshtein.
|
122
|
-
assert_equal(0, Levenshtein.
|
138
|
+
assert_equal(0, Levenshtein.distance("", "", nil, :force_slow=>true))
|
139
|
+
assert_equal(0, Levenshtein.distance("foo", "foo", nil, :force_slow=>true))
|
123
140
|
end
|
124
141
|
|
125
142
|
def test_threshold
|
126
|
-
assert_equal(3, Levenshtein.
|
127
|
-
assert_equal(3, Levenshtein.
|
128
|
-
assert_equal(nil, Levenshtein.
|
143
|
+
assert_equal(3, Levenshtein.distance("foo", "foobar", nil, :force_slow=>true))
|
144
|
+
assert_equal(3, Levenshtein.distance("foo", "foobar", 4, :force_slow=>true))
|
145
|
+
assert_equal(nil, Levenshtein.distance("foo", "foobar", 2, :force_slow=>true))
|
129
146
|
end
|
130
147
|
|
131
148
|
def test_same_head_and_or_tail
|
132
|
-
assert_equal(3, Levenshtein.
|
133
|
-
assert_equal(3, Levenshtein.
|
134
|
-
assert_equal(3, Levenshtein.
|
135
|
-
assert_equal(5, Levenshtein.
|
149
|
+
assert_equal(3, Levenshtein.distance("ab123cd", "abxyzcd", nil, :force_slow=>true))
|
150
|
+
assert_equal(3, Levenshtein.distance("ab123", "abxyz", nil, :force_slow=>true))
|
151
|
+
assert_equal(3, Levenshtein.distance("123cd", "xyzcd", nil, :force_slow=>true))
|
152
|
+
assert_equal(5, Levenshtein.distance("123cd123", "123", nil, :force_slow=>true))
|
136
153
|
end
|
137
154
|
end
|
metadata
CHANGED
@@ -1,41 +1,27 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: levenshtein
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.2
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 2
|
9
|
-
- 1
|
10
|
-
version: 0.2.1
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Erik Veenstra
|
14
9
|
autorequire:
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
|
-
|
18
|
-
date: 2012-02-11 00:00:00 Z
|
12
|
+
date: 2012-03-16 00:00:00.000000000 Z
|
19
13
|
dependencies: []
|
20
|
-
|
21
14
|
description: Calculates the Levenshtein distance between two byte strings.
|
22
15
|
email: levenshtein@erikveen.dds.nl
|
23
16
|
executables: []
|
24
|
-
|
25
|
-
extensions:
|
17
|
+
extensions:
|
26
18
|
- ext/levenshtein/extconf.rb
|
27
19
|
extra_rdoc_files: []
|
28
|
-
|
29
|
-
files:
|
30
|
-
- lib/levenshtein/exception.rb
|
20
|
+
files:
|
31
21
|
- lib/levenshtein/version.rb
|
32
22
|
- lib/levenshtein.rb
|
33
|
-
- ext/levenshtein/levenshtein_string.c
|
34
|
-
- ext/levenshtein/levenshtein_generic.c
|
35
23
|
- ext/levenshtein/levenshtein.h
|
36
24
|
- ext/levenshtein/levenshtein_fast.c
|
37
|
-
- ext/levenshtein/levenshtein_array_of_strings.c
|
38
|
-
- ext/levenshtein/levenshtein_array.c
|
39
25
|
- ext/levenshtein/extconf.rb
|
40
26
|
- README
|
41
27
|
- LICENSE
|
@@ -44,43 +30,35 @@ files:
|
|
44
30
|
- test/test.rb
|
45
31
|
homepage: http://www.erikveen.dds.nl/levenshtein/index.html
|
46
32
|
licenses: []
|
47
|
-
|
48
33
|
post_install_message:
|
49
|
-
rdoc_options:
|
34
|
+
rdoc_options:
|
50
35
|
- README
|
51
36
|
- LICENSE
|
52
37
|
- VERSION
|
53
38
|
- CHANGELOG
|
54
39
|
- --title
|
55
|
-
- levenshtein (0.2.
|
40
|
+
- levenshtein (0.2.2)
|
56
41
|
- --main
|
57
42
|
- README
|
58
|
-
require_paths:
|
43
|
+
require_paths:
|
59
44
|
- lib
|
60
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
61
46
|
none: false
|
62
|
-
requirements:
|
63
|
-
- -
|
64
|
-
- !ruby/object:Gem::Version
|
65
|
-
|
66
|
-
|
67
|
-
- 0
|
68
|
-
version: "0"
|
69
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
47
|
+
requirements:
|
48
|
+
- - ! '>='
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: '0'
|
51
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
70
52
|
none: false
|
71
|
-
requirements:
|
72
|
-
- -
|
73
|
-
- !ruby/object:Gem::Version
|
74
|
-
|
75
|
-
segments:
|
76
|
-
- 0
|
77
|
-
version: "0"
|
53
|
+
requirements:
|
54
|
+
- - ! '>='
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: '0'
|
78
57
|
requirements: []
|
79
|
-
|
80
58
|
rubyforge_project: levenshtein
|
81
|
-
rubygems_version: 1.8.
|
59
|
+
rubygems_version: 1.8.18
|
82
60
|
signing_key:
|
83
61
|
specification_version: 3
|
84
62
|
summary: Calculates the Levenshtein distance between two byte strings.
|
85
|
-
test_files:
|
63
|
+
test_files:
|
86
64
|
- test/test.rb
|
@@ -1,130 +0,0 @@
|
|
1
|
-
#include "ruby.h"
|
2
|
-
#include "levenshtein.h"
|
3
|
-
|
4
|
-
VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
5
|
-
int threshold;
|
6
|
-
int l1, l2;
|
7
|
-
int *prev_row, *curr_row, *temp_row;
|
8
|
-
int col, row;
|
9
|
-
int curr_row_min, result;
|
10
|
-
int offset;
|
11
|
-
int value1, value2;
|
12
|
-
|
13
|
-
/* Get the sizes of both arrays. */
|
14
|
-
|
15
|
-
l1 = RARRAY_LEN(rb_o1);
|
16
|
-
l2 = RARRAY_LEN(rb_o2);
|
17
|
-
|
18
|
-
/* Convert Ruby's threshold to C's threshold. */
|
19
|
-
|
20
|
-
if (!NIL_P(rb_threshold)) {
|
21
|
-
threshold = FIX2INT(rb_threshold);
|
22
|
-
} else {
|
23
|
-
threshold = -1;
|
24
|
-
}
|
25
|
-
|
26
|
-
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
27
|
-
|
28
|
-
offset = 0;
|
29
|
-
|
30
|
-
while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)))) {
|
31
|
-
offset++;
|
32
|
-
}
|
33
|
-
|
34
|
-
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
35
|
-
|
36
|
-
while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)))) {
|
37
|
-
l1--;
|
38
|
-
l2--;
|
39
|
-
}
|
40
|
-
|
41
|
-
l1 -= offset;
|
42
|
-
l2 -= offset;
|
43
|
-
|
44
|
-
/* The Levenshtein algorithm itself. */
|
45
|
-
|
46
|
-
/* s1= */
|
47
|
-
/* ERIK */
|
48
|
-
/* */
|
49
|
-
/* 01234 */
|
50
|
-
/* s2=V 11234 */
|
51
|
-
/* E 21234 */
|
52
|
-
/* E 32234 */
|
53
|
-
/* N 43334 <- prev_row */
|
54
|
-
/* S 54444 <- curr_row */
|
55
|
-
/* T 65555 */
|
56
|
-
/* R 76566 */
|
57
|
-
/* A 87667 */
|
58
|
-
|
59
|
-
/* Allocate memory for both rows */
|
60
|
-
|
61
|
-
prev_row = (int*) ALLOC_N(int, (l1+1));
|
62
|
-
curr_row = (int*) ALLOC_N(int, (l1+1));
|
63
|
-
|
64
|
-
/* Initialize the current row. */
|
65
|
-
|
66
|
-
for (col=0; col<=l1; col++) {
|
67
|
-
curr_row[col] = col;
|
68
|
-
}
|
69
|
-
|
70
|
-
for (row=1; row<=l2; row++) {
|
71
|
-
/* Copy the current row to the previous row. */
|
72
|
-
|
73
|
-
temp_row = prev_row;
|
74
|
-
prev_row = curr_row;
|
75
|
-
curr_row = temp_row;
|
76
|
-
|
77
|
-
/* Calculate the values of the current row. */
|
78
|
-
|
79
|
-
curr_row[0] = row;
|
80
|
-
curr_row_min = row;
|
81
|
-
|
82
|
-
for (col=1; col<=l1; col++) {
|
83
|
-
/* Equal (cost=0) or substitution (cost=1). */
|
84
|
-
|
85
|
-
value1 = prev_row[col-1] + (RTEST(rb_equal(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
|
86
|
-
|
87
|
-
/* Insertion if it's cheaper than substitution. */
|
88
|
-
|
89
|
-
value2 = prev_row[col]+1;
|
90
|
-
if (value2 < value1) {
|
91
|
-
value1 = value2;
|
92
|
-
}
|
93
|
-
|
94
|
-
/* Deletion if it's cheaper than substitution. */
|
95
|
-
|
96
|
-
value2 = curr_row[col-1]+1;
|
97
|
-
if (value2 < value1) {
|
98
|
-
value1 = value2;
|
99
|
-
}
|
100
|
-
|
101
|
-
/* Keep track of the minimum value on this row. */
|
102
|
-
|
103
|
-
if (value1 < curr_row_min) {
|
104
|
-
curr_row_min = value1;
|
105
|
-
}
|
106
|
-
|
107
|
-
curr_row[col] = value1;
|
108
|
-
}
|
109
|
-
|
110
|
-
/* Return nil as soon as we exceed the threshold. */
|
111
|
-
|
112
|
-
if (threshold > -1 && curr_row_min >= threshold) {
|
113
|
-
free(prev_row);
|
114
|
-
free(curr_row);
|
115
|
-
|
116
|
-
return Qnil;
|
117
|
-
}
|
118
|
-
}
|
119
|
-
|
120
|
-
/* The result is the last value on the last row. */
|
121
|
-
|
122
|
-
result = curr_row[l1];
|
123
|
-
|
124
|
-
free(prev_row);
|
125
|
-
free(curr_row);
|
126
|
-
|
127
|
-
/* Return the Ruby version of the result. */
|
128
|
-
|
129
|
-
return INT2FIX(result);
|
130
|
-
}
|
@@ -1,130 +0,0 @@
|
|
1
|
-
#include "ruby.h"
|
2
|
-
#include "levenshtein.h"
|
3
|
-
|
4
|
-
VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
5
|
-
int threshold;
|
6
|
-
int l1, l2;
|
7
|
-
int *prev_row, *curr_row, *temp_row;
|
8
|
-
int col, row;
|
9
|
-
int curr_row_min, result;
|
10
|
-
int offset;
|
11
|
-
int value1, value2;
|
12
|
-
|
13
|
-
/* Get the sizes of both arrays. */
|
14
|
-
|
15
|
-
l1 = RARRAY_LEN(rb_o1);
|
16
|
-
l2 = RARRAY_LEN(rb_o2);
|
17
|
-
|
18
|
-
/* Convert Ruby's threshold to C's threshold. */
|
19
|
-
|
20
|
-
if (!NIL_P(rb_threshold)) {
|
21
|
-
threshold = FIX2INT(rb_threshold);
|
22
|
-
} else {
|
23
|
-
threshold = -1;
|
24
|
-
}
|
25
|
-
|
26
|
-
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
27
|
-
|
28
|
-
offset = 0;
|
29
|
-
|
30
|
-
while ((offset < l1) && (offset < l2) && (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0)) {
|
31
|
-
offset++;
|
32
|
-
}
|
33
|
-
|
34
|
-
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
35
|
-
|
36
|
-
while ((offset < l1) && (offset < l2) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
|
37
|
-
l1--;
|
38
|
-
l2--;
|
39
|
-
}
|
40
|
-
|
41
|
-
l1 -= offset;
|
42
|
-
l2 -= offset;
|
43
|
-
|
44
|
-
/* The Levenshtein algorithm itself. */
|
45
|
-
|
46
|
-
/* s1= */
|
47
|
-
/* ERIK */
|
48
|
-
/* */
|
49
|
-
/* 01234 */
|
50
|
-
/* s2=V 11234 */
|
51
|
-
/* E 21234 */
|
52
|
-
/* E 32234 */
|
53
|
-
/* N 43334 <- prev_row */
|
54
|
-
/* S 54444 <- curr_row */
|
55
|
-
/* T 65555 */
|
56
|
-
/* R 76566 */
|
57
|
-
/* A 87667 */
|
58
|
-
|
59
|
-
/* Allocate memory for both rows */
|
60
|
-
|
61
|
-
prev_row = (int*) ALLOC_N(int, (l1+1));
|
62
|
-
curr_row = (int*) ALLOC_N(int, (l1+1));
|
63
|
-
|
64
|
-
/* Initialize the current row. */
|
65
|
-
|
66
|
-
for (col=0; col<=l1; col++) {
|
67
|
-
curr_row[col] = col;
|
68
|
-
}
|
69
|
-
|
70
|
-
for (row=1; row<=l2; row++) {
|
71
|
-
/* Copy the current row to the previous row. */
|
72
|
-
|
73
|
-
temp_row = prev_row;
|
74
|
-
prev_row = curr_row;
|
75
|
-
curr_row = temp_row;
|
76
|
-
|
77
|
-
/* Calculate the values of the current row. */
|
78
|
-
|
79
|
-
curr_row[0] = row;
|
80
|
-
curr_row_min = row;
|
81
|
-
|
82
|
-
for (col=1; col<=l1; col++) {
|
83
|
-
/* Equal (cost=0) or substitution (cost=1). */
|
84
|
-
|
85
|
-
value1 = prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
|
86
|
-
|
87
|
-
/* Insertion if it's cheaper than substitution. */
|
88
|
-
|
89
|
-
value2 = prev_row[col]+1;
|
90
|
-
if (value2 < value1) {
|
91
|
-
value1 = value2;
|
92
|
-
}
|
93
|
-
|
94
|
-
/* Deletion if it's cheaper than substitution. */
|
95
|
-
|
96
|
-
value2 = curr_row[col-1]+1;
|
97
|
-
if (value2 < value1) {
|
98
|
-
value1 = value2;
|
99
|
-
}
|
100
|
-
|
101
|
-
/* Keep track of the minimum value on this row. */
|
102
|
-
|
103
|
-
if (value1 < curr_row_min) {
|
104
|
-
curr_row_min = value1;
|
105
|
-
}
|
106
|
-
|
107
|
-
curr_row[col] = value1;
|
108
|
-
}
|
109
|
-
|
110
|
-
/* Return nil as soon as we exceed the threshold. */
|
111
|
-
|
112
|
-
if (threshold > -1 && curr_row_min >= threshold) {
|
113
|
-
free(prev_row);
|
114
|
-
free(curr_row);
|
115
|
-
|
116
|
-
return Qnil;
|
117
|
-
}
|
118
|
-
}
|
119
|
-
|
120
|
-
/* The result is the last value on the last row. */
|
121
|
-
|
122
|
-
result = curr_row[l1];
|
123
|
-
|
124
|
-
free(prev_row);
|
125
|
-
free(curr_row);
|
126
|
-
|
127
|
-
/* Return the Ruby version of the result. */
|
128
|
-
|
129
|
-
return INT2FIX(result);
|
130
|
-
}
|
@@ -1,133 +0,0 @@
|
|
1
|
-
#include "ruby.h"
|
2
|
-
#include "levenshtein.h"
|
3
|
-
|
4
|
-
VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
5
|
-
int threshold;
|
6
|
-
int l1, l2;
|
7
|
-
int *prev_row, *curr_row, *temp_row;
|
8
|
-
int col, row;
|
9
|
-
int curr_row_min, result;
|
10
|
-
int offset;
|
11
|
-
int value1, value2;
|
12
|
-
|
13
|
-
ID id_length = rb_intern("length");
|
14
|
-
ID id_get = rb_intern("[]");
|
15
|
-
|
16
|
-
/* Get the sizes of both sequences. */
|
17
|
-
|
18
|
-
l1 = FIX2INT(rb_funcall(rb_o1, id_length, 0));
|
19
|
-
l2 = FIX2INT(rb_funcall(rb_o2, id_length, 0));
|
20
|
-
|
21
|
-
/* Convert Ruby's threshold to C's threshold. */
|
22
|
-
|
23
|
-
if (!NIL_P(rb_threshold)) {
|
24
|
-
threshold = FIX2INT(rb_threshold);
|
25
|
-
} else {
|
26
|
-
threshold = -1;
|
27
|
-
}
|
28
|
-
|
29
|
-
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
30
|
-
|
31
|
-
offset = 0;
|
32
|
-
|
33
|
-
while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), rb_funcall(rb_o2, id_get, 1, INT2FIX(offset))))) {
|
34
|
-
offset++;
|
35
|
-
}
|
36
|
-
|
37
|
-
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
38
|
-
|
39
|
-
while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
|
40
|
-
l1--;
|
41
|
-
l2--;
|
42
|
-
}
|
43
|
-
|
44
|
-
l1 -= offset;
|
45
|
-
l2 -= offset;
|
46
|
-
|
47
|
-
/* The Levenshtein algorithm itself. */
|
48
|
-
|
49
|
-
/* s1= */
|
50
|
-
/* ERIK */
|
51
|
-
/* */
|
52
|
-
/* 01234 */
|
53
|
-
/* s2=V 11234 */
|
54
|
-
/* E 21234 */
|
55
|
-
/* E 32234 */
|
56
|
-
/* N 43334 <- prev_row */
|
57
|
-
/* S 54444 <- curr_row */
|
58
|
-
/* T 65555 */
|
59
|
-
/* R 76566 */
|
60
|
-
/* A 87667 */
|
61
|
-
|
62
|
-
/* Allocate memory for both rows */
|
63
|
-
|
64
|
-
prev_row = (int*) ALLOC_N(int, (l1+1));
|
65
|
-
curr_row = (int*) ALLOC_N(int, (l1+1));
|
66
|
-
|
67
|
-
/* Initialize the current row. */
|
68
|
-
|
69
|
-
for (col=0; col<=l1; col++) {
|
70
|
-
curr_row[col] = col;
|
71
|
-
}
|
72
|
-
|
73
|
-
for (row=1; row<=l2; row++) {
|
74
|
-
/* Copy the current row to the previous row. */
|
75
|
-
|
76
|
-
temp_row = prev_row;
|
77
|
-
prev_row = curr_row;
|
78
|
-
curr_row = temp_row;
|
79
|
-
|
80
|
-
/* Calculate the values of the current row. */
|
81
|
-
|
82
|
-
curr_row[0] = row;
|
83
|
-
curr_row_min = row;
|
84
|
-
|
85
|
-
for (col=1; col<=l1; col++) {
|
86
|
-
/* Equal (cost=0) or substitution (cost=1). */
|
87
|
-
|
88
|
-
value1 = prev_row[col-1] + (RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
|
89
|
-
|
90
|
-
/* Insertion if it's cheaper than substitution. */
|
91
|
-
|
92
|
-
value2 = prev_row[col]+1;
|
93
|
-
if (value2 < value1) {
|
94
|
-
value1 = value2;
|
95
|
-
}
|
96
|
-
|
97
|
-
/* Deletion if it's cheaper than substitution. */
|
98
|
-
|
99
|
-
value2 = curr_row[col-1]+1;
|
100
|
-
if (value2 < value1) {
|
101
|
-
value1 = value2;
|
102
|
-
}
|
103
|
-
|
104
|
-
/* Keep track of the minimum value on this row. */
|
105
|
-
|
106
|
-
if (value1 < curr_row_min) {
|
107
|
-
curr_row_min = value1;
|
108
|
-
}
|
109
|
-
|
110
|
-
curr_row[col] = value1;
|
111
|
-
}
|
112
|
-
|
113
|
-
/* Return nil as soon as we exceed the threshold. */
|
114
|
-
|
115
|
-
if (threshold > -1 && curr_row_min >= threshold) {
|
116
|
-
free(prev_row);
|
117
|
-
free(curr_row);
|
118
|
-
|
119
|
-
return Qnil;
|
120
|
-
}
|
121
|
-
}
|
122
|
-
|
123
|
-
/* The result is the last value on the last row. */
|
124
|
-
|
125
|
-
result = curr_row[l1];
|
126
|
-
|
127
|
-
free(prev_row);
|
128
|
-
free(curr_row);
|
129
|
-
|
130
|
-
/* Return the Ruby version of the result. */
|
131
|
-
|
132
|
-
return INT2FIX(result);
|
133
|
-
}
|
@@ -1,138 +0,0 @@
|
|
1
|
-
#include "ruby.h"
|
2
|
-
#include "levenshtein.h"
|
3
|
-
|
4
|
-
VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
5
|
-
int threshold;
|
6
|
-
int l1, l2;
|
7
|
-
int *prev_row, *curr_row, *temp_row;
|
8
|
-
int col, row;
|
9
|
-
int curr_row_min, result;
|
10
|
-
int offset;
|
11
|
-
int value1, value2;
|
12
|
-
char *s1, *s2;
|
13
|
-
|
14
|
-
/* Convert Ruby's s1 to C's s1. */
|
15
|
-
|
16
|
-
rb_o1 = StringValue(rb_o1);
|
17
|
-
s1 = RSTRING_PTR(rb_o1);
|
18
|
-
l1 = RSTRING_LEN(rb_o1);
|
19
|
-
|
20
|
-
/* Convert Ruby's s2 to C's s2. */
|
21
|
-
|
22
|
-
rb_o2 = StringValue(rb_o2);
|
23
|
-
s2 = RSTRING_PTR(rb_o2);
|
24
|
-
l2 = RSTRING_LEN(rb_o2);
|
25
|
-
|
26
|
-
/* Convert Ruby's threshold to C's threshold. */
|
27
|
-
|
28
|
-
if (!NIL_P(rb_threshold)) {
|
29
|
-
threshold = FIX2INT(rb_threshold);
|
30
|
-
} else {
|
31
|
-
threshold = -1;
|
32
|
-
}
|
33
|
-
|
34
|
-
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
35
|
-
|
36
|
-
offset = 0;
|
37
|
-
|
38
|
-
while ((offset < l1) && (offset < l2) && (s1[offset] == s2[offset])) {
|
39
|
-
offset++;
|
40
|
-
}
|
41
|
-
|
42
|
-
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
43
|
-
|
44
|
-
while ((offset < l1) && (offset < l2) && (s1[l1-1] == s2[l2-1])) {
|
45
|
-
l1--;
|
46
|
-
l2--;
|
47
|
-
}
|
48
|
-
|
49
|
-
l1 -= offset;
|
50
|
-
l2 -= offset;
|
51
|
-
|
52
|
-
/* The Levenshtein algorithm itself. */
|
53
|
-
|
54
|
-
/* s1= */
|
55
|
-
/* ERIK */
|
56
|
-
/* */
|
57
|
-
/* 01234 */
|
58
|
-
/* s2=V 11234 */
|
59
|
-
/* E 21234 */
|
60
|
-
/* E 32234 */
|
61
|
-
/* N 43334 <- prev_row */
|
62
|
-
/* S 54444 <- curr_row */
|
63
|
-
/* T 65555 */
|
64
|
-
/* R 76566 */
|
65
|
-
/* A 87667 */
|
66
|
-
|
67
|
-
/* Allocate memory for both rows */
|
68
|
-
|
69
|
-
prev_row = (int*) ALLOC_N(int, (l1+1));
|
70
|
-
curr_row = (int*) ALLOC_N(int, (l1+1));
|
71
|
-
|
72
|
-
/* Initialize the current row. */
|
73
|
-
|
74
|
-
for (col=0; col<=l1; col++) {
|
75
|
-
curr_row[col] = col;
|
76
|
-
}
|
77
|
-
|
78
|
-
for (row=1; row<=l2; row++) {
|
79
|
-
/* Copy the current row to the previous row. */
|
80
|
-
|
81
|
-
temp_row = prev_row;
|
82
|
-
prev_row = curr_row;
|
83
|
-
curr_row = temp_row;
|
84
|
-
|
85
|
-
/* Calculate the values of the current row. */
|
86
|
-
|
87
|
-
curr_row[0] = row;
|
88
|
-
curr_row_min = row;
|
89
|
-
|
90
|
-
for (col=1; col<=l1; col++) {
|
91
|
-
/* Equal (cost=0) or substitution (cost=1). */
|
92
|
-
|
93
|
-
value1 = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
|
94
|
-
|
95
|
-
/* Insertion if it's cheaper than substitution. */
|
96
|
-
|
97
|
-
value2 = prev_row[col]+1;
|
98
|
-
if (value2 < value1) {
|
99
|
-
value1 = value2;
|
100
|
-
}
|
101
|
-
|
102
|
-
/* Deletion if it's cheaper than substitution. */
|
103
|
-
|
104
|
-
value2 = curr_row[col-1]+1;
|
105
|
-
if (value2 < value1) {
|
106
|
-
value1 = value2;
|
107
|
-
}
|
108
|
-
|
109
|
-
/* Keep track of the minimum value on this row. */
|
110
|
-
|
111
|
-
if (value1 < curr_row_min) {
|
112
|
-
curr_row_min = value1;
|
113
|
-
}
|
114
|
-
|
115
|
-
curr_row[col] = value1;
|
116
|
-
}
|
117
|
-
|
118
|
-
/* Return nil as soon as we exceed the threshold. */
|
119
|
-
|
120
|
-
if (threshold > -1 && curr_row_min >= threshold) {
|
121
|
-
free(prev_row);
|
122
|
-
free(curr_row);
|
123
|
-
|
124
|
-
return Qnil;
|
125
|
-
}
|
126
|
-
}
|
127
|
-
|
128
|
-
/* The result is the last value on the last row. */
|
129
|
-
|
130
|
-
result = curr_row[l1];
|
131
|
-
|
132
|
-
free(prev_row);
|
133
|
-
free(curr_row);
|
134
|
-
|
135
|
-
/* Return the Ruby version of the result. */
|
136
|
-
|
137
|
-
return INT2FIX(result);
|
138
|
-
}
|