levenshtein 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +12 -0
- data/README +7 -3
- data/VERSION +1 -1
- data/ext/levenshtein/extconf.rb +6 -1
- data/ext/levenshtein/levenshtein_array.c +127 -0
- data/ext/levenshtein/levenshtein_array_of_strings.c +125 -0
- data/ext/levenshtein/levenshtein_fast.c +21 -0
- data/ext/levenshtein/levenshtein_generic.c +129 -0
- data/ext/levenshtein/{levenshtein_c.c → levenshtein_string.c} +31 -20
- data/lib/levenshtein.rb +45 -36
- data/test/test.rb +57 -23
- metadata +12 -7
data/CHANGELOG
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
0.2.0 (11-07-2009)
|
2
|
+
|
3
|
+
* Return 0 instead of 0.0 in case of empty strings.
|
4
|
+
|
5
|
+
* Added specific support for arrays.
|
6
|
+
|
7
|
+
* Added specific support for arrays of strings.
|
8
|
+
|
9
|
+
* Added generic support for all (?) kind of sequences.
|
10
|
+
|
11
|
+
* Moved a lot of code to the C world.
|
12
|
+
|
1
13
|
0.1.1 (06-10-2008)
|
2
14
|
|
3
15
|
* If one of the strings was both the begin and the end of the
|
data/README
CHANGED
@@ -1,8 +1,12 @@
|
|
1
1
|
The Levenshtein distance is a metric for measuring the amount of difference
|
2
2
|
between two sequences (i.e., the so called edit distance). The Levenshtein
|
3
|
-
distance between two
|
4
|
-
needed to transform one
|
5
|
-
insertion, deletion, or substitution of a single
|
3
|
+
distance between two sequences is given by the minimum number of operations
|
4
|
+
needed to transform one sequence into the other, where an operation is an
|
5
|
+
insertion, deletion, or substitution of a single element.
|
6
|
+
|
7
|
+
The two sequences can be two strings, two arrays, or two other objects.
|
8
|
+
Strings, arrays and arrays of strings are handled with optimized (very fast) C
|
9
|
+
code. All other sequences are handled with generic (fast) C code.
|
6
10
|
|
7
11
|
More information about the Levenshtein distance algorithm:
|
8
12
|
http://en.wikipedia.org/wiki/Levenshtein_distance .
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/ext/levenshtein/extconf.rb
CHANGED
@@ -2,4 +2,9 @@ require "mkmf"
|
|
2
2
|
|
3
3
|
dir_config("levenshtein")
|
4
4
|
|
5
|
-
|
5
|
+
have_library("levenshtein_array")
|
6
|
+
have_library("levenshtein_array_of_strings")
|
7
|
+
have_library("levenshtein_generic")
|
8
|
+
have_library("levenshtein_string")
|
9
|
+
|
10
|
+
create_makefile("levenshtein/levenshtein_fast")
|
@@ -0,0 +1,127 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
|
+
int threshold;
|
5
|
+
int l1, l2;
|
6
|
+
int *prev_row, *curr_row;
|
7
|
+
int col, row;
|
8
|
+
int curr_row_min, result;
|
9
|
+
int offset;
|
10
|
+
|
11
|
+
ID id_eql = rb_intern("==");
|
12
|
+
|
13
|
+
/* Get the sizes of both arrays. */
|
14
|
+
|
15
|
+
l1 = RARRAY(rb_o1)->len;
|
16
|
+
l2 = RARRAY(rb_o2)->len;
|
17
|
+
|
18
|
+
/* Convert Ruby's threshold to C's threshold. */
|
19
|
+
|
20
|
+
if (!NIL_P(rb_threshold)) {
|
21
|
+
threshold = FIX2INT(rb_threshold);
|
22
|
+
} else {
|
23
|
+
threshold = -1;
|
24
|
+
}
|
25
|
+
|
26
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
27
|
+
|
28
|
+
offset = 0;
|
29
|
+
while RTEST(rb_funcall(rb_ary_entry(rb_o1, offset), id_eql, 1, rb_ary_entry(rb_o2, offset))) {
|
30
|
+
offset++;
|
31
|
+
}
|
32
|
+
|
33
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
34
|
+
|
35
|
+
while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_ary_entry(rb_o1, l1-1), id_eql, 1, rb_ary_entry(rb_o2, l2-1)))) {
|
36
|
+
l1--;
|
37
|
+
l2--;
|
38
|
+
}
|
39
|
+
|
40
|
+
l1 -= offset;
|
41
|
+
l2 -= offset;
|
42
|
+
|
43
|
+
/* The Levenshtein algorithm itself. */
|
44
|
+
|
45
|
+
/* s1= */
|
46
|
+
/* ERIK */
|
47
|
+
/* */
|
48
|
+
/* 01234 */
|
49
|
+
/* s2=V 11234 */
|
50
|
+
/* E 21234 */
|
51
|
+
/* E 32234 */
|
52
|
+
/* N 43334 <- prev_row */
|
53
|
+
/* S 54444 <- curr_row */
|
54
|
+
/* T 65555 */
|
55
|
+
/* R 76566 */
|
56
|
+
/* A 87667 */
|
57
|
+
|
58
|
+
/* Allocate memory for both rows */
|
59
|
+
|
60
|
+
prev_row = ALLOC_N(int, l1+1);
|
61
|
+
curr_row = ALLOC_N(int, l1+1);
|
62
|
+
|
63
|
+
if ((prev_row == NULL) || (curr_row == NULL)) {
|
64
|
+
rb_raise(rb_eNoMemError, "out of memory");
|
65
|
+
}
|
66
|
+
|
67
|
+
/* Initialize the current row. */
|
68
|
+
|
69
|
+
for (col=0; col<=l1; col++) {
|
70
|
+
curr_row[col] = col;
|
71
|
+
}
|
72
|
+
|
73
|
+
for (row=1; row<=l2; row++) {
|
74
|
+
/* Copy the current row to the previous row. */
|
75
|
+
|
76
|
+
memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
|
77
|
+
|
78
|
+
/* Calculate the values of the current row. */
|
79
|
+
|
80
|
+
curr_row[0] = row;
|
81
|
+
curr_row_min = row;
|
82
|
+
|
83
|
+
for (col=1; col<=l1; col++) {
|
84
|
+
/* Equal (cost=0) or substitution (cost=1). */
|
85
|
+
|
86
|
+
curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_ary_entry(rb_o1, offset+col-1), id_eql, 1, rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
|
87
|
+
|
88
|
+
/* Insertion if it's cheaper than substitution. */
|
89
|
+
|
90
|
+
if (prev_row[col]+1 < curr_row[col]) {
|
91
|
+
curr_row[col] = prev_row[col]+1;
|
92
|
+
}
|
93
|
+
|
94
|
+
/* Deletion if it's cheaper than substitution. */
|
95
|
+
|
96
|
+
if (curr_row[col-1]+1 < curr_row[col]) {
|
97
|
+
curr_row[col] = curr_row[col-1]+1;
|
98
|
+
}
|
99
|
+
|
100
|
+
/* Keep track of the minimum value on this row. */
|
101
|
+
|
102
|
+
if (curr_row[col] < curr_row_min) {
|
103
|
+
curr_row_min = curr_row[col];
|
104
|
+
}
|
105
|
+
}
|
106
|
+
|
107
|
+
/* Return nil as soon as we exceed the threshold. */
|
108
|
+
|
109
|
+
if (threshold > -1 && curr_row_min >= threshold) {
|
110
|
+
free(prev_row);
|
111
|
+
free(curr_row);
|
112
|
+
|
113
|
+
return Qnil;
|
114
|
+
}
|
115
|
+
}
|
116
|
+
|
117
|
+
/* The result is the last value on the last row. */
|
118
|
+
|
119
|
+
result = curr_row[l1];
|
120
|
+
|
121
|
+
free(prev_row);
|
122
|
+
free(curr_row);
|
123
|
+
|
124
|
+
/* Return the Ruby version of the result. */
|
125
|
+
|
126
|
+
return INT2FIX(result);
|
127
|
+
}
|
@@ -0,0 +1,125 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
|
+
int threshold;
|
5
|
+
int l1, l2;
|
6
|
+
int *prev_row, *curr_row;
|
7
|
+
int col, row;
|
8
|
+
int curr_row_min, result;
|
9
|
+
int offset;
|
10
|
+
|
11
|
+
/* Get the sizes of both arrays. */
|
12
|
+
|
13
|
+
l1 = RARRAY(rb_o1)->len;
|
14
|
+
l2 = RARRAY(rb_o2)->len;
|
15
|
+
|
16
|
+
/* Convert Ruby's threshold to C's threshold. */
|
17
|
+
|
18
|
+
if (!NIL_P(rb_threshold)) {
|
19
|
+
threshold = FIX2INT(rb_threshold);
|
20
|
+
} else {
|
21
|
+
threshold = -1;
|
22
|
+
}
|
23
|
+
|
24
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
25
|
+
|
26
|
+
offset = 0;
|
27
|
+
while (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0) {
|
28
|
+
offset++;
|
29
|
+
}
|
30
|
+
|
31
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
32
|
+
|
33
|
+
while ((l1-1 > offset) && (l2-1 > offset) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
|
34
|
+
l1--;
|
35
|
+
l2--;
|
36
|
+
}
|
37
|
+
|
38
|
+
l1 -= offset;
|
39
|
+
l2 -= offset;
|
40
|
+
|
41
|
+
/* The Levenshtein algorithm itself. */
|
42
|
+
|
43
|
+
/* s1= */
|
44
|
+
/* ERIK */
|
45
|
+
/* */
|
46
|
+
/* 01234 */
|
47
|
+
/* s2=V 11234 */
|
48
|
+
/* E 21234 */
|
49
|
+
/* E 32234 */
|
50
|
+
/* N 43334 <- prev_row */
|
51
|
+
/* S 54444 <- curr_row */
|
52
|
+
/* T 65555 */
|
53
|
+
/* R 76566 */
|
54
|
+
/* A 87667 */
|
55
|
+
|
56
|
+
/* Allocate memory for both rows */
|
57
|
+
|
58
|
+
prev_row = ALLOC_N(int, l1+1);
|
59
|
+
curr_row = ALLOC_N(int, l1+1);
|
60
|
+
|
61
|
+
if ((prev_row == NULL) || (curr_row == NULL)) {
|
62
|
+
rb_raise(rb_eNoMemError, "out of memory");
|
63
|
+
}
|
64
|
+
|
65
|
+
/* Initialize the current row. */
|
66
|
+
|
67
|
+
for (col=0; col<=l1; col++) {
|
68
|
+
curr_row[col] = col;
|
69
|
+
}
|
70
|
+
|
71
|
+
for (row=1; row<=l2; row++) {
|
72
|
+
/* Copy the current row to the previous row. */
|
73
|
+
|
74
|
+
memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
|
75
|
+
|
76
|
+
/* Calculate the values of the current row. */
|
77
|
+
|
78
|
+
curr_row[0] = row;
|
79
|
+
curr_row_min = row;
|
80
|
+
|
81
|
+
for (col=1; col<=l1; col++) {
|
82
|
+
/* Equal (cost=0) or substitution (cost=1). */
|
83
|
+
|
84
|
+
curr_row[col] = prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
|
85
|
+
|
86
|
+
/* Insertion if it's cheaper than substitution. */
|
87
|
+
|
88
|
+
if (prev_row[col]+1 < curr_row[col]) {
|
89
|
+
curr_row[col] = prev_row[col]+1;
|
90
|
+
}
|
91
|
+
|
92
|
+
/* Deletion if it's cheaper than substitution. */
|
93
|
+
|
94
|
+
if (curr_row[col-1]+1 < curr_row[col]) {
|
95
|
+
curr_row[col] = curr_row[col-1]+1;
|
96
|
+
}
|
97
|
+
|
98
|
+
/* Keep track of the minimum value on this row. */
|
99
|
+
|
100
|
+
if (curr_row[col] < curr_row_min) {
|
101
|
+
curr_row_min = curr_row[col];
|
102
|
+
}
|
103
|
+
}
|
104
|
+
|
105
|
+
/* Return nil as soon as we exceed the threshold. */
|
106
|
+
|
107
|
+
if (threshold > -1 && curr_row_min >= threshold) {
|
108
|
+
free(prev_row);
|
109
|
+
free(curr_row);
|
110
|
+
|
111
|
+
return Qnil;
|
112
|
+
}
|
113
|
+
}
|
114
|
+
|
115
|
+
/* The result is the last value on the last row. */
|
116
|
+
|
117
|
+
result = curr_row[l1];
|
118
|
+
|
119
|
+
free(prev_row);
|
120
|
+
free(curr_row);
|
121
|
+
|
122
|
+
/* Return the Ruby version of the result. */
|
123
|
+
|
124
|
+
return INT2FIX(result);
|
125
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
|
+
if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
|
5
|
+
return levenshtein_distance_string(self, rb_o1, rb_o2, rb_threshold);
|
6
|
+
} else if ((TYPE(rb_o1) == T_ARRAY) && (TYPE(rb_o2)) == T_ARRAY) {
|
7
|
+
if ((TYPE(rb_ary_entry(rb_o1, 0)) == T_STRING) && (TYPE(rb_ary_entry(rb_o2, 0))) == T_STRING) {
|
8
|
+
return levenshtein_distance_array_of_strings(self, rb_o1, rb_o2, rb_threshold);
|
9
|
+
} else {
|
10
|
+
return levenshtein_distance_array(self, rb_o1, rb_o2, rb_threshold);
|
11
|
+
}
|
12
|
+
} else {
|
13
|
+
return levenshtein_distance_generic(self, rb_o1, rb_o2, rb_threshold);
|
14
|
+
}
|
15
|
+
}
|
16
|
+
|
17
|
+
void Init_levenshtein_fast() {
|
18
|
+
VALUE mLevenshtein = rb_define_module("Levenshtein");
|
19
|
+
|
20
|
+
rb_define_singleton_method(mLevenshtein, "levenshtein_distance_fast" , levenshtein_distance_fast, 3);
|
21
|
+
}
|
@@ -0,0 +1,129 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
|
+
int threshold;
|
5
|
+
int l1, l2;
|
6
|
+
int *prev_row, *curr_row;
|
7
|
+
int col, row;
|
8
|
+
int curr_row_min, result;
|
9
|
+
int offset;
|
10
|
+
|
11
|
+
ID id_length = rb_intern("length");
|
12
|
+
ID id_get = rb_intern("[]");
|
13
|
+
ID id_equal = rb_intern("==");
|
14
|
+
|
15
|
+
/* Get the sizes of both sequences. */
|
16
|
+
|
17
|
+
l1 = FIX2INT(rb_funcall(rb_o1, id_length, 0));
|
18
|
+
l2 = FIX2INT(rb_funcall(rb_o2, id_length, 0));
|
19
|
+
|
20
|
+
/* Convert Ruby's threshold to C's threshold. */
|
21
|
+
|
22
|
+
if (!NIL_P(rb_threshold)) {
|
23
|
+
threshold = FIX2INT(rb_threshold);
|
24
|
+
} else {
|
25
|
+
threshold = -1;
|
26
|
+
}
|
27
|
+
|
28
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
29
|
+
|
30
|
+
offset = 0;
|
31
|
+
while RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset)))) {
|
32
|
+
offset++;
|
33
|
+
}
|
34
|
+
|
35
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
36
|
+
|
37
|
+
while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
|
38
|
+
l1--;
|
39
|
+
l2--;
|
40
|
+
}
|
41
|
+
|
42
|
+
l1 -= offset;
|
43
|
+
l2 -= offset;
|
44
|
+
|
45
|
+
/* The Levenshtein algorithm itself. */
|
46
|
+
|
47
|
+
/* s1= */
|
48
|
+
/* ERIK */
|
49
|
+
/* */
|
50
|
+
/* 01234 */
|
51
|
+
/* s2=V 11234 */
|
52
|
+
/* E 21234 */
|
53
|
+
/* E 32234 */
|
54
|
+
/* N 43334 <- prev_row */
|
55
|
+
/* S 54444 <- curr_row */
|
56
|
+
/* T 65555 */
|
57
|
+
/* R 76566 */
|
58
|
+
/* A 87667 */
|
59
|
+
|
60
|
+
/* Allocate memory for both rows */
|
61
|
+
|
62
|
+
prev_row = ALLOC_N(int, l1+1);
|
63
|
+
curr_row = ALLOC_N(int, l1+1);
|
64
|
+
|
65
|
+
if ((prev_row == NULL) || (curr_row == NULL)) {
|
66
|
+
rb_raise(rb_eNoMemError, "out of memory");
|
67
|
+
}
|
68
|
+
|
69
|
+
/* Initialize the current row. */
|
70
|
+
|
71
|
+
for (col=0; col<=l1; col++) {
|
72
|
+
curr_row[col] = col;
|
73
|
+
}
|
74
|
+
|
75
|
+
for (row=1; row<=l2; row++) {
|
76
|
+
/* Copy the current row to the previous row. */
|
77
|
+
|
78
|
+
memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
|
79
|
+
|
80
|
+
/* Calculate the values of the current row. */
|
81
|
+
|
82
|
+
curr_row[0] = row;
|
83
|
+
curr_row_min = row;
|
84
|
+
|
85
|
+
for (col=1; col<=l1; col++) {
|
86
|
+
/* Equal (cost=0) or substitution (cost=1). */
|
87
|
+
|
88
|
+
curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
|
89
|
+
|
90
|
+
/* Insertion if it's cheaper than substitution. */
|
91
|
+
|
92
|
+
if (prev_row[col]+1 < curr_row[col]) {
|
93
|
+
curr_row[col] = prev_row[col]+1;
|
94
|
+
}
|
95
|
+
|
96
|
+
/* Deletion if it's cheaper than substitution. */
|
97
|
+
|
98
|
+
if (curr_row[col-1]+1 < curr_row[col]) {
|
99
|
+
curr_row[col] = curr_row[col-1]+1;
|
100
|
+
}
|
101
|
+
|
102
|
+
/* Keep track of the minimum value on this row. */
|
103
|
+
|
104
|
+
if (curr_row[col] < curr_row_min) {
|
105
|
+
curr_row_min = curr_row[col];
|
106
|
+
}
|
107
|
+
}
|
108
|
+
|
109
|
+
/* Return nil as soon as we exceed the threshold. */
|
110
|
+
|
111
|
+
if (threshold > -1 && curr_row_min >= threshold) {
|
112
|
+
free(prev_row);
|
113
|
+
free(curr_row);
|
114
|
+
|
115
|
+
return Qnil;
|
116
|
+
}
|
117
|
+
}
|
118
|
+
|
119
|
+
/* The result is the last value on the last row. */
|
120
|
+
|
121
|
+
result = curr_row[l1];
|
122
|
+
|
123
|
+
free(prev_row);
|
124
|
+
free(curr_row);
|
125
|
+
|
126
|
+
/* Return the Ruby version of the result. */
|
127
|
+
|
128
|
+
return INT2FIX(result);
|
129
|
+
}
|
@@ -1,25 +1,25 @@
|
|
1
1
|
#include "ruby.h"
|
2
2
|
|
3
|
-
|
4
|
-
VALUE rb_s3;
|
3
|
+
VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
5
4
|
int threshold;
|
6
|
-
int l1, l2
|
7
|
-
char *s1, *s2, *s3;
|
5
|
+
int l1, l2;
|
8
6
|
int *prev_row, *curr_row;
|
9
7
|
int col, row;
|
10
8
|
int curr_row_min, result;
|
9
|
+
int offset;
|
10
|
+
char *s1, *s2;
|
11
11
|
|
12
12
|
/* Convert Ruby's s1 to C's s1. */
|
13
13
|
|
14
|
-
|
15
|
-
s1 = RSTRING(
|
16
|
-
l1 = RSTRING(
|
14
|
+
rb_o1 = StringValue(rb_o1);
|
15
|
+
s1 = RSTRING(rb_o1)->ptr;
|
16
|
+
l1 = RSTRING(rb_o1)->len;
|
17
17
|
|
18
18
|
/* Convert Ruby's s2 to C's s2. */
|
19
19
|
|
20
|
-
|
21
|
-
s2 = RSTRING(
|
22
|
-
l2 = RSTRING(
|
20
|
+
rb_o2 = StringValue(rb_o2);
|
21
|
+
s2 = RSTRING(rb_o2)->ptr;
|
22
|
+
l2 = RSTRING(rb_o2)->len;
|
23
23
|
|
24
24
|
/* Convert Ruby's threshold to C's threshold. */
|
25
25
|
|
@@ -29,7 +29,24 @@ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VA
|
|
29
29
|
threshold = -1;
|
30
30
|
}
|
31
31
|
|
32
|
-
/*
|
32
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
33
|
+
|
34
|
+
offset = 0;
|
35
|
+
while (s1[offset] == s2[offset]) {
|
36
|
+
offset++;
|
37
|
+
}
|
38
|
+
|
39
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
40
|
+
|
41
|
+
while ((l1-1 > offset) && (l2-1 > offset) && (s1[l1-1] == s2[l2-1])) {
|
42
|
+
l1--;
|
43
|
+
l2--;
|
44
|
+
}
|
45
|
+
|
46
|
+
l1 -= offset;
|
47
|
+
l2 -= offset;
|
48
|
+
|
49
|
+
/* The Levenshtein algorithm itself. */
|
33
50
|
|
34
51
|
/* s1= */
|
35
52
|
/* ERIK */
|
@@ -43,7 +60,7 @@ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VA
|
|
43
60
|
/* T 65555 */
|
44
61
|
/* R 76566 */
|
45
62
|
/* A 87667 */
|
46
|
-
|
63
|
+
|
47
64
|
/* Allocate memory for both rows */
|
48
65
|
|
49
66
|
prev_row = ALLOC_N(int, l1+1);
|
@@ -70,9 +87,9 @@ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VA
|
|
70
87
|
curr_row_min = row;
|
71
88
|
|
72
89
|
for (col=1; col<=l1; col++) {
|
73
|
-
/* Equal (cost=0) or
|
90
|
+
/* Equal (cost=0) or substitution (cost=1). */
|
74
91
|
|
75
|
-
curr_row[col] = prev_row[col-1] + ((s1[col-1] == s2[row-1]) ? 0 : 1);
|
92
|
+
curr_row[col] = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
|
76
93
|
|
77
94
|
/* Insertion if it's cheaper than substitution. */
|
78
95
|
|
@@ -114,9 +131,3 @@ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VA
|
|
114
131
|
|
115
132
|
return INT2FIX(result);
|
116
133
|
}
|
117
|
-
|
118
|
-
void Init_levenshtein_c() {
|
119
|
-
VALUE mLevenshtein = rb_define_module("Levenshtein");
|
120
|
-
|
121
|
-
rb_define_singleton_method(mLevenshtein, "distance_part2_fast" , levenshtein_distance_part2, 3);
|
122
|
-
}
|
data/lib/levenshtein.rb
CHANGED
@@ -1,30 +1,34 @@
|
|
1
1
|
begin
|
2
|
-
require "levenshtein/
|
2
|
+
require "levenshtein/levenshtein_fast" # If compiled by RubyGems.
|
3
3
|
rescue LoadError
|
4
4
|
begin
|
5
|
-
require "
|
5
|
+
require "levenshtein_fast" # If compiled by the build script.
|
6
6
|
rescue LoadError
|
7
|
-
$stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.
|
7
|
+
$stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead."
|
8
8
|
end
|
9
9
|
end
|
10
10
|
|
11
|
-
# The Levenshtein distance is a metric for measuring the amount
|
12
|
-
# between two sequences (i.e., the so called edit
|
13
|
-
# distance between two
|
14
|
-
#
|
15
|
-
#
|
11
|
+
# The Levenshtein distance is a metric for measuring the amount
|
12
|
+
# of difference between two sequences (i.e., the so called edit
|
13
|
+
# distance). The Levenshtein distance between two sequences is
|
14
|
+
# given by the minimum number of operations needed to transform
|
15
|
+
# one sequence into the other, where an operation is an
|
16
|
+
# insertion, deletion, or substitution of a single element.
|
16
17
|
#
|
17
18
|
# More information about the Levenshtein distance algorithm:
|
18
19
|
# http://en.wikipedia.org/wiki/Levenshtein_distance .
|
19
20
|
|
20
21
|
module Levenshtein
|
21
|
-
|
22
|
-
|
22
|
+
VERSION = "0.2.0"
|
23
|
+
|
24
|
+
# Returns the Levenshtein distance as a number between 0.0 and
|
25
|
+
# 1.0. It's basically the Levenshtein distance divided by the
|
26
|
+
# length of the longest sequence.
|
23
27
|
|
24
28
|
def self.normalized_distance(s1, s2, threshold=nil)
|
25
29
|
s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
|
26
30
|
|
27
|
-
if s2.
|
31
|
+
if s2.length == 0
|
28
32
|
0.0 # Since s1.length < s2.length, s1 must be empty as well.
|
29
33
|
else
|
30
34
|
if threshold
|
@@ -39,46 +43,49 @@ module Levenshtein
|
|
39
43
|
end
|
40
44
|
end
|
41
45
|
|
42
|
-
# Returns the Levenshtein distance between two
|
46
|
+
# Returns the Levenshtein distance between two sequences.
|
47
|
+
#
|
48
|
+
# The two sequences can be two strings, two arrays, or two other
|
49
|
+
# objects. Strings, arrays and arrays of strings are handled with
|
50
|
+
# optimized (very fast) C code. All other sequences are handled
|
51
|
+
# with generic (fast) C code.
|
52
|
+
#
|
53
|
+
# The sequences should respond to :length and :[] and all objects
|
54
|
+
# in the sequences (as returned by []) should response to :==.
|
43
55
|
|
44
56
|
def self.distance(s1, s2, threshold=nil)
|
45
57
|
s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
|
46
58
|
|
47
59
|
# Handle some basic circumstances.
|
48
60
|
|
49
|
-
return 0
|
50
|
-
return s2.length if s1.
|
51
|
-
return nil if threshold and (s2.length-s1.length) >= threshold
|
52
|
-
return nil if threshold and (s1.scan(/./) - s2.scan(/./)).length >= threshold
|
53
|
-
return nil if threshold and (s2.scan(/./) - s1.scan(/./)).length >= threshold
|
54
|
-
|
55
|
-
# Do the expensive calculation on a subset of the strings only, if possible.
|
61
|
+
return 0 if s1 == s2
|
62
|
+
return s2.length if s1.length == 0
|
56
63
|
|
57
|
-
|
58
|
-
|
59
|
-
e2 = s2.length-1
|
64
|
+
if threshold
|
65
|
+
return nil if (s2.length-s1.length) >= threshold
|
60
66
|
|
61
|
-
|
62
|
-
|
63
|
-
|
67
|
+
a1, a2 = nil, nil
|
68
|
+
a1, a2 = s1, s2 if s1.respond_to?(:-) and s2.respond_to?(:-)
|
69
|
+
a1, a2 = s1.scan(/./), s2.scan(/./) if s1.respond_to?(:scan) and s2.respond_to?(:scan)
|
64
70
|
|
65
|
-
|
66
|
-
|
67
|
-
|
71
|
+
if a1 and a2
|
72
|
+
return nil if (a1-a2).length >= threshold
|
73
|
+
return nil if (a2-a1).length >= threshold
|
74
|
+
end
|
68
75
|
end
|
69
76
|
|
70
|
-
|
77
|
+
distance_fast_or_slow(s1, s2, threshold)
|
71
78
|
end
|
72
79
|
|
73
|
-
def self.
|
74
|
-
if respond_to?(:
|
75
|
-
|
80
|
+
def self.distance_fast_or_slow(s1, s2, threshold) # :nodoc:
|
81
|
+
if respond_to?(:levenshtein_distance_fast)
|
82
|
+
levenshtein_distance_fast(s1, s2, threshold) # Implemented in C.
|
76
83
|
else
|
77
|
-
|
84
|
+
levenshtein_distance_slow(s1, s2, threshold) # Implemented in Ruby.
|
78
85
|
end
|
79
86
|
end
|
80
87
|
|
81
|
-
def self.
|
88
|
+
def self.levenshtein_distance_slow(s1, s2, threshold) # :nodoc:
|
82
89
|
row = (0..s1.length).to_a
|
83
90
|
|
84
91
|
1.upto(s2.length) do |y|
|
@@ -89,8 +96,10 @@ module Levenshtein
|
|
89
96
|
row[x] = [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
|
90
97
|
end
|
91
98
|
|
92
|
-
# Stop analysing this
|
93
|
-
#
|
99
|
+
# Stop analysing this sequence as soon as the best possible
|
100
|
+
# result for this sequence is bigger than the best result so far.
|
101
|
+
# (The minimum value in the next row will be equal to or greater
|
102
|
+
# than the minimum value in this row.)
|
94
103
|
|
95
104
|
return nil if threshold and row.min >= threshold
|
96
105
|
end
|
data/test/test.rb
CHANGED
@@ -1,7 +1,35 @@
|
|
1
1
|
require "test/unit"
|
2
2
|
require "levenshtein"
|
3
3
|
|
4
|
-
|
4
|
+
module Levenshtein
|
5
|
+
class TestSequence
|
6
|
+
def initialize(o)
|
7
|
+
@sequence = o
|
8
|
+
end
|
9
|
+
|
10
|
+
def length
|
11
|
+
@sequence.length
|
12
|
+
end
|
13
|
+
|
14
|
+
def [](pos)
|
15
|
+
@sequence[pos]
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class TestElement
|
20
|
+
attr_reader :object
|
21
|
+
|
22
|
+
def initialize(o)
|
23
|
+
@object = o
|
24
|
+
end
|
25
|
+
|
26
|
+
def ==(other)
|
27
|
+
@object == other.object
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
class TestLevenshteinString < Test::Unit::TestCase
|
5
33
|
def test_erik_veenstra
|
6
34
|
assert_equal(7, Levenshtein.distance("erik", "veenstra"))
|
7
35
|
assert_equal(7, Levenshtein.distance("veenstra", "erik"))
|
@@ -30,9 +58,11 @@ class TestLevenshtein < Test::Unit::TestCase
|
|
30
58
|
|
31
59
|
def test_threshold
|
32
60
|
assert_equal(3, Levenshtein.distance("foo", "foobar"))
|
61
|
+
assert_equal(3, Levenshtein.distance("foo", "foobar", 4))
|
33
62
|
assert_equal(nil, Levenshtein.distance("foo", "foobar", 2))
|
34
63
|
|
35
64
|
assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar"), 0.01)
|
65
|
+
assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar", 0.66), 0.01)
|
36
66
|
assert_equal(nil, Levenshtein.normalized_distance("foo", "foobar", 0.30))
|
37
67
|
end
|
38
68
|
|
@@ -45,47 +75,51 @@ class TestLevenshtein < Test::Unit::TestCase
|
|
45
75
|
assert_in_delta(0.42, Levenshtein.normalized_distance("ab123cd", "abxyzcd"), 0.01)
|
46
76
|
assert_in_delta(0.6, Levenshtein.normalized_distance("ab123", "abxyz"), 0.01)
|
47
77
|
assert_in_delta(0.6, Levenshtein.normalized_distance("123cd", "xyzcd"), 0.01)
|
78
|
+
assert_in_delta(0.625, Levenshtein.normalized_distance("123cd123", "123"), 0.01)
|
48
79
|
end
|
49
80
|
end
|
50
81
|
|
51
|
-
class
|
82
|
+
class TestLevenshteinArray < Test::Unit::TestCase
|
52
83
|
def test_erik_veenstra
|
53
|
-
|
54
|
-
end
|
84
|
+
x = lambda{|s| s.scan(/./).collect{|e| Levenshtein::TestElement.new(e)}}
|
55
85
|
|
56
|
-
|
57
|
-
assert_equal(0, Levenshtein.distance_part2_slow("", "", nil))
|
58
|
-
assert_equal(3, Levenshtein.distance_part2_slow("", "foo", nil))
|
86
|
+
assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
|
59
87
|
end
|
88
|
+
end
|
60
89
|
|
61
|
-
|
62
|
-
|
63
|
-
|
90
|
+
class TestLevenshteinArrayOfStrings < Test::Unit::TestCase
|
91
|
+
def test_erik_veenstra
|
92
|
+
x = lambda{|s| s.scan(/./)}
|
93
|
+
|
94
|
+
assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
|
64
95
|
end
|
96
|
+
end
|
65
97
|
|
66
|
-
|
67
|
-
|
68
|
-
|
98
|
+
class TestLevenshteinGeneric < Test::Unit::TestCase
|
99
|
+
def test_erik_veenstra
|
100
|
+
x = lambda{|s| Levenshtein::TestSequence.new(s.scan(/./).collect{|e| Levenshtein::TestElement.new(e)})}
|
101
|
+
|
102
|
+
assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
|
69
103
|
end
|
70
104
|
end
|
71
105
|
|
72
|
-
class
|
106
|
+
class TestLevenshteinSlow < Test::Unit::TestCase
|
73
107
|
def test_erik_veenstra
|
74
|
-
assert_equal(7, Levenshtein.
|
108
|
+
assert_equal(7, Levenshtein.levenshtein_distance_slow("erik", "veenstra", nil))
|
75
109
|
end
|
76
110
|
|
77
|
-
def
|
78
|
-
assert_equal(0, Levenshtein.
|
79
|
-
assert_equal(3, Levenshtein.
|
111
|
+
def test_empty_sequence
|
112
|
+
assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
|
113
|
+
assert_equal(3, Levenshtein.levenshtein_distance_slow("", "foo", nil))
|
80
114
|
end
|
81
115
|
|
82
|
-
def
|
83
|
-
assert_equal(0, Levenshtein.
|
84
|
-
assert_equal(0, Levenshtein.
|
116
|
+
def test_same_sequence
|
117
|
+
assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
|
118
|
+
assert_equal(0, Levenshtein.levenshtein_distance_slow("foo", "foo", nil))
|
85
119
|
end
|
86
120
|
|
87
121
|
def test_threshold
|
88
|
-
assert_equal(3, Levenshtein.
|
89
|
-
assert_equal(nil, Levenshtein.
|
122
|
+
assert_equal(3, Levenshtein.levenshtein_distance_slow("foo", "foobar", nil))
|
123
|
+
assert_equal(nil, Levenshtein.levenshtein_distance_slow("foo", "foobar", 2))
|
90
124
|
end
|
91
125
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: levenshtein
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erik Veenstra
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2009-07-11 00:00:00 +02:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
@@ -23,15 +23,20 @@ extra_rdoc_files: []
|
|
23
23
|
|
24
24
|
files:
|
25
25
|
- lib/levenshtein.rb
|
26
|
-
- ext/levenshtein
|
27
26
|
- ext/levenshtein/extconf.rb
|
28
|
-
- ext/levenshtein/
|
27
|
+
- ext/levenshtein/levenshtein_array_of_strings.c
|
28
|
+
- ext/levenshtein/levenshtein_fast.c
|
29
|
+
- ext/levenshtein/levenshtein_string.c
|
30
|
+
- ext/levenshtein/levenshtein_generic.c
|
31
|
+
- ext/levenshtein/levenshtein_array.c
|
29
32
|
- README
|
30
33
|
- LICENSE
|
31
34
|
- VERSION
|
32
35
|
- CHANGELOG
|
33
36
|
has_rdoc: true
|
34
37
|
homepage: http://www.erikveen.dds.nl/levenshtein/index.html
|
38
|
+
licenses: []
|
39
|
+
|
35
40
|
post_install_message:
|
36
41
|
rdoc_options:
|
37
42
|
- README
|
@@ -39,7 +44,7 @@ rdoc_options:
|
|
39
44
|
- VERSION
|
40
45
|
- CHANGELOG
|
41
46
|
- --title
|
42
|
-
- levenshtein (0.
|
47
|
+
- levenshtein (0.2.0)
|
43
48
|
- --main
|
44
49
|
- README
|
45
50
|
require_paths:
|
@@ -59,9 +64,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
59
64
|
requirements: []
|
60
65
|
|
61
66
|
rubyforge_project: levenshtein
|
62
|
-
rubygems_version: 1.
|
67
|
+
rubygems_version: 1.3.4
|
63
68
|
signing_key:
|
64
|
-
specification_version:
|
69
|
+
specification_version: 3
|
65
70
|
summary: Calculates the Levenshtein distance between two byte strings.
|
66
71
|
test_files:
|
67
72
|
- test/test.rb
|