levenshtein 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +12 -0
- data/README +7 -3
- data/VERSION +1 -1
- data/ext/levenshtein/extconf.rb +6 -1
- data/ext/levenshtein/levenshtein_array.c +127 -0
- data/ext/levenshtein/levenshtein_array_of_strings.c +125 -0
- data/ext/levenshtein/levenshtein_fast.c +21 -0
- data/ext/levenshtein/levenshtein_generic.c +129 -0
- data/ext/levenshtein/{levenshtein_c.c → levenshtein_string.c} +31 -20
- data/lib/levenshtein.rb +45 -36
- data/test/test.rb +57 -23
- metadata +12 -7
data/CHANGELOG
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
0.2.0 (11-07-2009)
|
2
|
+
|
3
|
+
* Return 0 instead of 0.0 in case of empty strings.
|
4
|
+
|
5
|
+
* Added specific support for arrays.
|
6
|
+
|
7
|
+
* Added specific support for arrays of strings.
|
8
|
+
|
9
|
+
* Added generic support for all (?) kind of sequences.
|
10
|
+
|
11
|
+
* Moved a lot of code to the C world.
|
12
|
+
|
1
13
|
0.1.1 (06-10-2008)
|
2
14
|
|
3
15
|
* If one of the strings was both the begin and the end of the
|
data/README
CHANGED
@@ -1,8 +1,12 @@
|
|
1
1
|
The Levenshtein distance is a metric for measuring the amount of difference
|
2
2
|
between two sequences (i.e., the so called edit distance). The Levenshtein
|
3
|
-
distance between two
|
4
|
-
needed to transform one
|
5
|
-
insertion, deletion, or substitution of a single
|
3
|
+
distance between two sequences is given by the minimum number of operations
|
4
|
+
needed to transform one sequence into the other, where an operation is an
|
5
|
+
insertion, deletion, or substitution of a single element.
|
6
|
+
|
7
|
+
The two sequences can be two strings, two arrays, or two other objects.
|
8
|
+
Strings, arrays and arrays of strings are handled with optimized (very fast) C
|
9
|
+
code. All other sequences are handled with generic (fast) C code.
|
6
10
|
|
7
11
|
More information about the Levenshtein distance algorithm:
|
8
12
|
http://en.wikipedia.org/wiki/Levenshtein_distance .
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.2.0
|
data/ext/levenshtein/extconf.rb
CHANGED
@@ -2,4 +2,9 @@ require "mkmf"
|
|
2
2
|
|
3
3
|
dir_config("levenshtein")
|
4
4
|
|
5
|
-
|
5
|
+
have_library("levenshtein_array")
|
6
|
+
have_library("levenshtein_array_of_strings")
|
7
|
+
have_library("levenshtein_generic")
|
8
|
+
have_library("levenshtein_string")
|
9
|
+
|
10
|
+
create_makefile("levenshtein/levenshtein_fast")
|
@@ -0,0 +1,127 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
|
+
int threshold;
|
5
|
+
int l1, l2;
|
6
|
+
int *prev_row, *curr_row;
|
7
|
+
int col, row;
|
8
|
+
int curr_row_min, result;
|
9
|
+
int offset;
|
10
|
+
|
11
|
+
ID id_eql = rb_intern("==");
|
12
|
+
|
13
|
+
/* Get the sizes of both arrays. */
|
14
|
+
|
15
|
+
l1 = RARRAY(rb_o1)->len;
|
16
|
+
l2 = RARRAY(rb_o2)->len;
|
17
|
+
|
18
|
+
/* Convert Ruby's threshold to C's threshold. */
|
19
|
+
|
20
|
+
if (!NIL_P(rb_threshold)) {
|
21
|
+
threshold = FIX2INT(rb_threshold);
|
22
|
+
} else {
|
23
|
+
threshold = -1;
|
24
|
+
}
|
25
|
+
|
26
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
27
|
+
|
28
|
+
offset = 0;
|
29
|
+
while RTEST(rb_funcall(rb_ary_entry(rb_o1, offset), id_eql, 1, rb_ary_entry(rb_o2, offset))) {
|
30
|
+
offset++;
|
31
|
+
}
|
32
|
+
|
33
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
34
|
+
|
35
|
+
while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_ary_entry(rb_o1, l1-1), id_eql, 1, rb_ary_entry(rb_o2, l2-1)))) {
|
36
|
+
l1--;
|
37
|
+
l2--;
|
38
|
+
}
|
39
|
+
|
40
|
+
l1 -= offset;
|
41
|
+
l2 -= offset;
|
42
|
+
|
43
|
+
/* The Levenshtein algorithm itself. */
|
44
|
+
|
45
|
+
/* s1= */
|
46
|
+
/* ERIK */
|
47
|
+
/* */
|
48
|
+
/* 01234 */
|
49
|
+
/* s2=V 11234 */
|
50
|
+
/* E 21234 */
|
51
|
+
/* E 32234 */
|
52
|
+
/* N 43334 <- prev_row */
|
53
|
+
/* S 54444 <- curr_row */
|
54
|
+
/* T 65555 */
|
55
|
+
/* R 76566 */
|
56
|
+
/* A 87667 */
|
57
|
+
|
58
|
+
/* Allocate memory for both rows */
|
59
|
+
|
60
|
+
prev_row = ALLOC_N(int, l1+1);
|
61
|
+
curr_row = ALLOC_N(int, l1+1);
|
62
|
+
|
63
|
+
if ((prev_row == NULL) || (curr_row == NULL)) {
|
64
|
+
rb_raise(rb_eNoMemError, "out of memory");
|
65
|
+
}
|
66
|
+
|
67
|
+
/* Initialize the current row. */
|
68
|
+
|
69
|
+
for (col=0; col<=l1; col++) {
|
70
|
+
curr_row[col] = col;
|
71
|
+
}
|
72
|
+
|
73
|
+
for (row=1; row<=l2; row++) {
|
74
|
+
/* Copy the current row to the previous row. */
|
75
|
+
|
76
|
+
memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
|
77
|
+
|
78
|
+
/* Calculate the values of the current row. */
|
79
|
+
|
80
|
+
curr_row[0] = row;
|
81
|
+
curr_row_min = row;
|
82
|
+
|
83
|
+
for (col=1; col<=l1; col++) {
|
84
|
+
/* Equal (cost=0) or substitution (cost=1). */
|
85
|
+
|
86
|
+
curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_ary_entry(rb_o1, offset+col-1), id_eql, 1, rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
|
87
|
+
|
88
|
+
/* Insertion if it's cheaper than substitution. */
|
89
|
+
|
90
|
+
if (prev_row[col]+1 < curr_row[col]) {
|
91
|
+
curr_row[col] = prev_row[col]+1;
|
92
|
+
}
|
93
|
+
|
94
|
+
/* Deletion if it's cheaper than substitution. */
|
95
|
+
|
96
|
+
if (curr_row[col-1]+1 < curr_row[col]) {
|
97
|
+
curr_row[col] = curr_row[col-1]+1;
|
98
|
+
}
|
99
|
+
|
100
|
+
/* Keep track of the minimum value on this row. */
|
101
|
+
|
102
|
+
if (curr_row[col] < curr_row_min) {
|
103
|
+
curr_row_min = curr_row[col];
|
104
|
+
}
|
105
|
+
}
|
106
|
+
|
107
|
+
/* Return nil as soon as we exceed the threshold. */
|
108
|
+
|
109
|
+
if (threshold > -1 && curr_row_min >= threshold) {
|
110
|
+
free(prev_row);
|
111
|
+
free(curr_row);
|
112
|
+
|
113
|
+
return Qnil;
|
114
|
+
}
|
115
|
+
}
|
116
|
+
|
117
|
+
/* The result is the last value on the last row. */
|
118
|
+
|
119
|
+
result = curr_row[l1];
|
120
|
+
|
121
|
+
free(prev_row);
|
122
|
+
free(curr_row);
|
123
|
+
|
124
|
+
/* Return the Ruby version of the result. */
|
125
|
+
|
126
|
+
return INT2FIX(result);
|
127
|
+
}
|
@@ -0,0 +1,125 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
|
+
int threshold;
|
5
|
+
int l1, l2;
|
6
|
+
int *prev_row, *curr_row;
|
7
|
+
int col, row;
|
8
|
+
int curr_row_min, result;
|
9
|
+
int offset;
|
10
|
+
|
11
|
+
/* Get the sizes of both arrays. */
|
12
|
+
|
13
|
+
l1 = RARRAY(rb_o1)->len;
|
14
|
+
l2 = RARRAY(rb_o2)->len;
|
15
|
+
|
16
|
+
/* Convert Ruby's threshold to C's threshold. */
|
17
|
+
|
18
|
+
if (!NIL_P(rb_threshold)) {
|
19
|
+
threshold = FIX2INT(rb_threshold);
|
20
|
+
} else {
|
21
|
+
threshold = -1;
|
22
|
+
}
|
23
|
+
|
24
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
25
|
+
|
26
|
+
offset = 0;
|
27
|
+
while (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0) {
|
28
|
+
offset++;
|
29
|
+
}
|
30
|
+
|
31
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
32
|
+
|
33
|
+
while ((l1-1 > offset) && (l2-1 > offset) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
|
34
|
+
l1--;
|
35
|
+
l2--;
|
36
|
+
}
|
37
|
+
|
38
|
+
l1 -= offset;
|
39
|
+
l2 -= offset;
|
40
|
+
|
41
|
+
/* The Levenshtein algorithm itself. */
|
42
|
+
|
43
|
+
/* s1= */
|
44
|
+
/* ERIK */
|
45
|
+
/* */
|
46
|
+
/* 01234 */
|
47
|
+
/* s2=V 11234 */
|
48
|
+
/* E 21234 */
|
49
|
+
/* E 32234 */
|
50
|
+
/* N 43334 <- prev_row */
|
51
|
+
/* S 54444 <- curr_row */
|
52
|
+
/* T 65555 */
|
53
|
+
/* R 76566 */
|
54
|
+
/* A 87667 */
|
55
|
+
|
56
|
+
/* Allocate memory for both rows */
|
57
|
+
|
58
|
+
prev_row = ALLOC_N(int, l1+1);
|
59
|
+
curr_row = ALLOC_N(int, l1+1);
|
60
|
+
|
61
|
+
if ((prev_row == NULL) || (curr_row == NULL)) {
|
62
|
+
rb_raise(rb_eNoMemError, "out of memory");
|
63
|
+
}
|
64
|
+
|
65
|
+
/* Initialize the current row. */
|
66
|
+
|
67
|
+
for (col=0; col<=l1; col++) {
|
68
|
+
curr_row[col] = col;
|
69
|
+
}
|
70
|
+
|
71
|
+
for (row=1; row<=l2; row++) {
|
72
|
+
/* Copy the current row to the previous row. */
|
73
|
+
|
74
|
+
memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
|
75
|
+
|
76
|
+
/* Calculate the values of the current row. */
|
77
|
+
|
78
|
+
curr_row[0] = row;
|
79
|
+
curr_row_min = row;
|
80
|
+
|
81
|
+
for (col=1; col<=l1; col++) {
|
82
|
+
/* Equal (cost=0) or substitution (cost=1). */
|
83
|
+
|
84
|
+
curr_row[col] = prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
|
85
|
+
|
86
|
+
/* Insertion if it's cheaper than substitution. */
|
87
|
+
|
88
|
+
if (prev_row[col]+1 < curr_row[col]) {
|
89
|
+
curr_row[col] = prev_row[col]+1;
|
90
|
+
}
|
91
|
+
|
92
|
+
/* Deletion if it's cheaper than substitution. */
|
93
|
+
|
94
|
+
if (curr_row[col-1]+1 < curr_row[col]) {
|
95
|
+
curr_row[col] = curr_row[col-1]+1;
|
96
|
+
}
|
97
|
+
|
98
|
+
/* Keep track of the minimum value on this row. */
|
99
|
+
|
100
|
+
if (curr_row[col] < curr_row_min) {
|
101
|
+
curr_row_min = curr_row[col];
|
102
|
+
}
|
103
|
+
}
|
104
|
+
|
105
|
+
/* Return nil as soon as we exceed the threshold. */
|
106
|
+
|
107
|
+
if (threshold > -1 && curr_row_min >= threshold) {
|
108
|
+
free(prev_row);
|
109
|
+
free(curr_row);
|
110
|
+
|
111
|
+
return Qnil;
|
112
|
+
}
|
113
|
+
}
|
114
|
+
|
115
|
+
/* The result is the last value on the last row. */
|
116
|
+
|
117
|
+
result = curr_row[l1];
|
118
|
+
|
119
|
+
free(prev_row);
|
120
|
+
free(curr_row);
|
121
|
+
|
122
|
+
/* Return the Ruby version of the result. */
|
123
|
+
|
124
|
+
return INT2FIX(result);
|
125
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
|
+
if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
|
5
|
+
return levenshtein_distance_string(self, rb_o1, rb_o2, rb_threshold);
|
6
|
+
} else if ((TYPE(rb_o1) == T_ARRAY) && (TYPE(rb_o2)) == T_ARRAY) {
|
7
|
+
if ((TYPE(rb_ary_entry(rb_o1, 0)) == T_STRING) && (TYPE(rb_ary_entry(rb_o2, 0))) == T_STRING) {
|
8
|
+
return levenshtein_distance_array_of_strings(self, rb_o1, rb_o2, rb_threshold);
|
9
|
+
} else {
|
10
|
+
return levenshtein_distance_array(self, rb_o1, rb_o2, rb_threshold);
|
11
|
+
}
|
12
|
+
} else {
|
13
|
+
return levenshtein_distance_generic(self, rb_o1, rb_o2, rb_threshold);
|
14
|
+
}
|
15
|
+
}
|
16
|
+
|
17
|
+
void Init_levenshtein_fast() {
|
18
|
+
VALUE mLevenshtein = rb_define_module("Levenshtein");
|
19
|
+
|
20
|
+
rb_define_singleton_method(mLevenshtein, "levenshtein_distance_fast" , levenshtein_distance_fast, 3);
|
21
|
+
}
|
@@ -0,0 +1,129 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
|
+
int threshold;
|
5
|
+
int l1, l2;
|
6
|
+
int *prev_row, *curr_row;
|
7
|
+
int col, row;
|
8
|
+
int curr_row_min, result;
|
9
|
+
int offset;
|
10
|
+
|
11
|
+
ID id_length = rb_intern("length");
|
12
|
+
ID id_get = rb_intern("[]");
|
13
|
+
ID id_equal = rb_intern("==");
|
14
|
+
|
15
|
+
/* Get the sizes of both sequences. */
|
16
|
+
|
17
|
+
l1 = FIX2INT(rb_funcall(rb_o1, id_length, 0));
|
18
|
+
l2 = FIX2INT(rb_funcall(rb_o2, id_length, 0));
|
19
|
+
|
20
|
+
/* Convert Ruby's threshold to C's threshold. */
|
21
|
+
|
22
|
+
if (!NIL_P(rb_threshold)) {
|
23
|
+
threshold = FIX2INT(rb_threshold);
|
24
|
+
} else {
|
25
|
+
threshold = -1;
|
26
|
+
}
|
27
|
+
|
28
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
29
|
+
|
30
|
+
offset = 0;
|
31
|
+
while RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset)))) {
|
32
|
+
offset++;
|
33
|
+
}
|
34
|
+
|
35
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
36
|
+
|
37
|
+
while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
|
38
|
+
l1--;
|
39
|
+
l2--;
|
40
|
+
}
|
41
|
+
|
42
|
+
l1 -= offset;
|
43
|
+
l2 -= offset;
|
44
|
+
|
45
|
+
/* The Levenshtein algorithm itself. */
|
46
|
+
|
47
|
+
/* s1= */
|
48
|
+
/* ERIK */
|
49
|
+
/* */
|
50
|
+
/* 01234 */
|
51
|
+
/* s2=V 11234 */
|
52
|
+
/* E 21234 */
|
53
|
+
/* E 32234 */
|
54
|
+
/* N 43334 <- prev_row */
|
55
|
+
/* S 54444 <- curr_row */
|
56
|
+
/* T 65555 */
|
57
|
+
/* R 76566 */
|
58
|
+
/* A 87667 */
|
59
|
+
|
60
|
+
/* Allocate memory for both rows */
|
61
|
+
|
62
|
+
prev_row = ALLOC_N(int, l1+1);
|
63
|
+
curr_row = ALLOC_N(int, l1+1);
|
64
|
+
|
65
|
+
if ((prev_row == NULL) || (curr_row == NULL)) {
|
66
|
+
rb_raise(rb_eNoMemError, "out of memory");
|
67
|
+
}
|
68
|
+
|
69
|
+
/* Initialize the current row. */
|
70
|
+
|
71
|
+
for (col=0; col<=l1; col++) {
|
72
|
+
curr_row[col] = col;
|
73
|
+
}
|
74
|
+
|
75
|
+
for (row=1; row<=l2; row++) {
|
76
|
+
/* Copy the current row to the previous row. */
|
77
|
+
|
78
|
+
memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
|
79
|
+
|
80
|
+
/* Calculate the values of the current row. */
|
81
|
+
|
82
|
+
curr_row[0] = row;
|
83
|
+
curr_row_min = row;
|
84
|
+
|
85
|
+
for (col=1; col<=l1; col++) {
|
86
|
+
/* Equal (cost=0) or substitution (cost=1). */
|
87
|
+
|
88
|
+
curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
|
89
|
+
|
90
|
+
/* Insertion if it's cheaper than substitution. */
|
91
|
+
|
92
|
+
if (prev_row[col]+1 < curr_row[col]) {
|
93
|
+
curr_row[col] = prev_row[col]+1;
|
94
|
+
}
|
95
|
+
|
96
|
+
/* Deletion if it's cheaper than substitution. */
|
97
|
+
|
98
|
+
if (curr_row[col-1]+1 < curr_row[col]) {
|
99
|
+
curr_row[col] = curr_row[col-1]+1;
|
100
|
+
}
|
101
|
+
|
102
|
+
/* Keep track of the minimum value on this row. */
|
103
|
+
|
104
|
+
if (curr_row[col] < curr_row_min) {
|
105
|
+
curr_row_min = curr_row[col];
|
106
|
+
}
|
107
|
+
}
|
108
|
+
|
109
|
+
/* Return nil as soon as we exceed the threshold. */
|
110
|
+
|
111
|
+
if (threshold > -1 && curr_row_min >= threshold) {
|
112
|
+
free(prev_row);
|
113
|
+
free(curr_row);
|
114
|
+
|
115
|
+
return Qnil;
|
116
|
+
}
|
117
|
+
}
|
118
|
+
|
119
|
+
/* The result is the last value on the last row. */
|
120
|
+
|
121
|
+
result = curr_row[l1];
|
122
|
+
|
123
|
+
free(prev_row);
|
124
|
+
free(curr_row);
|
125
|
+
|
126
|
+
/* Return the Ruby version of the result. */
|
127
|
+
|
128
|
+
return INT2FIX(result);
|
129
|
+
}
|
@@ -1,25 +1,25 @@
|
|
1
1
|
#include "ruby.h"
|
2
2
|
|
3
|
-
|
4
|
-
VALUE rb_s3;
|
3
|
+
VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
5
4
|
int threshold;
|
6
|
-
int l1, l2
|
7
|
-
char *s1, *s2, *s3;
|
5
|
+
int l1, l2;
|
8
6
|
int *prev_row, *curr_row;
|
9
7
|
int col, row;
|
10
8
|
int curr_row_min, result;
|
9
|
+
int offset;
|
10
|
+
char *s1, *s2;
|
11
11
|
|
12
12
|
/* Convert Ruby's s1 to C's s1. */
|
13
13
|
|
14
|
-
|
15
|
-
s1 = RSTRING(
|
16
|
-
l1 = RSTRING(
|
14
|
+
rb_o1 = StringValue(rb_o1);
|
15
|
+
s1 = RSTRING(rb_o1)->ptr;
|
16
|
+
l1 = RSTRING(rb_o1)->len;
|
17
17
|
|
18
18
|
/* Convert Ruby's s2 to C's s2. */
|
19
19
|
|
20
|
-
|
21
|
-
s2 = RSTRING(
|
22
|
-
l2 = RSTRING(
|
20
|
+
rb_o2 = StringValue(rb_o2);
|
21
|
+
s2 = RSTRING(rb_o2)->ptr;
|
22
|
+
l2 = RSTRING(rb_o2)->len;
|
23
23
|
|
24
24
|
/* Convert Ruby's threshold to C's threshold. */
|
25
25
|
|
@@ -29,7 +29,24 @@ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VA
|
|
29
29
|
threshold = -1;
|
30
30
|
}
|
31
31
|
|
32
|
-
/*
|
32
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
33
|
+
|
34
|
+
offset = 0;
|
35
|
+
while (s1[offset] == s2[offset]) {
|
36
|
+
offset++;
|
37
|
+
}
|
38
|
+
|
39
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
40
|
+
|
41
|
+
while ((l1-1 > offset) && (l2-1 > offset) && (s1[l1-1] == s2[l2-1])) {
|
42
|
+
l1--;
|
43
|
+
l2--;
|
44
|
+
}
|
45
|
+
|
46
|
+
l1 -= offset;
|
47
|
+
l2 -= offset;
|
48
|
+
|
49
|
+
/* The Levenshtein algorithm itself. */
|
33
50
|
|
34
51
|
/* s1= */
|
35
52
|
/* ERIK */
|
@@ -43,7 +60,7 @@ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VA
|
|
43
60
|
/* T 65555 */
|
44
61
|
/* R 76566 */
|
45
62
|
/* A 87667 */
|
46
|
-
|
63
|
+
|
47
64
|
/* Allocate memory for both rows */
|
48
65
|
|
49
66
|
prev_row = ALLOC_N(int, l1+1);
|
@@ -70,9 +87,9 @@ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VA
|
|
70
87
|
curr_row_min = row;
|
71
88
|
|
72
89
|
for (col=1; col<=l1; col++) {
|
73
|
-
/* Equal (cost=0) or
|
90
|
+
/* Equal (cost=0) or substitution (cost=1). */
|
74
91
|
|
75
|
-
curr_row[col] = prev_row[col-1] + ((s1[col-1] == s2[row-1]) ? 0 : 1);
|
92
|
+
curr_row[col] = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
|
76
93
|
|
77
94
|
/* Insertion if it's cheaper than substitution. */
|
78
95
|
|
@@ -114,9 +131,3 @@ static VALUE levenshtein_distance_part2(VALUE self, VALUE rb_s1, VALUE rb_s2, VA
|
|
114
131
|
|
115
132
|
return INT2FIX(result);
|
116
133
|
}
|
117
|
-
|
118
|
-
void Init_levenshtein_c() {
|
119
|
-
VALUE mLevenshtein = rb_define_module("Levenshtein");
|
120
|
-
|
121
|
-
rb_define_singleton_method(mLevenshtein, "distance_part2_fast" , levenshtein_distance_part2, 3);
|
122
|
-
}
|
data/lib/levenshtein.rb
CHANGED
@@ -1,30 +1,34 @@
|
|
1
1
|
begin
|
2
|
-
require "levenshtein/
|
2
|
+
require "levenshtein/levenshtein_fast" # If compiled by RubyGems.
|
3
3
|
rescue LoadError
|
4
4
|
begin
|
5
|
-
require "
|
5
|
+
require "levenshtein_fast" # If compiled by the build script.
|
6
6
|
rescue LoadError
|
7
|
-
$stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.
|
7
|
+
$stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead."
|
8
8
|
end
|
9
9
|
end
|
10
10
|
|
11
|
-
# The Levenshtein distance is a metric for measuring the amount
|
12
|
-
# between two sequences (i.e., the so called edit
|
13
|
-
# distance between two
|
14
|
-
#
|
15
|
-
#
|
11
|
+
# The Levenshtein distance is a metric for measuring the amount
|
12
|
+
# of difference between two sequences (i.e., the so called edit
|
13
|
+
# distance). The Levenshtein distance between two sequences is
|
14
|
+
# given by the minimum number of operations needed to transform
|
15
|
+
# one sequence into the other, where an operation is an
|
16
|
+
# insertion, deletion, or substitution of a single element.
|
16
17
|
#
|
17
18
|
# More information about the Levenshtein distance algorithm:
|
18
19
|
# http://en.wikipedia.org/wiki/Levenshtein_distance .
|
19
20
|
|
20
21
|
module Levenshtein
|
21
|
-
|
22
|
-
|
22
|
+
VERSION = "0.2.0"
|
23
|
+
|
24
|
+
# Returns the Levenshtein distance as a number between 0.0 and
|
25
|
+
# 1.0. It's basically the Levenshtein distance divided by the
|
26
|
+
# length of the longest sequence.
|
23
27
|
|
24
28
|
def self.normalized_distance(s1, s2, threshold=nil)
|
25
29
|
s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
|
26
30
|
|
27
|
-
if s2.
|
31
|
+
if s2.length == 0
|
28
32
|
0.0 # Since s1.length < s2.length, s1 must be empty as well.
|
29
33
|
else
|
30
34
|
if threshold
|
@@ -39,46 +43,49 @@ module Levenshtein
|
|
39
43
|
end
|
40
44
|
end
|
41
45
|
|
42
|
-
# Returns the Levenshtein distance between two
|
46
|
+
# Returns the Levenshtein distance between two sequences.
|
47
|
+
#
|
48
|
+
# The two sequences can be two strings, two arrays, or two other
|
49
|
+
# objects. Strings, arrays and arrays of strings are handled with
|
50
|
+
# optimized (very fast) C code. All other sequences are handled
|
51
|
+
# with generic (fast) C code.
|
52
|
+
#
|
53
|
+
# The sequences should respond to :length and :[] and all objects
|
54
|
+
# in the sequences (as returned by []) should response to :==.
|
43
55
|
|
44
56
|
def self.distance(s1, s2, threshold=nil)
|
45
57
|
s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
|
46
58
|
|
47
59
|
# Handle some basic circumstances.
|
48
60
|
|
49
|
-
return 0
|
50
|
-
return s2.length if s1.
|
51
|
-
return nil if threshold and (s2.length-s1.length) >= threshold
|
52
|
-
return nil if threshold and (s1.scan(/./) - s2.scan(/./)).length >= threshold
|
53
|
-
return nil if threshold and (s2.scan(/./) - s1.scan(/./)).length >= threshold
|
54
|
-
|
55
|
-
# Do the expensive calculation on a subset of the strings only, if possible.
|
61
|
+
return 0 if s1 == s2
|
62
|
+
return s2.length if s1.length == 0
|
56
63
|
|
57
|
-
|
58
|
-
|
59
|
-
e2 = s2.length-1
|
64
|
+
if threshold
|
65
|
+
return nil if (s2.length-s1.length) >= threshold
|
60
66
|
|
61
|
-
|
62
|
-
|
63
|
-
|
67
|
+
a1, a2 = nil, nil
|
68
|
+
a1, a2 = s1, s2 if s1.respond_to?(:-) and s2.respond_to?(:-)
|
69
|
+
a1, a2 = s1.scan(/./), s2.scan(/./) if s1.respond_to?(:scan) and s2.respond_to?(:scan)
|
64
70
|
|
65
|
-
|
66
|
-
|
67
|
-
|
71
|
+
if a1 and a2
|
72
|
+
return nil if (a1-a2).length >= threshold
|
73
|
+
return nil if (a2-a1).length >= threshold
|
74
|
+
end
|
68
75
|
end
|
69
76
|
|
70
|
-
|
77
|
+
distance_fast_or_slow(s1, s2, threshold)
|
71
78
|
end
|
72
79
|
|
73
|
-
def self.
|
74
|
-
if respond_to?(:
|
75
|
-
|
80
|
+
def self.distance_fast_or_slow(s1, s2, threshold) # :nodoc:
|
81
|
+
if respond_to?(:levenshtein_distance_fast)
|
82
|
+
levenshtein_distance_fast(s1, s2, threshold) # Implemented in C.
|
76
83
|
else
|
77
|
-
|
84
|
+
levenshtein_distance_slow(s1, s2, threshold) # Implemented in Ruby.
|
78
85
|
end
|
79
86
|
end
|
80
87
|
|
81
|
-
def self.
|
88
|
+
def self.levenshtein_distance_slow(s1, s2, threshold) # :nodoc:
|
82
89
|
row = (0..s1.length).to_a
|
83
90
|
|
84
91
|
1.upto(s2.length) do |y|
|
@@ -89,8 +96,10 @@ module Levenshtein
|
|
89
96
|
row[x] = [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
|
90
97
|
end
|
91
98
|
|
92
|
-
# Stop analysing this
|
93
|
-
#
|
99
|
+
# Stop analysing this sequence as soon as the best possible
|
100
|
+
# result for this sequence is bigger than the best result so far.
|
101
|
+
# (The minimum value in the next row will be equal to or greater
|
102
|
+
# than the minimum value in this row.)
|
94
103
|
|
95
104
|
return nil if threshold and row.min >= threshold
|
96
105
|
end
|
data/test/test.rb
CHANGED
@@ -1,7 +1,35 @@
|
|
1
1
|
require "test/unit"
|
2
2
|
require "levenshtein"
|
3
3
|
|
4
|
-
|
4
|
+
module Levenshtein
|
5
|
+
class TestSequence
|
6
|
+
def initialize(o)
|
7
|
+
@sequence = o
|
8
|
+
end
|
9
|
+
|
10
|
+
def length
|
11
|
+
@sequence.length
|
12
|
+
end
|
13
|
+
|
14
|
+
def [](pos)
|
15
|
+
@sequence[pos]
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class TestElement
|
20
|
+
attr_reader :object
|
21
|
+
|
22
|
+
def initialize(o)
|
23
|
+
@object = o
|
24
|
+
end
|
25
|
+
|
26
|
+
def ==(other)
|
27
|
+
@object == other.object
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
class TestLevenshteinString < Test::Unit::TestCase
|
5
33
|
def test_erik_veenstra
|
6
34
|
assert_equal(7, Levenshtein.distance("erik", "veenstra"))
|
7
35
|
assert_equal(7, Levenshtein.distance("veenstra", "erik"))
|
@@ -30,9 +58,11 @@ class TestLevenshtein < Test::Unit::TestCase
|
|
30
58
|
|
31
59
|
def test_threshold
|
32
60
|
assert_equal(3, Levenshtein.distance("foo", "foobar"))
|
61
|
+
assert_equal(3, Levenshtein.distance("foo", "foobar", 4))
|
33
62
|
assert_equal(nil, Levenshtein.distance("foo", "foobar", 2))
|
34
63
|
|
35
64
|
assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar"), 0.01)
|
65
|
+
assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar", 0.66), 0.01)
|
36
66
|
assert_equal(nil, Levenshtein.normalized_distance("foo", "foobar", 0.30))
|
37
67
|
end
|
38
68
|
|
@@ -45,47 +75,51 @@ class TestLevenshtein < Test::Unit::TestCase
|
|
45
75
|
assert_in_delta(0.42, Levenshtein.normalized_distance("ab123cd", "abxyzcd"), 0.01)
|
46
76
|
assert_in_delta(0.6, Levenshtein.normalized_distance("ab123", "abxyz"), 0.01)
|
47
77
|
assert_in_delta(0.6, Levenshtein.normalized_distance("123cd", "xyzcd"), 0.01)
|
78
|
+
assert_in_delta(0.625, Levenshtein.normalized_distance("123cd123", "123"), 0.01)
|
48
79
|
end
|
49
80
|
end
|
50
81
|
|
51
|
-
class
|
82
|
+
class TestLevenshteinArray < Test::Unit::TestCase
|
52
83
|
def test_erik_veenstra
|
53
|
-
|
54
|
-
end
|
84
|
+
x = lambda{|s| s.scan(/./).collect{|e| Levenshtein::TestElement.new(e)}}
|
55
85
|
|
56
|
-
|
57
|
-
assert_equal(0, Levenshtein.distance_part2_slow("", "", nil))
|
58
|
-
assert_equal(3, Levenshtein.distance_part2_slow("", "foo", nil))
|
86
|
+
assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
|
59
87
|
end
|
88
|
+
end
|
60
89
|
|
61
|
-
|
62
|
-
|
63
|
-
|
90
|
+
class TestLevenshteinArrayOfStrings < Test::Unit::TestCase
|
91
|
+
def test_erik_veenstra
|
92
|
+
x = lambda{|s| s.scan(/./)}
|
93
|
+
|
94
|
+
assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
|
64
95
|
end
|
96
|
+
end
|
65
97
|
|
66
|
-
|
67
|
-
|
68
|
-
|
98
|
+
class TestLevenshteinGeneric < Test::Unit::TestCase
|
99
|
+
def test_erik_veenstra
|
100
|
+
x = lambda{|s| Levenshtein::TestSequence.new(s.scan(/./).collect{|e| Levenshtein::TestElement.new(e)})}
|
101
|
+
|
102
|
+
assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
|
69
103
|
end
|
70
104
|
end
|
71
105
|
|
72
|
-
class
|
106
|
+
class TestLevenshteinSlow < Test::Unit::TestCase
|
73
107
|
def test_erik_veenstra
|
74
|
-
assert_equal(7, Levenshtein.
|
108
|
+
assert_equal(7, Levenshtein.levenshtein_distance_slow("erik", "veenstra", nil))
|
75
109
|
end
|
76
110
|
|
77
|
-
def
|
78
|
-
assert_equal(0, Levenshtein.
|
79
|
-
assert_equal(3, Levenshtein.
|
111
|
+
def test_empty_sequence
|
112
|
+
assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
|
113
|
+
assert_equal(3, Levenshtein.levenshtein_distance_slow("", "foo", nil))
|
80
114
|
end
|
81
115
|
|
82
|
-
def
|
83
|
-
assert_equal(0, Levenshtein.
|
84
|
-
assert_equal(0, Levenshtein.
|
116
|
+
def test_same_sequence
|
117
|
+
assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
|
118
|
+
assert_equal(0, Levenshtein.levenshtein_distance_slow("foo", "foo", nil))
|
85
119
|
end
|
86
120
|
|
87
121
|
def test_threshold
|
88
|
-
assert_equal(3, Levenshtein.
|
89
|
-
assert_equal(nil, Levenshtein.
|
122
|
+
assert_equal(3, Levenshtein.levenshtein_distance_slow("foo", "foobar", nil))
|
123
|
+
assert_equal(nil, Levenshtein.levenshtein_distance_slow("foo", "foobar", 2))
|
90
124
|
end
|
91
125
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: levenshtein
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erik Veenstra
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date:
|
12
|
+
date: 2009-07-11 00:00:00 +02:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
@@ -23,15 +23,20 @@ extra_rdoc_files: []
|
|
23
23
|
|
24
24
|
files:
|
25
25
|
- lib/levenshtein.rb
|
26
|
-
- ext/levenshtein
|
27
26
|
- ext/levenshtein/extconf.rb
|
28
|
-
- ext/levenshtein/
|
27
|
+
- ext/levenshtein/levenshtein_array_of_strings.c
|
28
|
+
- ext/levenshtein/levenshtein_fast.c
|
29
|
+
- ext/levenshtein/levenshtein_string.c
|
30
|
+
- ext/levenshtein/levenshtein_generic.c
|
31
|
+
- ext/levenshtein/levenshtein_array.c
|
29
32
|
- README
|
30
33
|
- LICENSE
|
31
34
|
- VERSION
|
32
35
|
- CHANGELOG
|
33
36
|
has_rdoc: true
|
34
37
|
homepage: http://www.erikveen.dds.nl/levenshtein/index.html
|
38
|
+
licenses: []
|
39
|
+
|
35
40
|
post_install_message:
|
36
41
|
rdoc_options:
|
37
42
|
- README
|
@@ -39,7 +44,7 @@ rdoc_options:
|
|
39
44
|
- VERSION
|
40
45
|
- CHANGELOG
|
41
46
|
- --title
|
42
|
-
- levenshtein (0.
|
47
|
+
- levenshtein (0.2.0)
|
43
48
|
- --main
|
44
49
|
- README
|
45
50
|
require_paths:
|
@@ -59,9 +64,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
59
64
|
requirements: []
|
60
65
|
|
61
66
|
rubyforge_project: levenshtein
|
62
|
-
rubygems_version: 1.
|
67
|
+
rubygems_version: 1.3.4
|
63
68
|
signing_key:
|
64
|
-
specification_version:
|
69
|
+
specification_version: 3
|
65
70
|
summary: Calculates the Levenshtein distance between two byte strings.
|
66
71
|
test_files:
|
67
72
|
- test/test.rb
|