levenshtein-19 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +5 -0
- data/CHANGELOG +26 -0
- data/Gemfile +4 -0
- data/LICENSE +15 -0
- data/README +15 -0
- data/Rakefile +1 -0
- data/ext/levenshtein/extconf.rb +10 -0
- data/ext/levenshtein/levenshtein_array.c +127 -0
- data/ext/levenshtein/levenshtein_array_of_strings.c +125 -0
- data/ext/levenshtein/levenshtein_fast.c +21 -0
- data/ext/levenshtein/levenshtein_generic.c +129 -0
- data/ext/levenshtein/levenshtein_string.c +133 -0
- data/levenshtein-19.gemspec +21 -0
- data/lib/levenshtein/version.rb +3 -0
- data/lib/levenshtein.rb +109 -0
- data/test/test.rb +125 -0
- metadata +64 -0
data/CHANGELOG
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
0.3.0 (11-20-2011)
|
2
|
+
|
3
|
+
* Use RARRAY_LEN, RSTRING_LEN, and RSTRING_PTR for 1.9 compatibility.
|
4
|
+
|
5
|
+
0.2.0 (11-07-2009)
|
6
|
+
|
7
|
+
* Return 0 instead of 0.0 in case of empty strings.
|
8
|
+
|
9
|
+
* Added specific support for arrays.
|
10
|
+
|
11
|
+
* Added specific support for arrays of strings.
|
12
|
+
|
13
|
+
* Added generic support for all (?) kind of sequences.
|
14
|
+
|
15
|
+
* Moved a lot of code to the C world.
|
16
|
+
|
17
|
+
0.1.1 (06-10-2008)
|
18
|
+
|
19
|
+
* If one of the strings was both the begin and the end of the
|
20
|
+
other string, it would be stripped from both ends. Example:
|
21
|
+
Levenshtein.distance("abracadabra", "abra") resulted in 3
|
22
|
+
instead of 7. It's fixed now.
|
23
|
+
|
24
|
+
0.1.0 (24-05-2008)
|
25
|
+
|
26
|
+
* First release.
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# Copyright Erik Veenstra <levenshtein@erikveen.dds.nl>
|
2
|
+
#
|
3
|
+
# This program is free software; you can redistribute it and/or
|
4
|
+
# modify it under the terms of the GNU General Public License,
|
5
|
+
# version 2, as published by the Free Software Foundation.
|
6
|
+
#
|
7
|
+
# This program is distributed in the hope that it will be
|
8
|
+
# useful, but WITHOUT ANY WARRANTY; without even the implied
|
9
|
+
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
10
|
+
# PURPOSE. See the GNU General Public License for more details.
|
11
|
+
#
|
12
|
+
# You should have received a copy of the GNU General Public
|
13
|
+
# License along with this program; if not, write to the Free
|
14
|
+
# Software Foundation, Inc., 59 Temple Place, Suite 330,
|
15
|
+
# Boston, MA 02111-1307 USA.
|
data/README
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
The Levenshtein distance is a metric for measuring the amount of difference
|
2
|
+
between two sequences (i.e., the so called edit distance). The Levenshtein
|
3
|
+
distance between two sequences is given by the minimum number of operations
|
4
|
+
needed to transform one sequence into the other, where an operation is an
|
5
|
+
insertion, deletion, or substitution of a single element.
|
6
|
+
|
7
|
+
The two sequences can be two strings, two arrays, or two other objects.
|
8
|
+
Strings, arrays and arrays of strings are handled with optimized (very fast) C
|
9
|
+
code. All other sequences are handled with generic (fast) C code.
|
10
|
+
|
11
|
+
More information about the Levenshtein distance algorithm:
|
12
|
+
http://en.wikipedia.org/wiki/Levenshtein_distance .
|
13
|
+
|
14
|
+
NOTE: This gem was written by Erik Veenstra. I have made slight modifications
|
15
|
+
to it for compatibility with Ruby 1.9.
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,127 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
|
+
int threshold;
|
5
|
+
int l1, l2;
|
6
|
+
int *prev_row, *curr_row;
|
7
|
+
int col, row;
|
8
|
+
int curr_row_min, result;
|
9
|
+
int offset;
|
10
|
+
|
11
|
+
ID id_eql = rb_intern("==");
|
12
|
+
|
13
|
+
/* Get the sizes of both arrays. */
|
14
|
+
|
15
|
+
l1 = RARRAY_LEN(rb_o1);
|
16
|
+
l2 = RARRAY_LEN(rb_o2);
|
17
|
+
|
18
|
+
/* Convert Ruby's threshold to C's threshold. */
|
19
|
+
|
20
|
+
if (!NIL_P(rb_threshold)) {
|
21
|
+
threshold = FIX2INT(rb_threshold);
|
22
|
+
} else {
|
23
|
+
threshold = -1;
|
24
|
+
}
|
25
|
+
|
26
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
27
|
+
|
28
|
+
offset = 0;
|
29
|
+
while RTEST(rb_funcall(rb_ary_entry(rb_o1, offset), id_eql, 1, rb_ary_entry(rb_o2, offset))) {
|
30
|
+
offset++;
|
31
|
+
}
|
32
|
+
|
33
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
34
|
+
|
35
|
+
while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_ary_entry(rb_o1, l1-1), id_eql, 1, rb_ary_entry(rb_o2, l2-1)))) {
|
36
|
+
l1--;
|
37
|
+
l2--;
|
38
|
+
}
|
39
|
+
|
40
|
+
l1 -= offset;
|
41
|
+
l2 -= offset;
|
42
|
+
|
43
|
+
/* The Levenshtein algorithm itself. */
|
44
|
+
|
45
|
+
/* s1= */
|
46
|
+
/* ERIK */
|
47
|
+
/* */
|
48
|
+
/* 01234 */
|
49
|
+
/* s2=V 11234 */
|
50
|
+
/* E 21234 */
|
51
|
+
/* E 32234 */
|
52
|
+
/* N 43334 <- prev_row */
|
53
|
+
/* S 54444 <- curr_row */
|
54
|
+
/* T 65555 */
|
55
|
+
/* R 76566 */
|
56
|
+
/* A 87667 */
|
57
|
+
|
58
|
+
/* Allocate memory for both rows */
|
59
|
+
|
60
|
+
prev_row = ALLOC_N(int, l1+1);
|
61
|
+
curr_row = ALLOC_N(int, l1+1);
|
62
|
+
|
63
|
+
if ((prev_row == NULL) || (curr_row == NULL)) {
|
64
|
+
rb_raise(rb_eNoMemError, "out of memory");
|
65
|
+
}
|
66
|
+
|
67
|
+
/* Initialize the current row. */
|
68
|
+
|
69
|
+
for (col=0; col<=l1; col++) {
|
70
|
+
curr_row[col] = col;
|
71
|
+
}
|
72
|
+
|
73
|
+
for (row=1; row<=l2; row++) {
|
74
|
+
/* Copy the current row to the previous row. */
|
75
|
+
|
76
|
+
memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
|
77
|
+
|
78
|
+
/* Calculate the values of the current row. */
|
79
|
+
|
80
|
+
curr_row[0] = row;
|
81
|
+
curr_row_min = row;
|
82
|
+
|
83
|
+
for (col=1; col<=l1; col++) {
|
84
|
+
/* Equal (cost=0) or substitution (cost=1). */
|
85
|
+
|
86
|
+
curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_ary_entry(rb_o1, offset+col-1), id_eql, 1, rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
|
87
|
+
|
88
|
+
/* Insertion if it's cheaper than substitution. */
|
89
|
+
|
90
|
+
if (prev_row[col]+1 < curr_row[col]) {
|
91
|
+
curr_row[col] = prev_row[col]+1;
|
92
|
+
}
|
93
|
+
|
94
|
+
/* Deletion if it's cheaper than substitution. */
|
95
|
+
|
96
|
+
if (curr_row[col-1]+1 < curr_row[col]) {
|
97
|
+
curr_row[col] = curr_row[col-1]+1;
|
98
|
+
}
|
99
|
+
|
100
|
+
/* Keep track of the minimum value on this row. */
|
101
|
+
|
102
|
+
if (curr_row[col] < curr_row_min) {
|
103
|
+
curr_row_min = curr_row[col];
|
104
|
+
}
|
105
|
+
}
|
106
|
+
|
107
|
+
/* Return nil as soon as we exceed the threshold. */
|
108
|
+
|
109
|
+
if (threshold > -1 && curr_row_min >= threshold) {
|
110
|
+
free(prev_row);
|
111
|
+
free(curr_row);
|
112
|
+
|
113
|
+
return Qnil;
|
114
|
+
}
|
115
|
+
}
|
116
|
+
|
117
|
+
/* The result is the last value on the last row. */
|
118
|
+
|
119
|
+
result = curr_row[l1];
|
120
|
+
|
121
|
+
free(prev_row);
|
122
|
+
free(curr_row);
|
123
|
+
|
124
|
+
/* Return the Ruby version of the result. */
|
125
|
+
|
126
|
+
return INT2FIX(result);
|
127
|
+
}
|
@@ -0,0 +1,125 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
|
+
int threshold;
|
5
|
+
int l1, l2;
|
6
|
+
int *prev_row, *curr_row;
|
7
|
+
int col, row;
|
8
|
+
int curr_row_min, result;
|
9
|
+
int offset;
|
10
|
+
|
11
|
+
/* Get the sizes of both arrays. */
|
12
|
+
|
13
|
+
l1 = RARRAY_LEN(rb_o1);
|
14
|
+
l2 = RARRAY_LEN(rb_o2);
|
15
|
+
|
16
|
+
/* Convert Ruby's threshold to C's threshold. */
|
17
|
+
|
18
|
+
if (!NIL_P(rb_threshold)) {
|
19
|
+
threshold = FIX2INT(rb_threshold);
|
20
|
+
} else {
|
21
|
+
threshold = -1;
|
22
|
+
}
|
23
|
+
|
24
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
25
|
+
|
26
|
+
offset = 0;
|
27
|
+
while (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0) {
|
28
|
+
offset++;
|
29
|
+
}
|
30
|
+
|
31
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
32
|
+
|
33
|
+
while ((l1-1 > offset) && (l2-1 > offset) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
|
34
|
+
l1--;
|
35
|
+
l2--;
|
36
|
+
}
|
37
|
+
|
38
|
+
l1 -= offset;
|
39
|
+
l2 -= offset;
|
40
|
+
|
41
|
+
/* The Levenshtein algorithm itself. */
|
42
|
+
|
43
|
+
/* s1= */
|
44
|
+
/* ERIK */
|
45
|
+
/* */
|
46
|
+
/* 01234 */
|
47
|
+
/* s2=V 11234 */
|
48
|
+
/* E 21234 */
|
49
|
+
/* E 32234 */
|
50
|
+
/* N 43334 <- prev_row */
|
51
|
+
/* S 54444 <- curr_row */
|
52
|
+
/* T 65555 */
|
53
|
+
/* R 76566 */
|
54
|
+
/* A 87667 */
|
55
|
+
|
56
|
+
/* Allocate memory for both rows */
|
57
|
+
|
58
|
+
prev_row = ALLOC_N(int, l1+1);
|
59
|
+
curr_row = ALLOC_N(int, l1+1);
|
60
|
+
|
61
|
+
if ((prev_row == NULL) || (curr_row == NULL)) {
|
62
|
+
rb_raise(rb_eNoMemError, "out of memory");
|
63
|
+
}
|
64
|
+
|
65
|
+
/* Initialize the current row. */
|
66
|
+
|
67
|
+
for (col=0; col<=l1; col++) {
|
68
|
+
curr_row[col] = col;
|
69
|
+
}
|
70
|
+
|
71
|
+
for (row=1; row<=l2; row++) {
|
72
|
+
/* Copy the current row to the previous row. */
|
73
|
+
|
74
|
+
memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
|
75
|
+
|
76
|
+
/* Calculate the values of the current row. */
|
77
|
+
|
78
|
+
curr_row[0] = row;
|
79
|
+
curr_row_min = row;
|
80
|
+
|
81
|
+
for (col=1; col<=l1; col++) {
|
82
|
+
/* Equal (cost=0) or substitution (cost=1). */
|
83
|
+
|
84
|
+
curr_row[col] = prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
|
85
|
+
|
86
|
+
/* Insertion if it's cheaper than substitution. */
|
87
|
+
|
88
|
+
if (prev_row[col]+1 < curr_row[col]) {
|
89
|
+
curr_row[col] = prev_row[col]+1;
|
90
|
+
}
|
91
|
+
|
92
|
+
/* Deletion if it's cheaper than substitution. */
|
93
|
+
|
94
|
+
if (curr_row[col-1]+1 < curr_row[col]) {
|
95
|
+
curr_row[col] = curr_row[col-1]+1;
|
96
|
+
}
|
97
|
+
|
98
|
+
/* Keep track of the minimum value on this row. */
|
99
|
+
|
100
|
+
if (curr_row[col] < curr_row_min) {
|
101
|
+
curr_row_min = curr_row[col];
|
102
|
+
}
|
103
|
+
}
|
104
|
+
|
105
|
+
/* Return nil as soon as we exceed the threshold. */
|
106
|
+
|
107
|
+
if (threshold > -1 && curr_row_min >= threshold) {
|
108
|
+
free(prev_row);
|
109
|
+
free(curr_row);
|
110
|
+
|
111
|
+
return Qnil;
|
112
|
+
}
|
113
|
+
}
|
114
|
+
|
115
|
+
/* The result is the last value on the last row. */
|
116
|
+
|
117
|
+
result = curr_row[l1];
|
118
|
+
|
119
|
+
free(prev_row);
|
120
|
+
free(curr_row);
|
121
|
+
|
122
|
+
/* Return the Ruby version of the result. */
|
123
|
+
|
124
|
+
return INT2FIX(result);
|
125
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
|
+
if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
|
5
|
+
return levenshtein_distance_string(self, rb_o1, rb_o2, rb_threshold);
|
6
|
+
} else if ((TYPE(rb_o1) == T_ARRAY) && (TYPE(rb_o2)) == T_ARRAY) {
|
7
|
+
if ((TYPE(rb_ary_entry(rb_o1, 0)) == T_STRING) && (TYPE(rb_ary_entry(rb_o2, 0))) == T_STRING) {
|
8
|
+
return levenshtein_distance_array_of_strings(self, rb_o1, rb_o2, rb_threshold);
|
9
|
+
} else {
|
10
|
+
return levenshtein_distance_array(self, rb_o1, rb_o2, rb_threshold);
|
11
|
+
}
|
12
|
+
} else {
|
13
|
+
return levenshtein_distance_generic(self, rb_o1, rb_o2, rb_threshold);
|
14
|
+
}
|
15
|
+
}
|
16
|
+
|
17
|
+
void Init_levenshtein_fast() {
|
18
|
+
VALUE mLevenshtein = rb_define_module("Levenshtein");
|
19
|
+
|
20
|
+
rb_define_singleton_method(mLevenshtein, "levenshtein_distance_fast" , levenshtein_distance_fast, 3);
|
21
|
+
}
|
@@ -0,0 +1,129 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
|
+
int threshold;
|
5
|
+
int l1, l2;
|
6
|
+
int *prev_row, *curr_row;
|
7
|
+
int col, row;
|
8
|
+
int curr_row_min, result;
|
9
|
+
int offset;
|
10
|
+
|
11
|
+
ID id_length = rb_intern("length");
|
12
|
+
ID id_get = rb_intern("[]");
|
13
|
+
ID id_equal = rb_intern("==");
|
14
|
+
|
15
|
+
/* Get the sizes of both sequences. */
|
16
|
+
|
17
|
+
l1 = FIX2INT(rb_funcall(rb_o1, id_length, 0));
|
18
|
+
l2 = FIX2INT(rb_funcall(rb_o2, id_length, 0));
|
19
|
+
|
20
|
+
/* Convert Ruby's threshold to C's threshold. */
|
21
|
+
|
22
|
+
if (!NIL_P(rb_threshold)) {
|
23
|
+
threshold = FIX2INT(rb_threshold);
|
24
|
+
} else {
|
25
|
+
threshold = -1;
|
26
|
+
}
|
27
|
+
|
28
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
29
|
+
|
30
|
+
offset = 0;
|
31
|
+
while RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset)))) {
|
32
|
+
offset++;
|
33
|
+
}
|
34
|
+
|
35
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
36
|
+
|
37
|
+
while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
|
38
|
+
l1--;
|
39
|
+
l2--;
|
40
|
+
}
|
41
|
+
|
42
|
+
l1 -= offset;
|
43
|
+
l2 -= offset;
|
44
|
+
|
45
|
+
/* The Levenshtein algorithm itself. */
|
46
|
+
|
47
|
+
/* s1= */
|
48
|
+
/* ERIK */
|
49
|
+
/* */
|
50
|
+
/* 01234 */
|
51
|
+
/* s2=V 11234 */
|
52
|
+
/* E 21234 */
|
53
|
+
/* E 32234 */
|
54
|
+
/* N 43334 <- prev_row */
|
55
|
+
/* S 54444 <- curr_row */
|
56
|
+
/* T 65555 */
|
57
|
+
/* R 76566 */
|
58
|
+
/* A 87667 */
|
59
|
+
|
60
|
+
/* Allocate memory for both rows */
|
61
|
+
|
62
|
+
prev_row = ALLOC_N(int, l1+1);
|
63
|
+
curr_row = ALLOC_N(int, l1+1);
|
64
|
+
|
65
|
+
if ((prev_row == NULL) || (curr_row == NULL)) {
|
66
|
+
rb_raise(rb_eNoMemError, "out of memory");
|
67
|
+
}
|
68
|
+
|
69
|
+
/* Initialize the current row. */
|
70
|
+
|
71
|
+
for (col=0; col<=l1; col++) {
|
72
|
+
curr_row[col] = col;
|
73
|
+
}
|
74
|
+
|
75
|
+
for (row=1; row<=l2; row++) {
|
76
|
+
/* Copy the current row to the previous row. */
|
77
|
+
|
78
|
+
memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
|
79
|
+
|
80
|
+
/* Calculate the values of the current row. */
|
81
|
+
|
82
|
+
curr_row[0] = row;
|
83
|
+
curr_row_min = row;
|
84
|
+
|
85
|
+
for (col=1; col<=l1; col++) {
|
86
|
+
/* Equal (cost=0) or substitution (cost=1). */
|
87
|
+
|
88
|
+
curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
|
89
|
+
|
90
|
+
/* Insertion if it's cheaper than substitution. */
|
91
|
+
|
92
|
+
if (prev_row[col]+1 < curr_row[col]) {
|
93
|
+
curr_row[col] = prev_row[col]+1;
|
94
|
+
}
|
95
|
+
|
96
|
+
/* Deletion if it's cheaper than substitution. */
|
97
|
+
|
98
|
+
if (curr_row[col-1]+1 < curr_row[col]) {
|
99
|
+
curr_row[col] = curr_row[col-1]+1;
|
100
|
+
}
|
101
|
+
|
102
|
+
/* Keep track of the minimum value on this row. */
|
103
|
+
|
104
|
+
if (curr_row[col] < curr_row_min) {
|
105
|
+
curr_row_min = curr_row[col];
|
106
|
+
}
|
107
|
+
}
|
108
|
+
|
109
|
+
/* Return nil as soon as we exceed the threshold. */
|
110
|
+
|
111
|
+
if (threshold > -1 && curr_row_min >= threshold) {
|
112
|
+
free(prev_row);
|
113
|
+
free(curr_row);
|
114
|
+
|
115
|
+
return Qnil;
|
116
|
+
}
|
117
|
+
}
|
118
|
+
|
119
|
+
/* The result is the last value on the last row. */
|
120
|
+
|
121
|
+
result = curr_row[l1];
|
122
|
+
|
123
|
+
free(prev_row);
|
124
|
+
free(curr_row);
|
125
|
+
|
126
|
+
/* Return the Ruby version of the result. */
|
127
|
+
|
128
|
+
return INT2FIX(result);
|
129
|
+
}
|
@@ -0,0 +1,133 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
|
+
int threshold;
|
5
|
+
int l1, l2;
|
6
|
+
int *prev_row, *curr_row;
|
7
|
+
int col, row;
|
8
|
+
int curr_row_min, result;
|
9
|
+
int offset;
|
10
|
+
char *s1, *s2;
|
11
|
+
|
12
|
+
/* Convert Ruby's s1 to C's s1. */
|
13
|
+
|
14
|
+
rb_o1 = StringValue(rb_o1);
|
15
|
+
s1 = RSTRING_PTR(rb_o1);
|
16
|
+
l1 = RSTRING_LEN(rb_o1);
|
17
|
+
|
18
|
+
/* Convert Ruby's s2 to C's s2. */
|
19
|
+
|
20
|
+
rb_o2 = StringValue(rb_o2);
|
21
|
+
s2 = RSTRING_PTR(rb_o2);
|
22
|
+
l2 = RSTRING_LEN(rb_o2);
|
23
|
+
|
24
|
+
/* Convert Ruby's threshold to C's threshold. */
|
25
|
+
|
26
|
+
if (!NIL_P(rb_threshold)) {
|
27
|
+
threshold = FIX2INT(rb_threshold);
|
28
|
+
} else {
|
29
|
+
threshold = -1;
|
30
|
+
}
|
31
|
+
|
32
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
33
|
+
|
34
|
+
offset = 0;
|
35
|
+
while (s1[offset] == s2[offset]) {
|
36
|
+
offset++;
|
37
|
+
}
|
38
|
+
|
39
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
40
|
+
|
41
|
+
while ((l1-1 > offset) && (l2-1 > offset) && (s1[l1-1] == s2[l2-1])) {
|
42
|
+
l1--;
|
43
|
+
l2--;
|
44
|
+
}
|
45
|
+
|
46
|
+
l1 -= offset;
|
47
|
+
l2 -= offset;
|
48
|
+
|
49
|
+
/* The Levenshtein algorithm itself. */
|
50
|
+
|
51
|
+
/* s1= */
|
52
|
+
/* ERIK */
|
53
|
+
/* */
|
54
|
+
/* 01234 */
|
55
|
+
/* s2=V 11234 */
|
56
|
+
/* E 21234 */
|
57
|
+
/* E 32234 */
|
58
|
+
/* N 43334 <- prev_row */
|
59
|
+
/* S 54444 <- curr_row */
|
60
|
+
/* T 65555 */
|
61
|
+
/* R 76566 */
|
62
|
+
/* A 87667 */
|
63
|
+
|
64
|
+
/* Allocate memory for both rows */
|
65
|
+
|
66
|
+
prev_row = ALLOC_N(int, l1+1);
|
67
|
+
curr_row = ALLOC_N(int, l1+1);
|
68
|
+
|
69
|
+
if ((prev_row == NULL) || (curr_row == NULL)) {
|
70
|
+
rb_raise(rb_eNoMemError, "out of memory");
|
71
|
+
}
|
72
|
+
|
73
|
+
/* Initialize the current row. */
|
74
|
+
|
75
|
+
for (col=0; col<=l1; col++) {
|
76
|
+
curr_row[col] = col;
|
77
|
+
}
|
78
|
+
|
79
|
+
for (row=1; row<=l2; row++) {
|
80
|
+
/* Copy the current row to the previous row. */
|
81
|
+
|
82
|
+
memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
|
83
|
+
|
84
|
+
/* Calculate the values of the current row. */
|
85
|
+
|
86
|
+
curr_row[0] = row;
|
87
|
+
curr_row_min = row;
|
88
|
+
|
89
|
+
for (col=1; col<=l1; col++) {
|
90
|
+
/* Equal (cost=0) or substitution (cost=1). */
|
91
|
+
|
92
|
+
curr_row[col] = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
|
93
|
+
|
94
|
+
/* Insertion if it's cheaper than substitution. */
|
95
|
+
|
96
|
+
if (prev_row[col]+1 < curr_row[col]) {
|
97
|
+
curr_row[col] = prev_row[col]+1;
|
98
|
+
}
|
99
|
+
|
100
|
+
/* Deletion if it's cheaper than substitution. */
|
101
|
+
|
102
|
+
if (curr_row[col-1]+1 < curr_row[col]) {
|
103
|
+
curr_row[col] = curr_row[col-1]+1;
|
104
|
+
}
|
105
|
+
|
106
|
+
/* Keep track of the minimum value on this row. */
|
107
|
+
|
108
|
+
if (curr_row[col] < curr_row_min) {
|
109
|
+
curr_row_min = curr_row[col];
|
110
|
+
}
|
111
|
+
}
|
112
|
+
|
113
|
+
/* Return nil as soon as we exceed the threshold. */
|
114
|
+
|
115
|
+
if (threshold > -1 && curr_row_min >= threshold) {
|
116
|
+
free(prev_row);
|
117
|
+
free(curr_row);
|
118
|
+
|
119
|
+
return Qnil;
|
120
|
+
}
|
121
|
+
}
|
122
|
+
|
123
|
+
/* The result is the last value on the last row. */
|
124
|
+
|
125
|
+
result = curr_row[l1];
|
126
|
+
|
127
|
+
free(prev_row);
|
128
|
+
free(curr_row);
|
129
|
+
|
130
|
+
/* Return the Ruby version of the result. */
|
131
|
+
|
132
|
+
return INT2FIX(result);
|
133
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "levenshtein/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "levenshtein-19"
|
7
|
+
s.version = Levenshtein::VERSION
|
8
|
+
s.authors = ["Erik Veenstra", "Ryan Fitzgerald"]
|
9
|
+
s.email = ["rwfitzge@gmail.com"]
|
10
|
+
s.homepage = "http://github.com/rwfitzge/levenshtein-19"
|
11
|
+
s.summary = %q{Calculates the Levenshtein distance between two byte strings.}
|
12
|
+
s.description = %q{Calculates the Levenshtein distance between two byte strings.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "levenshtein-19"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
s.extensions = ["ext/levenshtein/extconf.rb"]
|
21
|
+
end
|
data/lib/levenshtein.rb
ADDED
@@ -0,0 +1,109 @@
|
|
1
|
+
require "levenshtein/version"
|
2
|
+
|
3
|
+
begin
|
4
|
+
require "levenshtein/levenshtein_fast" # If compiled by RubyGems.
|
5
|
+
rescue LoadError
|
6
|
+
begin
|
7
|
+
require "levenshtein_fast" # If compiled by the build script.
|
8
|
+
rescue LoadError
|
9
|
+
$stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead."
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
# The Levenshtein distance is a metric for measuring the amount
|
14
|
+
# of difference between two sequences (i.e., the so called edit
|
15
|
+
# distance). The Levenshtein distance between two sequences is
|
16
|
+
# given by the minimum number of operations needed to transform
|
17
|
+
# one sequence into the other, where an operation is an
|
18
|
+
# insertion, deletion, or substitution of a single element.
|
19
|
+
#
|
20
|
+
# More information about the Levenshtein distance algorithm:
|
21
|
+
# http://en.wikipedia.org/wiki/Levenshtein_distance .
|
22
|
+
|
23
|
+
module Levenshtein
|
24
|
+
# Returns the Levenshtein distance as a number between 0.0 and
|
25
|
+
# 1.0. It's basically the Levenshtein distance divided by the
|
26
|
+
# length of the longest sequence.
|
27
|
+
|
28
|
+
def self.normalized_distance(s1, s2, threshold=nil)
|
29
|
+
s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
|
30
|
+
|
31
|
+
if s2.length == 0
|
32
|
+
0.0 # Since s1.length < s2.length, s1 must be empty as well.
|
33
|
+
else
|
34
|
+
if threshold
|
35
|
+
if d = self.distance(s1, s2, (threshold*s2.length+1).to_i)
|
36
|
+
d.to_f/s2.length
|
37
|
+
else
|
38
|
+
nil
|
39
|
+
end
|
40
|
+
else
|
41
|
+
self.distance(s1, s2).to_f/s2.length
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Returns the Levenshtein distance between two sequences.
|
47
|
+
#
|
48
|
+
# The two sequences can be two strings, two arrays, or two other
|
49
|
+
# objects. Strings, arrays and arrays of strings are handled with
|
50
|
+
# optimized (very fast) C code. All other sequences are handled
|
51
|
+
# with generic (fast) C code.
|
52
|
+
#
|
53
|
+
# The sequences should respond to :length and :[] and all objects
|
54
|
+
# in the sequences (as returned by []) should response to :==.
|
55
|
+
|
56
|
+
def self.distance(s1, s2, threshold=nil)
|
57
|
+
s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
|
58
|
+
|
59
|
+
# Handle some basic circumstances.
|
60
|
+
|
61
|
+
return 0 if s1 == s2
|
62
|
+
return s2.length if s1.length == 0
|
63
|
+
|
64
|
+
if threshold
|
65
|
+
return nil if (s2.length-s1.length) >= threshold
|
66
|
+
|
67
|
+
a1, a2 = nil, nil
|
68
|
+
a1, a2 = s1, s2 if s1.respond_to?(:-) and s2.respond_to?(:-)
|
69
|
+
a1, a2 = s1.scan(/./), s2.scan(/./) if s1.respond_to?(:scan) and s2.respond_to?(:scan)
|
70
|
+
|
71
|
+
if a1 and a2
|
72
|
+
return nil if (a1-a2).length >= threshold
|
73
|
+
return nil if (a2-a1).length >= threshold
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
distance_fast_or_slow(s1, s2, threshold)
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.distance_fast_or_slow(s1, s2, threshold) # :nodoc:
|
81
|
+
if respond_to?(:levenshtein_distance_fast)
|
82
|
+
levenshtein_distance_fast(s1, s2, threshold) # Implemented in C.
|
83
|
+
else
|
84
|
+
levenshtein_distance_slow(s1, s2, threshold) # Implemented in Ruby.
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def self.levenshtein_distance_slow(s1, s2, threshold) # :nodoc:
|
89
|
+
row = (0..s1.length).to_a
|
90
|
+
|
91
|
+
1.upto(s2.length) do |y|
|
92
|
+
prow = row
|
93
|
+
row = [y]
|
94
|
+
|
95
|
+
1.upto(s1.length) do |x|
|
96
|
+
row[x] = [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
|
97
|
+
end
|
98
|
+
|
99
|
+
# Stop analysing this sequence as soon as the best possible
|
100
|
+
# result for this sequence is bigger than the best result so far.
|
101
|
+
# (The minimum value in the next row will be equal to or greater
|
102
|
+
# than the minimum value in this row.)
|
103
|
+
|
104
|
+
return nil if threshold and row.min >= threshold
|
105
|
+
end
|
106
|
+
|
107
|
+
row[-1]
|
108
|
+
end
|
109
|
+
end
|
data/test/test.rb
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
require "test/unit"
|
2
|
+
require "levenshtein"
|
3
|
+
|
4
|
+
module Levenshtein
|
5
|
+
class TestSequence
|
6
|
+
def initialize(o)
|
7
|
+
@sequence = o
|
8
|
+
end
|
9
|
+
|
10
|
+
def length
|
11
|
+
@sequence.length
|
12
|
+
end
|
13
|
+
|
14
|
+
def [](pos)
|
15
|
+
@sequence[pos]
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class TestElement
|
20
|
+
attr_reader :object
|
21
|
+
|
22
|
+
def initialize(o)
|
23
|
+
@object = o
|
24
|
+
end
|
25
|
+
|
26
|
+
def ==(other)
|
27
|
+
@object == other.object
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
class TestLevenshteinString < Test::Unit::TestCase
|
33
|
+
def test_erik_veenstra
|
34
|
+
assert_equal(7, Levenshtein.distance("erik", "veenstra"))
|
35
|
+
assert_equal(7, Levenshtein.distance("veenstra", "erik"))
|
36
|
+
|
37
|
+
assert_in_delta(0.875, Levenshtein.normalized_distance("erik", "veenstra"), 0.01)
|
38
|
+
assert_in_delta(0.875, Levenshtein.normalized_distance("veenstra", "erik"), 0.01)
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_empty_string
|
42
|
+
assert_equal(0, Levenshtein.distance("", ""))
|
43
|
+
assert_equal(3, Levenshtein.distance("", "foo"))
|
44
|
+
assert_equal(3, Levenshtein.distance("foo", ""))
|
45
|
+
|
46
|
+
assert_in_delta(0.0, Levenshtein.normalized_distance("", ""), 0.01)
|
47
|
+
assert_in_delta(1.0, Levenshtein.normalized_distance("", "foo"), 0.01)
|
48
|
+
assert_in_delta(1.0, Levenshtein.normalized_distance("foo", ""), 0.01)
|
49
|
+
end
|
50
|
+
|
51
|
+
def test_same_string
|
52
|
+
assert_equal(0, Levenshtein.distance("", ""))
|
53
|
+
assert_equal(0, Levenshtein.distance("foo", "foo"))
|
54
|
+
|
55
|
+
assert_in_delta(0.0, Levenshtein.normalized_distance("", ""), 0.01)
|
56
|
+
assert_in_delta(0.0, Levenshtein.normalized_distance("foo", "foo"), 0.01)
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_threshold
|
60
|
+
assert_equal(3, Levenshtein.distance("foo", "foobar"))
|
61
|
+
assert_equal(3, Levenshtein.distance("foo", "foobar", 4))
|
62
|
+
assert_equal(nil, Levenshtein.distance("foo", "foobar", 2))
|
63
|
+
|
64
|
+
assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar"), 0.01)
|
65
|
+
assert_in_delta(0.5, Levenshtein.normalized_distance("foo", "foobar", 0.66), 0.01)
|
66
|
+
assert_equal(nil, Levenshtein.normalized_distance("foo", "foobar", 0.30))
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_same_head_and_or_tail
|
70
|
+
assert_equal(3, Levenshtein.distance("ab123cd", "abxyzcd"))
|
71
|
+
assert_equal(3, Levenshtein.distance("ab123", "abxyz"))
|
72
|
+
assert_equal(3, Levenshtein.distance("123cd", "xyzcd"))
|
73
|
+
assert_equal(5, Levenshtein.distance("123cd123", "123"))
|
74
|
+
|
75
|
+
assert_in_delta(0.42, Levenshtein.normalized_distance("ab123cd", "abxyzcd"), 0.01)
|
76
|
+
assert_in_delta(0.6, Levenshtein.normalized_distance("ab123", "abxyz"), 0.01)
|
77
|
+
assert_in_delta(0.6, Levenshtein.normalized_distance("123cd", "xyzcd"), 0.01)
|
78
|
+
assert_in_delta(0.625, Levenshtein.normalized_distance("123cd123", "123"), 0.01)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
class TestLevenshteinArray < Test::Unit::TestCase
|
83
|
+
def test_erik_veenstra
|
84
|
+
x = lambda{|s| s.scan(/./).collect{|e| Levenshtein::TestElement.new(e)}}
|
85
|
+
|
86
|
+
assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
class TestLevenshteinArrayOfStrings < Test::Unit::TestCase
|
91
|
+
def test_erik_veenstra
|
92
|
+
x = lambda{|s| s.scan(/./)}
|
93
|
+
|
94
|
+
assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
class TestLevenshteinGeneric < Test::Unit::TestCase
|
99
|
+
def test_erik_veenstra
|
100
|
+
x = lambda{|s| Levenshtein::TestSequence.new(s.scan(/./).collect{|e| Levenshtein::TestElement.new(e)})}
|
101
|
+
|
102
|
+
assert_equal(7, Levenshtein.distance(x["erik"], x["veenstra"]))
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
class TestLevenshteinSlow < Test::Unit::TestCase
|
107
|
+
def test_erik_veenstra
|
108
|
+
assert_equal(7, Levenshtein.levenshtein_distance_slow("erik", "veenstra", nil))
|
109
|
+
end
|
110
|
+
|
111
|
+
def test_empty_sequence
|
112
|
+
assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
|
113
|
+
assert_equal(3, Levenshtein.levenshtein_distance_slow("", "foo", nil))
|
114
|
+
end
|
115
|
+
|
116
|
+
def test_same_sequence
|
117
|
+
assert_equal(0, Levenshtein.levenshtein_distance_slow("", "", nil))
|
118
|
+
assert_equal(0, Levenshtein.levenshtein_distance_slow("foo", "foo", nil))
|
119
|
+
end
|
120
|
+
|
121
|
+
def test_threshold
|
122
|
+
assert_equal(3, Levenshtein.levenshtein_distance_slow("foo", "foobar", nil))
|
123
|
+
assert_equal(nil, Levenshtein.levenshtein_distance_slow("foo", "foobar", 2))
|
124
|
+
end
|
125
|
+
end
|
metadata
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: levenshtein-19
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.3.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Erik Veenstra
|
9
|
+
- Ryan Fitzgerald
|
10
|
+
autorequire:
|
11
|
+
bindir: bin
|
12
|
+
cert_chain: []
|
13
|
+
date: 2011-11-21 00:00:00.000000000Z
|
14
|
+
dependencies: []
|
15
|
+
description: Calculates the Levenshtein distance between two byte strings.
|
16
|
+
email:
|
17
|
+
- rwfitzge@gmail.com
|
18
|
+
executables: []
|
19
|
+
extensions:
|
20
|
+
- ext/levenshtein/extconf.rb
|
21
|
+
extra_rdoc_files: []
|
22
|
+
files:
|
23
|
+
- .gitignore
|
24
|
+
- CHANGELOG
|
25
|
+
- Gemfile
|
26
|
+
- LICENSE
|
27
|
+
- README
|
28
|
+
- Rakefile
|
29
|
+
- ext/levenshtein/extconf.rb
|
30
|
+
- ext/levenshtein/levenshtein_array.c
|
31
|
+
- ext/levenshtein/levenshtein_array_of_strings.c
|
32
|
+
- ext/levenshtein/levenshtein_fast.c
|
33
|
+
- ext/levenshtein/levenshtein_generic.c
|
34
|
+
- ext/levenshtein/levenshtein_string.c
|
35
|
+
- levenshtein-19.gemspec
|
36
|
+
- lib/levenshtein.rb
|
37
|
+
- lib/levenshtein/version.rb
|
38
|
+
- test/test.rb
|
39
|
+
homepage: http://github.com/rwfitzge/levenshtein-19
|
40
|
+
licenses: []
|
41
|
+
post_install_message:
|
42
|
+
rdoc_options: []
|
43
|
+
require_paths:
|
44
|
+
- lib
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
+
none: false
|
47
|
+
requirements:
|
48
|
+
- - ! '>='
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: '0'
|
51
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
|
+
none: false
|
53
|
+
requirements:
|
54
|
+
- - ! '>='
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: '0'
|
57
|
+
requirements: []
|
58
|
+
rubyforge_project: levenshtein-19
|
59
|
+
rubygems_version: 1.8.11
|
60
|
+
signing_key:
|
61
|
+
specification_version: 3
|
62
|
+
summary: Calculates the Levenshtein distance between two byte strings.
|
63
|
+
test_files:
|
64
|
+
- test/test.rb
|