levenshtein 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +8 -0
- data/VERSION +1 -1
- data/ext/levenshtein/levenshtein.h +13 -0
- data/ext/levenshtein/levenshtein_array.c +24 -21
- data/ext/levenshtein/levenshtein_array_of_strings.c +24 -19
- data/ext/levenshtein/levenshtein_fast.c +3 -2
- data/ext/levenshtein/levenshtein_generic.c +24 -20
- data/ext/levenshtein/levenshtein_string.c +26 -21
- data/lib/levenshtein.rb +61 -53
- data/lib/levenshtein/exception.rb +4 -0
- data/lib/levenshtein/version.rb +3 -0
- data/test/test.rb +21 -9
- metadata +25 -11
data/CHANGELOG
CHANGED
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.1
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#ifdef RARRAY_PTR
|
2
|
+
#else
|
3
|
+
#define RARRAY_PTR(o) (RARRAY(o)->ptr)
|
4
|
+
#define RARRAY_LEN(o) (RARRAY(o)->len)
|
5
|
+
#endif
|
6
|
+
|
7
|
+
#ifdef RSTRING_PTR
|
8
|
+
#else
|
9
|
+
#define RSTRING_PTR(o) (RSTRING(o)->ptr)
|
10
|
+
#define RSTRING_LEN(o) (RSTRING(o)->len)
|
11
|
+
#endif
|
12
|
+
|
13
|
+
VALUE mLevenshtein;
|
@@ -1,19 +1,19 @@
|
|
1
1
|
#include "ruby.h"
|
2
|
+
#include "levenshtein.h"
|
2
3
|
|
3
4
|
VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
5
|
int threshold;
|
5
6
|
int l1, l2;
|
6
|
-
int *prev_row, *curr_row;
|
7
|
+
int *prev_row, *curr_row, *temp_row;
|
7
8
|
int col, row;
|
8
9
|
int curr_row_min, result;
|
9
10
|
int offset;
|
10
|
-
|
11
|
-
ID id_eql = rb_intern("==");
|
11
|
+
int value1, value2;
|
12
12
|
|
13
13
|
/* Get the sizes of both arrays. */
|
14
14
|
|
15
|
-
l1 =
|
16
|
-
l2 =
|
15
|
+
l1 = RARRAY_LEN(rb_o1);
|
16
|
+
l2 = RARRAY_LEN(rb_o2);
|
17
17
|
|
18
18
|
/* Convert Ruby's threshold to C's threshold. */
|
19
19
|
|
@@ -26,13 +26,14 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
|
|
26
26
|
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
27
27
|
|
28
28
|
offset = 0;
|
29
|
-
|
29
|
+
|
30
|
+
while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)))) {
|
30
31
|
offset++;
|
31
32
|
}
|
32
33
|
|
33
34
|
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
34
35
|
|
35
|
-
while ((
|
36
|
+
while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)))) {
|
36
37
|
l1--;
|
37
38
|
l2--;
|
38
39
|
}
|
@@ -57,12 +58,8 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
|
|
57
58
|
|
58
59
|
/* Allocate memory for both rows */
|
59
60
|
|
60
|
-
prev_row = ALLOC_N(int, l1+1);
|
61
|
-
curr_row = ALLOC_N(int, l1+1);
|
62
|
-
|
63
|
-
if ((prev_row == NULL) || (curr_row == NULL)) {
|
64
|
-
rb_raise(rb_eNoMemError, "out of memory");
|
65
|
-
}
|
61
|
+
prev_row = (int*) ALLOC_N(int, (l1+1));
|
62
|
+
curr_row = (int*) ALLOC_N(int, (l1+1));
|
66
63
|
|
67
64
|
/* Initialize the current row. */
|
68
65
|
|
@@ -73,7 +70,9 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
|
|
73
70
|
for (row=1; row<=l2; row++) {
|
74
71
|
/* Copy the current row to the previous row. */
|
75
72
|
|
76
|
-
|
73
|
+
temp_row = prev_row;
|
74
|
+
prev_row = curr_row;
|
75
|
+
curr_row = temp_row;
|
77
76
|
|
78
77
|
/* Calculate the values of the current row. */
|
79
78
|
|
@@ -83,25 +82,29 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
|
|
83
82
|
for (col=1; col<=l1; col++) {
|
84
83
|
/* Equal (cost=0) or substitution (cost=1). */
|
85
84
|
|
86
|
-
|
85
|
+
value1 = prev_row[col-1] + (RTEST(rb_equal(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
|
87
86
|
|
88
87
|
/* Insertion if it's cheaper than substitution. */
|
89
88
|
|
90
|
-
|
91
|
-
|
89
|
+
value2 = prev_row[col]+1;
|
90
|
+
if (value2 < value1) {
|
91
|
+
value1 = value2;
|
92
92
|
}
|
93
93
|
|
94
94
|
/* Deletion if it's cheaper than substitution. */
|
95
95
|
|
96
|
-
|
97
|
-
|
96
|
+
value2 = curr_row[col-1]+1;
|
97
|
+
if (value2 < value1) {
|
98
|
+
value1 = value2;
|
98
99
|
}
|
99
100
|
|
100
101
|
/* Keep track of the minimum value on this row. */
|
101
102
|
|
102
|
-
if (
|
103
|
-
curr_row_min =
|
103
|
+
if (value1 < curr_row_min) {
|
104
|
+
curr_row_min = value1;
|
104
105
|
}
|
106
|
+
|
107
|
+
curr_row[col] = value1;
|
105
108
|
}
|
106
109
|
|
107
110
|
/* Return nil as soon as we exceed the threshold. */
|
@@ -1,17 +1,19 @@
|
|
1
1
|
#include "ruby.h"
|
2
|
+
#include "levenshtein.h"
|
2
3
|
|
3
4
|
VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
5
|
int threshold;
|
5
6
|
int l1, l2;
|
6
|
-
int *prev_row, *curr_row;
|
7
|
+
int *prev_row, *curr_row, *temp_row;
|
7
8
|
int col, row;
|
8
9
|
int curr_row_min, result;
|
9
10
|
int offset;
|
11
|
+
int value1, value2;
|
10
12
|
|
11
13
|
/* Get the sizes of both arrays. */
|
12
14
|
|
13
|
-
l1 =
|
14
|
-
l2 =
|
15
|
+
l1 = RARRAY_LEN(rb_o1);
|
16
|
+
l2 = RARRAY_LEN(rb_o2);
|
15
17
|
|
16
18
|
/* Convert Ruby's threshold to C's threshold. */
|
17
19
|
|
@@ -24,13 +26,14 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
|
|
24
26
|
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
25
27
|
|
26
28
|
offset = 0;
|
27
|
-
|
29
|
+
|
30
|
+
while ((offset < l1) && (offset < l2) && (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0)) {
|
28
31
|
offset++;
|
29
32
|
}
|
30
33
|
|
31
34
|
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
32
35
|
|
33
|
-
while ((
|
36
|
+
while ((offset < l1) && (offset < l2) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
|
34
37
|
l1--;
|
35
38
|
l2--;
|
36
39
|
}
|
@@ -55,12 +58,8 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
|
|
55
58
|
|
56
59
|
/* Allocate memory for both rows */
|
57
60
|
|
58
|
-
prev_row = ALLOC_N(int, l1+1);
|
59
|
-
curr_row = ALLOC_N(int, l1+1);
|
60
|
-
|
61
|
-
if ((prev_row == NULL) || (curr_row == NULL)) {
|
62
|
-
rb_raise(rb_eNoMemError, "out of memory");
|
63
|
-
}
|
61
|
+
prev_row = (int*) ALLOC_N(int, (l1+1));
|
62
|
+
curr_row = (int*) ALLOC_N(int, (l1+1));
|
64
63
|
|
65
64
|
/* Initialize the current row. */
|
66
65
|
|
@@ -71,7 +70,9 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
|
|
71
70
|
for (row=1; row<=l2; row++) {
|
72
71
|
/* Copy the current row to the previous row. */
|
73
72
|
|
74
|
-
|
73
|
+
temp_row = prev_row;
|
74
|
+
prev_row = curr_row;
|
75
|
+
curr_row = temp_row;
|
75
76
|
|
76
77
|
/* Calculate the values of the current row. */
|
77
78
|
|
@@ -81,25 +82,29 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
|
|
81
82
|
for (col=1; col<=l1; col++) {
|
82
83
|
/* Equal (cost=0) or substitution (cost=1). */
|
83
84
|
|
84
|
-
|
85
|
+
value1 = prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
|
85
86
|
|
86
87
|
/* Insertion if it's cheaper than substitution. */
|
87
88
|
|
88
|
-
|
89
|
-
|
89
|
+
value2 = prev_row[col]+1;
|
90
|
+
if (value2 < value1) {
|
91
|
+
value1 = value2;
|
90
92
|
}
|
91
93
|
|
92
94
|
/* Deletion if it's cheaper than substitution. */
|
93
95
|
|
94
|
-
|
95
|
-
|
96
|
+
value2 = curr_row[col-1]+1;
|
97
|
+
if (value2 < value1) {
|
98
|
+
value1 = value2;
|
96
99
|
}
|
97
100
|
|
98
101
|
/* Keep track of the minimum value on this row. */
|
99
102
|
|
100
|
-
if (
|
101
|
-
curr_row_min =
|
103
|
+
if (value1 < curr_row_min) {
|
104
|
+
curr_row_min = value1;
|
102
105
|
}
|
106
|
+
|
107
|
+
curr_row[col] = value1;
|
103
108
|
}
|
104
109
|
|
105
110
|
/* Return nil as soon as we exceed the threshold. */
|
@@ -1,4 +1,5 @@
|
|
1
1
|
#include "ruby.h"
|
2
|
+
#include "levenshtein.h"
|
2
3
|
|
3
4
|
VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
5
|
if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
|
@@ -15,7 +16,7 @@ VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_t
|
|
15
16
|
}
|
16
17
|
|
17
18
|
void Init_levenshtein_fast() {
|
18
|
-
|
19
|
+
mLevenshtein = rb_const_get(rb_mKernel, rb_intern("Levenshtein"));
|
19
20
|
|
20
|
-
rb_define_singleton_method(mLevenshtein, "
|
21
|
+
rb_define_singleton_method(mLevenshtein, "distance_fast" , levenshtein_distance_fast, 3);
|
21
22
|
}
|
@@ -1,16 +1,17 @@
|
|
1
1
|
#include "ruby.h"
|
2
|
+
#include "levenshtein.h"
|
2
3
|
|
3
4
|
VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
5
|
int threshold;
|
5
6
|
int l1, l2;
|
6
|
-
int *prev_row, *curr_row;
|
7
|
+
int *prev_row, *curr_row, *temp_row;
|
7
8
|
int col, row;
|
8
9
|
int curr_row_min, result;
|
9
10
|
int offset;
|
11
|
+
int value1, value2;
|
10
12
|
|
11
|
-
ID
|
12
|
-
ID
|
13
|
-
ID id_equal = rb_intern("==");
|
13
|
+
ID id_length = rb_intern("length");
|
14
|
+
ID id_get = rb_intern("[]");
|
14
15
|
|
15
16
|
/* Get the sizes of both sequences. */
|
16
17
|
|
@@ -28,13 +29,14 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
|
|
28
29
|
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
29
30
|
|
30
31
|
offset = 0;
|
31
|
-
|
32
|
+
|
33
|
+
while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), rb_funcall(rb_o2, id_get, 1, INT2FIX(offset))))) {
|
32
34
|
offset++;
|
33
35
|
}
|
34
36
|
|
35
37
|
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
36
38
|
|
37
|
-
while ((
|
39
|
+
while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
|
38
40
|
l1--;
|
39
41
|
l2--;
|
40
42
|
}
|
@@ -59,12 +61,8 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
|
|
59
61
|
|
60
62
|
/* Allocate memory for both rows */
|
61
63
|
|
62
|
-
prev_row = ALLOC_N(int, l1+1);
|
63
|
-
curr_row = ALLOC_N(int, l1+1);
|
64
|
-
|
65
|
-
if ((prev_row == NULL) || (curr_row == NULL)) {
|
66
|
-
rb_raise(rb_eNoMemError, "out of memory");
|
67
|
-
}
|
64
|
+
prev_row = (int*) ALLOC_N(int, (l1+1));
|
65
|
+
curr_row = (int*) ALLOC_N(int, (l1+1));
|
68
66
|
|
69
67
|
/* Initialize the current row. */
|
70
68
|
|
@@ -75,7 +73,9 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
|
|
75
73
|
for (row=1; row<=l2; row++) {
|
76
74
|
/* Copy the current row to the previous row. */
|
77
75
|
|
78
|
-
|
76
|
+
temp_row = prev_row;
|
77
|
+
prev_row = curr_row;
|
78
|
+
curr_row = temp_row;
|
79
79
|
|
80
80
|
/* Calculate the values of the current row. */
|
81
81
|
|
@@ -85,25 +85,29 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
|
|
85
85
|
for (col=1; col<=l1; col++) {
|
86
86
|
/* Equal (cost=0) or substitution (cost=1). */
|
87
87
|
|
88
|
-
|
88
|
+
value1 = prev_row[col-1] + (RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
|
89
89
|
|
90
90
|
/* Insertion if it's cheaper than substitution. */
|
91
91
|
|
92
|
-
|
93
|
-
|
92
|
+
value2 = prev_row[col]+1;
|
93
|
+
if (value2 < value1) {
|
94
|
+
value1 = value2;
|
94
95
|
}
|
95
96
|
|
96
97
|
/* Deletion if it's cheaper than substitution. */
|
97
98
|
|
98
|
-
|
99
|
-
|
99
|
+
value2 = curr_row[col-1]+1;
|
100
|
+
if (value2 < value1) {
|
101
|
+
value1 = value2;
|
100
102
|
}
|
101
103
|
|
102
104
|
/* Keep track of the minimum value on this row. */
|
103
105
|
|
104
|
-
if (
|
105
|
-
curr_row_min =
|
106
|
+
if (value1 < curr_row_min) {
|
107
|
+
curr_row_min = value1;
|
106
108
|
}
|
109
|
+
|
110
|
+
curr_row[col] = value1;
|
107
111
|
}
|
108
112
|
|
109
113
|
/* Return nil as soon as we exceed the threshold. */
|
@@ -1,25 +1,27 @@
|
|
1
1
|
#include "ruby.h"
|
2
|
+
#include "levenshtein.h"
|
2
3
|
|
3
4
|
VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
5
|
int threshold;
|
5
6
|
int l1, l2;
|
6
|
-
int *prev_row, *curr_row;
|
7
|
+
int *prev_row, *curr_row, *temp_row;
|
7
8
|
int col, row;
|
8
9
|
int curr_row_min, result;
|
9
10
|
int offset;
|
11
|
+
int value1, value2;
|
10
12
|
char *s1, *s2;
|
11
13
|
|
12
14
|
/* Convert Ruby's s1 to C's s1. */
|
13
15
|
|
14
16
|
rb_o1 = StringValue(rb_o1);
|
15
|
-
s1 =
|
16
|
-
l1 =
|
17
|
+
s1 = RSTRING_PTR(rb_o1);
|
18
|
+
l1 = RSTRING_LEN(rb_o1);
|
17
19
|
|
18
20
|
/* Convert Ruby's s2 to C's s2. */
|
19
21
|
|
20
22
|
rb_o2 = StringValue(rb_o2);
|
21
|
-
s2 =
|
22
|
-
l2 =
|
23
|
+
s2 = RSTRING_PTR(rb_o2);
|
24
|
+
l2 = RSTRING_LEN(rb_o2);
|
23
25
|
|
24
26
|
/* Convert Ruby's threshold to C's threshold. */
|
25
27
|
|
@@ -32,13 +34,14 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
|
|
32
34
|
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
33
35
|
|
34
36
|
offset = 0;
|
35
|
-
|
37
|
+
|
38
|
+
while ((offset < l1) && (offset < l2) && (s1[offset] == s2[offset])) {
|
36
39
|
offset++;
|
37
40
|
}
|
38
41
|
|
39
42
|
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
40
43
|
|
41
|
-
while ((
|
44
|
+
while ((offset < l1) && (offset < l2) && (s1[l1-1] == s2[l2-1])) {
|
42
45
|
l1--;
|
43
46
|
l2--;
|
44
47
|
}
|
@@ -63,12 +66,8 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
|
|
63
66
|
|
64
67
|
/* Allocate memory for both rows */
|
65
68
|
|
66
|
-
prev_row = ALLOC_N(int, l1+1);
|
67
|
-
curr_row = ALLOC_N(int, l1+1);
|
68
|
-
|
69
|
-
if ((prev_row == NULL) || (curr_row == NULL)) {
|
70
|
-
rb_raise(rb_eNoMemError, "out of memory");
|
71
|
-
}
|
69
|
+
prev_row = (int*) ALLOC_N(int, (l1+1));
|
70
|
+
curr_row = (int*) ALLOC_N(int, (l1+1));
|
72
71
|
|
73
72
|
/* Initialize the current row. */
|
74
73
|
|
@@ -79,7 +78,9 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
|
|
79
78
|
for (row=1; row<=l2; row++) {
|
80
79
|
/* Copy the current row to the previous row. */
|
81
80
|
|
82
|
-
|
81
|
+
temp_row = prev_row;
|
82
|
+
prev_row = curr_row;
|
83
|
+
curr_row = temp_row;
|
83
84
|
|
84
85
|
/* Calculate the values of the current row. */
|
85
86
|
|
@@ -89,25 +90,29 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
|
|
89
90
|
for (col=1; col<=l1; col++) {
|
90
91
|
/* Equal (cost=0) or substitution (cost=1). */
|
91
92
|
|
92
|
-
|
93
|
+
value1 = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
|
93
94
|
|
94
95
|
/* Insertion if it's cheaper than substitution. */
|
95
96
|
|
96
|
-
|
97
|
-
|
97
|
+
value2 = prev_row[col]+1;
|
98
|
+
if (value2 < value1) {
|
99
|
+
value1 = value2;
|
98
100
|
}
|
99
101
|
|
100
102
|
/* Deletion if it's cheaper than substitution. */
|
101
103
|
|
102
|
-
|
103
|
-
|
104
|
+
value2 = curr_row[col-1]+1;
|
105
|
+
if (value2 < value1) {
|
106
|
+
value1 = value2;
|
104
107
|
}
|
105
108
|
|
106
109
|
/* Keep track of the minimum value on this row. */
|
107
110
|
|
108
|
-
if (
|
109
|
-
curr_row_min =
|
111
|
+
if (value1 < curr_row_min) {
|
112
|
+
curr_row_min = value1;
|
110
113
|
}
|
114
|
+
|
115
|
+
curr_row[col] = value1;
|
111
116
|
}
|
112
117
|
|
113
118
|
/* Return nil as soon as we exceed the threshold. */
|
data/lib/levenshtein.rb
CHANGED
@@ -1,44 +1,25 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
rescue LoadError
|
4
|
-
begin
|
5
|
-
require "levenshtein_fast" # If compiled by the build script.
|
6
|
-
rescue LoadError
|
7
|
-
$stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead."
|
8
|
-
end
|
9
|
-
end
|
10
|
-
|
11
|
-
# The Levenshtein distance is a metric for measuring the amount
|
12
|
-
# of difference between two sequences (i.e., the so called edit
|
13
|
-
# distance). The Levenshtein distance between two sequences is
|
14
|
-
# given by the minimum number of operations needed to transform
|
15
|
-
# one sequence into the other, where an operation is an
|
16
|
-
# insertion, deletion, or substitution of a single element.
|
17
|
-
#
|
18
|
-
# More information about the Levenshtein distance algorithm:
|
19
|
-
# http://en.wikipedia.org/wiki/Levenshtein_distance .
|
1
|
+
require "levenshtein/exception"
|
2
|
+
require "levenshtein/version"
|
20
3
|
|
21
4
|
module Levenshtein
|
22
|
-
VERSION = "0.2.0"
|
23
|
-
|
24
5
|
# Returns the Levenshtein distance as a number between 0.0 and
|
25
6
|
# 1.0. It's basically the Levenshtein distance divided by the
|
26
7
|
# length of the longest sequence.
|
27
8
|
|
28
|
-
def self.normalized_distance(
|
29
|
-
|
9
|
+
def self.normalized_distance(a1, a2, threshold=nil)
|
10
|
+
a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one.
|
30
11
|
|
31
|
-
if
|
32
|
-
0.0 # Since
|
12
|
+
if a2.length == 0
|
13
|
+
0.0 # Since a1.length < a2.length, a1 must be empty as well.
|
33
14
|
else
|
34
15
|
if threshold
|
35
|
-
if d = self.distance(
|
36
|
-
d.to_f/
|
16
|
+
if d = self.distance(a1, a2, (threshold*a2.length+1).to_i)
|
17
|
+
d.to_f/a2.length
|
37
18
|
else
|
38
19
|
nil
|
39
20
|
end
|
40
21
|
else
|
41
|
-
self.distance(
|
22
|
+
self.distance(a1, a2).to_f/a2.length
|
42
23
|
end
|
43
24
|
end
|
44
25
|
end
|
@@ -53,47 +34,64 @@ module Levenshtein
|
|
53
34
|
# The sequences should respond to :length and :[] and all objects
|
54
35
|
# in the sequences (as returned by []) should response to :==.
|
55
36
|
|
56
|
-
def self.distance(
|
57
|
-
|
37
|
+
def self.distance(a1, a2, threshold=nil)
|
38
|
+
a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one.
|
58
39
|
|
59
40
|
# Handle some basic circumstances.
|
60
41
|
|
61
|
-
return 0 if
|
62
|
-
return
|
42
|
+
return 0 if a1 == a2
|
43
|
+
return a2.length if a1.length == 0
|
63
44
|
|
64
45
|
if threshold
|
65
|
-
return nil if (
|
46
|
+
return nil if (a2.length-a1.length) >= threshold
|
66
47
|
|
67
|
-
|
68
|
-
|
69
|
-
|
48
|
+
a3, a4 = nil, nil
|
49
|
+
a3, a4 = a1, a2 if a1.respond_to?(:-) and a2.respond_to?(:-)
|
50
|
+
a3, a4 = a1.scan(/./), a2.scan(/./) if a1.respond_to?(:scan) and a2.respond_to?(:scan)
|
70
51
|
|
71
|
-
if
|
72
|
-
return nil if (
|
73
|
-
return nil if (
|
52
|
+
if a3 and a4
|
53
|
+
return nil if (a3-a4).length >= threshold
|
54
|
+
return nil if (a4-a3).length >= threshold
|
74
55
|
end
|
75
56
|
end
|
76
57
|
|
77
|
-
distance_fast_or_slow(
|
58
|
+
distance_fast_or_slow(a1, a2, threshold)
|
78
59
|
end
|
79
60
|
|
80
|
-
def self.distance_fast_or_slow(
|
81
|
-
if respond_to?(:
|
82
|
-
|
61
|
+
def self.distance_fast_or_slow(a1, a2, threshold) # :nodoc:
|
62
|
+
if respond_to?(:distance_fast)
|
63
|
+
distance_fast(a1, a2, threshold) # Implemented in C.
|
83
64
|
else
|
84
|
-
|
65
|
+
distance_slow(a1, a2, threshold) # Implemented in Ruby.
|
85
66
|
end
|
86
67
|
end
|
87
68
|
|
88
|
-
def self.
|
89
|
-
|
69
|
+
def self.distance_slow(a1, a2, threshold) # :nodoc:
|
70
|
+
l1 = a1.length
|
71
|
+
l2 = a2.length
|
90
72
|
|
91
|
-
|
92
|
-
|
93
|
-
|
73
|
+
offset = 0
|
74
|
+
|
75
|
+
while offset < l1 and offset < l2 and a1[offset] == a2[offset]
|
76
|
+
offset += 1
|
77
|
+
end
|
78
|
+
|
79
|
+
while offset < l1 and offset < l2 and a1[l1-1] == a2[l2-1]
|
80
|
+
l1 -= 1
|
81
|
+
l2 -= 1
|
82
|
+
end
|
83
|
+
|
84
|
+
l1 -= offset
|
85
|
+
l2 -= offset
|
94
86
|
|
95
|
-
|
96
|
-
|
87
|
+
crow = (0..l1).to_a
|
88
|
+
|
89
|
+
1.upto(l2) do |y|
|
90
|
+
prow = crow
|
91
|
+
crow = [y]
|
92
|
+
|
93
|
+
1.upto(l1) do |x|
|
94
|
+
crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[offset+x-1]==a2[offset+y-1] ? 0 : 1)].min
|
97
95
|
end
|
98
96
|
|
99
97
|
# Stop analysing this sequence as soon as the best possible
|
@@ -101,9 +99,19 @@ module Levenshtein
|
|
101
99
|
# (The minimum value in the next row will be equal to or greater
|
102
100
|
# than the minimum value in this row.)
|
103
101
|
|
104
|
-
return nil if threshold and
|
102
|
+
return nil if threshold and crow.min >= threshold
|
105
103
|
end
|
106
104
|
|
107
|
-
|
105
|
+
crow[-1]
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
begin
|
110
|
+
require "levenshtein/levenshtein_fast" # Compiled by RubyGems.
|
111
|
+
rescue LoadError
|
112
|
+
begin
|
113
|
+
require "levenshtein_fast" # Compiled by the build script.
|
114
|
+
rescue LoadError
|
115
|
+
$stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein. Using the much slower Ruby version instead."
|
108
116
|
end
|
109
117
|
end
|
data/test/test.rb
CHANGED
@@ -12,6 +12,8 @@ module Levenshtein
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def [](pos)
|
15
|
+
raise "type not allowed [#{pos.inspect}]" unless pos.kind_of?(Fixnum)
|
16
|
+
|
15
17
|
@sequence[pos]
|
16
18
|
end
|
17
19
|
end
|
@@ -105,21 +107,31 @@ end
|
|
105
107
|
|
106
108
|
class TestLevenshteinSlow < Test::Unit::TestCase
|
107
109
|
def test_erik_veenstra
|
108
|
-
assert_equal(7, Levenshtein.
|
110
|
+
assert_equal(7, Levenshtein.distance_slow("erik", "veenstra", nil))
|
111
|
+
assert_equal(7, Levenshtein.distance_slow("veenstra", "erik", nil))
|
109
112
|
end
|
110
113
|
|
111
|
-
def
|
112
|
-
assert_equal(0, Levenshtein.
|
113
|
-
assert_equal(3, Levenshtein.
|
114
|
+
def test_empty_string
|
115
|
+
assert_equal(0, Levenshtein.distance_slow("", "", nil))
|
116
|
+
assert_equal(3, Levenshtein.distance_slow("", "foo", nil))
|
117
|
+
assert_equal(3, Levenshtein.distance_slow("foo", "", nil))
|
114
118
|
end
|
115
119
|
|
116
|
-
def
|
117
|
-
assert_equal(0, Levenshtein.
|
118
|
-
assert_equal(0, Levenshtein.
|
120
|
+
def test_same_string
|
121
|
+
assert_equal(0, Levenshtein.distance_slow("", "", nil))
|
122
|
+
assert_equal(0, Levenshtein.distance_slow("foo", "foo", nil))
|
119
123
|
end
|
120
124
|
|
121
125
|
def test_threshold
|
122
|
-
assert_equal(3, Levenshtein.
|
123
|
-
assert_equal(
|
126
|
+
assert_equal(3, Levenshtein.distance_slow("foo", "foobar", nil))
|
127
|
+
assert_equal(3, Levenshtein.distance_slow("foo", "foobar", 4))
|
128
|
+
assert_equal(nil, Levenshtein.distance_slow("foo", "foobar", 2))
|
129
|
+
end
|
130
|
+
|
131
|
+
def test_same_head_and_or_tail
|
132
|
+
assert_equal(3, Levenshtein.distance_slow("ab123cd", "abxyzcd", nil))
|
133
|
+
assert_equal(3, Levenshtein.distance_slow("ab123", "abxyz", nil))
|
134
|
+
assert_equal(3, Levenshtein.distance_slow("123cd", "xyzcd", nil))
|
135
|
+
assert_equal(5, Levenshtein.distance_slow("123cd123", "123", nil))
|
124
136
|
end
|
125
137
|
end
|
metadata
CHANGED
@@ -1,7 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: levenshtein
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 21
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
- 1
|
10
|
+
version: 0.2.1
|
5
11
|
platform: ruby
|
6
12
|
authors:
|
7
13
|
- Erik Veenstra
|
@@ -9,8 +15,7 @@ autorequire:
|
|
9
15
|
bindir: bin
|
10
16
|
cert_chain: []
|
11
17
|
|
12
|
-
date:
|
13
|
-
default_executable:
|
18
|
+
date: 2012-02-11 00:00:00 Z
|
14
19
|
dependencies: []
|
15
20
|
|
16
21
|
description: Calculates the Levenshtein distance between two byte strings.
|
@@ -22,18 +27,21 @@ extensions:
|
|
22
27
|
extra_rdoc_files: []
|
23
28
|
|
24
29
|
files:
|
30
|
+
- lib/levenshtein/exception.rb
|
31
|
+
- lib/levenshtein/version.rb
|
25
32
|
- lib/levenshtein.rb
|
26
|
-
- ext/levenshtein/extconf.rb
|
27
|
-
- ext/levenshtein/levenshtein_array_of_strings.c
|
28
|
-
- ext/levenshtein/levenshtein_fast.c
|
29
33
|
- ext/levenshtein/levenshtein_string.c
|
30
34
|
- ext/levenshtein/levenshtein_generic.c
|
35
|
+
- ext/levenshtein/levenshtein.h
|
36
|
+
- ext/levenshtein/levenshtein_fast.c
|
37
|
+
- ext/levenshtein/levenshtein_array_of_strings.c
|
31
38
|
- ext/levenshtein/levenshtein_array.c
|
39
|
+
- ext/levenshtein/extconf.rb
|
32
40
|
- README
|
33
41
|
- LICENSE
|
34
42
|
- VERSION
|
35
43
|
- CHANGELOG
|
36
|
-
|
44
|
+
- test/test.rb
|
37
45
|
homepage: http://www.erikveen.dds.nl/levenshtein/index.html
|
38
46
|
licenses: []
|
39
47
|
|
@@ -44,27 +52,33 @@ rdoc_options:
|
|
44
52
|
- VERSION
|
45
53
|
- CHANGELOG
|
46
54
|
- --title
|
47
|
-
- levenshtein (0.2.
|
55
|
+
- levenshtein (0.2.1)
|
48
56
|
- --main
|
49
57
|
- README
|
50
58
|
require_paths:
|
51
59
|
- lib
|
52
60
|
required_ruby_version: !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
53
62
|
requirements:
|
54
63
|
- - ">="
|
55
64
|
- !ruby/object:Gem::Version
|
65
|
+
hash: 3
|
66
|
+
segments:
|
67
|
+
- 0
|
56
68
|
version: "0"
|
57
|
-
version:
|
58
69
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
70
|
+
none: false
|
59
71
|
requirements:
|
60
72
|
- - ">="
|
61
73
|
- !ruby/object:Gem::Version
|
74
|
+
hash: 3
|
75
|
+
segments:
|
76
|
+
- 0
|
62
77
|
version: "0"
|
63
|
-
version:
|
64
78
|
requirements: []
|
65
79
|
|
66
80
|
rubyforge_project: levenshtein
|
67
|
-
rubygems_version: 1.
|
81
|
+
rubygems_version: 1.8.12
|
68
82
|
signing_key:
|
69
83
|
specification_version: 3
|
70
84
|
summary: Calculates the Levenshtein distance between two byte strings.
|