levenshtein 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +8 -0
- data/VERSION +1 -1
- data/ext/levenshtein/levenshtein.h +13 -0
- data/ext/levenshtein/levenshtein_array.c +24 -21
- data/ext/levenshtein/levenshtein_array_of_strings.c +24 -19
- data/ext/levenshtein/levenshtein_fast.c +3 -2
- data/ext/levenshtein/levenshtein_generic.c +24 -20
- data/ext/levenshtein/levenshtein_string.c +26 -21
- data/lib/levenshtein.rb +61 -53
- data/lib/levenshtein/exception.rb +4 -0
- data/lib/levenshtein/version.rb +3 -0
- data/test/test.rb +21 -9
- metadata +25 -11
data/CHANGELOG
CHANGED
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.1
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#ifdef RARRAY_PTR
|
2
|
+
#else
|
3
|
+
#define RARRAY_PTR(o) (RARRAY(o)->ptr)
|
4
|
+
#define RARRAY_LEN(o) (RARRAY(o)->len)
|
5
|
+
#endif
|
6
|
+
|
7
|
+
#ifdef RSTRING_PTR
|
8
|
+
#else
|
9
|
+
#define RSTRING_PTR(o) (RSTRING(o)->ptr)
|
10
|
+
#define RSTRING_LEN(o) (RSTRING(o)->len)
|
11
|
+
#endif
|
12
|
+
|
13
|
+
VALUE mLevenshtein;
|
@@ -1,19 +1,19 @@
|
|
1
1
|
#include "ruby.h"
|
2
|
+
#include "levenshtein.h"
|
2
3
|
|
3
4
|
VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
5
|
int threshold;
|
5
6
|
int l1, l2;
|
6
|
-
int *prev_row, *curr_row;
|
7
|
+
int *prev_row, *curr_row, *temp_row;
|
7
8
|
int col, row;
|
8
9
|
int curr_row_min, result;
|
9
10
|
int offset;
|
10
|
-
|
11
|
-
ID id_eql = rb_intern("==");
|
11
|
+
int value1, value2;
|
12
12
|
|
13
13
|
/* Get the sizes of both arrays. */
|
14
14
|
|
15
|
-
l1 =
|
16
|
-
l2 =
|
15
|
+
l1 = RARRAY_LEN(rb_o1);
|
16
|
+
l2 = RARRAY_LEN(rb_o2);
|
17
17
|
|
18
18
|
/* Convert Ruby's threshold to C's threshold. */
|
19
19
|
|
@@ -26,13 +26,14 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
|
|
26
26
|
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
27
27
|
|
28
28
|
offset = 0;
|
29
|
-
|
29
|
+
|
30
|
+
while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)))) {
|
30
31
|
offset++;
|
31
32
|
}
|
32
33
|
|
33
34
|
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
34
35
|
|
35
|
-
while ((
|
36
|
+
while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)))) {
|
36
37
|
l1--;
|
37
38
|
l2--;
|
38
39
|
}
|
@@ -57,12 +58,8 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
|
|
57
58
|
|
58
59
|
/* Allocate memory for both rows */
|
59
60
|
|
60
|
-
prev_row = ALLOC_N(int, l1+1);
|
61
|
-
curr_row = ALLOC_N(int, l1+1);
|
62
|
-
|
63
|
-
if ((prev_row == NULL) || (curr_row == NULL)) {
|
64
|
-
rb_raise(rb_eNoMemError, "out of memory");
|
65
|
-
}
|
61
|
+
prev_row = (int*) ALLOC_N(int, (l1+1));
|
62
|
+
curr_row = (int*) ALLOC_N(int, (l1+1));
|
66
63
|
|
67
64
|
/* Initialize the current row. */
|
68
65
|
|
@@ -73,7 +70,9 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
|
|
73
70
|
for (row=1; row<=l2; row++) {
|
74
71
|
/* Copy the current row to the previous row. */
|
75
72
|
|
76
|
-
|
73
|
+
temp_row = prev_row;
|
74
|
+
prev_row = curr_row;
|
75
|
+
curr_row = temp_row;
|
77
76
|
|
78
77
|
/* Calculate the values of the current row. */
|
79
78
|
|
@@ -83,25 +82,29 @@ VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_
|
|
83
82
|
for (col=1; col<=l1; col++) {
|
84
83
|
/* Equal (cost=0) or substitution (cost=1). */
|
85
84
|
|
86
|
-
|
85
|
+
value1 = prev_row[col-1] + (RTEST(rb_equal(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
|
87
86
|
|
88
87
|
/* Insertion if it's cheaper than substitution. */
|
89
88
|
|
90
|
-
|
91
|
-
|
89
|
+
value2 = prev_row[col]+1;
|
90
|
+
if (value2 < value1) {
|
91
|
+
value1 = value2;
|
92
92
|
}
|
93
93
|
|
94
94
|
/* Deletion if it's cheaper than substitution. */
|
95
95
|
|
96
|
-
|
97
|
-
|
96
|
+
value2 = curr_row[col-1]+1;
|
97
|
+
if (value2 < value1) {
|
98
|
+
value1 = value2;
|
98
99
|
}
|
99
100
|
|
100
101
|
/* Keep track of the minimum value on this row. */
|
101
102
|
|
102
|
-
if (
|
103
|
-
curr_row_min =
|
103
|
+
if (value1 < curr_row_min) {
|
104
|
+
curr_row_min = value1;
|
104
105
|
}
|
106
|
+
|
107
|
+
curr_row[col] = value1;
|
105
108
|
}
|
106
109
|
|
107
110
|
/* Return nil as soon as we exceed the threshold. */
|
@@ -1,17 +1,19 @@
|
|
1
1
|
#include "ruby.h"
|
2
|
+
#include "levenshtein.h"
|
2
3
|
|
3
4
|
VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
5
|
int threshold;
|
5
6
|
int l1, l2;
|
6
|
-
int *prev_row, *curr_row;
|
7
|
+
int *prev_row, *curr_row, *temp_row;
|
7
8
|
int col, row;
|
8
9
|
int curr_row_min, result;
|
9
10
|
int offset;
|
11
|
+
int value1, value2;
|
10
12
|
|
11
13
|
/* Get the sizes of both arrays. */
|
12
14
|
|
13
|
-
l1 =
|
14
|
-
l2 =
|
15
|
+
l1 = RARRAY_LEN(rb_o1);
|
16
|
+
l2 = RARRAY_LEN(rb_o2);
|
15
17
|
|
16
18
|
/* Convert Ruby's threshold to C's threshold. */
|
17
19
|
|
@@ -24,13 +26,14 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
|
|
24
26
|
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
25
27
|
|
26
28
|
offset = 0;
|
27
|
-
|
29
|
+
|
30
|
+
while ((offset < l1) && (offset < l2) && (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0)) {
|
28
31
|
offset++;
|
29
32
|
}
|
30
33
|
|
31
34
|
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
32
35
|
|
33
|
-
while ((
|
36
|
+
while ((offset < l1) && (offset < l2) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
|
34
37
|
l1--;
|
35
38
|
l2--;
|
36
39
|
}
|
@@ -55,12 +58,8 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
|
|
55
58
|
|
56
59
|
/* Allocate memory for both rows */
|
57
60
|
|
58
|
-
prev_row = ALLOC_N(int, l1+1);
|
59
|
-
curr_row = ALLOC_N(int, l1+1);
|
60
|
-
|
61
|
-
if ((prev_row == NULL) || (curr_row == NULL)) {
|
62
|
-
rb_raise(rb_eNoMemError, "out of memory");
|
63
|
-
}
|
61
|
+
prev_row = (int*) ALLOC_N(int, (l1+1));
|
62
|
+
curr_row = (int*) ALLOC_N(int, (l1+1));
|
64
63
|
|
65
64
|
/* Initialize the current row. */
|
66
65
|
|
@@ -71,7 +70,9 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
|
|
71
70
|
for (row=1; row<=l2; row++) {
|
72
71
|
/* Copy the current row to the previous row. */
|
73
72
|
|
74
|
-
|
73
|
+
temp_row = prev_row;
|
74
|
+
prev_row = curr_row;
|
75
|
+
curr_row = temp_row;
|
75
76
|
|
76
77
|
/* Calculate the values of the current row. */
|
77
78
|
|
@@ -81,25 +82,29 @@ VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2
|
|
81
82
|
for (col=1; col<=l1; col++) {
|
82
83
|
/* Equal (cost=0) or substitution (cost=1). */
|
83
84
|
|
84
|
-
|
85
|
+
value1 = prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
|
85
86
|
|
86
87
|
/* Insertion if it's cheaper than substitution. */
|
87
88
|
|
88
|
-
|
89
|
-
|
89
|
+
value2 = prev_row[col]+1;
|
90
|
+
if (value2 < value1) {
|
91
|
+
value1 = value2;
|
90
92
|
}
|
91
93
|
|
92
94
|
/* Deletion if it's cheaper than substitution. */
|
93
95
|
|
94
|
-
|
95
|
-
|
96
|
+
value2 = curr_row[col-1]+1;
|
97
|
+
if (value2 < value1) {
|
98
|
+
value1 = value2;
|
96
99
|
}
|
97
100
|
|
98
101
|
/* Keep track of the minimum value on this row. */
|
99
102
|
|
100
|
-
if (
|
101
|
-
curr_row_min =
|
103
|
+
if (value1 < curr_row_min) {
|
104
|
+
curr_row_min = value1;
|
102
105
|
}
|
106
|
+
|
107
|
+
curr_row[col] = value1;
|
103
108
|
}
|
104
109
|
|
105
110
|
/* Return nil as soon as we exceed the threshold. */
|
@@ -1,4 +1,5 @@
|
|
1
1
|
#include "ruby.h"
|
2
|
+
#include "levenshtein.h"
|
2
3
|
|
3
4
|
VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
5
|
if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
|
@@ -15,7 +16,7 @@ VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_t
|
|
15
16
|
}
|
16
17
|
|
17
18
|
void Init_levenshtein_fast() {
|
18
|
-
|
19
|
+
mLevenshtein = rb_const_get(rb_mKernel, rb_intern("Levenshtein"));
|
19
20
|
|
20
|
-
rb_define_singleton_method(mLevenshtein, "
|
21
|
+
rb_define_singleton_method(mLevenshtein, "distance_fast" , levenshtein_distance_fast, 3);
|
21
22
|
}
|
@@ -1,16 +1,17 @@
|
|
1
1
|
#include "ruby.h"
|
2
|
+
#include "levenshtein.h"
|
2
3
|
|
3
4
|
VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
5
|
int threshold;
|
5
6
|
int l1, l2;
|
6
|
-
int *prev_row, *curr_row;
|
7
|
+
int *prev_row, *curr_row, *temp_row;
|
7
8
|
int col, row;
|
8
9
|
int curr_row_min, result;
|
9
10
|
int offset;
|
11
|
+
int value1, value2;
|
10
12
|
|
11
|
-
ID
|
12
|
-
ID
|
13
|
-
ID id_equal = rb_intern("==");
|
13
|
+
ID id_length = rb_intern("length");
|
14
|
+
ID id_get = rb_intern("[]");
|
14
15
|
|
15
16
|
/* Get the sizes of both sequences. */
|
16
17
|
|
@@ -28,13 +29,14 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
|
|
28
29
|
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
29
30
|
|
30
31
|
offset = 0;
|
31
|
-
|
32
|
+
|
33
|
+
while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), rb_funcall(rb_o2, id_get, 1, INT2FIX(offset))))) {
|
32
34
|
offset++;
|
33
35
|
}
|
34
36
|
|
35
37
|
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
36
38
|
|
37
|
-
while ((
|
39
|
+
while ((offset < l1) && (offset < l2) && RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
|
38
40
|
l1--;
|
39
41
|
l2--;
|
40
42
|
}
|
@@ -59,12 +61,8 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
|
|
59
61
|
|
60
62
|
/* Allocate memory for both rows */
|
61
63
|
|
62
|
-
prev_row = ALLOC_N(int, l1+1);
|
63
|
-
curr_row = ALLOC_N(int, l1+1);
|
64
|
-
|
65
|
-
if ((prev_row == NULL) || (curr_row == NULL)) {
|
66
|
-
rb_raise(rb_eNoMemError, "out of memory");
|
67
|
-
}
|
64
|
+
prev_row = (int*) ALLOC_N(int, (l1+1));
|
65
|
+
curr_row = (int*) ALLOC_N(int, (l1+1));
|
68
66
|
|
69
67
|
/* Initialize the current row. */
|
70
68
|
|
@@ -75,7 +73,9 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
|
|
75
73
|
for (row=1; row<=l2; row++) {
|
76
74
|
/* Copy the current row to the previous row. */
|
77
75
|
|
78
|
-
|
76
|
+
temp_row = prev_row;
|
77
|
+
prev_row = curr_row;
|
78
|
+
curr_row = temp_row;
|
79
79
|
|
80
80
|
/* Calculate the values of the current row. */
|
81
81
|
|
@@ -85,25 +85,29 @@ VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE r
|
|
85
85
|
for (col=1; col<=l1; col++) {
|
86
86
|
/* Equal (cost=0) or substitution (cost=1). */
|
87
87
|
|
88
|
-
|
88
|
+
value1 = prev_row[col-1] + (RTEST(rb_equal(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
|
89
89
|
|
90
90
|
/* Insertion if it's cheaper than substitution. */
|
91
91
|
|
92
|
-
|
93
|
-
|
92
|
+
value2 = prev_row[col]+1;
|
93
|
+
if (value2 < value1) {
|
94
|
+
value1 = value2;
|
94
95
|
}
|
95
96
|
|
96
97
|
/* Deletion if it's cheaper than substitution. */
|
97
98
|
|
98
|
-
|
99
|
-
|
99
|
+
value2 = curr_row[col-1]+1;
|
100
|
+
if (value2 < value1) {
|
101
|
+
value1 = value2;
|
100
102
|
}
|
101
103
|
|
102
104
|
/* Keep track of the minimum value on this row. */
|
103
105
|
|
104
|
-
if (
|
105
|
-
curr_row_min =
|
106
|
+
if (value1 < curr_row_min) {
|
107
|
+
curr_row_min = value1;
|
106
108
|
}
|
109
|
+
|
110
|
+
curr_row[col] = value1;
|
107
111
|
}
|
108
112
|
|
109
113
|
/* Return nil as soon as we exceed the threshold. */
|
@@ -1,25 +1,27 @@
|
|
1
1
|
#include "ruby.h"
|
2
|
+
#include "levenshtein.h"
|
2
3
|
|
3
4
|
VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
5
|
int threshold;
|
5
6
|
int l1, l2;
|
6
|
-
int *prev_row, *curr_row;
|
7
|
+
int *prev_row, *curr_row, *temp_row;
|
7
8
|
int col, row;
|
8
9
|
int curr_row_min, result;
|
9
10
|
int offset;
|
11
|
+
int value1, value2;
|
10
12
|
char *s1, *s2;
|
11
13
|
|
12
14
|
/* Convert Ruby's s1 to C's s1. */
|
13
15
|
|
14
16
|
rb_o1 = StringValue(rb_o1);
|
15
|
-
s1 =
|
16
|
-
l1 =
|
17
|
+
s1 = RSTRING_PTR(rb_o1);
|
18
|
+
l1 = RSTRING_LEN(rb_o1);
|
17
19
|
|
18
20
|
/* Convert Ruby's s2 to C's s2. */
|
19
21
|
|
20
22
|
rb_o2 = StringValue(rb_o2);
|
21
|
-
s2 =
|
22
|
-
l2 =
|
23
|
+
s2 = RSTRING_PTR(rb_o2);
|
24
|
+
l2 = RSTRING_LEN(rb_o2);
|
23
25
|
|
24
26
|
/* Convert Ruby's threshold to C's threshold. */
|
25
27
|
|
@@ -32,13 +34,14 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
|
|
32
34
|
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
33
35
|
|
34
36
|
offset = 0;
|
35
|
-
|
37
|
+
|
38
|
+
while ((offset < l1) && (offset < l2) && (s1[offset] == s2[offset])) {
|
36
39
|
offset++;
|
37
40
|
}
|
38
41
|
|
39
42
|
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
40
43
|
|
41
|
-
while ((
|
44
|
+
while ((offset < l1) && (offset < l2) && (s1[l1-1] == s2[l2-1])) {
|
42
45
|
l1--;
|
43
46
|
l2--;
|
44
47
|
}
|
@@ -63,12 +66,8 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
|
|
63
66
|
|
64
67
|
/* Allocate memory for both rows */
|
65
68
|
|
66
|
-
prev_row = ALLOC_N(int, l1+1);
|
67
|
-
curr_row = ALLOC_N(int, l1+1);
|
68
|
-
|
69
|
-
if ((prev_row == NULL) || (curr_row == NULL)) {
|
70
|
-
rb_raise(rb_eNoMemError, "out of memory");
|
71
|
-
}
|
69
|
+
prev_row = (int*) ALLOC_N(int, (l1+1));
|
70
|
+
curr_row = (int*) ALLOC_N(int, (l1+1));
|
72
71
|
|
73
72
|
/* Initialize the current row. */
|
74
73
|
|
@@ -79,7 +78,9 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
|
|
79
78
|
for (row=1; row<=l2; row++) {
|
80
79
|
/* Copy the current row to the previous row. */
|
81
80
|
|
82
|
-
|
81
|
+
temp_row = prev_row;
|
82
|
+
prev_row = curr_row;
|
83
|
+
curr_row = temp_row;
|
83
84
|
|
84
85
|
/* Calculate the values of the current row. */
|
85
86
|
|
@@ -89,25 +90,29 @@ VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb
|
|
89
90
|
for (col=1; col<=l1; col++) {
|
90
91
|
/* Equal (cost=0) or substitution (cost=1). */
|
91
92
|
|
92
|
-
|
93
|
+
value1 = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
|
93
94
|
|
94
95
|
/* Insertion if it's cheaper than substitution. */
|
95
96
|
|
96
|
-
|
97
|
-
|
97
|
+
value2 = prev_row[col]+1;
|
98
|
+
if (value2 < value1) {
|
99
|
+
value1 = value2;
|
98
100
|
}
|
99
101
|
|
100
102
|
/* Deletion if it's cheaper than substitution. */
|
101
103
|
|
102
|
-
|
103
|
-
|
104
|
+
value2 = curr_row[col-1]+1;
|
105
|
+
if (value2 < value1) {
|
106
|
+
value1 = value2;
|
104
107
|
}
|
105
108
|
|
106
109
|
/* Keep track of the minimum value on this row. */
|
107
110
|
|
108
|
-
if (
|
109
|
-
curr_row_min =
|
111
|
+
if (value1 < curr_row_min) {
|
112
|
+
curr_row_min = value1;
|
110
113
|
}
|
114
|
+
|
115
|
+
curr_row[col] = value1;
|
111
116
|
}
|
112
117
|
|
113
118
|
/* Return nil as soon as we exceed the threshold. */
|
data/lib/levenshtein.rb
CHANGED
@@ -1,44 +1,25 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
rescue LoadError
|
4
|
-
begin
|
5
|
-
require "levenshtein_fast" # If compiled by the build script.
|
6
|
-
rescue LoadError
|
7
|
-
$stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead."
|
8
|
-
end
|
9
|
-
end
|
10
|
-
|
11
|
-
# The Levenshtein distance is a metric for measuring the amount
|
12
|
-
# of difference between two sequences (i.e., the so called edit
|
13
|
-
# distance). The Levenshtein distance between two sequences is
|
14
|
-
# given by the minimum number of operations needed to transform
|
15
|
-
# one sequence into the other, where an operation is an
|
16
|
-
# insertion, deletion, or substitution of a single element.
|
17
|
-
#
|
18
|
-
# More information about the Levenshtein distance algorithm:
|
19
|
-
# http://en.wikipedia.org/wiki/Levenshtein_distance .
|
1
|
+
require "levenshtein/exception"
|
2
|
+
require "levenshtein/version"
|
20
3
|
|
21
4
|
module Levenshtein
|
22
|
-
VERSION = "0.2.0"
|
23
|
-
|
24
5
|
# Returns the Levenshtein distance as a number between 0.0 and
|
25
6
|
# 1.0. It's basically the Levenshtein distance divided by the
|
26
7
|
# length of the longest sequence.
|
27
8
|
|
28
|
-
def self.normalized_distance(
|
29
|
-
|
9
|
+
def self.normalized_distance(a1, a2, threshold=nil)
|
10
|
+
a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one.
|
30
11
|
|
31
|
-
if
|
32
|
-
0.0 # Since
|
12
|
+
if a2.length == 0
|
13
|
+
0.0 # Since a1.length < a2.length, a1 must be empty as well.
|
33
14
|
else
|
34
15
|
if threshold
|
35
|
-
if d = self.distance(
|
36
|
-
d.to_f/
|
16
|
+
if d = self.distance(a1, a2, (threshold*a2.length+1).to_i)
|
17
|
+
d.to_f/a2.length
|
37
18
|
else
|
38
19
|
nil
|
39
20
|
end
|
40
21
|
else
|
41
|
-
self.distance(
|
22
|
+
self.distance(a1, a2).to_f/a2.length
|
42
23
|
end
|
43
24
|
end
|
44
25
|
end
|
@@ -53,47 +34,64 @@ module Levenshtein
|
|
53
34
|
# The sequences should respond to :length and :[] and all objects
|
54
35
|
# in the sequences (as returned by []) should response to :==.
|
55
36
|
|
56
|
-
def self.distance(
|
57
|
-
|
37
|
+
def self.distance(a1, a2, threshold=nil)
|
38
|
+
a1, a2 = a2, a1 if a1.length > a2.length # a1 is the short one; a2 is the long one.
|
58
39
|
|
59
40
|
# Handle some basic circumstances.
|
60
41
|
|
61
|
-
return 0 if
|
62
|
-
return
|
42
|
+
return 0 if a1 == a2
|
43
|
+
return a2.length if a1.length == 0
|
63
44
|
|
64
45
|
if threshold
|
65
|
-
return nil if (
|
46
|
+
return nil if (a2.length-a1.length) >= threshold
|
66
47
|
|
67
|
-
|
68
|
-
|
69
|
-
|
48
|
+
a3, a4 = nil, nil
|
49
|
+
a3, a4 = a1, a2 if a1.respond_to?(:-) and a2.respond_to?(:-)
|
50
|
+
a3, a4 = a1.scan(/./), a2.scan(/./) if a1.respond_to?(:scan) and a2.respond_to?(:scan)
|
70
51
|
|
71
|
-
if
|
72
|
-
return nil if (
|
73
|
-
return nil if (
|
52
|
+
if a3 and a4
|
53
|
+
return nil if (a3-a4).length >= threshold
|
54
|
+
return nil if (a4-a3).length >= threshold
|
74
55
|
end
|
75
56
|
end
|
76
57
|
|
77
|
-
distance_fast_or_slow(
|
58
|
+
distance_fast_or_slow(a1, a2, threshold)
|
78
59
|
end
|
79
60
|
|
80
|
-
def self.distance_fast_or_slow(
|
81
|
-
if respond_to?(:
|
82
|
-
|
61
|
+
def self.distance_fast_or_slow(a1, a2, threshold) # :nodoc:
|
62
|
+
if respond_to?(:distance_fast)
|
63
|
+
distance_fast(a1, a2, threshold) # Implemented in C.
|
83
64
|
else
|
84
|
-
|
65
|
+
distance_slow(a1, a2, threshold) # Implemented in Ruby.
|
85
66
|
end
|
86
67
|
end
|
87
68
|
|
88
|
-
def self.
|
89
|
-
|
69
|
+
def self.distance_slow(a1, a2, threshold) # :nodoc:
|
70
|
+
l1 = a1.length
|
71
|
+
l2 = a2.length
|
90
72
|
|
91
|
-
|
92
|
-
|
93
|
-
|
73
|
+
offset = 0
|
74
|
+
|
75
|
+
while offset < l1 and offset < l2 and a1[offset] == a2[offset]
|
76
|
+
offset += 1
|
77
|
+
end
|
78
|
+
|
79
|
+
while offset < l1 and offset < l2 and a1[l1-1] == a2[l2-1]
|
80
|
+
l1 -= 1
|
81
|
+
l2 -= 1
|
82
|
+
end
|
83
|
+
|
84
|
+
l1 -= offset
|
85
|
+
l2 -= offset
|
94
86
|
|
95
|
-
|
96
|
-
|
87
|
+
crow = (0..l1).to_a
|
88
|
+
|
89
|
+
1.upto(l2) do |y|
|
90
|
+
prow = crow
|
91
|
+
crow = [y]
|
92
|
+
|
93
|
+
1.upto(l1) do |x|
|
94
|
+
crow[x] = [prow[x]+1, crow[x-1]+1, prow[x-1]+(a1[offset+x-1]==a2[offset+y-1] ? 0 : 1)].min
|
97
95
|
end
|
98
96
|
|
99
97
|
# Stop analysing this sequence as soon as the best possible
|
@@ -101,9 +99,19 @@ module Levenshtein
|
|
101
99
|
# (The minimum value in the next row will be equal to or greater
|
102
100
|
# than the minimum value in this row.)
|
103
101
|
|
104
|
-
return nil if threshold and
|
102
|
+
return nil if threshold and crow.min >= threshold
|
105
103
|
end
|
106
104
|
|
107
|
-
|
105
|
+
crow[-1]
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
begin
|
110
|
+
require "levenshtein/levenshtein_fast" # Compiled by RubyGems.
|
111
|
+
rescue LoadError
|
112
|
+
begin
|
113
|
+
require "levenshtein_fast" # Compiled by the build script.
|
114
|
+
rescue LoadError
|
115
|
+
$stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein. Using the much slower Ruby version instead."
|
108
116
|
end
|
109
117
|
end
|
data/test/test.rb
CHANGED
@@ -12,6 +12,8 @@ module Levenshtein
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def [](pos)
|
15
|
+
raise "type not allowed [#{pos.inspect}]" unless pos.kind_of?(Fixnum)
|
16
|
+
|
15
17
|
@sequence[pos]
|
16
18
|
end
|
17
19
|
end
|
@@ -105,21 +107,31 @@ end
|
|
105
107
|
|
106
108
|
class TestLevenshteinSlow < Test::Unit::TestCase
|
107
109
|
def test_erik_veenstra
|
108
|
-
assert_equal(7, Levenshtein.
|
110
|
+
assert_equal(7, Levenshtein.distance_slow("erik", "veenstra", nil))
|
111
|
+
assert_equal(7, Levenshtein.distance_slow("veenstra", "erik", nil))
|
109
112
|
end
|
110
113
|
|
111
|
-
def
|
112
|
-
assert_equal(0, Levenshtein.
|
113
|
-
assert_equal(3, Levenshtein.
|
114
|
+
def test_empty_string
|
115
|
+
assert_equal(0, Levenshtein.distance_slow("", "", nil))
|
116
|
+
assert_equal(3, Levenshtein.distance_slow("", "foo", nil))
|
117
|
+
assert_equal(3, Levenshtein.distance_slow("foo", "", nil))
|
114
118
|
end
|
115
119
|
|
116
|
-
def
|
117
|
-
assert_equal(0, Levenshtein.
|
118
|
-
assert_equal(0, Levenshtein.
|
120
|
+
def test_same_string
|
121
|
+
assert_equal(0, Levenshtein.distance_slow("", "", nil))
|
122
|
+
assert_equal(0, Levenshtein.distance_slow("foo", "foo", nil))
|
119
123
|
end
|
120
124
|
|
121
125
|
def test_threshold
|
122
|
-
assert_equal(3, Levenshtein.
|
123
|
-
assert_equal(
|
126
|
+
assert_equal(3, Levenshtein.distance_slow("foo", "foobar", nil))
|
127
|
+
assert_equal(3, Levenshtein.distance_slow("foo", "foobar", 4))
|
128
|
+
assert_equal(nil, Levenshtein.distance_slow("foo", "foobar", 2))
|
129
|
+
end
|
130
|
+
|
131
|
+
def test_same_head_and_or_tail
|
132
|
+
assert_equal(3, Levenshtein.distance_slow("ab123cd", "abxyzcd", nil))
|
133
|
+
assert_equal(3, Levenshtein.distance_slow("ab123", "abxyz", nil))
|
134
|
+
assert_equal(3, Levenshtein.distance_slow("123cd", "xyzcd", nil))
|
135
|
+
assert_equal(5, Levenshtein.distance_slow("123cd123", "123", nil))
|
124
136
|
end
|
125
137
|
end
|
metadata
CHANGED
@@ -1,7 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: levenshtein
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 21
|
5
|
+
prerelease:
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
- 1
|
10
|
+
version: 0.2.1
|
5
11
|
platform: ruby
|
6
12
|
authors:
|
7
13
|
- Erik Veenstra
|
@@ -9,8 +15,7 @@ autorequire:
|
|
9
15
|
bindir: bin
|
10
16
|
cert_chain: []
|
11
17
|
|
12
|
-
date:
|
13
|
-
default_executable:
|
18
|
+
date: 2012-02-11 00:00:00 Z
|
14
19
|
dependencies: []
|
15
20
|
|
16
21
|
description: Calculates the Levenshtein distance between two byte strings.
|
@@ -22,18 +27,21 @@ extensions:
|
|
22
27
|
extra_rdoc_files: []
|
23
28
|
|
24
29
|
files:
|
30
|
+
- lib/levenshtein/exception.rb
|
31
|
+
- lib/levenshtein/version.rb
|
25
32
|
- lib/levenshtein.rb
|
26
|
-
- ext/levenshtein/extconf.rb
|
27
|
-
- ext/levenshtein/levenshtein_array_of_strings.c
|
28
|
-
- ext/levenshtein/levenshtein_fast.c
|
29
33
|
- ext/levenshtein/levenshtein_string.c
|
30
34
|
- ext/levenshtein/levenshtein_generic.c
|
35
|
+
- ext/levenshtein/levenshtein.h
|
36
|
+
- ext/levenshtein/levenshtein_fast.c
|
37
|
+
- ext/levenshtein/levenshtein_array_of_strings.c
|
31
38
|
- ext/levenshtein/levenshtein_array.c
|
39
|
+
- ext/levenshtein/extconf.rb
|
32
40
|
- README
|
33
41
|
- LICENSE
|
34
42
|
- VERSION
|
35
43
|
- CHANGELOG
|
36
|
-
|
44
|
+
- test/test.rb
|
37
45
|
homepage: http://www.erikveen.dds.nl/levenshtein/index.html
|
38
46
|
licenses: []
|
39
47
|
|
@@ -44,27 +52,33 @@ rdoc_options:
|
|
44
52
|
- VERSION
|
45
53
|
- CHANGELOG
|
46
54
|
- --title
|
47
|
-
- levenshtein (0.2.
|
55
|
+
- levenshtein (0.2.1)
|
48
56
|
- --main
|
49
57
|
- README
|
50
58
|
require_paths:
|
51
59
|
- lib
|
52
60
|
required_ruby_version: !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
53
62
|
requirements:
|
54
63
|
- - ">="
|
55
64
|
- !ruby/object:Gem::Version
|
65
|
+
hash: 3
|
66
|
+
segments:
|
67
|
+
- 0
|
56
68
|
version: "0"
|
57
|
-
version:
|
58
69
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
70
|
+
none: false
|
59
71
|
requirements:
|
60
72
|
- - ">="
|
61
73
|
- !ruby/object:Gem::Version
|
74
|
+
hash: 3
|
75
|
+
segments:
|
76
|
+
- 0
|
62
77
|
version: "0"
|
63
|
-
version:
|
64
78
|
requirements: []
|
65
79
|
|
66
80
|
rubyforge_project: levenshtein
|
67
|
-
rubygems_version: 1.
|
81
|
+
rubygems_version: 1.8.12
|
68
82
|
signing_key:
|
69
83
|
specification_version: 3
|
70
84
|
summary: Calculates the Levenshtein distance between two byte strings.
|