levenshtein-extended 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +7 -0
- data/Gemfile +4 -0
- data/README +14 -0
- data/Rakefile +3 -0
- data/ext/levenshtein_in_c/extconf.rb +10 -0
- data/ext/levenshtein_in_c/levenshtein_array.c +127 -0
- data/ext/levenshtein_in_c/levenshtein_array_of_strings.c +125 -0
- data/ext/levenshtein_in_c/levenshtein_fast.c +21 -0
- data/ext/levenshtein_in_c/levenshtein_generic.c +129 -0
- data/ext/levenshtein_in_c/levenshtein_string.c +133 -0
- data/levenshtein.gemspec +20 -0
- data/lib/levenshtein.rb +108 -0
- data/lib/levenshtein/version.rb +3 -0
- data/lib/levenshtein_in_c.bundle +0 -0
- metadata +70 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
The levenshtein module implements fast Damerau-Levenshtein edit distance computation in O(n) memory and O(n^2) time, using a C wrapper.
|
2
|
+
|
3
|
+
USAGE:
|
4
|
+
|
5
|
+
@install
|
6
|
+
git clone git://github.com/esdras/levenshtein.git
|
7
|
+
cd levenshtein
|
8
|
+
rake compile
|
9
|
+
rake install
|
10
|
+
|
11
|
+
@usage
|
12
|
+
require 'levenshtein'
|
13
|
+
Levenshtein.normalized_distance("string 1", "string 2")
|
14
|
+
Levenshtein.normalized_distance([2, 3, 4, 5], [1, 2, 3, 4])
|
data/Rakefile
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
require "mkmf"
|
2
|
+
|
3
|
+
dir_config("levenshtein_in_c")
|
4
|
+
|
5
|
+
have_library("levenshtein_array")
|
6
|
+
have_library("levenshtein_array_of_strings")
|
7
|
+
have_library("levenshtein_generic")
|
8
|
+
have_library("levenshtein_string")
|
9
|
+
|
10
|
+
create_makefile("levenshtein_in_c/levenshtein_in_c")
|
@@ -0,0 +1,127 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
VALUE levenshtein_distance_array(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
|
+
int threshold;
|
5
|
+
long l1, l2;
|
6
|
+
int *prev_row, *curr_row;
|
7
|
+
int col, row;
|
8
|
+
int curr_row_min, result;
|
9
|
+
int offset;
|
10
|
+
|
11
|
+
ID id_eql = rb_intern("==");
|
12
|
+
|
13
|
+
/* Get the sizes of both arrays. */
|
14
|
+
|
15
|
+
l1 = RARRAY_LEN(RARRAY(rb_o1));
|
16
|
+
l2 = RARRAY_LEN(RARRAY(rb_o2));
|
17
|
+
|
18
|
+
/* Convert Ruby's threshold to C's threshold. */
|
19
|
+
|
20
|
+
if (!NIL_P(rb_threshold)) {
|
21
|
+
threshold = FIX2INT(rb_threshold);
|
22
|
+
} else {
|
23
|
+
threshold = -1;
|
24
|
+
}
|
25
|
+
|
26
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
27
|
+
|
28
|
+
offset = 0;
|
29
|
+
while RTEST(rb_funcall(rb_ary_entry(rb_o1, offset), id_eql, 1, rb_ary_entry(rb_o2, offset))) {
|
30
|
+
offset++;
|
31
|
+
}
|
32
|
+
|
33
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
34
|
+
|
35
|
+
while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_ary_entry(rb_o1, l1-1), id_eql, 1, rb_ary_entry(rb_o2, l2-1)))) {
|
36
|
+
l1--;
|
37
|
+
l2--;
|
38
|
+
}
|
39
|
+
|
40
|
+
l1 -= offset;
|
41
|
+
l2 -= offset;
|
42
|
+
|
43
|
+
/* The Levenshtein algorithm itself. */
|
44
|
+
|
45
|
+
/* s1= */
|
46
|
+
/* ERIK */
|
47
|
+
/* */
|
48
|
+
/* 01234 */
|
49
|
+
/* s2=V 11234 */
|
50
|
+
/* E 21234 */
|
51
|
+
/* E 32234 */
|
52
|
+
/* N 43334 <- prev_row */
|
53
|
+
/* S 54444 <- curr_row */
|
54
|
+
/* T 65555 */
|
55
|
+
/* R 76566 */
|
56
|
+
/* A 87667 */
|
57
|
+
|
58
|
+
/* Allocate memory for both rows */
|
59
|
+
|
60
|
+
prev_row = ALLOC_N(int, l1+1);
|
61
|
+
curr_row = ALLOC_N(int, l1+1);
|
62
|
+
|
63
|
+
if ((prev_row == NULL) || (curr_row == NULL)) {
|
64
|
+
rb_raise(rb_eNoMemError, "out of memory");
|
65
|
+
}
|
66
|
+
|
67
|
+
/* Initialize the current row. */
|
68
|
+
|
69
|
+
for (col=0; col<=l1; col++) {
|
70
|
+
curr_row[col] = col;
|
71
|
+
}
|
72
|
+
|
73
|
+
for (row=1; row<=l2; row++) {
|
74
|
+
/* Copy the current row to the previous row. */
|
75
|
+
|
76
|
+
memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
|
77
|
+
|
78
|
+
/* Calculate the values of the current row. */
|
79
|
+
|
80
|
+
curr_row[0] = row;
|
81
|
+
curr_row_min = row;
|
82
|
+
|
83
|
+
for (col=1; col<=l1; col++) {
|
84
|
+
/* Equal (cost=0) or substitution (cost=1). */
|
85
|
+
|
86
|
+
curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_ary_entry(rb_o1, offset+col-1), id_eql, 1, rb_ary_entry(rb_o2, offset+row-1))) ? 0 : 1);
|
87
|
+
|
88
|
+
/* Insertion if it's cheaper than substitution. */
|
89
|
+
|
90
|
+
if (prev_row[col]+1 < curr_row[col]) {
|
91
|
+
curr_row[col] = prev_row[col]+1;
|
92
|
+
}
|
93
|
+
|
94
|
+
/* Deletion if it's cheaper than substitution. */
|
95
|
+
|
96
|
+
if (curr_row[col-1]+1 < curr_row[col]) {
|
97
|
+
curr_row[col] = curr_row[col-1]+1;
|
98
|
+
}
|
99
|
+
|
100
|
+
/* Keep track of the minimum value on this row. */
|
101
|
+
|
102
|
+
if (curr_row[col] < curr_row_min) {
|
103
|
+
curr_row_min = curr_row[col];
|
104
|
+
}
|
105
|
+
}
|
106
|
+
|
107
|
+
/* Return nil as soon as we exceed the threshold. */
|
108
|
+
|
109
|
+
if (threshold > -1 && curr_row_min >= threshold) {
|
110
|
+
free(prev_row);
|
111
|
+
free(curr_row);
|
112
|
+
|
113
|
+
return Qnil;
|
114
|
+
}
|
115
|
+
}
|
116
|
+
|
117
|
+
/* The result is the last value on the last row. */
|
118
|
+
|
119
|
+
result = curr_row[l1];
|
120
|
+
|
121
|
+
free(prev_row);
|
122
|
+
free(curr_row);
|
123
|
+
|
124
|
+
/* Return the Ruby version of the result. */
|
125
|
+
|
126
|
+
return INT2FIX(result);
|
127
|
+
}
|
@@ -0,0 +1,125 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
VALUE levenshtein_distance_array_of_strings(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
|
+
int threshold;
|
5
|
+
long l1, l2;
|
6
|
+
int *prev_row, *curr_row;
|
7
|
+
int col, row;
|
8
|
+
int curr_row_min, result;
|
9
|
+
int offset;
|
10
|
+
|
11
|
+
/* Get the sizes of both arrays. */
|
12
|
+
|
13
|
+
l1 = RARRAY_LEN(RARRAY(rb_o1));
|
14
|
+
l2 = RARRAY_LEN(RARRAY(rb_o2));
|
15
|
+
|
16
|
+
/* Convert Ruby's threshold to C's threshold. */
|
17
|
+
|
18
|
+
if (!NIL_P(rb_threshold)) {
|
19
|
+
threshold = FIX2INT(rb_threshold);
|
20
|
+
} else {
|
21
|
+
threshold = -1;
|
22
|
+
}
|
23
|
+
|
24
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
25
|
+
|
26
|
+
offset = 0;
|
27
|
+
while (rb_str_cmp(rb_ary_entry(rb_o1, offset), rb_ary_entry(rb_o2, offset)) == 0) {
|
28
|
+
offset++;
|
29
|
+
}
|
30
|
+
|
31
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
32
|
+
|
33
|
+
while ((l1-1 > offset) && (l2-1 > offset) && (rb_str_cmp(rb_ary_entry(rb_o1, l1-1), rb_ary_entry(rb_o2, l2-1)) == 0 )) {
|
34
|
+
l1--;
|
35
|
+
l2--;
|
36
|
+
}
|
37
|
+
|
38
|
+
l1 -= offset;
|
39
|
+
l2 -= offset;
|
40
|
+
|
41
|
+
/* The Levenshtein algorithm itself. */
|
42
|
+
|
43
|
+
/* s1= */
|
44
|
+
/* ERIK */
|
45
|
+
/* */
|
46
|
+
/* 01234 */
|
47
|
+
/* s2=V 11234 */
|
48
|
+
/* E 21234 */
|
49
|
+
/* E 32234 */
|
50
|
+
/* N 43334 <- prev_row */
|
51
|
+
/* S 54444 <- curr_row */
|
52
|
+
/* T 65555 */
|
53
|
+
/* R 76566 */
|
54
|
+
/* A 87667 */
|
55
|
+
|
56
|
+
/* Allocate memory for both rows */
|
57
|
+
|
58
|
+
prev_row = ALLOC_N(int, l1+1);
|
59
|
+
curr_row = ALLOC_N(int, l1+1);
|
60
|
+
|
61
|
+
if ((prev_row == NULL) || (curr_row == NULL)) {
|
62
|
+
rb_raise(rb_eNoMemError, "out of memory");
|
63
|
+
}
|
64
|
+
|
65
|
+
/* Initialize the current row. */
|
66
|
+
|
67
|
+
for (col=0; col<=l1; col++) {
|
68
|
+
curr_row[col] = col;
|
69
|
+
}
|
70
|
+
|
71
|
+
for (row=1; row<=l2; row++) {
|
72
|
+
/* Copy the current row to the previous row. */
|
73
|
+
|
74
|
+
memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
|
75
|
+
|
76
|
+
/* Calculate the values of the current row. */
|
77
|
+
|
78
|
+
curr_row[0] = row;
|
79
|
+
curr_row_min = row;
|
80
|
+
|
81
|
+
for (col=1; col<=l1; col++) {
|
82
|
+
/* Equal (cost=0) or substitution (cost=1). */
|
83
|
+
|
84
|
+
curr_row[col] = prev_row[col-1] + ((rb_str_cmp(rb_ary_entry(rb_o1, offset+col-1), rb_ary_entry(rb_o2, offset+row-1)) == 0) ? 0 : 1);
|
85
|
+
|
86
|
+
/* Insertion if it's cheaper than substitution. */
|
87
|
+
|
88
|
+
if (prev_row[col]+1 < curr_row[col]) {
|
89
|
+
curr_row[col] = prev_row[col]+1;
|
90
|
+
}
|
91
|
+
|
92
|
+
/* Deletion if it's cheaper than substitution. */
|
93
|
+
|
94
|
+
if (curr_row[col-1]+1 < curr_row[col]) {
|
95
|
+
curr_row[col] = curr_row[col-1]+1;
|
96
|
+
}
|
97
|
+
|
98
|
+
/* Keep track of the minimum value on this row. */
|
99
|
+
|
100
|
+
if (curr_row[col] < curr_row_min) {
|
101
|
+
curr_row_min = curr_row[col];
|
102
|
+
}
|
103
|
+
}
|
104
|
+
|
105
|
+
/* Return nil as soon as we exceed the threshold. */
|
106
|
+
|
107
|
+
if (threshold > -1 && curr_row_min >= threshold) {
|
108
|
+
free(prev_row);
|
109
|
+
free(curr_row);
|
110
|
+
|
111
|
+
return Qnil;
|
112
|
+
}
|
113
|
+
}
|
114
|
+
|
115
|
+
/* The result is the last value on the last row. */
|
116
|
+
|
117
|
+
result = curr_row[l1];
|
118
|
+
|
119
|
+
free(prev_row);
|
120
|
+
free(curr_row);
|
121
|
+
|
122
|
+
/* Return the Ruby version of the result. */
|
123
|
+
|
124
|
+
return INT2FIX(result);
|
125
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
VALUE levenshtein_distance_fast(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
|
+
if ((TYPE(rb_o1) == T_STRING) && (TYPE(rb_o2)) == T_STRING) {
|
5
|
+
return levenshtein_distance_string(self, rb_o1, rb_o2, rb_threshold);
|
6
|
+
} else if ((TYPE(rb_o1) == T_ARRAY) && (TYPE(rb_o2)) == T_ARRAY) {
|
7
|
+
if ((TYPE(rb_ary_entry(rb_o1, 0)) == T_STRING) && (TYPE(rb_ary_entry(rb_o2, 0))) == T_STRING) {
|
8
|
+
return levenshtein_distance_array_of_strings(self, rb_o1, rb_o2, rb_threshold);
|
9
|
+
} else {
|
10
|
+
return levenshtein_distance_array(self, rb_o1, rb_o2, rb_threshold);
|
11
|
+
}
|
12
|
+
} else {
|
13
|
+
return levenshtein_distance_generic(self, rb_o1, rb_o2, rb_threshold);
|
14
|
+
}
|
15
|
+
}
|
16
|
+
|
17
|
+
void Init_levenshtein_in_c() {
|
18
|
+
VALUE mLevenshtein = rb_define_module("Levenshtein");
|
19
|
+
|
20
|
+
rb_define_singleton_method(mLevenshtein, "levenshtein_distance_fast" , levenshtein_distance_fast, 3);
|
21
|
+
}
|
@@ -0,0 +1,129 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
VALUE levenshtein_distance_generic(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
|
+
int threshold;
|
5
|
+
int l1, l2;
|
6
|
+
int *prev_row, *curr_row;
|
7
|
+
int col, row;
|
8
|
+
int curr_row_min, result;
|
9
|
+
int offset;
|
10
|
+
|
11
|
+
ID id_length = rb_intern("length");
|
12
|
+
ID id_get = rb_intern("[]");
|
13
|
+
ID id_equal = rb_intern("==");
|
14
|
+
|
15
|
+
/* Get the sizes of both sequences. */
|
16
|
+
|
17
|
+
l1 = FIX2INT(rb_funcall(rb_o1, id_length, 0));
|
18
|
+
l2 = FIX2INT(rb_funcall(rb_o2, id_length, 0));
|
19
|
+
|
20
|
+
/* Convert Ruby's threshold to C's threshold. */
|
21
|
+
|
22
|
+
if (!NIL_P(rb_threshold)) {
|
23
|
+
threshold = FIX2INT(rb_threshold);
|
24
|
+
} else {
|
25
|
+
threshold = -1;
|
26
|
+
}
|
27
|
+
|
28
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
29
|
+
|
30
|
+
offset = 0;
|
31
|
+
while RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset)))) {
|
32
|
+
offset++;
|
33
|
+
}
|
34
|
+
|
35
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
36
|
+
|
37
|
+
while ((l1-1 > offset) && (l2-1 > offset) && RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(l1-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(l2-1))))) {
|
38
|
+
l1--;
|
39
|
+
l2--;
|
40
|
+
}
|
41
|
+
|
42
|
+
l1 -= offset;
|
43
|
+
l2 -= offset;
|
44
|
+
|
45
|
+
/* The Levenshtein algorithm itself. */
|
46
|
+
|
47
|
+
/* s1= */
|
48
|
+
/* ERIK */
|
49
|
+
/* */
|
50
|
+
/* 01234 */
|
51
|
+
/* s2=V 11234 */
|
52
|
+
/* E 21234 */
|
53
|
+
/* E 32234 */
|
54
|
+
/* N 43334 <- prev_row */
|
55
|
+
/* S 54444 <- curr_row */
|
56
|
+
/* T 65555 */
|
57
|
+
/* R 76566 */
|
58
|
+
/* A 87667 */
|
59
|
+
|
60
|
+
/* Allocate memory for both rows */
|
61
|
+
|
62
|
+
prev_row = ALLOC_N(int, l1+1);
|
63
|
+
curr_row = ALLOC_N(int, l1+1);
|
64
|
+
|
65
|
+
if ((prev_row == NULL) || (curr_row == NULL)) {
|
66
|
+
rb_raise(rb_eNoMemError, "out of memory");
|
67
|
+
}
|
68
|
+
|
69
|
+
/* Initialize the current row. */
|
70
|
+
|
71
|
+
for (col=0; col<=l1; col++) {
|
72
|
+
curr_row[col] = col;
|
73
|
+
}
|
74
|
+
|
75
|
+
for (row=1; row<=l2; row++) {
|
76
|
+
/* Copy the current row to the previous row. */
|
77
|
+
|
78
|
+
memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
|
79
|
+
|
80
|
+
/* Calculate the values of the current row. */
|
81
|
+
|
82
|
+
curr_row[0] = row;
|
83
|
+
curr_row_min = row;
|
84
|
+
|
85
|
+
for (col=1; col<=l1; col++) {
|
86
|
+
/* Equal (cost=0) or substitution (cost=1). */
|
87
|
+
|
88
|
+
curr_row[col] = prev_row[col-1] + (RTEST(rb_funcall(rb_funcall(rb_o1, id_get, 1, INT2FIX(offset+col-1)), id_equal, 1, rb_funcall(rb_o2, id_get, 1, INT2FIX(offset+row-1)))) ? 0 : 1);
|
89
|
+
|
90
|
+
/* Insertion if it's cheaper than substitution. */
|
91
|
+
|
92
|
+
if (prev_row[col]+1 < curr_row[col]) {
|
93
|
+
curr_row[col] = prev_row[col]+1;
|
94
|
+
}
|
95
|
+
|
96
|
+
/* Deletion if it's cheaper than substitution. */
|
97
|
+
|
98
|
+
if (curr_row[col-1]+1 < curr_row[col]) {
|
99
|
+
curr_row[col] = curr_row[col-1]+1;
|
100
|
+
}
|
101
|
+
|
102
|
+
/* Keep track of the minimum value on this row. */
|
103
|
+
|
104
|
+
if (curr_row[col] < curr_row_min) {
|
105
|
+
curr_row_min = curr_row[col];
|
106
|
+
}
|
107
|
+
}
|
108
|
+
|
109
|
+
/* Return nil as soon as we exceed the threshold. */
|
110
|
+
|
111
|
+
if (threshold > -1 && curr_row_min >= threshold) {
|
112
|
+
free(prev_row);
|
113
|
+
free(curr_row);
|
114
|
+
|
115
|
+
return Qnil;
|
116
|
+
}
|
117
|
+
}
|
118
|
+
|
119
|
+
/* The result is the last value on the last row. */
|
120
|
+
|
121
|
+
result = curr_row[l1];
|
122
|
+
|
123
|
+
free(prev_row);
|
124
|
+
free(curr_row);
|
125
|
+
|
126
|
+
/* Return the Ruby version of the result. */
|
127
|
+
|
128
|
+
return INT2FIX(result);
|
129
|
+
}
|
@@ -0,0 +1,133 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
VALUE levenshtein_distance_string(VALUE self, VALUE rb_o1, VALUE rb_o2, VALUE rb_threshold) {
|
4
|
+
int threshold;
|
5
|
+
long l1, l2;
|
6
|
+
int *prev_row, *curr_row;
|
7
|
+
int col, row;
|
8
|
+
int curr_row_min, result;
|
9
|
+
int offset;
|
10
|
+
char *s1, *s2;
|
11
|
+
|
12
|
+
/* Convert Ruby's s1 to C's s1. */
|
13
|
+
|
14
|
+
rb_o1 = StringValue(rb_o1);
|
15
|
+
s1 = RSTRING_PTR(RSTRING(rb_o1));
|
16
|
+
l1 = RSTRING_LEN(RSTRING(rb_o1));
|
17
|
+
|
18
|
+
/* Convert Ruby's s2 to C's s2. */
|
19
|
+
|
20
|
+
rb_o2 = StringValue(rb_o2);
|
21
|
+
s2 = RSTRING_PTR(RSTRING(rb_o2));
|
22
|
+
l2 = RSTRING_LEN(RSTRING(rb_o2));
|
23
|
+
|
24
|
+
/* Convert Ruby's threshold to C's threshold. */
|
25
|
+
|
26
|
+
if (!NIL_P(rb_threshold)) {
|
27
|
+
threshold = FIX2INT(rb_threshold);
|
28
|
+
} else {
|
29
|
+
threshold = -1;
|
30
|
+
}
|
31
|
+
|
32
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common prefix. */
|
33
|
+
|
34
|
+
offset = 0;
|
35
|
+
while (s1[offset] == s2[offset]) {
|
36
|
+
offset++;
|
37
|
+
}
|
38
|
+
|
39
|
+
/* Do the expensive calculation on a subset of the sequences, if possible, by removing the common postfix. */
|
40
|
+
|
41
|
+
while ((l1-1 > offset) && (l2-1 > offset) && (s1[l1-1] == s2[l2-1])) {
|
42
|
+
l1--;
|
43
|
+
l2--;
|
44
|
+
}
|
45
|
+
|
46
|
+
l1 -= offset;
|
47
|
+
l2 -= offset;
|
48
|
+
|
49
|
+
/* The Levenshtein algorithm itself. */
|
50
|
+
|
51
|
+
/* s1= */
|
52
|
+
/* ERIK */
|
53
|
+
/* */
|
54
|
+
/* 01234 */
|
55
|
+
/* s2=V 11234 */
|
56
|
+
/* E 21234 */
|
57
|
+
/* E 32234 */
|
58
|
+
/* N 43334 <- prev_row */
|
59
|
+
/* S 54444 <- curr_row */
|
60
|
+
/* T 65555 */
|
61
|
+
/* R 76566 */
|
62
|
+
/* A 87667 */
|
63
|
+
|
64
|
+
/* Allocate memory for both rows */
|
65
|
+
|
66
|
+
prev_row = ALLOC_N(int, l1+1);
|
67
|
+
curr_row = ALLOC_N(int, l1+1);
|
68
|
+
|
69
|
+
if ((prev_row == NULL) || (curr_row == NULL)) {
|
70
|
+
rb_raise(rb_eNoMemError, "out of memory");
|
71
|
+
}
|
72
|
+
|
73
|
+
/* Initialize the current row. */
|
74
|
+
|
75
|
+
for (col=0; col<=l1; col++) {
|
76
|
+
curr_row[col] = col;
|
77
|
+
}
|
78
|
+
|
79
|
+
for (row=1; row<=l2; row++) {
|
80
|
+
/* Copy the current row to the previous row. */
|
81
|
+
|
82
|
+
memcpy(prev_row, curr_row, sizeof(int)*(l1+1));
|
83
|
+
|
84
|
+
/* Calculate the values of the current row. */
|
85
|
+
|
86
|
+
curr_row[0] = row;
|
87
|
+
curr_row_min = row;
|
88
|
+
|
89
|
+
for (col=1; col<=l1; col++) {
|
90
|
+
/* Equal (cost=0) or substitution (cost=1). */
|
91
|
+
|
92
|
+
curr_row[col] = prev_row[col-1] + ((s1[offset+col-1] == s2[offset+row-1]) ? 0 : 1);
|
93
|
+
|
94
|
+
/* Insertion if it's cheaper than substitution. */
|
95
|
+
|
96
|
+
if (prev_row[col]+1 < curr_row[col]) {
|
97
|
+
curr_row[col] = prev_row[col]+1;
|
98
|
+
}
|
99
|
+
|
100
|
+
/* Deletion if it's cheaper than substitution. */
|
101
|
+
|
102
|
+
if (curr_row[col-1]+1 < curr_row[col]) {
|
103
|
+
curr_row[col] = curr_row[col-1]+1;
|
104
|
+
}
|
105
|
+
|
106
|
+
/* Keep track of the minimum value on this row. */
|
107
|
+
|
108
|
+
if (curr_row[col] < curr_row_min) {
|
109
|
+
curr_row_min = curr_row[col];
|
110
|
+
}
|
111
|
+
}
|
112
|
+
|
113
|
+
/* Return nil as soon as we exceed the threshold. */
|
114
|
+
|
115
|
+
if (threshold > -1 && curr_row_min >= threshold) {
|
116
|
+
free(prev_row);
|
117
|
+
free(curr_row);
|
118
|
+
|
119
|
+
return Qnil;
|
120
|
+
}
|
121
|
+
}
|
122
|
+
|
123
|
+
/* The result is the last value on the last row. */
|
124
|
+
|
125
|
+
result = curr_row[l1];
|
126
|
+
|
127
|
+
free(prev_row);
|
128
|
+
free(curr_row);
|
129
|
+
|
130
|
+
/* Return the Ruby version of the result. */
|
131
|
+
|
132
|
+
return INT2FIX(result);
|
133
|
+
}
|
data/levenshtein.gemspec
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "levenshtein/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "levenshtein-extended"
|
7
|
+
s.version = Levenshtein::VERSION
|
8
|
+
s.authors = ["Esdras Mayrink"]
|
9
|
+
s.email = ["falecom@oesdras.com.br"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = "fast string edit distance computation, using the Damerau-Levenshtein algorithm"
|
12
|
+
s.description = "The levenshtein module implements fast Damerau-Levenshtein edit distance computation in O(n) memory and O(n^2) time, using a C wrapper."
|
13
|
+
|
14
|
+
s.rubyforge_project = "levenshtein"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
end
|
data/lib/levenshtein.rb
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
begin
|
2
|
+
require "levenshtein/levenshtein_in_c" # If compiled by RubyGems.
|
3
|
+
rescue LoadError
|
4
|
+
begin
|
5
|
+
require "levenshtein_in_c" # If compiled by the build script.
|
6
|
+
rescue LoadError
|
7
|
+
$stderr.puts "WARNING: Couldn't find the fast C implementation of Levenshtein.distance. Using the much slower Ruby version instead."
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
# The Levenshtein distance is a metric for measuring the amount
|
12
|
+
# of difference between two sequences (i.e., the so called edit
|
13
|
+
# distance). The Levenshtein distance between two sequences is
|
14
|
+
# given by the minimum number of operations needed to transform
|
15
|
+
# one sequence into the other, where an operation is an
|
16
|
+
# insertion, deletion, or substitution of a single element.
|
17
|
+
#
|
18
|
+
# More information about the Levenshtein distance algorithm:
|
19
|
+
# http://en.wikipedia.org/wiki/Levenshtein_distance .
|
20
|
+
|
21
|
+
module Levenshtein
|
22
|
+
|
23
|
+
# Returns the Levenshtein distance as a number between 0.0 and
|
24
|
+
# 1.0. It's basically the Levenshtein distance divided by the
|
25
|
+
# length of the longest sequence.
|
26
|
+
|
27
|
+
def self.normalized_distance(s1, s2, threshold=nil)
|
28
|
+
s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
|
29
|
+
|
30
|
+
if s2.length == 0
|
31
|
+
0.0 # Since s1.length < s2.length, s1 must be empty as well.
|
32
|
+
else
|
33
|
+
if threshold
|
34
|
+
if d = self.distance(s1, s2, (threshold*s2.length+1).to_i)
|
35
|
+
d.to_f/s2.length
|
36
|
+
else
|
37
|
+
nil
|
38
|
+
end
|
39
|
+
else
|
40
|
+
self.distance(s1, s2).to_f/s2.length
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# Returns the Levenshtein distance between two sequences.
|
46
|
+
#
|
47
|
+
# The two sequences can be two strings, two arrays, or two other
|
48
|
+
# objects. Strings, arrays and arrays of strings are handled with
|
49
|
+
# optimized (very fast) C code. All other sequences are handled
|
50
|
+
# with generic (fast) C code.
|
51
|
+
#
|
52
|
+
# The sequences should respond to :length and :[] and all objects
|
53
|
+
# in the sequences (as returned by []) should response to :==.
|
54
|
+
|
55
|
+
def self.distance(s1, s2, threshold=nil)
|
56
|
+
s1, s2 = s2, s1 if s1.length > s2.length # s1 is the short one; s2 is the long one.
|
57
|
+
|
58
|
+
# Handle some basic circumstances.
|
59
|
+
|
60
|
+
return 0 if s1 == s2
|
61
|
+
return s2.length if s1.length == 0
|
62
|
+
|
63
|
+
if threshold
|
64
|
+
return nil if (s2.length-s1.length) >= threshold
|
65
|
+
|
66
|
+
a1, a2 = nil, nil
|
67
|
+
a1, a2 = s1, s2 if s1.respond_to?(:-) and s2.respond_to?(:-)
|
68
|
+
a1, a2 = s1.scan(/./), s2.scan(/./) if s1.respond_to?(:scan) and s2.respond_to?(:scan)
|
69
|
+
|
70
|
+
if a1 and a2
|
71
|
+
return nil if (a1-a2).length >= threshold
|
72
|
+
return nil if (a2-a1).length >= threshold
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
distance_fast_or_slow(s1, s2, threshold)
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.distance_fast_or_slow(s1, s2, threshold) # :nodoc:
|
80
|
+
if respond_to?(:levenshtein_distance_fast)
|
81
|
+
levenshtein_distance_fast(s1, s2, threshold) # Implemented in C.
|
82
|
+
else
|
83
|
+
levenshtein_distance_slow(s1, s2, threshold) # Implemented in Ruby.
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def self.levenshtein_distance_slow(s1, s2, threshold) # :nodoc:
|
88
|
+
row = (0..s1.length).to_a
|
89
|
+
|
90
|
+
1.upto(s2.length) do |y|
|
91
|
+
prow = row
|
92
|
+
row = [y]
|
93
|
+
|
94
|
+
1.upto(s1.length) do |x|
|
95
|
+
row[x] = [prow[x]+1, row[x-1]+1, prow[x-1]+(s1[x-1]==s2[y-1] ? 0 : 1)].min
|
96
|
+
end
|
97
|
+
|
98
|
+
# Stop analysing this sequence as soon as the best possible
|
99
|
+
# result for this sequence is bigger than the best result so far.
|
100
|
+
# (The minimum value in the next row will be equal to or greater
|
101
|
+
# than the minimum value in this row.)
|
102
|
+
|
103
|
+
return nil if threshold and row.min >= threshold
|
104
|
+
end
|
105
|
+
|
106
|
+
row[-1]
|
107
|
+
end
|
108
|
+
end
|
Binary file
|
metadata
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: levenshtein-extended
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.0.1
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Esdras Mayrink
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-07-04 00:00:00 -03:00
|
14
|
+
default_executable:
|
15
|
+
dependencies: []
|
16
|
+
|
17
|
+
description: The levenshtein module implements fast Damerau-Levenshtein edit distance computation in O(n) memory and O(n^2) time, using a C wrapper.
|
18
|
+
email:
|
19
|
+
- falecom@oesdras.com.br
|
20
|
+
executables: []
|
21
|
+
|
22
|
+
extensions: []
|
23
|
+
|
24
|
+
extra_rdoc_files: []
|
25
|
+
|
26
|
+
files:
|
27
|
+
- .gitignore
|
28
|
+
- Gemfile
|
29
|
+
- README
|
30
|
+
- Rakefile
|
31
|
+
- ext/levenshtein_in_c/extconf.rb
|
32
|
+
- ext/levenshtein_in_c/levenshtein_array.c
|
33
|
+
- ext/levenshtein_in_c/levenshtein_array_of_strings.c
|
34
|
+
- ext/levenshtein_in_c/levenshtein_fast.c
|
35
|
+
- ext/levenshtein_in_c/levenshtein_generic.c
|
36
|
+
- ext/levenshtein_in_c/levenshtein_string.c
|
37
|
+
- levenshtein.gemspec
|
38
|
+
- lib/levenshtein.rb
|
39
|
+
- lib/levenshtein/version.rb
|
40
|
+
- lib/levenshtein_in_c.bundle
|
41
|
+
has_rdoc: true
|
42
|
+
homepage: ""
|
43
|
+
licenses: []
|
44
|
+
|
45
|
+
post_install_message:
|
46
|
+
rdoc_options: []
|
47
|
+
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: "0"
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: "0"
|
62
|
+
requirements: []
|
63
|
+
|
64
|
+
rubyforge_project: levenshtein
|
65
|
+
rubygems_version: 1.5.2
|
66
|
+
signing_key:
|
67
|
+
specification_version: 3
|
68
|
+
summary: fast string edit distance computation, using the Damerau-Levenshtein algorithm
|
69
|
+
test_files: []
|
70
|
+
|