aspell_edit_dist 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +5 -0
- data/README +1 -0
- data/Rakefile +12 -0
- data/VERSION +1 -0
- data/ext/aspell_edit_dist.cpp +127 -0
- data/ext/aspell_edit_dist.h +9 -0
- data/ext/extconf.rb +5 -0
- data/ext/leditdist.cpp +308 -0
- data/ext/leditdist.hpp +68 -0
- data/ext/weights.hpp +23 -0
- data/lib/aspell_edit_dist_stub.rb +61 -0
- data/test/edit_distance_test.rb +26 -0
- data/test/test_helper.rb +11 -0
- data/test/weights_test.rb +51 -0
- metadata +70 -0
data/README
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
This is a very simple gem, which purpose is to expose limit_edit_distance from aspell.
|
data/Rakefile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
begin
|
2
|
+
require 'jeweler'
|
3
|
+
Jeweler::Tasks.new do |gemspec|
|
4
|
+
gemspec.name = "aspell_edit_dist"
|
5
|
+
gemspec.summary = "Gem that exposes limit_edit_distance function from Aspell."
|
6
|
+
gemspec.email = "adam@pohorecki.pl"
|
7
|
+
gemspec.homepage = "http://github.com/psyho/aspell_edit_dist"
|
8
|
+
gemspec.authors = ["Adam Pohorecki"]
|
9
|
+
end
|
10
|
+
rescue LoadError
|
11
|
+
puts "Jeweler not available. Install it with: gem install jeweler"
|
12
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
@@ -0,0 +1,127 @@
|
|
1
|
+
#include "aspell_edit_dist.h"
|
2
|
+
#include "weights.hpp"
|
3
|
+
#include "leditdist.hpp"
|
4
|
+
|
5
|
+
// Forward declarations
|
6
|
+
void Init_edit_distance_weights();
|
7
|
+
void Init_limit_edit_distance();
|
8
|
+
|
9
|
+
extern "C" void Init_aspell_edit_dist() {
|
10
|
+
Init_edit_distance_weights();
|
11
|
+
Init_limit_edit_distance();
|
12
|
+
}
|
13
|
+
|
14
|
+
static aspeller::EditDistanceWeights* get_weights(VALUE weights) {
|
15
|
+
aspeller::EditDistanceWeights* result;
|
16
|
+
Data_Get_Struct(weights, aspeller::EditDistanceWeights, result);
|
17
|
+
return result;
|
18
|
+
}
|
19
|
+
|
20
|
+
static void weights_free(aspeller::EditDistanceWeights* obj) {
|
21
|
+
if (obj) {
|
22
|
+
delete obj;
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
static VALUE weights_init(VALUE self) {
|
27
|
+
aspeller::EditDistanceWeights * new_obj = new aspeller::EditDistanceWeights();
|
28
|
+
return Data_Wrap_Struct(cEditDistanceWeights, 0, weights_free, new_obj);
|
29
|
+
}
|
30
|
+
|
31
|
+
static VALUE weights_del1(VALUE self) {
|
32
|
+
return INT2FIX(get_weights(self)->del1);
|
33
|
+
}
|
34
|
+
|
35
|
+
static VALUE weights_set_del1(VALUE self, VALUE val) {
|
36
|
+
get_weights(self)->del1 = NUM2INT(val);
|
37
|
+
return val;
|
38
|
+
}
|
39
|
+
|
40
|
+
static VALUE weights_del2(VALUE self) {
|
41
|
+
return INT2FIX(get_weights(self)->del2);
|
42
|
+
}
|
43
|
+
|
44
|
+
static VALUE weights_set_del2(VALUE self, VALUE val) {
|
45
|
+
get_weights(self)->del2 = NUM2INT(val);
|
46
|
+
return val;
|
47
|
+
}
|
48
|
+
|
49
|
+
static VALUE weights_swap(VALUE self) {
|
50
|
+
return INT2FIX(get_weights(self)->swap);
|
51
|
+
}
|
52
|
+
|
53
|
+
static VALUE weights_set_swap(VALUE self, VALUE val) {
|
54
|
+
get_weights(self)->swap = NUM2INT(val);
|
55
|
+
return val;
|
56
|
+
}
|
57
|
+
|
58
|
+
static VALUE weights_sub(VALUE self) {
|
59
|
+
return INT2FIX(get_weights(self)->sub);
|
60
|
+
}
|
61
|
+
|
62
|
+
static VALUE weights_set_sub(VALUE self, VALUE val) {
|
63
|
+
get_weights(self)->sub = NUM2INT(val);
|
64
|
+
return val;
|
65
|
+
}
|
66
|
+
|
67
|
+
static VALUE weights_similar(VALUE self) {
|
68
|
+
return INT2FIX(get_weights(self)->similar);
|
69
|
+
}
|
70
|
+
|
71
|
+
static VALUE weights_set_similar(VALUE self, VALUE val) {
|
72
|
+
get_weights(self)->similar = NUM2INT(val);
|
73
|
+
return val;
|
74
|
+
}
|
75
|
+
|
76
|
+
static VALUE weights_min(VALUE self) {
|
77
|
+
return INT2FIX(get_weights(self)->min);
|
78
|
+
}
|
79
|
+
|
80
|
+
static VALUE weights_set_min(VALUE self, VALUE val) {
|
81
|
+
get_weights(self)->min = NUM2INT(val);
|
82
|
+
return val;
|
83
|
+
}
|
84
|
+
|
85
|
+
static VALUE weights_max(VALUE self) {
|
86
|
+
return INT2FIX(get_weights(self)->max);
|
87
|
+
}
|
88
|
+
|
89
|
+
static VALUE weights_set_max(VALUE self, VALUE val) {
|
90
|
+
get_weights(self)->max = NUM2INT(val);
|
91
|
+
return val;
|
92
|
+
}
|
93
|
+
|
94
|
+
typedef VALUE (*rb_method)(...);
|
95
|
+
|
96
|
+
void Init_edit_distance_weights() {
|
97
|
+
mAspell = rb_define_module("Aspeller");
|
98
|
+
|
99
|
+
cEditDistanceWeights = rb_define_class_under(mAspell, "EditDistanceWeights", rb_cObject);
|
100
|
+
|
101
|
+
rb_define_method(cEditDistanceWeights, "initialize", (rb_method)weights_init, 0);
|
102
|
+
rb_define_singleton_method(cEditDistanceWeights, "new", (rb_method)weights_init, 0);
|
103
|
+
|
104
|
+
rb_define_method(cEditDistanceWeights, "del1", (rb_method)weights_del1, 0);
|
105
|
+
rb_define_method(cEditDistanceWeights, "del1=", (rb_method)weights_set_del1, 1);
|
106
|
+
rb_define_method(cEditDistanceWeights, "del2", (rb_method)weights_del2, 0);
|
107
|
+
rb_define_method(cEditDistanceWeights, "del2=", (rb_method)weights_set_del2, 1);
|
108
|
+
rb_define_method(cEditDistanceWeights, "swap", (rb_method)weights_swap, 0);
|
109
|
+
rb_define_method(cEditDistanceWeights, "swap=", (rb_method)weights_set_swap, 1);
|
110
|
+
rb_define_method(cEditDistanceWeights, "sub", (rb_method)weights_sub, 0);
|
111
|
+
rb_define_method(cEditDistanceWeights, "sub=", (rb_method)weights_set_sub, 1);
|
112
|
+
rb_define_method(cEditDistanceWeights, "similar", (rb_method)weights_similar, 0);
|
113
|
+
rb_define_method(cEditDistanceWeights, "similar=",(rb_method)weights_set_similar, 1);
|
114
|
+
rb_define_method(cEditDistanceWeights, "min", (rb_method)weights_min, 0);
|
115
|
+
rb_define_method(cEditDistanceWeights, "min=", (rb_method)weights_set_min, 1);
|
116
|
+
rb_define_method(cEditDistanceWeights, "max", (rb_method)weights_max, 0);
|
117
|
+
rb_define_method(cEditDistanceWeights, "max=", (rb_method)weights_set_max, 1);
|
118
|
+
}
|
119
|
+
|
120
|
+
static VALUE aspell_limit_edit_distance(VALUE self, VALUE strA, VALUE strB, VALUE limit, VALUE weights) {
|
121
|
+
int result = aspeller::limit_edit_distance(STR2CSTR(strA), STR2CSTR(strB), NUM2INT(limit), *get_weights(weights));
|
122
|
+
return INT2FIX(result);
|
123
|
+
}
|
124
|
+
|
125
|
+
void Init_limit_edit_distance() {
|
126
|
+
rb_define_singleton_method(mAspell, "limit_edit_distance", (rb_method)aspell_limit_edit_distance, 4);
|
127
|
+
}
|
data/ext/extconf.rb
ADDED
data/ext/leditdist.cpp
ADDED
@@ -0,0 +1,308 @@
|
|
1
|
+
|
2
|
+
#include "leditdist.hpp"
|
3
|
+
|
4
|
+
// The basic algorithm is as follows:
|
5
|
+
//
|
6
|
+
// Let A[n] represent the nth character of string n
|
7
|
+
// A[n..] represent the substring of A starting at n
|
8
|
+
// if n > length of A then it is considered an empty string
|
9
|
+
//
|
10
|
+
// edit_distance(A,B,limit) = ed(A,B,0)
|
11
|
+
// where ed(A,B,d) = d if A & B is empty.
|
12
|
+
// = infinity if d > limit
|
13
|
+
// = ed(A[2..],B[2..], d) if A[1] == B[1]
|
14
|
+
// = min ( ed(A[2..],B[2..], d+1),
|
15
|
+
// ed(A, B[2..], d+1),
|
16
|
+
// ed(A[2..],B, d+1) ) otherwise
|
17
|
+
//
|
18
|
+
// However, the code below:
|
19
|
+
// 1) Also allows for swaps
|
20
|
+
// 2) Allow weights to be attached to each edit
|
21
|
+
// 3) Is not recursive, it uses a loop when it is tail recursion
|
22
|
+
// and a small stack otherwise. The stack will NEVER be larger
|
23
|
+
// then 2 * limit.
|
24
|
+
// 4) Is extremely optimized
|
25
|
+
|
26
|
+
|
27
|
+
#define check_rest(a,b,s) \
|
28
|
+
a0 = a; b0 = b; \
|
29
|
+
while (*a0 == *b0) { \
|
30
|
+
if (*a0 == '\0') { \
|
31
|
+
if (s < min) min = s; \
|
32
|
+
break; \
|
33
|
+
} \
|
34
|
+
++a0; ++b0; \
|
35
|
+
}
|
36
|
+
|
37
|
+
namespace aspeller {
|
38
|
+
|
39
|
+
int limit_edit_distance(const char * a, const char * b,
|
40
|
+
int limit, const EditDistanceWeights & w)
|
41
|
+
{
|
42
|
+
limit = limit*w.max;
|
43
|
+
static const int size = 10;
|
44
|
+
struct Edit {
|
45
|
+
const char * a;
|
46
|
+
const char * b;
|
47
|
+
int score;
|
48
|
+
};
|
49
|
+
Edit begin[size];
|
50
|
+
Edit * i = begin;
|
51
|
+
const char * a0;
|
52
|
+
const char * b0;
|
53
|
+
int score = 0;
|
54
|
+
int min = LARGE_NUM;
|
55
|
+
|
56
|
+
while (true) {
|
57
|
+
|
58
|
+
while (*a == *b) {
|
59
|
+
if (*a == '\0') {
|
60
|
+
if (score < min) min = score;
|
61
|
+
goto FINISH;
|
62
|
+
}
|
63
|
+
++a; ++b;
|
64
|
+
}
|
65
|
+
|
66
|
+
if (*a == '\0') {
|
67
|
+
|
68
|
+
do {
|
69
|
+
score += w.del2;
|
70
|
+
if (score >= min) goto FINISH;
|
71
|
+
++b;
|
72
|
+
} while (*b != '\0');
|
73
|
+
min = score;
|
74
|
+
|
75
|
+
} else if (*b == '\0') {
|
76
|
+
|
77
|
+
do {
|
78
|
+
score += w.del1;
|
79
|
+
if (score >= min) goto FINISH;
|
80
|
+
++a;
|
81
|
+
} while (*a != '\0');
|
82
|
+
min = score;
|
83
|
+
|
84
|
+
} else {
|
85
|
+
|
86
|
+
if (score + w.max <= limit) {
|
87
|
+
if (limit*w.min <= w.max*(w.min+score)) {
|
88
|
+
// if floor(score/max)=limit/max-1 then this edit is only good
|
89
|
+
// if it makes the rest of the string match. So check if
|
90
|
+
// the rest of the string matches to avoid the overhead of
|
91
|
+
// pushing it on then off the stack
|
92
|
+
|
93
|
+
// delete a character from a
|
94
|
+
check_rest(a+1,b,score + w.del1);
|
95
|
+
|
96
|
+
// delete a character from b
|
97
|
+
check_rest(a,b+1,score + w.del2);
|
98
|
+
|
99
|
+
if (*a == *(b+1) && *b == *(a+1)) {
|
100
|
+
|
101
|
+
// swap two characters
|
102
|
+
check_rest(a+2,b+2, score + w.swap);
|
103
|
+
|
104
|
+
} else {
|
105
|
+
|
106
|
+
// substitute one character for another which is the same
|
107
|
+
// thing as deleting a character from both a & b
|
108
|
+
check_rest(a+1,b+1, score + w.sub);
|
109
|
+
|
110
|
+
}
|
111
|
+
|
112
|
+
} else {
|
113
|
+
|
114
|
+
// delete a character from a
|
115
|
+
i->a = a + 1;
|
116
|
+
i->b = b;
|
117
|
+
i->score = score + w.del1;
|
118
|
+
++i;
|
119
|
+
|
120
|
+
// delete a character from b
|
121
|
+
i->a = a;
|
122
|
+
i->b = b + 1;
|
123
|
+
i->score = score + w.del2;
|
124
|
+
++i;
|
125
|
+
|
126
|
+
// If two characters can be swapped and make a match
|
127
|
+
// then the substitution is pointless.
|
128
|
+
// Also, there is no need to push this on the stack as
|
129
|
+
// it is going to be imminently removed.
|
130
|
+
if (*a == *(b+1) && *b == *(a+1)) {
|
131
|
+
|
132
|
+
// swap two characters
|
133
|
+
a = a + 2;
|
134
|
+
b = b + 2;
|
135
|
+
score += w.swap;
|
136
|
+
continue;
|
137
|
+
|
138
|
+
} else {
|
139
|
+
|
140
|
+
// substitute one character for another which is the same
|
141
|
+
// thing as deleting a character from both a & b
|
142
|
+
a = a + 1;
|
143
|
+
b = b + 1;
|
144
|
+
score += w.sub;
|
145
|
+
continue;
|
146
|
+
|
147
|
+
}
|
148
|
+
}
|
149
|
+
}
|
150
|
+
}
|
151
|
+
FINISH:
|
152
|
+
if (i == begin) return min;
|
153
|
+
--i;
|
154
|
+
a = i->a;
|
155
|
+
b = i->b;
|
156
|
+
score = i->score;
|
157
|
+
}
|
158
|
+
}
|
159
|
+
|
160
|
+
#undef check_rest
|
161
|
+
#define check_rest(a,b,w) \
|
162
|
+
a0 = a; b0 = b; \
|
163
|
+
while(*a0 == *b0) { \
|
164
|
+
if (*a0 == '\0') { \
|
165
|
+
if (w < min) min = w; \
|
166
|
+
break; \
|
167
|
+
} \
|
168
|
+
++a0; \
|
169
|
+
++b0; \
|
170
|
+
} \
|
171
|
+
if (amax < a0) amax = a0;
|
172
|
+
|
173
|
+
#define check2(a,b,w) \
|
174
|
+
aa = a; bb = b; \
|
175
|
+
while(*aa == *bb) { \
|
176
|
+
if (*aa == '\0') { \
|
177
|
+
if (amax < aa) amax = aa; \
|
178
|
+
if (w < min) min = w; \
|
179
|
+
break; \
|
180
|
+
} \
|
181
|
+
++aa; ++bb; \
|
182
|
+
} \
|
183
|
+
if (*aa == '\0') { \
|
184
|
+
if (amax < aa) amax = aa; \
|
185
|
+
if (*bb == '\0') {} \
|
186
|
+
else if (*(bb+1) == '\0' && w+ws.del2 < min) min = w+ws.del2; \
|
187
|
+
} else if (*bb == '\0') { \
|
188
|
+
++aa; \
|
189
|
+
if (amax < aa) amax = aa; \
|
190
|
+
if (*aa == '\0' && w+ws.del1 < min) min = w+ws.del1; \
|
191
|
+
} else { \
|
192
|
+
check_rest(aa+1,bb,w+ws.del1); \
|
193
|
+
check_rest(aa,bb+1,w+ws.del2); \
|
194
|
+
if (*aa == *(bb+1) && *bb == *(aa+1)) { \
|
195
|
+
check_rest(aa+2,bb+2,w+ws.swap); \
|
196
|
+
} else { \
|
197
|
+
check_rest(aa+1,bb+1,w+ws.sub); \
|
198
|
+
} \
|
199
|
+
}
|
200
|
+
|
201
|
+
EditDist limit1_edit_distance(const char * a, const char * b,
|
202
|
+
const EditDistanceWeights & ws)
|
203
|
+
{
|
204
|
+
int min = LARGE_NUM;
|
205
|
+
const char * a0;
|
206
|
+
const char * b0;
|
207
|
+
const char * amax = a;
|
208
|
+
|
209
|
+
while(*a == *b) {
|
210
|
+
if (*a == '\0')
|
211
|
+
return EditDist(0, a);
|
212
|
+
++a; ++b;
|
213
|
+
}
|
214
|
+
|
215
|
+
if (*a == '\0') {
|
216
|
+
|
217
|
+
++b;
|
218
|
+
if (*b == '\0') return EditDist(ws.del2, a);
|
219
|
+
return EditDist(LARGE_NUM, a);
|
220
|
+
|
221
|
+
} else if (*b == '\0') {
|
222
|
+
|
223
|
+
++a;
|
224
|
+
if (*a == '\0') return EditDist(ws.del1, a);
|
225
|
+
return EditDist(LARGE_NUM, a);
|
226
|
+
|
227
|
+
} else {
|
228
|
+
|
229
|
+
// delete a character from a
|
230
|
+
check_rest(a+1,b,ws.del1);
|
231
|
+
|
232
|
+
// delete a character from b
|
233
|
+
check_rest(a,b+1,ws.del2);
|
234
|
+
|
235
|
+
if (*a == *(b+1) && *b == *(a+1)) {
|
236
|
+
|
237
|
+
// swap two characters
|
238
|
+
check_rest(a+2,b+2,ws.swap);
|
239
|
+
|
240
|
+
} else {
|
241
|
+
|
242
|
+
// substitute one character for another which is the same
|
243
|
+
// thing as deleting a character from both a & b
|
244
|
+
check_rest(a+1,b+1,ws.sub);
|
245
|
+
|
246
|
+
}
|
247
|
+
}
|
248
|
+
return EditDist(min, amax);
|
249
|
+
}
|
250
|
+
|
251
|
+
EditDist limit2_edit_distance(const char * a, const char * b,
|
252
|
+
const EditDistanceWeights & ws)
|
253
|
+
{
|
254
|
+
int min = LARGE_NUM;
|
255
|
+
const char * a0;
|
256
|
+
const char * b0;
|
257
|
+
const char * aa;
|
258
|
+
const char * bb;
|
259
|
+
const char * amax = a;
|
260
|
+
|
261
|
+
while(*a == *b) {
|
262
|
+
if (*a == '\0')
|
263
|
+
return EditDist(0, a);
|
264
|
+
++a; ++b;
|
265
|
+
}
|
266
|
+
|
267
|
+
if (*a == '\0') {
|
268
|
+
|
269
|
+
++b;
|
270
|
+
if (*b == '\0') return EditDist(ws.del2, a);
|
271
|
+
++b;
|
272
|
+
if (*b == '\0') return EditDist(2*ws.del2, a);
|
273
|
+
return EditDist(LARGE_NUM, a);
|
274
|
+
|
275
|
+
} else if (*b == '\0') {
|
276
|
+
|
277
|
+
++a;
|
278
|
+
if (*a == '\0') return EditDist(ws.del1, a);
|
279
|
+
++a;
|
280
|
+
if (*a == '\0') return EditDist(2*ws.del1, a);
|
281
|
+
return EditDist(LARGE_NUM, a);
|
282
|
+
|
283
|
+
} else {
|
284
|
+
|
285
|
+
// delete a character from a
|
286
|
+
check2(a+1,b,ws.del1);
|
287
|
+
|
288
|
+
// delete a character from b
|
289
|
+
check2(a,b+1,ws.del2);
|
290
|
+
|
291
|
+
if (*a == *(b+1) && *b == *(a+1)) {
|
292
|
+
|
293
|
+
// swap two characters
|
294
|
+
check2(a+2,b+2,ws.swap);
|
295
|
+
|
296
|
+
} else {
|
297
|
+
|
298
|
+
// substitute one character for another which is the same
|
299
|
+
// thing as deleting a character from both a & b
|
300
|
+
check2(a+1,b+1,ws.sub);
|
301
|
+
|
302
|
+
}
|
303
|
+
}
|
304
|
+
return EditDist(min, amax);
|
305
|
+
}
|
306
|
+
}
|
307
|
+
|
308
|
+
|
data/ext/leditdist.hpp
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
|
2
|
+
#ifndef __aspeller_leditdist_hh__
|
3
|
+
#define __aspeller_leditdist_hh__
|
4
|
+
|
5
|
+
#include "weights.hpp"
|
6
|
+
|
7
|
+
namespace aspeller {
|
8
|
+
|
9
|
+
// limit_edit_distance finds the shortest edit distance but will
|
10
|
+
// stop and return a number at least as large as LARGE_NUM if it has
|
11
|
+
// to do more edits than a set limit.
|
12
|
+
// Note that this does NOT mean that the score returned is <= limit*w.max
|
13
|
+
// as "sub" vs "submarine" will return 6*(cost of insertion) no matter what
|
14
|
+
// the limit is.
|
15
|
+
// The edit distance is
|
16
|
+
// (cost of swap)(# of swaps) + (cost of deletion)(# of deletions)
|
17
|
+
// + (cost of insertion)(# of insertions)
|
18
|
+
// + (cost of substitutions)(# of substitutions)
|
19
|
+
|
20
|
+
// Preconditions:
|
21
|
+
// max(strlen(a), strlen(b))*max(of the edit weights) <= 2^15
|
22
|
+
// if violated than an incorrect result may be returned (which may be negative)
|
23
|
+
// due to overflow of a short integer
|
24
|
+
// (limit+1)*w.min < limit*w.max
|
25
|
+
// limit <= 5 (use edit_distance if limit > 5)
|
26
|
+
// where w.min and w.max is the minimum and maximum cost of an edit
|
27
|
+
// respectfully.
|
28
|
+
|
29
|
+
// The running time is asymptotically bounded above by
|
30
|
+
// (3^l)*n where l is the limit and n is the maxium of strlen(a),strlen(b)
|
31
|
+
// Based on my informal tests, however, the n does not really matter
|
32
|
+
// and the running time is more like (3^l).
|
33
|
+
|
34
|
+
// limit_edit_distance, based on my informal tests, turns out to be
|
35
|
+
// faster than edit_dist for l < 5. For l == 5 it is about the
|
36
|
+
// smaller for short strings (<= 5) and less than for longer strings
|
37
|
+
|
38
|
+
// limit2_edit_distance(a,b,w) = limit_edit_distance(a,b,2,w)
|
39
|
+
// but is roughly 2/3's faster
|
40
|
+
|
41
|
+
struct EditDist {
|
42
|
+
int score;
|
43
|
+
const char * stopped_at;
|
44
|
+
EditDist() {}
|
45
|
+
EditDist(int s, const char * p)
|
46
|
+
: score(s), stopped_at(p) {}
|
47
|
+
operator int () const {return score;}
|
48
|
+
};
|
49
|
+
|
50
|
+
static const int LARGE_NUM = 0xFFFFF;
|
51
|
+
// this needs to be SMALLER than INT_MAX since it may be incremented
|
52
|
+
// a few times
|
53
|
+
|
54
|
+
int limit_edit_distance(const char * a, const char * b, int limit,
|
55
|
+
const EditDistanceWeights & w
|
56
|
+
= EditDistanceWeights());
|
57
|
+
|
58
|
+
EditDist limit1_edit_distance(const char * a, const char * b,
|
59
|
+
const EditDistanceWeights & w
|
60
|
+
= EditDistanceWeights());
|
61
|
+
|
62
|
+
EditDist limit2_edit_distance(const char * a, const char * b,
|
63
|
+
const EditDistanceWeights & w
|
64
|
+
= EditDistanceWeights());
|
65
|
+
|
66
|
+
}
|
67
|
+
|
68
|
+
#endif
|
data/ext/weights.hpp
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
|
2
|
+
#ifndef __aspeller_weights_hh__
|
3
|
+
#define __aspeller_weights_hh__
|
4
|
+
|
5
|
+
namespace aspeller {
|
6
|
+
|
7
|
+
struct EditDistanceWeights {
|
8
|
+
int del1; // the cost of deleting a char in the first string
|
9
|
+
int del2; // the cost of inserting a character or deleting a char
|
10
|
+
// in the next string
|
11
|
+
int swap; // the cost of swapping two adjacent letters
|
12
|
+
int sub; // the cost of replacing one letter with another
|
13
|
+
int similar; // the cost of a "similar" but not exact match for
|
14
|
+
// two characters
|
15
|
+
int min; // the min of del1, del2, swap and sub.
|
16
|
+
int max; // the max of del1, del2, swap and sub.
|
17
|
+
EditDistanceWeights()
|
18
|
+
: del1(1), del2(1), swap(1), sub(1), similar(0), min(1), max(1) {}
|
19
|
+
};
|
20
|
+
|
21
|
+
}
|
22
|
+
|
23
|
+
#endif
|
@@ -0,0 +1,61 @@
|
|
1
|
+
raise "This file should never be required. It's here only for documentation purposes."
|
2
|
+
|
3
|
+
# module, through which the functionality of edit distance calculation is possible
|
4
|
+
module Aspeller
|
5
|
+
|
6
|
+
# weights that are used by Aspell to determine edit distance between two strings
|
7
|
+
class EditDistanceWeights
|
8
|
+
|
9
|
+
# the cost of deleting a char in the first string, defaults to 1
|
10
|
+
attr_accessor :del1
|
11
|
+
|
12
|
+
# the cost of inserting a character or deleting a char in the next string, defaults to 1
|
13
|
+
attr_accessor :del2
|
14
|
+
|
15
|
+
# the cost of swapping two adjacent letters, defaults to 1
|
16
|
+
attr_accessor :swap
|
17
|
+
|
18
|
+
# the cost of replacing one letter with another, defaults to 1
|
19
|
+
attr_accessor :sub
|
20
|
+
|
21
|
+
# the cost of a "similar" but not exact match for two characters, defaults to 0
|
22
|
+
attr_accessor :similar
|
23
|
+
|
24
|
+
# the min of del1, del2, swap and sub, defaults to 1
|
25
|
+
attr_accessor :min
|
26
|
+
|
27
|
+
# the max of del1, del2, swap and sub, defaults to 1
|
28
|
+
attr_accessor :max
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
# limit_edit_distance finds the shortest edit distance but will
|
33
|
+
# stop and return a number at least as large as LARGE_NUM if it has
|
34
|
+
# to do more edits than a set limit.
|
35
|
+
# Note that this does NOT mean that the score returned is <= limit*w.max
|
36
|
+
# as "sub" vs "submarine" will return 6*(cost of insertion) no matter what
|
37
|
+
# the limit is.
|
38
|
+
# The edit distance is
|
39
|
+
# (cost of swap)(# of swaps) + (cost of deletion)(# of deletions)
|
40
|
+
# + (cost of insertion)(# of insertions)
|
41
|
+
# + (cost of substitutions)(# of substitutions)
|
42
|
+
#
|
43
|
+
# Preconditions:
|
44
|
+
# max(strlen(a), strlen(b))*max(of the edit weights) <= 2^15
|
45
|
+
# if violated than an incorrect result may be returned (which may be negative)
|
46
|
+
# due to overflow of a short integer
|
47
|
+
# (limit+1)*w.min < limit*w.max
|
48
|
+
# limit <= 5 (use edit_distance if limit > 5)
|
49
|
+
# where w.min and w.max is the minimum and maximum cost of an edit
|
50
|
+
# respectfully.
|
51
|
+
#
|
52
|
+
# The running time is asymptotically bounded above by
|
53
|
+
# (3^l)*n where l is the limit and n is the maxium of strlen(a),strlen(b)
|
54
|
+
# Based on my informal tests, however, the n does not really matter
|
55
|
+
# and the running time is more like (3^l).
|
56
|
+
#
|
57
|
+
# limit_edit_distance, based on my informal tests, turns out to be
|
58
|
+
# faster than edit_dist for l < 5. For l == 5 it is about the
|
59
|
+
# smaller for short strings (<= 5) and less than for longer strings
|
60
|
+
def self.limit_edit_distance(strA, strB, limit, weights); end
|
61
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require File.expand_path( File.dirname(__FILE__) + '/test_helper.rb' )
|
2
|
+
|
3
|
+
class EditDistanceTest < Test::Unit::TestCase
|
4
|
+
|
5
|
+
def test_limit_distance_defined
|
6
|
+
assert defined?(Aspeller), "module Aspell must be defined"
|
7
|
+
assert defined?(Aspeller.limit_edit_distance), "method Aspell.limit_edit_distance must be defined"
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_limit_distance_returns_fixnum
|
11
|
+
result = Aspeller.limit_edit_distance("a", "b", 1, Aspeller::EditDistanceWeights.new)
|
12
|
+
assert result.is_a?(Fixnum), "limit_edit_distance is supposed to return a fixnum"
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_limit_distance_returns_right_value
|
16
|
+
result = Aspeller.limit_edit_distance("test", "tast", 1, Aspeller::EditDistanceWeights.new)
|
17
|
+
assert_equal 1, result
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_limit_distance_uses_passed_weights
|
21
|
+
weights = Aspeller::EditDistanceWeights.new
|
22
|
+
weights.sub = 2
|
23
|
+
result = Aspeller.limit_edit_distance("test", "tast", 1, weights)
|
24
|
+
assert_equal 2, result
|
25
|
+
end
|
26
|
+
end
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
$LOAD_PATH << File.expand_path( File.dirname(__FILE__) + '/../ext' )
|
2
|
+
$LOAD_PATH << File.expand_path( File.dirname(__FILE__) + '/..' )
|
3
|
+
|
4
|
+
require 'test/unit'
|
5
|
+
|
6
|
+
system("cd #{File.expand_path(File.join(File.dirname(__FILE__), '..'))}; make distclean")
|
7
|
+
|
8
|
+
system("cd #{File.expand_path(File.join(File.dirname(__FILE__), '..'))}; ruby ext/extconf.rb")
|
9
|
+
system("cd #{File.expand_path(File.join(File.dirname(__FILE__), '..'))}; make")
|
10
|
+
|
11
|
+
require "aspell_edit_dist"
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require File.expand_path( File.dirname(__FILE__) + '/test_helper.rb' )
|
2
|
+
|
3
|
+
class WeightsTest < Test::Unit::TestCase
|
4
|
+
|
5
|
+
def test_weights_defined
|
6
|
+
assert defined?(Aspeller), "Aspeller module should be defined"
|
7
|
+
assert defined?(Aspeller::EditDistanceWeights), "class Aspeller::EditDistanceWeights is supposed to be defined"
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_weights_has_constructor
|
11
|
+
assert_respond_to Aspeller::EditDistanceWeights, :new
|
12
|
+
end
|
13
|
+
|
14
|
+
WEIGHT_METHODS = [:del1, :del2, :swap, :sub, :similar, :min, :max]
|
15
|
+
|
16
|
+
def test_weights_getters
|
17
|
+
weights = Aspeller::EditDistanceWeights.new
|
18
|
+
WEIGHT_METHODS.each do |method_name|
|
19
|
+
assert_respond_to weights, method_name
|
20
|
+
assert weights.send(method_name), "weights.#{method_name} should return something"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_weights_default_values
|
25
|
+
weights = Aspeller::EditDistanceWeights.new
|
26
|
+
assert_equal 1, weights.del1
|
27
|
+
assert_equal 1, weights.del2
|
28
|
+
assert_equal 1, weights.swap
|
29
|
+
assert_equal 1, weights.sub
|
30
|
+
assert_equal 0, weights.similar
|
31
|
+
assert_equal 1, weights.min
|
32
|
+
assert_equal 1, weights.max
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_weights_setters
|
36
|
+
weights = Aspeller::EditDistanceWeights.new
|
37
|
+
|
38
|
+
n = 123
|
39
|
+
WEIGHT_METHODS.each do |method_name|
|
40
|
+
setter_name = :"#{method_name}="
|
41
|
+
|
42
|
+
assert_respond_to weights, setter_name
|
43
|
+
|
44
|
+
weights.send(setter_name, n)
|
45
|
+
assert_equal n, weights.send(method_name)
|
46
|
+
|
47
|
+
n += 1
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
metadata
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: aspell_edit_dist
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Adam Pohorecki
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-12-29 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description:
|
17
|
+
email: adam@pohorecki.pl
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions:
|
21
|
+
- ext/extconf.rb
|
22
|
+
extra_rdoc_files:
|
23
|
+
- README
|
24
|
+
files:
|
25
|
+
- .gitignore
|
26
|
+
- README
|
27
|
+
- Rakefile
|
28
|
+
- VERSION
|
29
|
+
- ext/aspell_edit_dist.cpp
|
30
|
+
- ext/aspell_edit_dist.h
|
31
|
+
- ext/extconf.rb
|
32
|
+
- ext/leditdist.cpp
|
33
|
+
- ext/leditdist.hpp
|
34
|
+
- ext/weights.hpp
|
35
|
+
- lib/aspell_edit_dist_stub.rb
|
36
|
+
- test/edit_distance_test.rb
|
37
|
+
- test/test_helper.rb
|
38
|
+
- test/weights_test.rb
|
39
|
+
has_rdoc: true
|
40
|
+
homepage: http://github.com/psyho/aspell_edit_dist
|
41
|
+
licenses: []
|
42
|
+
|
43
|
+
post_install_message:
|
44
|
+
rdoc_options:
|
45
|
+
- --charset=UTF-8
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: "0"
|
53
|
+
version:
|
54
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: "0"
|
59
|
+
version:
|
60
|
+
requirements: []
|
61
|
+
|
62
|
+
rubyforge_project:
|
63
|
+
rubygems_version: 1.3.5
|
64
|
+
signing_key:
|
65
|
+
specification_version: 3
|
66
|
+
summary: Gem that exposes limit_edit_distance function from Aspell.
|
67
|
+
test_files:
|
68
|
+
- test/edit_distance_test.rb
|
69
|
+
- test/weights_test.rb
|
70
|
+
- test/test_helper.rb
|