aspell_edit_dist 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -0
- data/README +1 -0
- data/Rakefile +12 -0
- data/VERSION +1 -0
- data/ext/aspell_edit_dist.cpp +127 -0
- data/ext/aspell_edit_dist.h +9 -0
- data/ext/extconf.rb +5 -0
- data/ext/leditdist.cpp +308 -0
- data/ext/leditdist.hpp +68 -0
- data/ext/weights.hpp +23 -0
- data/lib/aspell_edit_dist_stub.rb +61 -0
- data/test/edit_distance_test.rb +26 -0
- data/test/test_helper.rb +11 -0
- data/test/weights_test.rb +51 -0
- metadata +70 -0
data/README
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
This is a very simple gem, which purpose is to expose limit_edit_distance from aspell.
|
data/Rakefile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
begin
|
2
|
+
require 'jeweler'
|
3
|
+
Jeweler::Tasks.new do |gemspec|
|
4
|
+
gemspec.name = "aspell_edit_dist"
|
5
|
+
gemspec.summary = "Gem that exposes limit_edit_distance function from Aspell."
|
6
|
+
gemspec.email = "adam@pohorecki.pl"
|
7
|
+
gemspec.homepage = "http://github.com/psyho/aspell_edit_dist"
|
8
|
+
gemspec.authors = ["Adam Pohorecki"]
|
9
|
+
end
|
10
|
+
rescue LoadError
|
11
|
+
puts "Jeweler not available. Install it with: gem install jeweler"
|
12
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
@@ -0,0 +1,127 @@
|
|
1
|
+
#include "aspell_edit_dist.h"
|
2
|
+
#include "weights.hpp"
|
3
|
+
#include "leditdist.hpp"
|
4
|
+
|
5
|
+
// Forward declarations
|
6
|
+
void Init_edit_distance_weights();
|
7
|
+
void Init_limit_edit_distance();
|
8
|
+
|
9
|
+
extern "C" void Init_aspell_edit_dist() {
|
10
|
+
Init_edit_distance_weights();
|
11
|
+
Init_limit_edit_distance();
|
12
|
+
}
|
13
|
+
|
14
|
+
static aspeller::EditDistanceWeights* get_weights(VALUE weights) {
|
15
|
+
aspeller::EditDistanceWeights* result;
|
16
|
+
Data_Get_Struct(weights, aspeller::EditDistanceWeights, result);
|
17
|
+
return result;
|
18
|
+
}
|
19
|
+
|
20
|
+
static void weights_free(aspeller::EditDistanceWeights* obj) {
|
21
|
+
if (obj) {
|
22
|
+
delete obj;
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
static VALUE weights_init(VALUE self) {
|
27
|
+
aspeller::EditDistanceWeights * new_obj = new aspeller::EditDistanceWeights();
|
28
|
+
return Data_Wrap_Struct(cEditDistanceWeights, 0, weights_free, new_obj);
|
29
|
+
}
|
30
|
+
|
31
|
+
static VALUE weights_del1(VALUE self) {
|
32
|
+
return INT2FIX(get_weights(self)->del1);
|
33
|
+
}
|
34
|
+
|
35
|
+
static VALUE weights_set_del1(VALUE self, VALUE val) {
|
36
|
+
get_weights(self)->del1 = NUM2INT(val);
|
37
|
+
return val;
|
38
|
+
}
|
39
|
+
|
40
|
+
static VALUE weights_del2(VALUE self) {
|
41
|
+
return INT2FIX(get_weights(self)->del2);
|
42
|
+
}
|
43
|
+
|
44
|
+
static VALUE weights_set_del2(VALUE self, VALUE val) {
|
45
|
+
get_weights(self)->del2 = NUM2INT(val);
|
46
|
+
return val;
|
47
|
+
}
|
48
|
+
|
49
|
+
static VALUE weights_swap(VALUE self) {
|
50
|
+
return INT2FIX(get_weights(self)->swap);
|
51
|
+
}
|
52
|
+
|
53
|
+
static VALUE weights_set_swap(VALUE self, VALUE val) {
|
54
|
+
get_weights(self)->swap = NUM2INT(val);
|
55
|
+
return val;
|
56
|
+
}
|
57
|
+
|
58
|
+
static VALUE weights_sub(VALUE self) {
|
59
|
+
return INT2FIX(get_weights(self)->sub);
|
60
|
+
}
|
61
|
+
|
62
|
+
static VALUE weights_set_sub(VALUE self, VALUE val) {
|
63
|
+
get_weights(self)->sub = NUM2INT(val);
|
64
|
+
return val;
|
65
|
+
}
|
66
|
+
|
67
|
+
static VALUE weights_similar(VALUE self) {
|
68
|
+
return INT2FIX(get_weights(self)->similar);
|
69
|
+
}
|
70
|
+
|
71
|
+
static VALUE weights_set_similar(VALUE self, VALUE val) {
|
72
|
+
get_weights(self)->similar = NUM2INT(val);
|
73
|
+
return val;
|
74
|
+
}
|
75
|
+
|
76
|
+
static VALUE weights_min(VALUE self) {
|
77
|
+
return INT2FIX(get_weights(self)->min);
|
78
|
+
}
|
79
|
+
|
80
|
+
static VALUE weights_set_min(VALUE self, VALUE val) {
|
81
|
+
get_weights(self)->min = NUM2INT(val);
|
82
|
+
return val;
|
83
|
+
}
|
84
|
+
|
85
|
+
static VALUE weights_max(VALUE self) {
|
86
|
+
return INT2FIX(get_weights(self)->max);
|
87
|
+
}
|
88
|
+
|
89
|
+
static VALUE weights_set_max(VALUE self, VALUE val) {
|
90
|
+
get_weights(self)->max = NUM2INT(val);
|
91
|
+
return val;
|
92
|
+
}
|
93
|
+
|
94
|
+
typedef VALUE (*rb_method)(...);
|
95
|
+
|
96
|
+
void Init_edit_distance_weights() {
|
97
|
+
mAspell = rb_define_module("Aspeller");
|
98
|
+
|
99
|
+
cEditDistanceWeights = rb_define_class_under(mAspell, "EditDistanceWeights", rb_cObject);
|
100
|
+
|
101
|
+
rb_define_method(cEditDistanceWeights, "initialize", (rb_method)weights_init, 0);
|
102
|
+
rb_define_singleton_method(cEditDistanceWeights, "new", (rb_method)weights_init, 0);
|
103
|
+
|
104
|
+
rb_define_method(cEditDistanceWeights, "del1", (rb_method)weights_del1, 0);
|
105
|
+
rb_define_method(cEditDistanceWeights, "del1=", (rb_method)weights_set_del1, 1);
|
106
|
+
rb_define_method(cEditDistanceWeights, "del2", (rb_method)weights_del2, 0);
|
107
|
+
rb_define_method(cEditDistanceWeights, "del2=", (rb_method)weights_set_del2, 1);
|
108
|
+
rb_define_method(cEditDistanceWeights, "swap", (rb_method)weights_swap, 0);
|
109
|
+
rb_define_method(cEditDistanceWeights, "swap=", (rb_method)weights_set_swap, 1);
|
110
|
+
rb_define_method(cEditDistanceWeights, "sub", (rb_method)weights_sub, 0);
|
111
|
+
rb_define_method(cEditDistanceWeights, "sub=", (rb_method)weights_set_sub, 1);
|
112
|
+
rb_define_method(cEditDistanceWeights, "similar", (rb_method)weights_similar, 0);
|
113
|
+
rb_define_method(cEditDistanceWeights, "similar=",(rb_method)weights_set_similar, 1);
|
114
|
+
rb_define_method(cEditDistanceWeights, "min", (rb_method)weights_min, 0);
|
115
|
+
rb_define_method(cEditDistanceWeights, "min=", (rb_method)weights_set_min, 1);
|
116
|
+
rb_define_method(cEditDistanceWeights, "max", (rb_method)weights_max, 0);
|
117
|
+
rb_define_method(cEditDistanceWeights, "max=", (rb_method)weights_set_max, 1);
|
118
|
+
}
|
119
|
+
|
120
|
+
static VALUE aspell_limit_edit_distance(VALUE self, VALUE strA, VALUE strB, VALUE limit, VALUE weights) {
|
121
|
+
int result = aspeller::limit_edit_distance(STR2CSTR(strA), STR2CSTR(strB), NUM2INT(limit), *get_weights(weights));
|
122
|
+
return INT2FIX(result);
|
123
|
+
}
|
124
|
+
|
125
|
+
void Init_limit_edit_distance() {
|
126
|
+
rb_define_singleton_method(mAspell, "limit_edit_distance", (rb_method)aspell_limit_edit_distance, 4);
|
127
|
+
}
|
data/ext/extconf.rb
ADDED
data/ext/leditdist.cpp
ADDED
@@ -0,0 +1,308 @@
|
|
1
|
+
|
2
|
+
#include "leditdist.hpp"
|
3
|
+
|
4
|
+
// The basic algorithm is as follows:
|
5
|
+
//
|
6
|
+
// Let A[n] represent the nth character of string n
|
7
|
+
// A[n..] represent the substring of A starting at n
|
8
|
+
// if n > length of A then it is considered an empty string
|
9
|
+
//
|
10
|
+
// edit_distance(A,B,limit) = ed(A,B,0)
|
11
|
+
// where ed(A,B,d) = d if A & B is empty.
|
12
|
+
// = infinity if d > limit
|
13
|
+
// = ed(A[2..],B[2..], d) if A[1] == B[1]
|
14
|
+
// = min ( ed(A[2..],B[2..], d+1),
|
15
|
+
// ed(A, B[2..], d+1),
|
16
|
+
// ed(A[2..],B, d+1) ) otherwise
|
17
|
+
//
|
18
|
+
// However, the code below:
|
19
|
+
// 1) Also allows for swaps
|
20
|
+
// 2) Allow weights to be attached to each edit
|
21
|
+
// 3) Is not recursive, it uses a loop when it is tail recursion
|
22
|
+
// and a small stack otherwise. The stack will NEVER be larger
|
23
|
+
// then 2 * limit.
|
24
|
+
// 4) Is extremely optimized
|
25
|
+
|
26
|
+
|
27
|
+
#define check_rest(a,b,s) \
|
28
|
+
a0 = a; b0 = b; \
|
29
|
+
while (*a0 == *b0) { \
|
30
|
+
if (*a0 == '\0') { \
|
31
|
+
if (s < min) min = s; \
|
32
|
+
break; \
|
33
|
+
} \
|
34
|
+
++a0; ++b0; \
|
35
|
+
}
|
36
|
+
|
37
|
+
namespace aspeller {
|
38
|
+
|
39
|
+
int limit_edit_distance(const char * a, const char * b,
|
40
|
+
int limit, const EditDistanceWeights & w)
|
41
|
+
{
|
42
|
+
limit = limit*w.max;
|
43
|
+
static const int size = 10;
|
44
|
+
struct Edit {
|
45
|
+
const char * a;
|
46
|
+
const char * b;
|
47
|
+
int score;
|
48
|
+
};
|
49
|
+
Edit begin[size];
|
50
|
+
Edit * i = begin;
|
51
|
+
const char * a0;
|
52
|
+
const char * b0;
|
53
|
+
int score = 0;
|
54
|
+
int min = LARGE_NUM;
|
55
|
+
|
56
|
+
while (true) {
|
57
|
+
|
58
|
+
while (*a == *b) {
|
59
|
+
if (*a == '\0') {
|
60
|
+
if (score < min) min = score;
|
61
|
+
goto FINISH;
|
62
|
+
}
|
63
|
+
++a; ++b;
|
64
|
+
}
|
65
|
+
|
66
|
+
if (*a == '\0') {
|
67
|
+
|
68
|
+
do {
|
69
|
+
score += w.del2;
|
70
|
+
if (score >= min) goto FINISH;
|
71
|
+
++b;
|
72
|
+
} while (*b != '\0');
|
73
|
+
min = score;
|
74
|
+
|
75
|
+
} else if (*b == '\0') {
|
76
|
+
|
77
|
+
do {
|
78
|
+
score += w.del1;
|
79
|
+
if (score >= min) goto FINISH;
|
80
|
+
++a;
|
81
|
+
} while (*a != '\0');
|
82
|
+
min = score;
|
83
|
+
|
84
|
+
} else {
|
85
|
+
|
86
|
+
if (score + w.max <= limit) {
|
87
|
+
if (limit*w.min <= w.max*(w.min+score)) {
|
88
|
+
// if floor(score/max)=limit/max-1 then this edit is only good
|
89
|
+
// if it makes the rest of the string match. So check if
|
90
|
+
// the rest of the string matches to avoid the overhead of
|
91
|
+
// pushing it on then off the stack
|
92
|
+
|
93
|
+
// delete a character from a
|
94
|
+
check_rest(a+1,b,score + w.del1);
|
95
|
+
|
96
|
+
// delete a character from b
|
97
|
+
check_rest(a,b+1,score + w.del2);
|
98
|
+
|
99
|
+
if (*a == *(b+1) && *b == *(a+1)) {
|
100
|
+
|
101
|
+
// swap two characters
|
102
|
+
check_rest(a+2,b+2, score + w.swap);
|
103
|
+
|
104
|
+
} else {
|
105
|
+
|
106
|
+
// substitute one character for another which is the same
|
107
|
+
// thing as deleting a character from both a & b
|
108
|
+
check_rest(a+1,b+1, score + w.sub);
|
109
|
+
|
110
|
+
}
|
111
|
+
|
112
|
+
} else {
|
113
|
+
|
114
|
+
// delete a character from a
|
115
|
+
i->a = a + 1;
|
116
|
+
i->b = b;
|
117
|
+
i->score = score + w.del1;
|
118
|
+
++i;
|
119
|
+
|
120
|
+
// delete a character from b
|
121
|
+
i->a = a;
|
122
|
+
i->b = b + 1;
|
123
|
+
i->score = score + w.del2;
|
124
|
+
++i;
|
125
|
+
|
126
|
+
// If two characters can be swapped and make a match
|
127
|
+
// then the substitution is pointless.
|
128
|
+
// Also, there is no need to push this on the stack as
|
129
|
+
// it is going to be imminently removed.
|
130
|
+
if (*a == *(b+1) && *b == *(a+1)) {
|
131
|
+
|
132
|
+
// swap two characters
|
133
|
+
a = a + 2;
|
134
|
+
b = b + 2;
|
135
|
+
score += w.swap;
|
136
|
+
continue;
|
137
|
+
|
138
|
+
} else {
|
139
|
+
|
140
|
+
// substitute one character for another which is the same
|
141
|
+
// thing as deleting a character from both a & b
|
142
|
+
a = a + 1;
|
143
|
+
b = b + 1;
|
144
|
+
score += w.sub;
|
145
|
+
continue;
|
146
|
+
|
147
|
+
}
|
148
|
+
}
|
149
|
+
}
|
150
|
+
}
|
151
|
+
FINISH:
|
152
|
+
if (i == begin) return min;
|
153
|
+
--i;
|
154
|
+
a = i->a;
|
155
|
+
b = i->b;
|
156
|
+
score = i->score;
|
157
|
+
}
|
158
|
+
}
|
159
|
+
|
160
|
+
#undef check_rest
|
161
|
+
#define check_rest(a,b,w) \
|
162
|
+
a0 = a; b0 = b; \
|
163
|
+
while(*a0 == *b0) { \
|
164
|
+
if (*a0 == '\0') { \
|
165
|
+
if (w < min) min = w; \
|
166
|
+
break; \
|
167
|
+
} \
|
168
|
+
++a0; \
|
169
|
+
++b0; \
|
170
|
+
} \
|
171
|
+
if (amax < a0) amax = a0;
|
172
|
+
|
173
|
+
#define check2(a,b,w) \
|
174
|
+
aa = a; bb = b; \
|
175
|
+
while(*aa == *bb) { \
|
176
|
+
if (*aa == '\0') { \
|
177
|
+
if (amax < aa) amax = aa; \
|
178
|
+
if (w < min) min = w; \
|
179
|
+
break; \
|
180
|
+
} \
|
181
|
+
++aa; ++bb; \
|
182
|
+
} \
|
183
|
+
if (*aa == '\0') { \
|
184
|
+
if (amax < aa) amax = aa; \
|
185
|
+
if (*bb == '\0') {} \
|
186
|
+
else if (*(bb+1) == '\0' && w+ws.del2 < min) min = w+ws.del2; \
|
187
|
+
} else if (*bb == '\0') { \
|
188
|
+
++aa; \
|
189
|
+
if (amax < aa) amax = aa; \
|
190
|
+
if (*aa == '\0' && w+ws.del1 < min) min = w+ws.del1; \
|
191
|
+
} else { \
|
192
|
+
check_rest(aa+1,bb,w+ws.del1); \
|
193
|
+
check_rest(aa,bb+1,w+ws.del2); \
|
194
|
+
if (*aa == *(bb+1) && *bb == *(aa+1)) { \
|
195
|
+
check_rest(aa+2,bb+2,w+ws.swap); \
|
196
|
+
} else { \
|
197
|
+
check_rest(aa+1,bb+1,w+ws.sub); \
|
198
|
+
} \
|
199
|
+
}
|
200
|
+
|
201
|
+
EditDist limit1_edit_distance(const char * a, const char * b,
|
202
|
+
const EditDistanceWeights & ws)
|
203
|
+
{
|
204
|
+
int min = LARGE_NUM;
|
205
|
+
const char * a0;
|
206
|
+
const char * b0;
|
207
|
+
const char * amax = a;
|
208
|
+
|
209
|
+
while(*a == *b) {
|
210
|
+
if (*a == '\0')
|
211
|
+
return EditDist(0, a);
|
212
|
+
++a; ++b;
|
213
|
+
}
|
214
|
+
|
215
|
+
if (*a == '\0') {
|
216
|
+
|
217
|
+
++b;
|
218
|
+
if (*b == '\0') return EditDist(ws.del2, a);
|
219
|
+
return EditDist(LARGE_NUM, a);
|
220
|
+
|
221
|
+
} else if (*b == '\0') {
|
222
|
+
|
223
|
+
++a;
|
224
|
+
if (*a == '\0') return EditDist(ws.del1, a);
|
225
|
+
return EditDist(LARGE_NUM, a);
|
226
|
+
|
227
|
+
} else {
|
228
|
+
|
229
|
+
// delete a character from a
|
230
|
+
check_rest(a+1,b,ws.del1);
|
231
|
+
|
232
|
+
// delete a character from b
|
233
|
+
check_rest(a,b+1,ws.del2);
|
234
|
+
|
235
|
+
if (*a == *(b+1) && *b == *(a+1)) {
|
236
|
+
|
237
|
+
// swap two characters
|
238
|
+
check_rest(a+2,b+2,ws.swap);
|
239
|
+
|
240
|
+
} else {
|
241
|
+
|
242
|
+
// substitute one character for another which is the same
|
243
|
+
// thing as deleting a character from both a & b
|
244
|
+
check_rest(a+1,b+1,ws.sub);
|
245
|
+
|
246
|
+
}
|
247
|
+
}
|
248
|
+
return EditDist(min, amax);
|
249
|
+
}
|
250
|
+
|
251
|
+
EditDist limit2_edit_distance(const char * a, const char * b,
|
252
|
+
const EditDistanceWeights & ws)
|
253
|
+
{
|
254
|
+
int min = LARGE_NUM;
|
255
|
+
const char * a0;
|
256
|
+
const char * b0;
|
257
|
+
const char * aa;
|
258
|
+
const char * bb;
|
259
|
+
const char * amax = a;
|
260
|
+
|
261
|
+
while(*a == *b) {
|
262
|
+
if (*a == '\0')
|
263
|
+
return EditDist(0, a);
|
264
|
+
++a; ++b;
|
265
|
+
}
|
266
|
+
|
267
|
+
if (*a == '\0') {
|
268
|
+
|
269
|
+
++b;
|
270
|
+
if (*b == '\0') return EditDist(ws.del2, a);
|
271
|
+
++b;
|
272
|
+
if (*b == '\0') return EditDist(2*ws.del2, a);
|
273
|
+
return EditDist(LARGE_NUM, a);
|
274
|
+
|
275
|
+
} else if (*b == '\0') {
|
276
|
+
|
277
|
+
++a;
|
278
|
+
if (*a == '\0') return EditDist(ws.del1, a);
|
279
|
+
++a;
|
280
|
+
if (*a == '\0') return EditDist(2*ws.del1, a);
|
281
|
+
return EditDist(LARGE_NUM, a);
|
282
|
+
|
283
|
+
} else {
|
284
|
+
|
285
|
+
// delete a character from a
|
286
|
+
check2(a+1,b,ws.del1);
|
287
|
+
|
288
|
+
// delete a character from b
|
289
|
+
check2(a,b+1,ws.del2);
|
290
|
+
|
291
|
+
if (*a == *(b+1) && *b == *(a+1)) {
|
292
|
+
|
293
|
+
// swap two characters
|
294
|
+
check2(a+2,b+2,ws.swap);
|
295
|
+
|
296
|
+
} else {
|
297
|
+
|
298
|
+
// substitute one character for another which is the same
|
299
|
+
// thing as deleting a character from both a & b
|
300
|
+
check2(a+1,b+1,ws.sub);
|
301
|
+
|
302
|
+
}
|
303
|
+
}
|
304
|
+
return EditDist(min, amax);
|
305
|
+
}
|
306
|
+
}
|
307
|
+
|
308
|
+
|
data/ext/leditdist.hpp
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
|
2
|
+
#ifndef __aspeller_leditdist_hh__
|
3
|
+
#define __aspeller_leditdist_hh__
|
4
|
+
|
5
|
+
#include "weights.hpp"
|
6
|
+
|
7
|
+
namespace aspeller {
|
8
|
+
|
9
|
+
// limit_edit_distance finds the shortest edit distance but will
|
10
|
+
// stop and return a number at least as large as LARGE_NUM if it has
|
11
|
+
// to do more edits than a set limit.
|
12
|
+
// Note that this does NOT mean that the score returned is <= limit*w.max
|
13
|
+
// as "sub" vs "submarine" will return 6*(cost of insertion) no matter what
|
14
|
+
// the limit is.
|
15
|
+
// The edit distance is
|
16
|
+
// (cost of swap)(# of swaps) + (cost of deletion)(# of deletions)
|
17
|
+
// + (cost of insertion)(# of insertions)
|
18
|
+
// + (cost of substitutions)(# of substitutions)
|
19
|
+
|
20
|
+
// Preconditions:
|
21
|
+
// max(strlen(a), strlen(b))*max(of the edit weights) <= 2^15
|
22
|
+
// if violated than an incorrect result may be returned (which may be negative)
|
23
|
+
// due to overflow of a short integer
|
24
|
+
// (limit+1)*w.min < limit*w.max
|
25
|
+
// limit <= 5 (use edit_distance if limit > 5)
|
26
|
+
// where w.min and w.max is the minimum and maximum cost of an edit
|
27
|
+
// respectfully.
|
28
|
+
|
29
|
+
// The running time is asymptotically bounded above by
|
30
|
+
// (3^l)*n where l is the limit and n is the maxium of strlen(a),strlen(b)
|
31
|
+
// Based on my informal tests, however, the n does not really matter
|
32
|
+
// and the running time is more like (3^l).
|
33
|
+
|
34
|
+
// limit_edit_distance, based on my informal tests, turns out to be
|
35
|
+
// faster than edit_dist for l < 5. For l == 5 it is about the
|
36
|
+
// smaller for short strings (<= 5) and less than for longer strings
|
37
|
+
|
38
|
+
// limit2_edit_distance(a,b,w) = limit_edit_distance(a,b,2,w)
|
39
|
+
// but is roughly 2/3's faster
|
40
|
+
|
41
|
+
struct EditDist {
|
42
|
+
int score;
|
43
|
+
const char * stopped_at;
|
44
|
+
EditDist() {}
|
45
|
+
EditDist(int s, const char * p)
|
46
|
+
: score(s), stopped_at(p) {}
|
47
|
+
operator int () const {return score;}
|
48
|
+
};
|
49
|
+
|
50
|
+
static const int LARGE_NUM = 0xFFFFF;
|
51
|
+
// this needs to be SMALLER than INT_MAX since it may be incremented
|
52
|
+
// a few times
|
53
|
+
|
54
|
+
int limit_edit_distance(const char * a, const char * b, int limit,
|
55
|
+
const EditDistanceWeights & w
|
56
|
+
= EditDistanceWeights());
|
57
|
+
|
58
|
+
EditDist limit1_edit_distance(const char * a, const char * b,
|
59
|
+
const EditDistanceWeights & w
|
60
|
+
= EditDistanceWeights());
|
61
|
+
|
62
|
+
EditDist limit2_edit_distance(const char * a, const char * b,
|
63
|
+
const EditDistanceWeights & w
|
64
|
+
= EditDistanceWeights());
|
65
|
+
|
66
|
+
}
|
67
|
+
|
68
|
+
#endif
|
data/ext/weights.hpp
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
|
2
|
+
#ifndef __aspeller_weights_hh__
|
3
|
+
#define __aspeller_weights_hh__
|
4
|
+
|
5
|
+
namespace aspeller {
|
6
|
+
|
7
|
+
struct EditDistanceWeights {
|
8
|
+
int del1; // the cost of deleting a char in the first string
|
9
|
+
int del2; // the cost of inserting a character or deleting a char
|
10
|
+
// in the next string
|
11
|
+
int swap; // the cost of swapping two adjacent letters
|
12
|
+
int sub; // the cost of replacing one letter with another
|
13
|
+
int similar; // the cost of a "similar" but not exact match for
|
14
|
+
// two characters
|
15
|
+
int min; // the min of del1, del2, swap and sub.
|
16
|
+
int max; // the max of del1, del2, swap and sub.
|
17
|
+
EditDistanceWeights()
|
18
|
+
: del1(1), del2(1), swap(1), sub(1), similar(0), min(1), max(1) {}
|
19
|
+
};
|
20
|
+
|
21
|
+
}
|
22
|
+
|
23
|
+
#endif
|
@@ -0,0 +1,61 @@
|
|
1
|
+
raise "This file should never be required. It's here only for documentation purposes."
|
2
|
+
|
3
|
+
# module, through which the functionality of edit distance calculation is possible
|
4
|
+
module Aspeller
|
5
|
+
|
6
|
+
# weights that are used by Aspell to determine edit distance between two strings
|
7
|
+
class EditDistanceWeights
|
8
|
+
|
9
|
+
# the cost of deleting a char in the first string, defaults to 1
|
10
|
+
attr_accessor :del1
|
11
|
+
|
12
|
+
# the cost of inserting a character or deleting a char in the next string, defaults to 1
|
13
|
+
attr_accessor :del2
|
14
|
+
|
15
|
+
# the cost of swapping two adjacent letters, defaults to 1
|
16
|
+
attr_accessor :swap
|
17
|
+
|
18
|
+
# the cost of replacing one letter with another, defaults to 1
|
19
|
+
attr_accessor :sub
|
20
|
+
|
21
|
+
# the cost of a "similar" but not exact match for two characters, defaults to 0
|
22
|
+
attr_accessor :similar
|
23
|
+
|
24
|
+
# the min of del1, del2, swap and sub, defaults to 1
|
25
|
+
attr_accessor :min
|
26
|
+
|
27
|
+
# the max of del1, del2, swap and sub, defaults to 1
|
28
|
+
attr_accessor :max
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
# limit_edit_distance finds the shortest edit distance but will
|
33
|
+
# stop and return a number at least as large as LARGE_NUM if it has
|
34
|
+
# to do more edits than a set limit.
|
35
|
+
# Note that this does NOT mean that the score returned is <= limit*w.max
|
36
|
+
# as "sub" vs "submarine" will return 6*(cost of insertion) no matter what
|
37
|
+
# the limit is.
|
38
|
+
# The edit distance is
|
39
|
+
# (cost of swap)(# of swaps) + (cost of deletion)(# of deletions)
|
40
|
+
# + (cost of insertion)(# of insertions)
|
41
|
+
# + (cost of substitutions)(# of substitutions)
|
42
|
+
#
|
43
|
+
# Preconditions:
|
44
|
+
# max(strlen(a), strlen(b))*max(of the edit weights) <= 2^15
|
45
|
+
# if violated than an incorrect result may be returned (which may be negative)
|
46
|
+
# due to overflow of a short integer
|
47
|
+
# (limit+1)*w.min < limit*w.max
|
48
|
+
# limit <= 5 (use edit_distance if limit > 5)
|
49
|
+
# where w.min and w.max is the minimum and maximum cost of an edit
|
50
|
+
# respectfully.
|
51
|
+
#
|
52
|
+
# The running time is asymptotically bounded above by
|
53
|
+
# (3^l)*n where l is the limit and n is the maxium of strlen(a),strlen(b)
|
54
|
+
# Based on my informal tests, however, the n does not really matter
|
55
|
+
# and the running time is more like (3^l).
|
56
|
+
#
|
57
|
+
# limit_edit_distance, based on my informal tests, turns out to be
|
58
|
+
# faster than edit_dist for l < 5. For l == 5 it is about the
|
59
|
+
# smaller for short strings (<= 5) and less than for longer strings
|
60
|
+
def self.limit_edit_distance(strA, strB, limit, weights); end
|
61
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require File.expand_path( File.dirname(__FILE__) + '/test_helper.rb' )
|
2
|
+
|
3
|
+
class EditDistanceTest < Test::Unit::TestCase
|
4
|
+
|
5
|
+
def test_limit_distance_defined
|
6
|
+
assert defined?(Aspeller), "module Aspell must be defined"
|
7
|
+
assert defined?(Aspeller.limit_edit_distance), "method Aspell.limit_edit_distance must be defined"
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_limit_distance_returns_fixnum
|
11
|
+
result = Aspeller.limit_edit_distance("a", "b", 1, Aspeller::EditDistanceWeights.new)
|
12
|
+
assert result.is_a?(Fixnum), "limit_edit_distance is supposed to return a fixnum"
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_limit_distance_returns_right_value
|
16
|
+
result = Aspeller.limit_edit_distance("test", "tast", 1, Aspeller::EditDistanceWeights.new)
|
17
|
+
assert_equal 1, result
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_limit_distance_uses_passed_weights
|
21
|
+
weights = Aspeller::EditDistanceWeights.new
|
22
|
+
weights.sub = 2
|
23
|
+
result = Aspeller.limit_edit_distance("test", "tast", 1, weights)
|
24
|
+
assert_equal 2, result
|
25
|
+
end
|
26
|
+
end
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
$LOAD_PATH << File.expand_path( File.dirname(__FILE__) + '/../ext' )
|
2
|
+
$LOAD_PATH << File.expand_path( File.dirname(__FILE__) + '/..' )
|
3
|
+
|
4
|
+
require 'test/unit'
|
5
|
+
|
6
|
+
system("cd #{File.expand_path(File.join(File.dirname(__FILE__), '..'))}; make distclean")
|
7
|
+
|
8
|
+
system("cd #{File.expand_path(File.join(File.dirname(__FILE__), '..'))}; ruby ext/extconf.rb")
|
9
|
+
system("cd #{File.expand_path(File.join(File.dirname(__FILE__), '..'))}; make")
|
10
|
+
|
11
|
+
require "aspell_edit_dist"
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require File.expand_path( File.dirname(__FILE__) + '/test_helper.rb' )
|
2
|
+
|
3
|
+
class WeightsTest < Test::Unit::TestCase
|
4
|
+
|
5
|
+
def test_weights_defined
|
6
|
+
assert defined?(Aspeller), "Aspeller module should be defined"
|
7
|
+
assert defined?(Aspeller::EditDistanceWeights), "class Aspeller::EditDistanceWeights is supposed to be defined"
|
8
|
+
end
|
9
|
+
|
10
|
+
def test_weights_has_constructor
|
11
|
+
assert_respond_to Aspeller::EditDistanceWeights, :new
|
12
|
+
end
|
13
|
+
|
14
|
+
WEIGHT_METHODS = [:del1, :del2, :swap, :sub, :similar, :min, :max]
|
15
|
+
|
16
|
+
def test_weights_getters
|
17
|
+
weights = Aspeller::EditDistanceWeights.new
|
18
|
+
WEIGHT_METHODS.each do |method_name|
|
19
|
+
assert_respond_to weights, method_name
|
20
|
+
assert weights.send(method_name), "weights.#{method_name} should return something"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def test_weights_default_values
|
25
|
+
weights = Aspeller::EditDistanceWeights.new
|
26
|
+
assert_equal 1, weights.del1
|
27
|
+
assert_equal 1, weights.del2
|
28
|
+
assert_equal 1, weights.swap
|
29
|
+
assert_equal 1, weights.sub
|
30
|
+
assert_equal 0, weights.similar
|
31
|
+
assert_equal 1, weights.min
|
32
|
+
assert_equal 1, weights.max
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_weights_setters
|
36
|
+
weights = Aspeller::EditDistanceWeights.new
|
37
|
+
|
38
|
+
n = 123
|
39
|
+
WEIGHT_METHODS.each do |method_name|
|
40
|
+
setter_name = :"#{method_name}="
|
41
|
+
|
42
|
+
assert_respond_to weights, setter_name
|
43
|
+
|
44
|
+
weights.send(setter_name, n)
|
45
|
+
assert_equal n, weights.send(method_name)
|
46
|
+
|
47
|
+
n += 1
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
metadata
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: aspell_edit_dist
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Adam Pohorecki
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-12-29 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description:
|
17
|
+
email: adam@pohorecki.pl
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions:
|
21
|
+
- ext/extconf.rb
|
22
|
+
extra_rdoc_files:
|
23
|
+
- README
|
24
|
+
files:
|
25
|
+
- .gitignore
|
26
|
+
- README
|
27
|
+
- Rakefile
|
28
|
+
- VERSION
|
29
|
+
- ext/aspell_edit_dist.cpp
|
30
|
+
- ext/aspell_edit_dist.h
|
31
|
+
- ext/extconf.rb
|
32
|
+
- ext/leditdist.cpp
|
33
|
+
- ext/leditdist.hpp
|
34
|
+
- ext/weights.hpp
|
35
|
+
- lib/aspell_edit_dist_stub.rb
|
36
|
+
- test/edit_distance_test.rb
|
37
|
+
- test/test_helper.rb
|
38
|
+
- test/weights_test.rb
|
39
|
+
has_rdoc: true
|
40
|
+
homepage: http://github.com/psyho/aspell_edit_dist
|
41
|
+
licenses: []
|
42
|
+
|
43
|
+
post_install_message:
|
44
|
+
rdoc_options:
|
45
|
+
- --charset=UTF-8
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: "0"
|
53
|
+
version:
|
54
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - ">="
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: "0"
|
59
|
+
version:
|
60
|
+
requirements: []
|
61
|
+
|
62
|
+
rubyforge_project:
|
63
|
+
rubygems_version: 1.3.5
|
64
|
+
signing_key:
|
65
|
+
specification_version: 3
|
66
|
+
summary: Gem that exposes limit_edit_distance function from Aspell.
|
67
|
+
test_files:
|
68
|
+
- test/edit_distance_test.rb
|
69
|
+
- test/weights_test.rb
|
70
|
+
- test/test_helper.rb
|