mini-levenshtein 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/ruby.yml +25 -0
- data/.gitignore +2 -0
- data/.rspec +2 -0
- data/.ruby-version +1 -0
- data/Gemfile +10 -0
- data/Gemfile.lock +52 -0
- data/README.md +15 -0
- data/Rakefile +24 -0
- data/ext/mini_levenshtein/extconf.rb +10 -0
- data/ext/mini_levenshtein/levenshtein.c +1056 -0
- data/ext/mini_levenshtein/levenshtein.h +146 -0
- data/ext/mini_levenshtein/mini_levenshtein.c +32 -0
- data/lib/mini-levenshtein/version.rb +5 -0
- data/lib/mini-levenshtein.rb +33 -0
- data/mini-levenshtein.gemspec +32 -0
- metadata +63 -0
@@ -0,0 +1,1056 @@
|
|
1
|
+
/*
|
2
|
+
* This file has been altered to better fit fuzzywuzzy.
|
3
|
+
* To se all changes done, please diff this file with
|
4
|
+
* <https://github.com/Tmplt/python-Levenshtein/blob/master/Levenshtein.c>
|
5
|
+
*
|
6
|
+
* Summary:
|
7
|
+
* - stripped all python-related code and data types;
|
8
|
+
* - fixed some spelling errors.
|
9
|
+
*/
|
10
|
+
|
11
|
+
/*
|
12
|
+
* Levenshtein.c
|
13
|
+
* @(#) $Id: Levenshtein.c,v 1.41 2005/01/13 20:05:36 yeti Exp $
|
14
|
+
* Python extension computing Levenshtein distances, string similarities,
|
15
|
+
* median strings and other goodies.
|
16
|
+
*
|
17
|
+
* Copyright (C) 2002-2003 David Necas (Yeti) <yeti@physics.muni.cz>.
|
18
|
+
*
|
19
|
+
* The Taus113 random generator:
|
20
|
+
* Copyright (C) 2002 Atakan Gurkan
|
21
|
+
* Copyright (C) 1996, 1997, 1998, 1999, 2000 James Theiler, Brian Gough
|
22
|
+
* (see below for more)
|
23
|
+
*
|
24
|
+
* This program is free software; you can redistribute it and/or modify it
|
25
|
+
* under the terms of the GNU General Public License as published by the Free
|
26
|
+
* Software Foundation; either version 2 of the License, or (at your option)
|
27
|
+
* any later version.
|
28
|
+
*
|
29
|
+
* This program is distributed in the hope that it will be useful, but WITHOUT
|
30
|
+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
31
|
+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
32
|
+
* more details.
|
33
|
+
*
|
34
|
+
* You should have received a copy of the GNU General Public License along
|
35
|
+
* with this program; if not, write to the Free Software Foundation, Inc.,
|
36
|
+
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
|
37
|
+
**/
|
38
|
+
|
39
|
+
/**
|
40
|
+
* TODO:
|
41
|
+
*
|
42
|
+
* - Implement weighted string averaging, see:
|
43
|
+
* H. Bunke et. al.: On the Weighted Mean of a Pair of Strings,
|
44
|
+
* Pattern Analysis and Applications 2002, 5(1): 23-30.
|
45
|
+
* X. Jiang et. al.: Dynamic Computations of Generalized Median Strings,
|
46
|
+
* Pattern Analysis and Applications 2002, ???.
|
47
|
+
* The latter also contains an interesting median-search algorithm.
|
48
|
+
*
|
49
|
+
* - Deal with stray symbols in greedy median() and median_improve().
|
50
|
+
* There are two possibilities:
|
51
|
+
* (i) Remember which strings contain which symbols. This allows certain
|
52
|
+
* small optimizations when processing them.
|
53
|
+
* (ii) Use some overall heuristics to find symbols which don't worth
|
54
|
+
* trying. This is very appealing, but hard to do properly
|
55
|
+
* (requires some inequality strong enough to allow practical exclusion
|
56
|
+
* of certain symbols -- at certain positions)
|
57
|
+
*
|
58
|
+
* - Editops should be an object that only *looks* like a list (which means
|
59
|
+
* it is a list in duck typing) to avoid never-ending conversions from
|
60
|
+
* Python lists to LevEditOp arrays and back
|
61
|
+
*
|
62
|
+
* - Optimize munkers_blackman(), it's pretty dumb (no memory of visited
|
63
|
+
* columns/rows)
|
64
|
+
*
|
65
|
+
* - Make it really usable as a C library (needs some wrappers, headers, ...,
|
66
|
+
* and maybe even documentation ;-)
|
67
|
+
*
|
68
|
+
* - Add interface to various interesting auxiliary results, namely
|
69
|
+
* set and sequence distance (only ratio is exported), the map from
|
70
|
+
* munkers_blackman() itself, ...
|
71
|
+
*
|
72
|
+
* - Generalizations:
|
73
|
+
* - character weight matrix/function
|
74
|
+
* - arbitrary edit operation costs, decomposable edit operations
|
75
|
+
*
|
76
|
+
* - Create a test suite
|
77
|
+
*
|
78
|
+
* - Add more interesting algorithms ;-)
|
79
|
+
*
|
80
|
+
* Postponed TODO (investigated, and a big `but' was found):
|
81
|
+
*
|
82
|
+
* - A linear approximate set median algorithm:
|
83
|
+
* P. Indyk: Sublinear time algorithms for metric space problems,
|
84
|
+
* STOC 1999, http://citeseer.nj.nec.com/indyk00sublinear.html.
|
85
|
+
* BUT: The algorithm seems to be advantageous only in the case of very
|
86
|
+
* large sets -- if my estimates are correct (the article itself is quite
|
87
|
+
* `asymptotic'), say 10^5 at least. On smaller sets either one would get
|
88
|
+
* only an extermely rough median estimate, or the number of distance
|
89
|
+
* computations would be in fact higher than in the dumb O(n^2) algorithm.
|
90
|
+
*
|
91
|
+
* - Improve setmedian() speed with triangular inequality, see:
|
92
|
+
* Juan, A., E. Vidal: An Algorithm for Fast Median Search,
|
93
|
+
* 1997, http://citeseer.nj.nec.com/article/juan97algorithm.html
|
94
|
+
* BUT: It doesn't seem to help much in spaces of high dimension (see the
|
95
|
+
* discussion and graphs in the article itself), a few percents at most,
|
96
|
+
* and strings behave like a space with a very high dimension (locally), so
|
97
|
+
* who knows, it probably wouldn't help much.
|
98
|
+
*
|
99
|
+
**/
|
100
|
+
|
101
|
+
#ifndef _GNU_SOURCE
|
102
|
+
#define _GNU_SOURCE
|
103
|
+
#endif
|
104
|
+
|
105
|
+
#include <string.h>
|
106
|
+
#include <math.h>
|
107
|
+
/* for debugging */
|
108
|
+
#include <stdio.h>
|
109
|
+
|
110
|
+
#include <assert.h>
|
111
|
+
#include "levenshtein.h"
|
112
|
+
|
113
|
+
/**
|
114
|
+
* lev_edit_distance:
|
115
|
+
* @len1: The length of @string1.
|
116
|
+
* @string1: A sequence of bytes of length @len1, may contain NUL characters.
|
117
|
+
* @len2: The length of @string2.
|
118
|
+
* @string2: A sequence of bytes of length @len2, may contain NUL characters.
|
119
|
+
* @xcost: If nonzero, the replace operation has weight 2, otherwise all
|
120
|
+
* edit operations have equal weights of 1.
|
121
|
+
*
|
122
|
+
* Computes Levenshtein edit distance of two strings.
|
123
|
+
*
|
124
|
+
* Returns: The edit distance.
|
125
|
+
**/
|
126
|
+
size_t
|
127
|
+
lev_edit_distance(size_t len1, const lev_byte *string1,
|
128
|
+
size_t len2, const lev_byte *string2,
|
129
|
+
int xcost)
|
130
|
+
{
|
131
|
+
size_t i;
|
132
|
+
size_t *row; /* we only need to keep one row of costs */
|
133
|
+
size_t *end;
|
134
|
+
size_t half;
|
135
|
+
|
136
|
+
/* strip common prefix */
|
137
|
+
while (len1 > 0 && len2 > 0 && *string1 == *string2)
|
138
|
+
{
|
139
|
+
len1--;
|
140
|
+
len2--;
|
141
|
+
string1++;
|
142
|
+
string2++;
|
143
|
+
}
|
144
|
+
|
145
|
+
/* strip common suffix */
|
146
|
+
while (len1 > 0 && len2 > 0 && string1[len1 - 1] == string2[len2 - 1])
|
147
|
+
{
|
148
|
+
len1--;
|
149
|
+
len2--;
|
150
|
+
}
|
151
|
+
|
152
|
+
/* catch trivial cases */
|
153
|
+
if (len1 == 0)
|
154
|
+
return len2;
|
155
|
+
if (len2 == 0)
|
156
|
+
return len1;
|
157
|
+
|
158
|
+
/* make the inner cycle (i.e. string2) the longer one */
|
159
|
+
if (len1 > len2)
|
160
|
+
{
|
161
|
+
size_t nx = len1;
|
162
|
+
const lev_byte *sx = string1;
|
163
|
+
len1 = len2;
|
164
|
+
len2 = nx;
|
165
|
+
string1 = string2;
|
166
|
+
string2 = sx;
|
167
|
+
}
|
168
|
+
/* check len1 == 1 separately */
|
169
|
+
if (len1 == 1)
|
170
|
+
{
|
171
|
+
if (xcost)
|
172
|
+
return len2 + 1 - 2 * (memchr(string2, *string1, len2) != NULL);
|
173
|
+
else
|
174
|
+
return len2 - (memchr(string2, *string1, len2) != NULL);
|
175
|
+
}
|
176
|
+
len1++;
|
177
|
+
len2++;
|
178
|
+
half = len1 >> 1;
|
179
|
+
|
180
|
+
/* initialize first row */
|
181
|
+
row = (size_t *)malloc(len2 * sizeof(size_t));
|
182
|
+
if (!row)
|
183
|
+
return (size_t)(-1);
|
184
|
+
end = row + len2 - 1;
|
185
|
+
for (i = 0; i < len2 - (xcost ? 0 : half); i++)
|
186
|
+
row[i] = i;
|
187
|
+
|
188
|
+
/* go through the matrix and compute the costs. yes, this is an extremely
|
189
|
+
* obfuscated version, but also extremely memory-conservative and relatively
|
190
|
+
* fast. */
|
191
|
+
if (xcost)
|
192
|
+
{
|
193
|
+
for (i = 1; i < len1; i++)
|
194
|
+
{
|
195
|
+
size_t *p = row + 1;
|
196
|
+
const lev_byte char1 = string1[i - 1];
|
197
|
+
const lev_byte *char2p = string2;
|
198
|
+
size_t D = i;
|
199
|
+
size_t x = i;
|
200
|
+
while (p <= end)
|
201
|
+
{
|
202
|
+
if (char1 == *(char2p++))
|
203
|
+
x = --D;
|
204
|
+
else
|
205
|
+
x++;
|
206
|
+
D = *p;
|
207
|
+
D++;
|
208
|
+
if (x > D)
|
209
|
+
x = D;
|
210
|
+
*(p++) = x;
|
211
|
+
}
|
212
|
+
}
|
213
|
+
}
|
214
|
+
else
|
215
|
+
{
|
216
|
+
/* in this case we don't have to scan two corner triangles (of size len1/2)
|
217
|
+
* in the matrix because no best path can go thought them. note this
|
218
|
+
* breaks when len1 == len2 == 2 so the memchr() special case above is
|
219
|
+
* necessary */
|
220
|
+
row[0] = len1 - half - 1;
|
221
|
+
for (i = 1; i < len1; i++)
|
222
|
+
{
|
223
|
+
size_t *p;
|
224
|
+
const lev_byte char1 = string1[i - 1];
|
225
|
+
const lev_byte *char2p;
|
226
|
+
size_t D, x;
|
227
|
+
/* skip the upper triangle */
|
228
|
+
if (i >= len1 - half)
|
229
|
+
{
|
230
|
+
size_t offset = i - (len1 - half);
|
231
|
+
size_t c3;
|
232
|
+
|
233
|
+
char2p = string2 + offset;
|
234
|
+
p = row + offset;
|
235
|
+
c3 = *(p++) + (char1 != *(char2p++));
|
236
|
+
x = *p;
|
237
|
+
x++;
|
238
|
+
D = x;
|
239
|
+
if (x > c3)
|
240
|
+
x = c3;
|
241
|
+
*(p++) = x;
|
242
|
+
}
|
243
|
+
else
|
244
|
+
{
|
245
|
+
p = row + 1;
|
246
|
+
char2p = string2;
|
247
|
+
D = x = i;
|
248
|
+
}
|
249
|
+
/* skip the lower triangle */
|
250
|
+
if (i <= half + 1)
|
251
|
+
end = row + len2 + i - half - 2;
|
252
|
+
/* main */
|
253
|
+
while (p <= end)
|
254
|
+
{
|
255
|
+
size_t c3 = --D + (char1 != *(char2p++));
|
256
|
+
x++;
|
257
|
+
if (x > c3)
|
258
|
+
x = c3;
|
259
|
+
D = *p;
|
260
|
+
D++;
|
261
|
+
if (x > D)
|
262
|
+
x = D;
|
263
|
+
*(p++) = x;
|
264
|
+
}
|
265
|
+
/* lower triangle sentinel */
|
266
|
+
if (i <= half)
|
267
|
+
{
|
268
|
+
size_t c3 = --D + (char1 != *char2p);
|
269
|
+
x++;
|
270
|
+
if (x > c3)
|
271
|
+
x = c3;
|
272
|
+
*p = x;
|
273
|
+
}
|
274
|
+
}
|
275
|
+
}
|
276
|
+
|
277
|
+
i = *end;
|
278
|
+
free(row);
|
279
|
+
return i;
|
280
|
+
}
|
281
|
+
|
282
|
+
/**
|
283
|
+
* editops_from_cost_matrix:
|
284
|
+
* @len1: The length of @string1.
|
285
|
+
* @string1: A string of length @len1, may contain NUL characters.
|
286
|
+
* @o1: The offset where the matrix starts from the start of @string1.
|
287
|
+
* @len2: The length of @string2.
|
288
|
+
* @string2: A string of length @len2, may contain NUL characters.
|
289
|
+
* @o2: The offset where the matrix starts from the start of @string2.
|
290
|
+
* @matrix: The cost matrix.
|
291
|
+
* @n: Where the number of edit operations should be stored.
|
292
|
+
*
|
293
|
+
* Reconstructs the optimal edit sequence from the cost matrix @matrix.
|
294
|
+
*
|
295
|
+
* The matrix is freed.
|
296
|
+
*
|
297
|
+
* Returns: The optimal edit sequence, as a newly allocated array of
|
298
|
+
* elementary edit operations, it length is stored in @n.
|
299
|
+
**/
|
300
|
+
static LevEditOp *
|
301
|
+
editops_from_cost_matrix(size_t len1, const lev_byte *string1, size_t off1,
|
302
|
+
size_t len2, const lev_byte *string2, size_t off2,
|
303
|
+
size_t *matrix, size_t *n)
|
304
|
+
{
|
305
|
+
size_t *p;
|
306
|
+
size_t i, j, pos;
|
307
|
+
LevEditOp *ops;
|
308
|
+
int dir = 0;
|
309
|
+
|
310
|
+
pos = *n = matrix[len1 * len2 - 1];
|
311
|
+
if (!*n)
|
312
|
+
{
|
313
|
+
free(matrix);
|
314
|
+
return NULL;
|
315
|
+
}
|
316
|
+
ops = (LevEditOp *)malloc((*n) * sizeof(LevEditOp));
|
317
|
+
if (!ops)
|
318
|
+
{
|
319
|
+
free(matrix);
|
320
|
+
*n = (size_t)(-1);
|
321
|
+
return NULL;
|
322
|
+
}
|
323
|
+
i = len1 - 1;
|
324
|
+
j = len2 - 1;
|
325
|
+
p = matrix + len1 * len2 - 1;
|
326
|
+
while (i || j)
|
327
|
+
{
|
328
|
+
/* prefer contiuning in the same direction */
|
329
|
+
if (dir < 0 && j && *p == *(p - 1) + 1)
|
330
|
+
{
|
331
|
+
pos--;
|
332
|
+
ops[pos].type = LEV_EDIT_INSERT;
|
333
|
+
ops[pos].spos = i + off1;
|
334
|
+
ops[pos].dpos = --j + off2;
|
335
|
+
p--;
|
336
|
+
continue;
|
337
|
+
}
|
338
|
+
if (dir > 0 && i && *p == *(p - len2) + 1)
|
339
|
+
{
|
340
|
+
pos--;
|
341
|
+
ops[pos].type = LEV_EDIT_DELETE;
|
342
|
+
ops[pos].spos = --i + off1;
|
343
|
+
ops[pos].dpos = j + off2;
|
344
|
+
p -= len2;
|
345
|
+
continue;
|
346
|
+
}
|
347
|
+
if (i && j && *p == *(p - len2 - 1) && string1[i - 1] == string2[j - 1])
|
348
|
+
{
|
349
|
+
/* don't be stupid like difflib, don't store LEV_EDIT_KEEP */
|
350
|
+
i--;
|
351
|
+
j--;
|
352
|
+
p -= len2 + 1;
|
353
|
+
dir = 0;
|
354
|
+
continue;
|
355
|
+
}
|
356
|
+
if (i && j && *p == *(p - len2 - 1) + 1)
|
357
|
+
{
|
358
|
+
pos--;
|
359
|
+
ops[pos].type = LEV_EDIT_REPLACE;
|
360
|
+
ops[pos].spos = --i + off1;
|
361
|
+
ops[pos].dpos = --j + off2;
|
362
|
+
p -= len2 + 1;
|
363
|
+
dir = 0;
|
364
|
+
continue;
|
365
|
+
}
|
366
|
+
/* we cant't turn directly from -1 to 1, in this case it would be better
|
367
|
+
* to go diagonally, but check it (dir == 0) */
|
368
|
+
if (dir == 0 && j && *p == *(p - 1) + 1)
|
369
|
+
{
|
370
|
+
pos--;
|
371
|
+
ops[pos].type = LEV_EDIT_INSERT;
|
372
|
+
ops[pos].spos = i + off1;
|
373
|
+
ops[pos].dpos = --j + off2;
|
374
|
+
p--;
|
375
|
+
dir = -1;
|
376
|
+
continue;
|
377
|
+
}
|
378
|
+
if (dir == 0 && i && *p == *(p - len2) + 1)
|
379
|
+
{
|
380
|
+
pos--;
|
381
|
+
ops[pos].type = LEV_EDIT_DELETE;
|
382
|
+
ops[pos].spos = --i + off1;
|
383
|
+
ops[pos].dpos = j + off2;
|
384
|
+
p -= len2;
|
385
|
+
dir = 1;
|
386
|
+
continue;
|
387
|
+
}
|
388
|
+
/* coredump right now, later might be too late ;-) */
|
389
|
+
assert("lost in the cost matrix" == NULL);
|
390
|
+
}
|
391
|
+
free(matrix);
|
392
|
+
|
393
|
+
return ops;
|
394
|
+
}
|
395
|
+
|
396
|
+
/**
|
397
|
+
* lev_editops_find:
|
398
|
+
* @len1: The length of @string1.
|
399
|
+
* @string1: A string of length @len1, may contain NUL characters.
|
400
|
+
* @len2: The length of @string2.
|
401
|
+
* @string2: A string of length @len2, may contain NUL characters.
|
402
|
+
* @n: Where the number of edit operations should be stored.
|
403
|
+
*
|
404
|
+
* Find an optimal edit sequence from @string1 to @string2.
|
405
|
+
*
|
406
|
+
* When there's more than one optimal sequence, a one is arbitrarily (though
|
407
|
+
* deterministically) chosen.
|
408
|
+
*
|
409
|
+
* Returns: The optimal edit sequence, as a newly allocated array of
|
410
|
+
* elementary edit operations, it length is stored in @n.
|
411
|
+
* It is normalized, i.e., keep operations are not included.
|
412
|
+
**/
|
413
|
+
LevEditOp *
|
414
|
+
lev_editops_find(size_t len1, const lev_byte *string1,
|
415
|
+
size_t len2, const lev_byte *string2,
|
416
|
+
size_t *n)
|
417
|
+
{
|
418
|
+
size_t len1o, len2o;
|
419
|
+
size_t i;
|
420
|
+
size_t *matrix; /* cost matrix */
|
421
|
+
|
422
|
+
/* strip common prefix */
|
423
|
+
len1o = 0;
|
424
|
+
while (len1 > 0 && len2 > 0 && *string1 == *string2)
|
425
|
+
{
|
426
|
+
len1--;
|
427
|
+
len2--;
|
428
|
+
string1++;
|
429
|
+
string2++;
|
430
|
+
len1o++;
|
431
|
+
}
|
432
|
+
len2o = len1o;
|
433
|
+
|
434
|
+
/* strip common suffix */
|
435
|
+
while (len1 > 0 && len2 > 0 && string1[len1 - 1] == string2[len2 - 1])
|
436
|
+
{
|
437
|
+
len1--;
|
438
|
+
len2--;
|
439
|
+
}
|
440
|
+
len1++;
|
441
|
+
len2++;
|
442
|
+
|
443
|
+
/* initalize first row and column */
|
444
|
+
matrix = (size_t *)malloc(len1 * len2 * sizeof(size_t));
|
445
|
+
if (!matrix)
|
446
|
+
{
|
447
|
+
*n = (size_t)(-1);
|
448
|
+
return NULL;
|
449
|
+
}
|
450
|
+
for (i = 0; i < len2; i++)
|
451
|
+
matrix[i] = i;
|
452
|
+
for (i = 1; i < len1; i++)
|
453
|
+
matrix[len2 * i] = i;
|
454
|
+
|
455
|
+
/* find the costs and fill the matrix */
|
456
|
+
for (i = 1; i < len1; i++)
|
457
|
+
{
|
458
|
+
size_t *prev = matrix + (i - 1) * len2;
|
459
|
+
size_t *p = matrix + i * len2;
|
460
|
+
size_t *end = p + len2 - 1;
|
461
|
+
const lev_byte char1 = string1[i - 1];
|
462
|
+
const lev_byte *char2p = string2;
|
463
|
+
size_t x = i;
|
464
|
+
p++;
|
465
|
+
while (p <= end)
|
466
|
+
{
|
467
|
+
size_t c3 = *(prev++) + (char1 != *(char2p++));
|
468
|
+
x++;
|
469
|
+
if (x > c3)
|
470
|
+
x = c3;
|
471
|
+
c3 = *prev + 1;
|
472
|
+
if (x > c3)
|
473
|
+
x = c3;
|
474
|
+
*(p++) = x;
|
475
|
+
}
|
476
|
+
}
|
477
|
+
|
478
|
+
/* find the way back */
|
479
|
+
return editops_from_cost_matrix(len1, string1, len1o,
|
480
|
+
len2, string2, len2o,
|
481
|
+
matrix, n);
|
482
|
+
}
|
483
|
+
|
484
|
+
/**
|
485
|
+
* lev_u_edit_distance:
|
486
|
+
* @len1: The length of @string1.
|
487
|
+
* @string1: A sequence of Unicode characters of length @len1, may contain NUL
|
488
|
+
* characters.
|
489
|
+
* @len2: The length of @string2.
|
490
|
+
* @string2: A sequence of Unicode characters of length @len2, may contain NUL
|
491
|
+
* characters.
|
492
|
+
* @xcost: If nonzero, the replace operation has weight 2, otherwise all
|
493
|
+
* edit operations have equal weights of 1.
|
494
|
+
*
|
495
|
+
* Computes Levenshtein edit distance of two Unicode strings.
|
496
|
+
*
|
497
|
+
* Returns: The edit distance.
|
498
|
+
**/
|
499
|
+
size_t
|
500
|
+
lev_u_edit_distance(size_t len1, const lev_wchar *string1,
|
501
|
+
size_t len2, const lev_wchar *string2,
|
502
|
+
int xcost)
|
503
|
+
{
|
504
|
+
size_t i;
|
505
|
+
size_t *row; /* we only need to keep one row of costs */
|
506
|
+
size_t *end;
|
507
|
+
size_t half;
|
508
|
+
|
509
|
+
/* strip common prefix */
|
510
|
+
while (len1 > 0 && len2 > 0 && *string1 == *string2)
|
511
|
+
{
|
512
|
+
len1--;
|
513
|
+
len2--;
|
514
|
+
string1++;
|
515
|
+
string2++;
|
516
|
+
}
|
517
|
+
|
518
|
+
/* strip common suffix */
|
519
|
+
while (len1 > 0 && len2 > 0 && string1[len1 - 1] == string2[len2 - 1])
|
520
|
+
{
|
521
|
+
len1--;
|
522
|
+
len2--;
|
523
|
+
}
|
524
|
+
|
525
|
+
/* catch trivial cases */
|
526
|
+
if (len1 == 0)
|
527
|
+
return len2;
|
528
|
+
if (len2 == 0)
|
529
|
+
return len1;
|
530
|
+
|
531
|
+
/* make the inner cycle (i.e. string2) the longer one */
|
532
|
+
if (len1 > len2)
|
533
|
+
{
|
534
|
+
size_t nx = len1;
|
535
|
+
const lev_wchar *sx = string1;
|
536
|
+
len1 = len2;
|
537
|
+
len2 = nx;
|
538
|
+
string1 = string2;
|
539
|
+
string2 = sx;
|
540
|
+
}
|
541
|
+
/* check len1 == 1 separately */
|
542
|
+
if (len1 == 1)
|
543
|
+
{
|
544
|
+
lev_wchar z = *string1;
|
545
|
+
const lev_wchar *p = string2;
|
546
|
+
for (i = len2; i; i--)
|
547
|
+
{
|
548
|
+
if (*(p++) == z)
|
549
|
+
return len2 - 1;
|
550
|
+
}
|
551
|
+
return len2 + (xcost != 0);
|
552
|
+
}
|
553
|
+
len1++;
|
554
|
+
len2++;
|
555
|
+
half = len1 >> 1;
|
556
|
+
|
557
|
+
/* initalize first row */
|
558
|
+
row = (size_t *)malloc(len2 * sizeof(size_t));
|
559
|
+
if (!row)
|
560
|
+
return (size_t)(-1);
|
561
|
+
end = row + len2 - 1;
|
562
|
+
for (i = 0; i < len2 - (xcost ? 0 : half); i++)
|
563
|
+
row[i] = i;
|
564
|
+
|
565
|
+
/* go through the matrix and compute the costs. yes, this is an extremely
|
566
|
+
* obfuscated version, but also extremely memory-conservative and relatively
|
567
|
+
* fast. */
|
568
|
+
if (xcost)
|
569
|
+
{
|
570
|
+
for (i = 1; i < len1; i++)
|
571
|
+
{
|
572
|
+
size_t *p = row + 1;
|
573
|
+
const lev_wchar char1 = string1[i - 1];
|
574
|
+
const lev_wchar *char2p = string2;
|
575
|
+
size_t D = i - 1;
|
576
|
+
size_t x = i;
|
577
|
+
while (p <= end)
|
578
|
+
{
|
579
|
+
if (char1 == *(char2p++))
|
580
|
+
x = D;
|
581
|
+
else
|
582
|
+
x++;
|
583
|
+
D = *p;
|
584
|
+
if (x > D + 1)
|
585
|
+
x = D + 1;
|
586
|
+
*(p++) = x;
|
587
|
+
}
|
588
|
+
}
|
589
|
+
}
|
590
|
+
else
|
591
|
+
{
|
592
|
+
/* in this case we don't have to scan two corner triangles (of size len1/2)
|
593
|
+
* in the matrix because no best path can go throught them. note this
|
594
|
+
* breaks when len1 == len2 == 2 so the memchr() special case above is
|
595
|
+
* necessary */
|
596
|
+
row[0] = len1 - half - 1;
|
597
|
+
for (i = 1; i < len1; i++)
|
598
|
+
{
|
599
|
+
size_t *p;
|
600
|
+
const lev_wchar char1 = string1[i - 1];
|
601
|
+
const lev_wchar *char2p;
|
602
|
+
size_t D, x;
|
603
|
+
/* skip the upper triangle */
|
604
|
+
if (i >= len1 - half)
|
605
|
+
{
|
606
|
+
size_t offset = i - (len1 - half);
|
607
|
+
size_t c3;
|
608
|
+
|
609
|
+
char2p = string2 + offset;
|
610
|
+
p = row + offset;
|
611
|
+
c3 = *(p++) + (char1 != *(char2p++));
|
612
|
+
x = *p;
|
613
|
+
x++;
|
614
|
+
D = x;
|
615
|
+
if (x > c3)
|
616
|
+
x = c3;
|
617
|
+
*(p++) = x;
|
618
|
+
}
|
619
|
+
else
|
620
|
+
{
|
621
|
+
p = row + 1;
|
622
|
+
char2p = string2;
|
623
|
+
D = x = i;
|
624
|
+
}
|
625
|
+
/* skip the lower triangle */
|
626
|
+
if (i <= half + 1)
|
627
|
+
end = row + len2 + i - half - 2;
|
628
|
+
/* main */
|
629
|
+
while (p <= end)
|
630
|
+
{
|
631
|
+
size_t c3 = --D + (char1 != *(char2p++));
|
632
|
+
x++;
|
633
|
+
if (x > c3)
|
634
|
+
x = c3;
|
635
|
+
D = *p;
|
636
|
+
D++;
|
637
|
+
if (x > D)
|
638
|
+
x = D;
|
639
|
+
*(p++) = x;
|
640
|
+
}
|
641
|
+
/* lower triangle sentinel */
|
642
|
+
if (i <= half)
|
643
|
+
{
|
644
|
+
size_t c3 = --D + (char1 != *char2p);
|
645
|
+
x++;
|
646
|
+
if (x > c3)
|
647
|
+
x = c3;
|
648
|
+
*p = x;
|
649
|
+
}
|
650
|
+
}
|
651
|
+
}
|
652
|
+
|
653
|
+
i = *end;
|
654
|
+
free(row);
|
655
|
+
return i;
|
656
|
+
}
|
657
|
+
|
658
|
+
/**
|
659
|
+
* lev_editops_to_opcodes:
|
660
|
+
* @n: The size of @ops.
|
661
|
+
* @ops: An array of elementary edit operations.
|
662
|
+
* @nb: Where the number of difflib block operation codes should be stored.
|
663
|
+
* @len1: The length of the source string.
|
664
|
+
* @len2: The length of the destination string.
|
665
|
+
*
|
666
|
+
* Converts elementary edit operations to difflib block operation codes.
|
667
|
+
*
|
668
|
+
* Note the string lengths are necessary since difflib doesn't allow omitting
|
669
|
+
* keep operations.
|
670
|
+
*
|
671
|
+
* Returns: The converted block operation codes, as a newly allocated array;
|
672
|
+
* its length is stored in @nb.
|
673
|
+
**/
|
674
|
+
LevOpCode *
|
675
|
+
lev_editops_to_opcodes(size_t n, const LevEditOp *ops, size_t *nb,
|
676
|
+
size_t len1, size_t len2)
|
677
|
+
{
|
678
|
+
size_t nbl, i, spos, dpos;
|
679
|
+
const LevEditOp *o;
|
680
|
+
LevOpCode *bops, *b;
|
681
|
+
LevEditType type;
|
682
|
+
|
683
|
+
/* compute the number of blocks */
|
684
|
+
nbl = 0;
|
685
|
+
o = ops;
|
686
|
+
spos = dpos = 0;
|
687
|
+
type = LEV_EDIT_KEEP;
|
688
|
+
for (i = n; i;)
|
689
|
+
{
|
690
|
+
/* simply pretend there are no keep blocks */
|
691
|
+
while (o->type == LEV_EDIT_KEEP && --i)
|
692
|
+
o++;
|
693
|
+
if (!i)
|
694
|
+
break;
|
695
|
+
if (spos < o->spos || dpos < o->dpos)
|
696
|
+
{
|
697
|
+
nbl++;
|
698
|
+
spos = o->spos;
|
699
|
+
dpos = o->dpos;
|
700
|
+
}
|
701
|
+
nbl++;
|
702
|
+
type = o->type;
|
703
|
+
switch (type)
|
704
|
+
{
|
705
|
+
case LEV_EDIT_REPLACE:
|
706
|
+
do
|
707
|
+
{
|
708
|
+
spos++;
|
709
|
+
dpos++;
|
710
|
+
i--;
|
711
|
+
o++;
|
712
|
+
} while (i && o->type == type && spos == o->spos && dpos == o->dpos);
|
713
|
+
break;
|
714
|
+
|
715
|
+
case LEV_EDIT_DELETE:
|
716
|
+
do
|
717
|
+
{
|
718
|
+
spos++;
|
719
|
+
i--;
|
720
|
+
o++;
|
721
|
+
} while (i && o->type == type && spos == o->spos && dpos == o->dpos);
|
722
|
+
break;
|
723
|
+
|
724
|
+
case LEV_EDIT_INSERT:
|
725
|
+
do
|
726
|
+
{
|
727
|
+
dpos++;
|
728
|
+
i--;
|
729
|
+
o++;
|
730
|
+
} while (i && o->type == type && spos == o->spos && dpos == o->dpos);
|
731
|
+
break;
|
732
|
+
|
733
|
+
default:
|
734
|
+
break;
|
735
|
+
}
|
736
|
+
}
|
737
|
+
if (spos < len1 || dpos < len2)
|
738
|
+
nbl++;
|
739
|
+
|
740
|
+
/* convert */
|
741
|
+
b = bops = (LevOpCode *)malloc(nbl * sizeof(LevOpCode));
|
742
|
+
if (!bops)
|
743
|
+
{
|
744
|
+
*nb = (size_t)(-1);
|
745
|
+
return NULL;
|
746
|
+
}
|
747
|
+
o = ops;
|
748
|
+
spos = dpos = 0;
|
749
|
+
type = LEV_EDIT_KEEP;
|
750
|
+
for (i = n; i;)
|
751
|
+
{
|
752
|
+
/* simply pretend there are no keep blocks */
|
753
|
+
while (o->type == LEV_EDIT_KEEP && --i)
|
754
|
+
o++;
|
755
|
+
if (!i)
|
756
|
+
break;
|
757
|
+
b->sbeg = spos;
|
758
|
+
b->dbeg = dpos;
|
759
|
+
if (spos < o->spos || dpos < o->dpos)
|
760
|
+
{
|
761
|
+
b->type = LEV_EDIT_KEEP;
|
762
|
+
spos = b->send = o->spos;
|
763
|
+
dpos = b->dend = o->dpos;
|
764
|
+
b++;
|
765
|
+
b->sbeg = spos;
|
766
|
+
b->dbeg = dpos;
|
767
|
+
}
|
768
|
+
type = o->type;
|
769
|
+
switch (type)
|
770
|
+
{
|
771
|
+
case LEV_EDIT_REPLACE:
|
772
|
+
do
|
773
|
+
{
|
774
|
+
spos++;
|
775
|
+
dpos++;
|
776
|
+
i--;
|
777
|
+
o++;
|
778
|
+
} while (i && o->type == type && spos == o->spos && dpos == o->dpos);
|
779
|
+
break;
|
780
|
+
|
781
|
+
case LEV_EDIT_DELETE:
|
782
|
+
do
|
783
|
+
{
|
784
|
+
spos++;
|
785
|
+
i--;
|
786
|
+
o++;
|
787
|
+
} while (i && o->type == type && spos == o->spos && dpos == o->dpos);
|
788
|
+
break;
|
789
|
+
|
790
|
+
case LEV_EDIT_INSERT:
|
791
|
+
do
|
792
|
+
{
|
793
|
+
dpos++;
|
794
|
+
i--;
|
795
|
+
o++;
|
796
|
+
} while (i && o->type == type && spos == o->spos && dpos == o->dpos);
|
797
|
+
break;
|
798
|
+
|
799
|
+
default:
|
800
|
+
break;
|
801
|
+
}
|
802
|
+
b->type = type;
|
803
|
+
b->send = spos;
|
804
|
+
b->dend = dpos;
|
805
|
+
b++;
|
806
|
+
}
|
807
|
+
if (spos < len1 || dpos < len2)
|
808
|
+
{
|
809
|
+
assert(len1 - spos == len2 - dpos);
|
810
|
+
b->type = LEV_EDIT_KEEP;
|
811
|
+
b->sbeg = spos;
|
812
|
+
b->dbeg = dpos;
|
813
|
+
b->send = len1;
|
814
|
+
b->dend = len2;
|
815
|
+
b++;
|
816
|
+
}
|
817
|
+
assert((size_t)(b - bops) == nbl);
|
818
|
+
|
819
|
+
*nb = nbl;
|
820
|
+
return bops;
|
821
|
+
}
|
822
|
+
|
823
|
+
/**
|
824
|
+
* lev_opcodes_matching_blocks:
|
825
|
+
* @len1: The length of the source string.
|
826
|
+
* @len2: The length of the destination string.
|
827
|
+
* @nb: The size of @bops.
|
828
|
+
* @bops: An array of difflib block edit operation codes.
|
829
|
+
* @nmblocks: Where the number of matching block should be stored.
|
830
|
+
*
|
831
|
+
* Computes the matching block corresponding to an optimal edit @bops.
|
832
|
+
*
|
833
|
+
* Returns: The matching blocks as a newly allocated array, it length is
|
834
|
+
* stored in @nmblocks.
|
835
|
+
**/
|
836
|
+
LevMatchingBlock *
|
837
|
+
lev_opcodes_matching_blocks(size_t len1,
|
838
|
+
__attribute__((unused)) size_t len2,
|
839
|
+
size_t nb,
|
840
|
+
const LevOpCode *bops,
|
841
|
+
size_t *nmblocks)
|
842
|
+
{
|
843
|
+
size_t nmb, i;
|
844
|
+
const LevOpCode *b;
|
845
|
+
LevMatchingBlock *mblocks, *mb;
|
846
|
+
|
847
|
+
/* compute the number of matching blocks */
|
848
|
+
nmb = 0;
|
849
|
+
b = bops;
|
850
|
+
for (i = nb; i; i--, b++)
|
851
|
+
{
|
852
|
+
if (b->type == LEV_EDIT_KEEP)
|
853
|
+
{
|
854
|
+
nmb++;
|
855
|
+
/* adjacent KEEP blocks -- we never produce it, but... */
|
856
|
+
while (i && b->type == LEV_EDIT_KEEP)
|
857
|
+
{
|
858
|
+
i--;
|
859
|
+
b++;
|
860
|
+
}
|
861
|
+
if (!i)
|
862
|
+
break;
|
863
|
+
}
|
864
|
+
}
|
865
|
+
|
866
|
+
/* convert */
|
867
|
+
mb = mblocks = (LevMatchingBlock *)malloc(nmb * sizeof(LevOpCode));
|
868
|
+
if (!mblocks)
|
869
|
+
{
|
870
|
+
*nmblocks = (size_t)(-1);
|
871
|
+
return NULL;
|
872
|
+
}
|
873
|
+
b = bops;
|
874
|
+
for (i = nb; i; i--, b++)
|
875
|
+
{
|
876
|
+
if (b->type == LEV_EDIT_KEEP)
|
877
|
+
{
|
878
|
+
mb->spos = b->sbeg;
|
879
|
+
mb->dpos = b->dbeg;
|
880
|
+
/* adjacent KEEP blocks -- we never produce it, but... */
|
881
|
+
while (i && b->type == LEV_EDIT_KEEP)
|
882
|
+
{
|
883
|
+
i--;
|
884
|
+
b++;
|
885
|
+
}
|
886
|
+
if (!i)
|
887
|
+
{
|
888
|
+
mb->len = len1 - mb->spos;
|
889
|
+
mb++;
|
890
|
+
break;
|
891
|
+
}
|
892
|
+
mb->len = b->sbeg - mb->spos;
|
893
|
+
mb++;
|
894
|
+
}
|
895
|
+
}
|
896
|
+
assert((size_t)(mb - mblocks) == nmb);
|
897
|
+
|
898
|
+
*nmblocks = nmb;
|
899
|
+
return mblocks;
|
900
|
+
}
|
901
|
+
|
902
|
+
/**
|
903
|
+
* lev_editops_matching_blocks:
|
904
|
+
* @len1: The length of the source string.
|
905
|
+
* @len2: The length of the destination string.
|
906
|
+
* @n: The size of @ops.
|
907
|
+
* @ops: An array of elementary edit operations.
|
908
|
+
* @nmblocks: Where the number of matching block should be stored.
|
909
|
+
*
|
910
|
+
* Computes the matching block corresponding to an optimal edit @ops.
|
911
|
+
*
|
912
|
+
* Returns: The matching blocks as a newly allocated array, it length is
|
913
|
+
* stored in @nmblocks.
|
914
|
+
**/
|
915
|
+
LevMatchingBlock *
|
916
|
+
lev_editops_matching_blocks(size_t len1,
|
917
|
+
size_t len2,
|
918
|
+
size_t n,
|
919
|
+
const LevEditOp *ops,
|
920
|
+
size_t *nmblocks)
|
921
|
+
{
|
922
|
+
size_t nmb, i, spos, dpos;
|
923
|
+
LevEditType type;
|
924
|
+
const LevEditOp *o;
|
925
|
+
LevMatchingBlock *mblocks, *mb;
|
926
|
+
|
927
|
+
/* compute the number of matching blocks */
|
928
|
+
nmb = 0;
|
929
|
+
o = ops;
|
930
|
+
spos = dpos = 0;
|
931
|
+
type = LEV_EDIT_KEEP;
|
932
|
+
for (i = n; i;)
|
933
|
+
{
|
934
|
+
/* simply pretend there are no keep blocks */
|
935
|
+
while (o->type == LEV_EDIT_KEEP && --i)
|
936
|
+
o++;
|
937
|
+
if (!i)
|
938
|
+
break;
|
939
|
+
if (spos < o->spos || dpos < o->dpos)
|
940
|
+
{
|
941
|
+
nmb++;
|
942
|
+
spos = o->spos;
|
943
|
+
dpos = o->dpos;
|
944
|
+
}
|
945
|
+
type = o->type;
|
946
|
+
switch (type)
|
947
|
+
{
|
948
|
+
case LEV_EDIT_REPLACE:
|
949
|
+
do
|
950
|
+
{
|
951
|
+
spos++;
|
952
|
+
dpos++;
|
953
|
+
i--;
|
954
|
+
o++;
|
955
|
+
} while (i && o->type == type && spos == o->spos && dpos == o->dpos);
|
956
|
+
break;
|
957
|
+
|
958
|
+
case LEV_EDIT_DELETE:
|
959
|
+
do
|
960
|
+
{
|
961
|
+
spos++;
|
962
|
+
i--;
|
963
|
+
o++;
|
964
|
+
} while (i && o->type == type && spos == o->spos && dpos == o->dpos);
|
965
|
+
break;
|
966
|
+
|
967
|
+
case LEV_EDIT_INSERT:
|
968
|
+
do
|
969
|
+
{
|
970
|
+
dpos++;
|
971
|
+
i--;
|
972
|
+
o++;
|
973
|
+
} while (i && o->type == type && spos == o->spos && dpos == o->dpos);
|
974
|
+
break;
|
975
|
+
|
976
|
+
default:
|
977
|
+
break;
|
978
|
+
}
|
979
|
+
}
|
980
|
+
if (spos < len1 || dpos < len2)
|
981
|
+
nmb++;
|
982
|
+
|
983
|
+
/* fill the info */
|
984
|
+
mb = mblocks = (LevMatchingBlock *)malloc(nmb * sizeof(LevOpCode));
|
985
|
+
if (!mblocks)
|
986
|
+
{
|
987
|
+
*nmblocks = (size_t)(-1);
|
988
|
+
return NULL;
|
989
|
+
}
|
990
|
+
o = ops;
|
991
|
+
spos = dpos = 0;
|
992
|
+
type = LEV_EDIT_KEEP;
|
993
|
+
for (i = n; i;)
|
994
|
+
{
|
995
|
+
/* simply pretend there are no keep blocks */
|
996
|
+
while (o->type == LEV_EDIT_KEEP && --i)
|
997
|
+
o++;
|
998
|
+
if (!i)
|
999
|
+
break;
|
1000
|
+
if (spos < o->spos || dpos < o->dpos)
|
1001
|
+
{
|
1002
|
+
mb->spos = spos;
|
1003
|
+
mb->dpos = dpos;
|
1004
|
+
mb->len = o->spos - spos;
|
1005
|
+
spos = o->spos;
|
1006
|
+
dpos = o->dpos;
|
1007
|
+
mb++;
|
1008
|
+
}
|
1009
|
+
type = o->type;
|
1010
|
+
switch (type)
|
1011
|
+
{
|
1012
|
+
case LEV_EDIT_REPLACE:
|
1013
|
+
do
|
1014
|
+
{
|
1015
|
+
spos++;
|
1016
|
+
dpos++;
|
1017
|
+
i--;
|
1018
|
+
o++;
|
1019
|
+
} while (i && o->type == type && spos == o->spos && dpos == o->dpos);
|
1020
|
+
break;
|
1021
|
+
|
1022
|
+
case LEV_EDIT_DELETE:
|
1023
|
+
do
|
1024
|
+
{
|
1025
|
+
spos++;
|
1026
|
+
i--;
|
1027
|
+
o++;
|
1028
|
+
} while (i && o->type == type && spos == o->spos && dpos == o->dpos);
|
1029
|
+
break;
|
1030
|
+
|
1031
|
+
case LEV_EDIT_INSERT:
|
1032
|
+
do
|
1033
|
+
{
|
1034
|
+
dpos++;
|
1035
|
+
i--;
|
1036
|
+
o++;
|
1037
|
+
} while (i && o->type == type && spos == o->spos && dpos == o->dpos);
|
1038
|
+
break;
|
1039
|
+
|
1040
|
+
default:
|
1041
|
+
break;
|
1042
|
+
}
|
1043
|
+
}
|
1044
|
+
if (spos < len1 || dpos < len2)
|
1045
|
+
{
|
1046
|
+
assert(len1 - spos == len2 - dpos);
|
1047
|
+
mb->spos = spos;
|
1048
|
+
mb->dpos = dpos;
|
1049
|
+
mb->len = len1 - spos;
|
1050
|
+
mb++;
|
1051
|
+
}
|
1052
|
+
assert((size_t)(mb - mblocks) == nmb);
|
1053
|
+
|
1054
|
+
*nmblocks = nmb;
|
1055
|
+
return mblocks;
|
1056
|
+
}
|