mini-levenshtein 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1056 @@
1
+ /*
2
+ * This file has been altered to better fit fuzzywuzzy.
3
+ * To se all changes done, please diff this file with
4
+ * <https://github.com/Tmplt/python-Levenshtein/blob/master/Levenshtein.c>
5
+ *
6
+ * Summary:
7
+ * - stripped all python-related code and data types;
8
+ * - fixed some spelling errors.
9
+ */
10
+
11
+ /*
12
+ * Levenshtein.c
13
+ * @(#) $Id: Levenshtein.c,v 1.41 2005/01/13 20:05:36 yeti Exp $
14
+ * Python extension computing Levenshtein distances, string similarities,
15
+ * median strings and other goodies.
16
+ *
17
+ * Copyright (C) 2002-2003 David Necas (Yeti) <yeti@physics.muni.cz>.
18
+ *
19
+ * The Taus113 random generator:
20
+ * Copyright (C) 2002 Atakan Gurkan
21
+ * Copyright (C) 1996, 1997, 1998, 1999, 2000 James Theiler, Brian Gough
22
+ * (see below for more)
23
+ *
24
+ * This program is free software; you can redistribute it and/or modify it
25
+ * under the terms of the GNU General Public License as published by the Free
26
+ * Software Foundation; either version 2 of the License, or (at your option)
27
+ * any later version.
28
+ *
29
+ * This program is distributed in the hope that it will be useful, but WITHOUT
30
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
31
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
32
+ * more details.
33
+ *
34
+ * You should have received a copy of the GNU General Public License along
35
+ * with this program; if not, write to the Free Software Foundation, Inc.,
36
+ * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
37
+ **/
38
+
39
+ /**
40
+ * TODO:
41
+ *
42
+ * - Implement weighted string averaging, see:
43
+ * H. Bunke et. al.: On the Weighted Mean of a Pair of Strings,
44
+ * Pattern Analysis and Applications 2002, 5(1): 23-30.
45
+ * X. Jiang et. al.: Dynamic Computations of Generalized Median Strings,
46
+ * Pattern Analysis and Applications 2002, ???.
47
+ * The latter also contains an interesting median-search algorithm.
48
+ *
49
+ * - Deal with stray symbols in greedy median() and median_improve().
50
+ * There are two possibilities:
51
+ * (i) Remember which strings contain which symbols. This allows certain
52
+ * small optimizations when processing them.
53
+ * (ii) Use some overall heuristics to find symbols which don't worth
54
+ * trying. This is very appealing, but hard to do properly
55
+ * (requires some inequality strong enough to allow practical exclusion
56
+ * of certain symbols -- at certain positions)
57
+ *
58
+ * - Editops should be an object that only *looks* like a list (which means
59
+ * it is a list in duck typing) to avoid never-ending conversions from
60
+ * Python lists to LevEditOp arrays and back
61
+ *
62
+ * - Optimize munkers_blackman(), it's pretty dumb (no memory of visited
63
+ * columns/rows)
64
+ *
65
+ * - Make it really usable as a C library (needs some wrappers, headers, ...,
66
+ * and maybe even documentation ;-)
67
+ *
68
+ * - Add interface to various interesting auxiliary results, namely
69
+ * set and sequence distance (only ratio is exported), the map from
70
+ * munkers_blackman() itself, ...
71
+ *
72
+ * - Generalizations:
73
+ * - character weight matrix/function
74
+ * - arbitrary edit operation costs, decomposable edit operations
75
+ *
76
+ * - Create a test suite
77
+ *
78
+ * - Add more interesting algorithms ;-)
79
+ *
80
+ * Postponed TODO (investigated, and a big `but' was found):
81
+ *
82
+ * - A linear approximate set median algorithm:
83
+ * P. Indyk: Sublinear time algorithms for metric space problems,
84
+ * STOC 1999, http://citeseer.nj.nec.com/indyk00sublinear.html.
85
+ * BUT: The algorithm seems to be advantageous only in the case of very
86
+ * large sets -- if my estimates are correct (the article itself is quite
87
+ * `asymptotic'), say 10^5 at least. On smaller sets either one would get
88
+ * only an extermely rough median estimate, or the number of distance
89
+ * computations would be in fact higher than in the dumb O(n^2) algorithm.
90
+ *
91
+ * - Improve setmedian() speed with triangular inequality, see:
92
+ * Juan, A., E. Vidal: An Algorithm for Fast Median Search,
93
+ * 1997, http://citeseer.nj.nec.com/article/juan97algorithm.html
94
+ * BUT: It doesn't seem to help much in spaces of high dimension (see the
95
+ * discussion and graphs in the article itself), a few percents at most,
96
+ * and strings behave like a space with a very high dimension (locally), so
97
+ * who knows, it probably wouldn't help much.
98
+ *
99
+ **/
100
+
101
+ #ifndef _GNU_SOURCE
102
+ #define _GNU_SOURCE
103
+ #endif
104
+
105
+ #include <string.h>
106
+ #include <math.h>
107
+ /* for debugging */
108
+ #include <stdio.h>
109
+
110
+ #include <assert.h>
111
+ #include "levenshtein.h"
112
+
113
+ /**
114
+ * lev_edit_distance:
115
+ * @len1: The length of @string1.
116
+ * @string1: A sequence of bytes of length @len1, may contain NUL characters.
117
+ * @len2: The length of @string2.
118
+ * @string2: A sequence of bytes of length @len2, may contain NUL characters.
119
+ * @xcost: If nonzero, the replace operation has weight 2, otherwise all
120
+ * edit operations have equal weights of 1.
121
+ *
122
+ * Computes Levenshtein edit distance of two strings.
123
+ *
124
+ * Returns: The edit distance.
125
+ **/
126
+ size_t
127
+ lev_edit_distance(size_t len1, const lev_byte *string1,
128
+ size_t len2, const lev_byte *string2,
129
+ int xcost)
130
+ {
131
+ size_t i;
132
+ size_t *row; /* we only need to keep one row of costs */
133
+ size_t *end;
134
+ size_t half;
135
+
136
+ /* strip common prefix */
137
+ while (len1 > 0 && len2 > 0 && *string1 == *string2)
138
+ {
139
+ len1--;
140
+ len2--;
141
+ string1++;
142
+ string2++;
143
+ }
144
+
145
+ /* strip common suffix */
146
+ while (len1 > 0 && len2 > 0 && string1[len1 - 1] == string2[len2 - 1])
147
+ {
148
+ len1--;
149
+ len2--;
150
+ }
151
+
152
+ /* catch trivial cases */
153
+ if (len1 == 0)
154
+ return len2;
155
+ if (len2 == 0)
156
+ return len1;
157
+
158
+ /* make the inner cycle (i.e. string2) the longer one */
159
+ if (len1 > len2)
160
+ {
161
+ size_t nx = len1;
162
+ const lev_byte *sx = string1;
163
+ len1 = len2;
164
+ len2 = nx;
165
+ string1 = string2;
166
+ string2 = sx;
167
+ }
168
+ /* check len1 == 1 separately */
169
+ if (len1 == 1)
170
+ {
171
+ if (xcost)
172
+ return len2 + 1 - 2 * (memchr(string2, *string1, len2) != NULL);
173
+ else
174
+ return len2 - (memchr(string2, *string1, len2) != NULL);
175
+ }
176
+ len1++;
177
+ len2++;
178
+ half = len1 >> 1;
179
+
180
+ /* initialize first row */
181
+ row = (size_t *)malloc(len2 * sizeof(size_t));
182
+ if (!row)
183
+ return (size_t)(-1);
184
+ end = row + len2 - 1;
185
+ for (i = 0; i < len2 - (xcost ? 0 : half); i++)
186
+ row[i] = i;
187
+
188
+ /* go through the matrix and compute the costs. yes, this is an extremely
189
+ * obfuscated version, but also extremely memory-conservative and relatively
190
+ * fast. */
191
+ if (xcost)
192
+ {
193
+ for (i = 1; i < len1; i++)
194
+ {
195
+ size_t *p = row + 1;
196
+ const lev_byte char1 = string1[i - 1];
197
+ const lev_byte *char2p = string2;
198
+ size_t D = i;
199
+ size_t x = i;
200
+ while (p <= end)
201
+ {
202
+ if (char1 == *(char2p++))
203
+ x = --D;
204
+ else
205
+ x++;
206
+ D = *p;
207
+ D++;
208
+ if (x > D)
209
+ x = D;
210
+ *(p++) = x;
211
+ }
212
+ }
213
+ }
214
+ else
215
+ {
216
+ /* in this case we don't have to scan two corner triangles (of size len1/2)
217
+ * in the matrix because no best path can go thought them. note this
218
+ * breaks when len1 == len2 == 2 so the memchr() special case above is
219
+ * necessary */
220
+ row[0] = len1 - half - 1;
221
+ for (i = 1; i < len1; i++)
222
+ {
223
+ size_t *p;
224
+ const lev_byte char1 = string1[i - 1];
225
+ const lev_byte *char2p;
226
+ size_t D, x;
227
+ /* skip the upper triangle */
228
+ if (i >= len1 - half)
229
+ {
230
+ size_t offset = i - (len1 - half);
231
+ size_t c3;
232
+
233
+ char2p = string2 + offset;
234
+ p = row + offset;
235
+ c3 = *(p++) + (char1 != *(char2p++));
236
+ x = *p;
237
+ x++;
238
+ D = x;
239
+ if (x > c3)
240
+ x = c3;
241
+ *(p++) = x;
242
+ }
243
+ else
244
+ {
245
+ p = row + 1;
246
+ char2p = string2;
247
+ D = x = i;
248
+ }
249
+ /* skip the lower triangle */
250
+ if (i <= half + 1)
251
+ end = row + len2 + i - half - 2;
252
+ /* main */
253
+ while (p <= end)
254
+ {
255
+ size_t c3 = --D + (char1 != *(char2p++));
256
+ x++;
257
+ if (x > c3)
258
+ x = c3;
259
+ D = *p;
260
+ D++;
261
+ if (x > D)
262
+ x = D;
263
+ *(p++) = x;
264
+ }
265
+ /* lower triangle sentinel */
266
+ if (i <= half)
267
+ {
268
+ size_t c3 = --D + (char1 != *char2p);
269
+ x++;
270
+ if (x > c3)
271
+ x = c3;
272
+ *p = x;
273
+ }
274
+ }
275
+ }
276
+
277
+ i = *end;
278
+ free(row);
279
+ return i;
280
+ }
281
+
282
+ /**
283
+ * editops_from_cost_matrix:
284
+ * @len1: The length of @string1.
285
+ * @string1: A string of length @len1, may contain NUL characters.
286
+ * @o1: The offset where the matrix starts from the start of @string1.
287
+ * @len2: The length of @string2.
288
+ * @string2: A string of length @len2, may contain NUL characters.
289
+ * @o2: The offset where the matrix starts from the start of @string2.
290
+ * @matrix: The cost matrix.
291
+ * @n: Where the number of edit operations should be stored.
292
+ *
293
+ * Reconstructs the optimal edit sequence from the cost matrix @matrix.
294
+ *
295
+ * The matrix is freed.
296
+ *
297
+ * Returns: The optimal edit sequence, as a newly allocated array of
298
+ * elementary edit operations, it length is stored in @n.
299
+ **/
300
+ static LevEditOp *
301
+ editops_from_cost_matrix(size_t len1, const lev_byte *string1, size_t off1,
302
+ size_t len2, const lev_byte *string2, size_t off2,
303
+ size_t *matrix, size_t *n)
304
+ {
305
+ size_t *p;
306
+ size_t i, j, pos;
307
+ LevEditOp *ops;
308
+ int dir = 0;
309
+
310
+ pos = *n = matrix[len1 * len2 - 1];
311
+ if (!*n)
312
+ {
313
+ free(matrix);
314
+ return NULL;
315
+ }
316
+ ops = (LevEditOp *)malloc((*n) * sizeof(LevEditOp));
317
+ if (!ops)
318
+ {
319
+ free(matrix);
320
+ *n = (size_t)(-1);
321
+ return NULL;
322
+ }
323
+ i = len1 - 1;
324
+ j = len2 - 1;
325
+ p = matrix + len1 * len2 - 1;
326
+ while (i || j)
327
+ {
328
+ /* prefer contiuning in the same direction */
329
+ if (dir < 0 && j && *p == *(p - 1) + 1)
330
+ {
331
+ pos--;
332
+ ops[pos].type = LEV_EDIT_INSERT;
333
+ ops[pos].spos = i + off1;
334
+ ops[pos].dpos = --j + off2;
335
+ p--;
336
+ continue;
337
+ }
338
+ if (dir > 0 && i && *p == *(p - len2) + 1)
339
+ {
340
+ pos--;
341
+ ops[pos].type = LEV_EDIT_DELETE;
342
+ ops[pos].spos = --i + off1;
343
+ ops[pos].dpos = j + off2;
344
+ p -= len2;
345
+ continue;
346
+ }
347
+ if (i && j && *p == *(p - len2 - 1) && string1[i - 1] == string2[j - 1])
348
+ {
349
+ /* don't be stupid like difflib, don't store LEV_EDIT_KEEP */
350
+ i--;
351
+ j--;
352
+ p -= len2 + 1;
353
+ dir = 0;
354
+ continue;
355
+ }
356
+ if (i && j && *p == *(p - len2 - 1) + 1)
357
+ {
358
+ pos--;
359
+ ops[pos].type = LEV_EDIT_REPLACE;
360
+ ops[pos].spos = --i + off1;
361
+ ops[pos].dpos = --j + off2;
362
+ p -= len2 + 1;
363
+ dir = 0;
364
+ continue;
365
+ }
366
+ /* we cant't turn directly from -1 to 1, in this case it would be better
367
+ * to go diagonally, but check it (dir == 0) */
368
+ if (dir == 0 && j && *p == *(p - 1) + 1)
369
+ {
370
+ pos--;
371
+ ops[pos].type = LEV_EDIT_INSERT;
372
+ ops[pos].spos = i + off1;
373
+ ops[pos].dpos = --j + off2;
374
+ p--;
375
+ dir = -1;
376
+ continue;
377
+ }
378
+ if (dir == 0 && i && *p == *(p - len2) + 1)
379
+ {
380
+ pos--;
381
+ ops[pos].type = LEV_EDIT_DELETE;
382
+ ops[pos].spos = --i + off1;
383
+ ops[pos].dpos = j + off2;
384
+ p -= len2;
385
+ dir = 1;
386
+ continue;
387
+ }
388
+ /* coredump right now, later might be too late ;-) */
389
+ assert("lost in the cost matrix" == NULL);
390
+ }
391
+ free(matrix);
392
+
393
+ return ops;
394
+ }
395
+
396
+ /**
397
+ * lev_editops_find:
398
+ * @len1: The length of @string1.
399
+ * @string1: A string of length @len1, may contain NUL characters.
400
+ * @len2: The length of @string2.
401
+ * @string2: A string of length @len2, may contain NUL characters.
402
+ * @n: Where the number of edit operations should be stored.
403
+ *
404
+ * Find an optimal edit sequence from @string1 to @string2.
405
+ *
406
+ * When there's more than one optimal sequence, a one is arbitrarily (though
407
+ * deterministically) chosen.
408
+ *
409
+ * Returns: The optimal edit sequence, as a newly allocated array of
410
+ * elementary edit operations, it length is stored in @n.
411
+ * It is normalized, i.e., keep operations are not included.
412
+ **/
413
+ LevEditOp *
414
+ lev_editops_find(size_t len1, const lev_byte *string1,
415
+ size_t len2, const lev_byte *string2,
416
+ size_t *n)
417
+ {
418
+ size_t len1o, len2o;
419
+ size_t i;
420
+ size_t *matrix; /* cost matrix */
421
+
422
+ /* strip common prefix */
423
+ len1o = 0;
424
+ while (len1 > 0 && len2 > 0 && *string1 == *string2)
425
+ {
426
+ len1--;
427
+ len2--;
428
+ string1++;
429
+ string2++;
430
+ len1o++;
431
+ }
432
+ len2o = len1o;
433
+
434
+ /* strip common suffix */
435
+ while (len1 > 0 && len2 > 0 && string1[len1 - 1] == string2[len2 - 1])
436
+ {
437
+ len1--;
438
+ len2--;
439
+ }
440
+ len1++;
441
+ len2++;
442
+
443
+ /* initalize first row and column */
444
+ matrix = (size_t *)malloc(len1 * len2 * sizeof(size_t));
445
+ if (!matrix)
446
+ {
447
+ *n = (size_t)(-1);
448
+ return NULL;
449
+ }
450
+ for (i = 0; i < len2; i++)
451
+ matrix[i] = i;
452
+ for (i = 1; i < len1; i++)
453
+ matrix[len2 * i] = i;
454
+
455
+ /* find the costs and fill the matrix */
456
+ for (i = 1; i < len1; i++)
457
+ {
458
+ size_t *prev = matrix + (i - 1) * len2;
459
+ size_t *p = matrix + i * len2;
460
+ size_t *end = p + len2 - 1;
461
+ const lev_byte char1 = string1[i - 1];
462
+ const lev_byte *char2p = string2;
463
+ size_t x = i;
464
+ p++;
465
+ while (p <= end)
466
+ {
467
+ size_t c3 = *(prev++) + (char1 != *(char2p++));
468
+ x++;
469
+ if (x > c3)
470
+ x = c3;
471
+ c3 = *prev + 1;
472
+ if (x > c3)
473
+ x = c3;
474
+ *(p++) = x;
475
+ }
476
+ }
477
+
478
+ /* find the way back */
479
+ return editops_from_cost_matrix(len1, string1, len1o,
480
+ len2, string2, len2o,
481
+ matrix, n);
482
+ }
483
+
484
+ /**
485
+ * lev_u_edit_distance:
486
+ * @len1: The length of @string1.
487
+ * @string1: A sequence of Unicode characters of length @len1, may contain NUL
488
+ * characters.
489
+ * @len2: The length of @string2.
490
+ * @string2: A sequence of Unicode characters of length @len2, may contain NUL
491
+ * characters.
492
+ * @xcost: If nonzero, the replace operation has weight 2, otherwise all
493
+ * edit operations have equal weights of 1.
494
+ *
495
+ * Computes Levenshtein edit distance of two Unicode strings.
496
+ *
497
+ * Returns: The edit distance.
498
+ **/
499
+ size_t
500
+ lev_u_edit_distance(size_t len1, const lev_wchar *string1,
501
+ size_t len2, const lev_wchar *string2,
502
+ int xcost)
503
+ {
504
+ size_t i;
505
+ size_t *row; /* we only need to keep one row of costs */
506
+ size_t *end;
507
+ size_t half;
508
+
509
+ /* strip common prefix */
510
+ while (len1 > 0 && len2 > 0 && *string1 == *string2)
511
+ {
512
+ len1--;
513
+ len2--;
514
+ string1++;
515
+ string2++;
516
+ }
517
+
518
+ /* strip common suffix */
519
+ while (len1 > 0 && len2 > 0 && string1[len1 - 1] == string2[len2 - 1])
520
+ {
521
+ len1--;
522
+ len2--;
523
+ }
524
+
525
+ /* catch trivial cases */
526
+ if (len1 == 0)
527
+ return len2;
528
+ if (len2 == 0)
529
+ return len1;
530
+
531
+ /* make the inner cycle (i.e. string2) the longer one */
532
+ if (len1 > len2)
533
+ {
534
+ size_t nx = len1;
535
+ const lev_wchar *sx = string1;
536
+ len1 = len2;
537
+ len2 = nx;
538
+ string1 = string2;
539
+ string2 = sx;
540
+ }
541
+ /* check len1 == 1 separately */
542
+ if (len1 == 1)
543
+ {
544
+ lev_wchar z = *string1;
545
+ const lev_wchar *p = string2;
546
+ for (i = len2; i; i--)
547
+ {
548
+ if (*(p++) == z)
549
+ return len2 - 1;
550
+ }
551
+ return len2 + (xcost != 0);
552
+ }
553
+ len1++;
554
+ len2++;
555
+ half = len1 >> 1;
556
+
557
+ /* initalize first row */
558
+ row = (size_t *)malloc(len2 * sizeof(size_t));
559
+ if (!row)
560
+ return (size_t)(-1);
561
+ end = row + len2 - 1;
562
+ for (i = 0; i < len2 - (xcost ? 0 : half); i++)
563
+ row[i] = i;
564
+
565
+ /* go through the matrix and compute the costs. yes, this is an extremely
566
+ * obfuscated version, but also extremely memory-conservative and relatively
567
+ * fast. */
568
+ if (xcost)
569
+ {
570
+ for (i = 1; i < len1; i++)
571
+ {
572
+ size_t *p = row + 1;
573
+ const lev_wchar char1 = string1[i - 1];
574
+ const lev_wchar *char2p = string2;
575
+ size_t D = i - 1;
576
+ size_t x = i;
577
+ while (p <= end)
578
+ {
579
+ if (char1 == *(char2p++))
580
+ x = D;
581
+ else
582
+ x++;
583
+ D = *p;
584
+ if (x > D + 1)
585
+ x = D + 1;
586
+ *(p++) = x;
587
+ }
588
+ }
589
+ }
590
+ else
591
+ {
592
+ /* in this case we don't have to scan two corner triangles (of size len1/2)
593
+ * in the matrix because no best path can go throught them. note this
594
+ * breaks when len1 == len2 == 2 so the memchr() special case above is
595
+ * necessary */
596
+ row[0] = len1 - half - 1;
597
+ for (i = 1; i < len1; i++)
598
+ {
599
+ size_t *p;
600
+ const lev_wchar char1 = string1[i - 1];
601
+ const lev_wchar *char2p;
602
+ size_t D, x;
603
+ /* skip the upper triangle */
604
+ if (i >= len1 - half)
605
+ {
606
+ size_t offset = i - (len1 - half);
607
+ size_t c3;
608
+
609
+ char2p = string2 + offset;
610
+ p = row + offset;
611
+ c3 = *(p++) + (char1 != *(char2p++));
612
+ x = *p;
613
+ x++;
614
+ D = x;
615
+ if (x > c3)
616
+ x = c3;
617
+ *(p++) = x;
618
+ }
619
+ else
620
+ {
621
+ p = row + 1;
622
+ char2p = string2;
623
+ D = x = i;
624
+ }
625
+ /* skip the lower triangle */
626
+ if (i <= half + 1)
627
+ end = row + len2 + i - half - 2;
628
+ /* main */
629
+ while (p <= end)
630
+ {
631
+ size_t c3 = --D + (char1 != *(char2p++));
632
+ x++;
633
+ if (x > c3)
634
+ x = c3;
635
+ D = *p;
636
+ D++;
637
+ if (x > D)
638
+ x = D;
639
+ *(p++) = x;
640
+ }
641
+ /* lower triangle sentinel */
642
+ if (i <= half)
643
+ {
644
+ size_t c3 = --D + (char1 != *char2p);
645
+ x++;
646
+ if (x > c3)
647
+ x = c3;
648
+ *p = x;
649
+ }
650
+ }
651
+ }
652
+
653
+ i = *end;
654
+ free(row);
655
+ return i;
656
+ }
657
+
658
+ /**
659
+ * lev_editops_to_opcodes:
660
+ * @n: The size of @ops.
661
+ * @ops: An array of elementary edit operations.
662
+ * @nb: Where the number of difflib block operation codes should be stored.
663
+ * @len1: The length of the source string.
664
+ * @len2: The length of the destination string.
665
+ *
666
+ * Converts elementary edit operations to difflib block operation codes.
667
+ *
668
+ * Note the string lengths are necessary since difflib doesn't allow omitting
669
+ * keep operations.
670
+ *
671
+ * Returns: The converted block operation codes, as a newly allocated array;
672
+ * its length is stored in @nb.
673
+ **/
674
+ LevOpCode *
675
+ lev_editops_to_opcodes(size_t n, const LevEditOp *ops, size_t *nb,
676
+ size_t len1, size_t len2)
677
+ {
678
+ size_t nbl, i, spos, dpos;
679
+ const LevEditOp *o;
680
+ LevOpCode *bops, *b;
681
+ LevEditType type;
682
+
683
+ /* compute the number of blocks */
684
+ nbl = 0;
685
+ o = ops;
686
+ spos = dpos = 0;
687
+ type = LEV_EDIT_KEEP;
688
+ for (i = n; i;)
689
+ {
690
+ /* simply pretend there are no keep blocks */
691
+ while (o->type == LEV_EDIT_KEEP && --i)
692
+ o++;
693
+ if (!i)
694
+ break;
695
+ if (spos < o->spos || dpos < o->dpos)
696
+ {
697
+ nbl++;
698
+ spos = o->spos;
699
+ dpos = o->dpos;
700
+ }
701
+ nbl++;
702
+ type = o->type;
703
+ switch (type)
704
+ {
705
+ case LEV_EDIT_REPLACE:
706
+ do
707
+ {
708
+ spos++;
709
+ dpos++;
710
+ i--;
711
+ o++;
712
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
713
+ break;
714
+
715
+ case LEV_EDIT_DELETE:
716
+ do
717
+ {
718
+ spos++;
719
+ i--;
720
+ o++;
721
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
722
+ break;
723
+
724
+ case LEV_EDIT_INSERT:
725
+ do
726
+ {
727
+ dpos++;
728
+ i--;
729
+ o++;
730
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
731
+ break;
732
+
733
+ default:
734
+ break;
735
+ }
736
+ }
737
+ if (spos < len1 || dpos < len2)
738
+ nbl++;
739
+
740
+ /* convert */
741
+ b = bops = (LevOpCode *)malloc(nbl * sizeof(LevOpCode));
742
+ if (!bops)
743
+ {
744
+ *nb = (size_t)(-1);
745
+ return NULL;
746
+ }
747
+ o = ops;
748
+ spos = dpos = 0;
749
+ type = LEV_EDIT_KEEP;
750
+ for (i = n; i;)
751
+ {
752
+ /* simply pretend there are no keep blocks */
753
+ while (o->type == LEV_EDIT_KEEP && --i)
754
+ o++;
755
+ if (!i)
756
+ break;
757
+ b->sbeg = spos;
758
+ b->dbeg = dpos;
759
+ if (spos < o->spos || dpos < o->dpos)
760
+ {
761
+ b->type = LEV_EDIT_KEEP;
762
+ spos = b->send = o->spos;
763
+ dpos = b->dend = o->dpos;
764
+ b++;
765
+ b->sbeg = spos;
766
+ b->dbeg = dpos;
767
+ }
768
+ type = o->type;
769
+ switch (type)
770
+ {
771
+ case LEV_EDIT_REPLACE:
772
+ do
773
+ {
774
+ spos++;
775
+ dpos++;
776
+ i--;
777
+ o++;
778
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
779
+ break;
780
+
781
+ case LEV_EDIT_DELETE:
782
+ do
783
+ {
784
+ spos++;
785
+ i--;
786
+ o++;
787
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
788
+ break;
789
+
790
+ case LEV_EDIT_INSERT:
791
+ do
792
+ {
793
+ dpos++;
794
+ i--;
795
+ o++;
796
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
797
+ break;
798
+
799
+ default:
800
+ break;
801
+ }
802
+ b->type = type;
803
+ b->send = spos;
804
+ b->dend = dpos;
805
+ b++;
806
+ }
807
+ if (spos < len1 || dpos < len2)
808
+ {
809
+ assert(len1 - spos == len2 - dpos);
810
+ b->type = LEV_EDIT_KEEP;
811
+ b->sbeg = spos;
812
+ b->dbeg = dpos;
813
+ b->send = len1;
814
+ b->dend = len2;
815
+ b++;
816
+ }
817
+ assert((size_t)(b - bops) == nbl);
818
+
819
+ *nb = nbl;
820
+ return bops;
821
+ }
822
+
823
+ /**
824
+ * lev_opcodes_matching_blocks:
825
+ * @len1: The length of the source string.
826
+ * @len2: The length of the destination string.
827
+ * @nb: The size of @bops.
828
+ * @bops: An array of difflib block edit operation codes.
829
+ * @nmblocks: Where the number of matching block should be stored.
830
+ *
831
+ * Computes the matching block corresponding to an optimal edit @bops.
832
+ *
833
+ * Returns: The matching blocks as a newly allocated array, it length is
834
+ * stored in @nmblocks.
835
+ **/
836
+ LevMatchingBlock *
837
+ lev_opcodes_matching_blocks(size_t len1,
838
+ __attribute__((unused)) size_t len2,
839
+ size_t nb,
840
+ const LevOpCode *bops,
841
+ size_t *nmblocks)
842
+ {
843
+ size_t nmb, i;
844
+ const LevOpCode *b;
845
+ LevMatchingBlock *mblocks, *mb;
846
+
847
+ /* compute the number of matching blocks */
848
+ nmb = 0;
849
+ b = bops;
850
+ for (i = nb; i; i--, b++)
851
+ {
852
+ if (b->type == LEV_EDIT_KEEP)
853
+ {
854
+ nmb++;
855
+ /* adjacent KEEP blocks -- we never produce it, but... */
856
+ while (i && b->type == LEV_EDIT_KEEP)
857
+ {
858
+ i--;
859
+ b++;
860
+ }
861
+ if (!i)
862
+ break;
863
+ }
864
+ }
865
+
866
+ /* convert */
867
+ mb = mblocks = (LevMatchingBlock *)malloc(nmb * sizeof(LevOpCode));
868
+ if (!mblocks)
869
+ {
870
+ *nmblocks = (size_t)(-1);
871
+ return NULL;
872
+ }
873
+ b = bops;
874
+ for (i = nb; i; i--, b++)
875
+ {
876
+ if (b->type == LEV_EDIT_KEEP)
877
+ {
878
+ mb->spos = b->sbeg;
879
+ mb->dpos = b->dbeg;
880
+ /* adjacent KEEP blocks -- we never produce it, but... */
881
+ while (i && b->type == LEV_EDIT_KEEP)
882
+ {
883
+ i--;
884
+ b++;
885
+ }
886
+ if (!i)
887
+ {
888
+ mb->len = len1 - mb->spos;
889
+ mb++;
890
+ break;
891
+ }
892
+ mb->len = b->sbeg - mb->spos;
893
+ mb++;
894
+ }
895
+ }
896
+ assert((size_t)(mb - mblocks) == nmb);
897
+
898
+ *nmblocks = nmb;
899
+ return mblocks;
900
+ }
901
+
902
+ /**
903
+ * lev_editops_matching_blocks:
904
+ * @len1: The length of the source string.
905
+ * @len2: The length of the destination string.
906
+ * @n: The size of @ops.
907
+ * @ops: An array of elementary edit operations.
908
+ * @nmblocks: Where the number of matching block should be stored.
909
+ *
910
+ * Computes the matching block corresponding to an optimal edit @ops.
911
+ *
912
+ * Returns: The matching blocks as a newly allocated array, it length is
913
+ * stored in @nmblocks.
914
+ **/
915
+ LevMatchingBlock *
916
+ lev_editops_matching_blocks(size_t len1,
917
+ size_t len2,
918
+ size_t n,
919
+ const LevEditOp *ops,
920
+ size_t *nmblocks)
921
+ {
922
+ size_t nmb, i, spos, dpos;
923
+ LevEditType type;
924
+ const LevEditOp *o;
925
+ LevMatchingBlock *mblocks, *mb;
926
+
927
+ /* compute the number of matching blocks */
928
+ nmb = 0;
929
+ o = ops;
930
+ spos = dpos = 0;
931
+ type = LEV_EDIT_KEEP;
932
+ for (i = n; i;)
933
+ {
934
+ /* simply pretend there are no keep blocks */
935
+ while (o->type == LEV_EDIT_KEEP && --i)
936
+ o++;
937
+ if (!i)
938
+ break;
939
+ if (spos < o->spos || dpos < o->dpos)
940
+ {
941
+ nmb++;
942
+ spos = o->spos;
943
+ dpos = o->dpos;
944
+ }
945
+ type = o->type;
946
+ switch (type)
947
+ {
948
+ case LEV_EDIT_REPLACE:
949
+ do
950
+ {
951
+ spos++;
952
+ dpos++;
953
+ i--;
954
+ o++;
955
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
956
+ break;
957
+
958
+ case LEV_EDIT_DELETE:
959
+ do
960
+ {
961
+ spos++;
962
+ i--;
963
+ o++;
964
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
965
+ break;
966
+
967
+ case LEV_EDIT_INSERT:
968
+ do
969
+ {
970
+ dpos++;
971
+ i--;
972
+ o++;
973
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
974
+ break;
975
+
976
+ default:
977
+ break;
978
+ }
979
+ }
980
+ if (spos < len1 || dpos < len2)
981
+ nmb++;
982
+
983
+ /* fill the info */
984
+ mb = mblocks = (LevMatchingBlock *)malloc(nmb * sizeof(LevOpCode));
985
+ if (!mblocks)
986
+ {
987
+ *nmblocks = (size_t)(-1);
988
+ return NULL;
989
+ }
990
+ o = ops;
991
+ spos = dpos = 0;
992
+ type = LEV_EDIT_KEEP;
993
+ for (i = n; i;)
994
+ {
995
+ /* simply pretend there are no keep blocks */
996
+ while (o->type == LEV_EDIT_KEEP && --i)
997
+ o++;
998
+ if (!i)
999
+ break;
1000
+ if (spos < o->spos || dpos < o->dpos)
1001
+ {
1002
+ mb->spos = spos;
1003
+ mb->dpos = dpos;
1004
+ mb->len = o->spos - spos;
1005
+ spos = o->spos;
1006
+ dpos = o->dpos;
1007
+ mb++;
1008
+ }
1009
+ type = o->type;
1010
+ switch (type)
1011
+ {
1012
+ case LEV_EDIT_REPLACE:
1013
+ do
1014
+ {
1015
+ spos++;
1016
+ dpos++;
1017
+ i--;
1018
+ o++;
1019
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
1020
+ break;
1021
+
1022
+ case LEV_EDIT_DELETE:
1023
+ do
1024
+ {
1025
+ spos++;
1026
+ i--;
1027
+ o++;
1028
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
1029
+ break;
1030
+
1031
+ case LEV_EDIT_INSERT:
1032
+ do
1033
+ {
1034
+ dpos++;
1035
+ i--;
1036
+ o++;
1037
+ } while (i && o->type == type && spos == o->spos && dpos == o->dpos);
1038
+ break;
1039
+
1040
+ default:
1041
+ break;
1042
+ }
1043
+ }
1044
+ if (spos < len1 || dpos < len2)
1045
+ {
1046
+ assert(len1 - spos == len2 - dpos);
1047
+ mb->spos = spos;
1048
+ mb->dpos = dpos;
1049
+ mb->len = len1 - spos;
1050
+ mb++;
1051
+ }
1052
+ assert((size_t)(mb - mblocks) == nmb);
1053
+
1054
+ *nmblocks = nmb;
1055
+ return mblocks;
1056
+ }