amatch 0.3.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/CHANGES +7 -0
- data/COPYING +203 -340
- data/README.md +124 -0
- data/Rakefile +9 -14
- data/amatch.gemspec +0 -0
- data/bin/{agrep.rb → agrep} +23 -9
- data/bin/dupfind +153 -0
- data/ext/amatch_ext.c +313 -91
- data/ext/pair.c +3 -1
- data/images/amatch_ext.png +0 -0
- data/lib/amatch/version.rb +1 -1
- data/tests/test_damerau_levenshtein.rb +93 -0
- metadata +33 -37
- data/.gitignore +0 -6
- data/.travis.yml +0 -10
- data/README.rdoc +0 -128
- data/VERSION +0 -1
data/ext/amatch_ext.c
CHANGED
@@ -3,24 +3,8 @@
|
|
3
3
|
#include <ctype.h>
|
4
4
|
#include "common.h"
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
*
|
9
|
-
* call-seq: pattern -> pattern string
|
10
|
-
*
|
11
|
-
* Returns the current pattern string of this instance.
|
12
|
-
*/
|
13
|
-
|
14
|
-
/*
|
15
|
-
* Document-method: pattern=
|
16
|
-
*
|
17
|
-
* call-seq: pattern=(pattern)
|
18
|
-
*
|
19
|
-
* Sets the current pattern string of this instance to <code>pattern</code>.
|
20
|
-
*/
|
21
|
-
|
22
|
-
|
23
|
-
static VALUE rb_mAmatch, rb_mAmatchStringMethods, rb_cLevenshtein, rb_cSellers, rb_cHamming,
|
6
|
+
static VALUE rb_mAmatch, rb_mAmatchStringMethods, rb_cLevenshtein,
|
7
|
+
rb_cDamerauLevenshtein, rb_cSellers, rb_cHamming,
|
24
8
|
rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring,
|
25
9
|
rb_cJaro, rb_cJaroWinkler;
|
26
10
|
|
@@ -230,9 +214,11 @@ DEF_ITERATE_STRINGS(JaroWinkler)
|
|
230
214
|
*/
|
231
215
|
|
232
216
|
#define COMPUTE_LEVENSHTEIN_DISTANCE \
|
233
|
-
|
217
|
+
c = 0; \
|
218
|
+
p = 0; \
|
219
|
+
for (i = 1; i <= a_len; i++) { \
|
234
220
|
c = i % 2; /* current row */ \
|
235
|
-
p = (i
|
221
|
+
p = (i - 1) % 2; /* previous row */ \
|
236
222
|
v[c][0] = i; /* first column */ \
|
237
223
|
for (j = 1; j <= b_len; j++) { \
|
238
224
|
/* Bellman's principle of optimality: */ \
|
@@ -245,8 +231,6 @@ DEF_ITERATE_STRINGS(JaroWinkler)
|
|
245
231
|
} \
|
246
232
|
v[c][j] = weight; \
|
247
233
|
} \
|
248
|
-
p = c; \
|
249
|
-
c = (c + 1) % 2; \
|
250
234
|
}
|
251
235
|
|
252
236
|
static VALUE Levenshtein_match(General *amatch, VALUE string)
|
@@ -269,7 +253,7 @@ static VALUE Levenshtein_match(General *amatch, VALUE string)
|
|
269
253
|
|
270
254
|
COMPUTE_LEVENSHTEIN_DISTANCE
|
271
255
|
|
272
|
-
result = INT2FIX(v[
|
256
|
+
result = INT2FIX(v[c][b_len]);
|
273
257
|
|
274
258
|
xfree(v[0]);
|
275
259
|
xfree(v[1]);
|
@@ -287,6 +271,7 @@ static VALUE Levenshtein_similar(General *amatch, VALUE string)
|
|
287
271
|
|
288
272
|
Check_Type(string, T_STRING);
|
289
273
|
DONT_OPTIMIZE
|
274
|
+
|
290
275
|
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
291
276
|
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
292
277
|
v[0] = ALLOC_N(int, b_len + 1);
|
@@ -299,12 +284,14 @@ static VALUE Levenshtein_similar(General *amatch, VALUE string)
|
|
299
284
|
COMPUTE_LEVENSHTEIN_DISTANCE
|
300
285
|
|
301
286
|
if (b_len > a_len) {
|
302
|
-
result = rb_float_new(1.0 - ((double) v[
|
287
|
+
result = rb_float_new(1.0 - ((double) v[c][b_len]) / b_len);
|
303
288
|
} else {
|
304
|
-
result = rb_float_new(1.0 - ((double) v[
|
289
|
+
result = rb_float_new(1.0 - ((double) v[c][b_len]) / a_len);
|
305
290
|
}
|
291
|
+
|
306
292
|
xfree(v[0]);
|
307
293
|
xfree(v[1]);
|
294
|
+
|
308
295
|
return result;
|
309
296
|
}
|
310
297
|
|
@@ -327,26 +314,159 @@ static VALUE Levenshtein_search(General *amatch, VALUE string)
|
|
327
314
|
COMPUTE_LEVENSHTEIN_DISTANCE
|
328
315
|
|
329
316
|
for (i = 0, min = a_len; i <= b_len; i++) {
|
330
|
-
if (v[
|
317
|
+
if (v[c][i] < min) min = v[c][i];
|
331
318
|
}
|
332
319
|
|
333
320
|
result = INT2FIX(min);
|
334
321
|
|
335
322
|
xfree(v[0]);
|
336
323
|
xfree(v[1]);
|
337
|
-
|
324
|
+
|
325
|
+
return result;
|
326
|
+
}
|
327
|
+
|
328
|
+
/*
|
329
|
+
* DamerauLevenshtein edit distances are computed here:
|
330
|
+
*/
|
331
|
+
|
332
|
+
#define COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE \
|
333
|
+
c = 0; \
|
334
|
+
p = 0; \
|
335
|
+
pp = 0; \
|
336
|
+
for (i = 1; i <= a_len; i++) { \
|
337
|
+
c = i % 3; /* current row */ \
|
338
|
+
p = (i - 1) % 3; /* previous row */ \
|
339
|
+
pp = (i - 2) % 3; /* previous previous row */ \
|
340
|
+
v[c][0] = i; /* first column */ \
|
341
|
+
for (j = 1; j <= b_len; j++) { \
|
342
|
+
/* Bellman's principle of optimality: */ \
|
343
|
+
weight = v[p][j - 1] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
|
344
|
+
if (weight > v[p][j] + 1) { \
|
345
|
+
weight = v[p][j] + 1; \
|
346
|
+
} \
|
347
|
+
if (weight > v[c][j - 1] + 1) { \
|
348
|
+
weight = v[c][j - 1] + 1; \
|
349
|
+
} \
|
350
|
+
if (i > 2 && j > 2 && a_ptr[i - 1] == b_ptr[j - 2] && a_ptr[i - 2] == b_ptr[j - 1]) {\
|
351
|
+
if (weight > v[pp][j - 2]) { \
|
352
|
+
weight = v[pp][j - 2] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
|
353
|
+
} \
|
354
|
+
} \
|
355
|
+
v[c][j] = weight; \
|
356
|
+
} \
|
357
|
+
}
|
358
|
+
|
359
|
+
static VALUE DamerauLevenshtein_match(General *amatch, VALUE string)
|
360
|
+
{
|
361
|
+
VALUE result;
|
362
|
+
char *a_ptr, *b_ptr;
|
363
|
+
int a_len, b_len;
|
364
|
+
int *v[3], weight;
|
365
|
+
int i, j, c, p, pp;
|
366
|
+
|
367
|
+
Check_Type(string, T_STRING);
|
368
|
+
DONT_OPTIMIZE
|
369
|
+
|
370
|
+
v[0] = ALLOC_N(int, b_len + 1);
|
371
|
+
v[1] = ALLOC_N(int, b_len + 1);
|
372
|
+
v[2] = ALLOC_N(int, b_len + 1);
|
373
|
+
for (i = 0; i <= b_len; i++) {
|
374
|
+
v[0][i] = i;
|
375
|
+
v[1][i] = i;
|
376
|
+
v[2][i] = i;
|
377
|
+
}
|
378
|
+
|
379
|
+
COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE
|
380
|
+
|
381
|
+
result = INT2FIX(v[c][b_len]);
|
382
|
+
|
383
|
+
xfree(v[0]);
|
384
|
+
xfree(v[1]);
|
385
|
+
xfree(v[2]);
|
386
|
+
|
338
387
|
return result;
|
339
388
|
}
|
340
389
|
|
390
|
+
static VALUE DamerauLevenshtein_similar(General *amatch, VALUE string)
|
391
|
+
{
|
392
|
+
VALUE result;
|
393
|
+
char *a_ptr, *b_ptr;
|
394
|
+
int a_len, b_len;
|
395
|
+
int *v[3], weight;
|
396
|
+
int i, j, c, p, pp;
|
397
|
+
|
398
|
+
Check_Type(string, T_STRING);
|
399
|
+
DONT_OPTIMIZE
|
400
|
+
|
401
|
+
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
402
|
+
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
403
|
+
v[0] = ALLOC_N(int, b_len + 1);
|
404
|
+
v[1] = ALLOC_N(int, b_len + 1);
|
405
|
+
v[2] = ALLOC_N(int, b_len + 1);
|
406
|
+
for (i = 0; i <= b_len; i++) {
|
407
|
+
v[0][i] = i;
|
408
|
+
v[1][i] = i;
|
409
|
+
v[2][i] = i;
|
410
|
+
}
|
411
|
+
|
412
|
+
COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE
|
413
|
+
|
414
|
+
if (b_len > a_len) {
|
415
|
+
result = rb_float_new(1.0 - ((double) v[c][b_len]) / b_len);
|
416
|
+
} else {
|
417
|
+
result = rb_float_new(1.0 - ((double) v[c][b_len]) / a_len);
|
418
|
+
}
|
419
|
+
|
420
|
+
xfree(v[0]);
|
421
|
+
xfree(v[1]);
|
422
|
+
xfree(v[2]);
|
423
|
+
|
424
|
+
return result;
|
425
|
+
}
|
426
|
+
|
427
|
+
static VALUE DamerauLevenshtein_search(General *amatch, VALUE string)
|
428
|
+
{
|
429
|
+
VALUE result;
|
430
|
+
char *a_ptr, *b_ptr;
|
431
|
+
int a_len, b_len;
|
432
|
+
int *v[3], weight, min;
|
433
|
+
int i, j, c, p, pp;
|
434
|
+
|
435
|
+
Check_Type(string, T_STRING);
|
436
|
+
DONT_OPTIMIZE
|
437
|
+
|
438
|
+
v[0] = ALLOC_N(int, b_len + 1);
|
439
|
+
v[1] = ALLOC_N(int, b_len + 1);
|
440
|
+
v[2] = ALLOC_N(int, b_len + 1);
|
441
|
+
MEMZERO(v[0], int, b_len + 1);
|
442
|
+
MEMZERO(v[1], int, b_len + 1);
|
443
|
+
MEMZERO(v[2], int, b_len + 1);
|
444
|
+
|
445
|
+
COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE
|
446
|
+
|
447
|
+
for (i = 0, min = a_len; i <= b_len; i++) {
|
448
|
+
if (v[c][i] < min) min = v[c][i];
|
449
|
+
}
|
450
|
+
|
451
|
+
result = INT2FIX(min);
|
452
|
+
|
453
|
+
xfree(v[0]);
|
454
|
+
xfree(v[1]);
|
455
|
+
xfree(v[2]);
|
456
|
+
|
457
|
+
return result;
|
458
|
+
}
|
341
459
|
|
342
460
|
/*
|
343
461
|
* Sellers edit distances are computed here:
|
344
462
|
*/
|
345
463
|
|
346
464
|
#define COMPUTE_SELLERS_DISTANCE \
|
347
|
-
|
465
|
+
c = 0; \
|
466
|
+
p = 0; \
|
467
|
+
for (i = 1; i <= a_len; i++) { \
|
348
468
|
c = i % 2; /* current row */ \
|
349
|
-
p = (i
|
469
|
+
p = (i - 1) % 2; /* previous row */ \
|
350
470
|
v[c][0] = i * amatch->deletion; /* first column */ \
|
351
471
|
for (j = 1; j <= b_len; j++) { \
|
352
472
|
/* Bellman's principle of optimality: */ \
|
@@ -361,7 +481,6 @@ static VALUE Levenshtein_search(General *amatch, VALUE string)
|
|
361
481
|
v[c][j] = weight; \
|
362
482
|
} \
|
363
483
|
p = c; \
|
364
|
-
c = (c + 1) % 2; \
|
365
484
|
}
|
366
485
|
|
367
486
|
static VALUE Sellers_match(Sellers *amatch, VALUE string)
|
@@ -411,9 +530,10 @@ static VALUE Sellers_similar(Sellers *amatch, VALUE string)
|
|
411
530
|
max_weight = amatch->deletion;
|
412
531
|
}
|
413
532
|
}
|
414
|
-
|
533
|
+
|
415
534
|
Check_Type(string, T_STRING);
|
416
535
|
DONT_OPTIMIZE
|
536
|
+
|
417
537
|
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
418
538
|
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
419
539
|
v[0] = ALLOC_N(double, b_len + 1);
|
@@ -459,7 +579,7 @@ static VALUE Sellers_search(Sellers *amatch, VALUE string)
|
|
459
579
|
result = rb_float_new(min);
|
460
580
|
xfree(v[0]);
|
461
581
|
xfree(v[1]);
|
462
|
-
|
582
|
+
|
463
583
|
return result;
|
464
584
|
}
|
465
585
|
|
@@ -470,34 +590,32 @@ static VALUE Sellers_search(Sellers *amatch, VALUE string)
|
|
470
590
|
static VALUE PairDistance_match(PairDistance *amatch, VALUE string, VALUE regexp, int use_regexp)
|
471
591
|
{
|
472
592
|
double result;
|
473
|
-
VALUE tokens;
|
474
|
-
PairArray *pair_array;
|
475
|
-
|
593
|
+
VALUE string_tokens, tokens;
|
594
|
+
PairArray *pattern_pair_array, *pair_array;
|
595
|
+
|
476
596
|
Check_Type(string, T_STRING);
|
477
597
|
if (!NIL_P(regexp) || use_regexp) {
|
478
598
|
tokens = rb_funcall(
|
479
599
|
rb_str_new(amatch->pattern, amatch->pattern_len),
|
480
600
|
id_split, 1, regexp
|
481
601
|
);
|
482
|
-
|
483
|
-
amatch->pattern_pair_array = PairArray_new(tokens);
|
484
|
-
} else {
|
485
|
-
pair_array_reactivate(amatch->pattern_pair_array);
|
486
|
-
}
|
487
|
-
tokens = rb_funcall(string, id_split, 1, regexp);
|
488
|
-
pair_array = PairArray_new(tokens);
|
602
|
+
string_tokens = rb_funcall(string, id_split, 1, regexp);
|
489
603
|
} else {
|
490
604
|
VALUE tmp = rb_str_new(amatch->pattern, amatch->pattern_len);
|
491
605
|
tokens = rb_ary_new4(1, &tmp);
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
606
|
+
string_tokens = rb_ary_new4(1, &string);
|
607
|
+
}
|
608
|
+
|
609
|
+
if (!amatch->pattern_pair_array) {
|
610
|
+
pattern_pair_array = PairArray_new(tokens);
|
611
|
+
amatch->pattern_pair_array = pattern_pair_array;
|
612
|
+
} else {
|
613
|
+
pattern_pair_array = amatch->pattern_pair_array;
|
614
|
+
pair_array_reactivate(amatch->pattern_pair_array);
|
499
615
|
}
|
500
|
-
|
616
|
+
pair_array = PairArray_new(string_tokens);
|
617
|
+
|
618
|
+
result = pair_array_match(pattern_pair_array, pair_array);
|
501
619
|
pair_array_destroy(pair_array);
|
502
620
|
return rb_float_new(result);
|
503
621
|
}
|
@@ -520,7 +638,7 @@ static VALUE Hamming_match(General *amatch, VALUE string)
|
|
520
638
|
char *a_ptr, *b_ptr;
|
521
639
|
int a_len, b_len;
|
522
640
|
int i, result;
|
523
|
-
|
641
|
+
|
524
642
|
Check_Type(string, T_STRING);
|
525
643
|
OPTIMIZE_TIME
|
526
644
|
COMPUTE_HAMMING_DISTANCE
|
@@ -532,7 +650,7 @@ static VALUE Hamming_similar(General *amatch, VALUE string)
|
|
532
650
|
char *a_ptr, *b_ptr;
|
533
651
|
int a_len, b_len;
|
534
652
|
int i, result;
|
535
|
-
|
653
|
+
|
536
654
|
Check_Type(string, T_STRING);
|
537
655
|
OPTIMIZE_TIME
|
538
656
|
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
@@ -572,7 +690,7 @@ static VALUE LongestSubsequence_match(General *amatch, VALUE string)
|
|
572
690
|
char *a_ptr, *b_ptr;
|
573
691
|
int a_len, b_len;
|
574
692
|
int result, c, p, i, j, *l[2];
|
575
|
-
|
693
|
+
|
576
694
|
Check_Type(string, T_STRING);
|
577
695
|
OPTIMIZE_TIME
|
578
696
|
|
@@ -586,7 +704,7 @@ static VALUE LongestSubsequence_similar(General *amatch, VALUE string)
|
|
586
704
|
char *a_ptr, *b_ptr;
|
587
705
|
int a_len, b_len;
|
588
706
|
int result, c, p, i, j, *l[2];
|
589
|
-
|
707
|
+
|
590
708
|
Check_Type(string, T_STRING);
|
591
709
|
OPTIMIZE_TIME
|
592
710
|
|
@@ -626,7 +744,7 @@ static VALUE LongestSubstring_match(General *amatch, VALUE string)
|
|
626
744
|
char *a_ptr, *b_ptr;
|
627
745
|
int a_len, b_len;
|
628
746
|
int result, c, p, i, j, *l[2];
|
629
|
-
|
747
|
+
|
630
748
|
Check_Type(string, T_STRING);
|
631
749
|
OPTIMIZE_TIME
|
632
750
|
if (a_len == 0 || b_len == 0) return INT2FIX(0);
|
@@ -639,7 +757,7 @@ static VALUE LongestSubstring_similar(General *amatch, VALUE string)
|
|
639
757
|
char *a_ptr, *b_ptr;
|
640
758
|
int a_len, b_len;
|
641
759
|
int result, c, p, i, j, *l[2];
|
642
|
-
|
760
|
+
|
643
761
|
Check_Type(string, T_STRING);
|
644
762
|
OPTIMIZE_TIME
|
645
763
|
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
@@ -771,7 +889,7 @@ static VALUE JaroWinkler_match(JaroWinkler *amatch, VALUE string)
|
|
771
889
|
* Ruby API
|
772
890
|
*/
|
773
891
|
|
774
|
-
/*
|
892
|
+
/*
|
775
893
|
* Document-class: Amatch::Levenshtein
|
776
894
|
*
|
777
895
|
* The Levenshtein edit distance is defined as the minimal costs involved to
|
@@ -804,7 +922,7 @@ DEF_CONSTRUCTOR(Levenshtein, General)
|
|
804
922
|
|
805
923
|
/*
|
806
924
|
* call-seq: match(strings) -> results
|
807
|
-
*
|
925
|
+
*
|
808
926
|
* Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
|
809
927
|
* against <code>strings</code>. It returns the number operations, the Sellers
|
810
928
|
* distance. <code>strings</code> has to be either a String or an Array of
|
@@ -812,14 +930,14 @@ DEF_CONSTRUCTOR(Levenshtein, General)
|
|
812
930
|
* Floats respectively.
|
813
931
|
*/
|
814
932
|
static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
|
815
|
-
{
|
933
|
+
{
|
816
934
|
GET_STRUCT(General)
|
817
935
|
return General_iterate_strings(amatch, strings, Levenshtein_match);
|
818
936
|
}
|
819
937
|
|
820
938
|
/*
|
821
939
|
* call-seq: similar(strings) -> results
|
822
|
-
*
|
940
|
+
*
|
823
941
|
* Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
|
824
942
|
* against <code>strings</code>, and compute a Levenshtein distance metric
|
825
943
|
* number between 0.0 for very unsimilar strings and 1.0 for an exact match.
|
@@ -828,14 +946,14 @@ static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
|
|
828
946
|
* respectively.
|
829
947
|
*/
|
830
948
|
static VALUE rb_Levenshtein_similar(VALUE self, VALUE strings)
|
831
|
-
{
|
949
|
+
{
|
832
950
|
GET_STRUCT(General)
|
833
951
|
return General_iterate_strings(amatch, strings, Levenshtein_similar);
|
834
952
|
}
|
835
953
|
|
836
954
|
/*
|
837
955
|
* call-seq: levenshtein_similar(strings) -> results
|
838
|
-
*
|
956
|
+
*
|
839
957
|
* If called on a String, this string is used as a Amatch::Levenshtein#pattern
|
840
958
|
* to match against <code>strings</code>. It returns a Levenshtein distance
|
841
959
|
* metric number between 0.0 for very unsimilar strings and 1.0 for an exact
|
@@ -851,7 +969,7 @@ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
|
|
851
969
|
|
852
970
|
/*
|
853
971
|
* call-seq: search(strings) -> results
|
854
|
-
*
|
972
|
+
*
|
855
973
|
* searches Amatch::Levenshtein#pattern in <code>strings</code> and returns the
|
856
974
|
* edit distance (the sum of character operations) as a Fixnum value, by greedy
|
857
975
|
* trimming prefixes or postfixes of the match. <code>strings</code> has
|
@@ -859,12 +977,105 @@ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
|
|
859
977
|
* <code>results</code> is either a Float or an Array of Floats respectively.
|
860
978
|
*/
|
861
979
|
static VALUE rb_Levenshtein_search(VALUE self, VALUE strings)
|
862
|
-
{
|
980
|
+
{
|
863
981
|
GET_STRUCT(General)
|
864
982
|
return General_iterate_strings(amatch, strings, Levenshtein_search);
|
865
983
|
}
|
866
984
|
|
867
|
-
/*
|
985
|
+
/*
|
986
|
+
* Document-class: Amatch::DamerauLevenshtein
|
987
|
+
* XXX
|
988
|
+
* The DamerauLevenshtein edit distance is defined as the minimal costs
|
989
|
+
* involved to transform one string into another by using three elementary
|
990
|
+
* operations: deletion, insertion and substitution of a character. To
|
991
|
+
* transform "water" into "wine", for instance, you have to substitute "a" ->
|
992
|
+
* "i": "witer", "t" -> "n": "winer" and delete "r": "wine". The edit distance
|
993
|
+
* between "water" and "wine" is 3, because you have to apply three
|
994
|
+
* operations. The edit distance between "wine" and "wine" is 0 of course: no
|
995
|
+
* operation is necessary for the transformation -- they're already the same
|
996
|
+
* string. It's easy to see that more similar strings have smaller edit
|
997
|
+
* distances than strings that differ a lot.
|
998
|
+
*/
|
999
|
+
|
1000
|
+
DEF_RB_FREE(DamerauLevenshtein, General)
|
1001
|
+
|
1002
|
+
/*
|
1003
|
+
* call-seq: new(pattern)
|
1004
|
+
* XXX
|
1005
|
+
* Creates a new Amatch::DamerauLevenshtein instance from <code>pattern</code>.
|
1006
|
+
*/
|
1007
|
+
static VALUE rb_DamerauLevenshtein_initialize(VALUE self, VALUE pattern)
|
1008
|
+
{
|
1009
|
+
GET_STRUCT(General)
|
1010
|
+
General_pattern_set(amatch, pattern);
|
1011
|
+
return self;
|
1012
|
+
}
|
1013
|
+
|
1014
|
+
DEF_CONSTRUCTOR(DamerauLevenshtein, General)
|
1015
|
+
|
1016
|
+
/*
|
1017
|
+
* call-seq: match(strings) -> results
|
1018
|
+
* XXX
|
1019
|
+
* Uses this Amatch::DamerauLevenshtein instance to match Amatch::DamerauLevenshtein#pattern
|
1020
|
+
* against <code>strings</code>. It returns the number operations, the Sellers
|
1021
|
+
* distance. <code>strings</code> has to be either a String or an Array of
|
1022
|
+
* Strings. The returned <code>results</code> is either a Float or an Array of
|
1023
|
+
* Floats respectively.
|
1024
|
+
*/
|
1025
|
+
static VALUE rb_DamerauLevenshtein_match(VALUE self, VALUE strings)
|
1026
|
+
{
|
1027
|
+
GET_STRUCT(General)
|
1028
|
+
return General_iterate_strings(amatch, strings, DamerauLevenshtein_match);
|
1029
|
+
}
|
1030
|
+
|
1031
|
+
/*
|
1032
|
+
* call-seq: similar(strings) -> results
|
1033
|
+
* XXX
|
1034
|
+
* Uses this Amatch::DamerauLevenshtein instance to match Amatch::DamerauLevenshtein#pattern
|
1035
|
+
* against <code>strings</code>, and compute a DamerauLevenshtein distance metric
|
1036
|
+
* number between 0.0 for very unsimilar strings and 1.0 for an exact match.
|
1037
|
+
* <code>strings</code> has to be either a String or an Array of Strings. The
|
1038
|
+
* returned <code>results</code> is either a Fixnum or an Array of Fixnums
|
1039
|
+
* respectively.
|
1040
|
+
*/
|
1041
|
+
static VALUE rb_DamerauLevenshtein_similar(VALUE self, VALUE strings)
|
1042
|
+
{
|
1043
|
+
GET_STRUCT(General)
|
1044
|
+
return General_iterate_strings(amatch, strings, DamerauLevenshtein_similar);
|
1045
|
+
}
|
1046
|
+
|
1047
|
+
/*
|
1048
|
+
* call-seq: levenshtein_similar(strings) -> results
|
1049
|
+
* XXX
|
1050
|
+
* If called on a String, this string is used as a Amatch::DamerauLevenshtein#pattern
|
1051
|
+
* to match against <code>strings</code>. It returns a DamerauLevenshtein distance
|
1052
|
+
* metric number between 0.0 for very unsimilar strings and 1.0 for an exact
|
1053
|
+
* match. <code>strings</code> has to be either a String or an Array of
|
1054
|
+
* Strings. The returned <code>results</code> is either a Float or an Array of
|
1055
|
+
* Floats respectively.
|
1056
|
+
*/
|
1057
|
+
static VALUE rb_str_damerau_levenshtein_similar(VALUE self, VALUE strings)
|
1058
|
+
{
|
1059
|
+
VALUE amatch = rb_DamerauLevenshtein_new(rb_cDamerauLevenshtein, self);
|
1060
|
+
return rb_DamerauLevenshtein_similar(amatch, strings);
|
1061
|
+
}
|
1062
|
+
|
1063
|
+
/*
|
1064
|
+
* call-seq: search(strings) -> results
|
1065
|
+
* XXX
|
1066
|
+
* searches Amatch::DamerauLevenshtein#pattern in <code>strings</code> and returns the
|
1067
|
+
* edit distance (the sum of character operations) as a Fixnum value, by greedy
|
1068
|
+
* trimming prefixes or postfixes of the match. <code>strings</code> has
|
1069
|
+
* to be either a String or an Array of Strings. The returned
|
1070
|
+
* <code>results</code> is either a Float or an Array of Floats respectively.
|
1071
|
+
*/
|
1072
|
+
static VALUE rb_DamerauLevenshtein_search(VALUE self, VALUE strings)
|
1073
|
+
{
|
1074
|
+
GET_STRUCT(General)
|
1075
|
+
return General_iterate_strings(amatch, strings, DamerauLevenshtein_search);
|
1076
|
+
}
|
1077
|
+
|
1078
|
+
/*
|
868
1079
|
* Document-class: Amatch::Sellers
|
869
1080
|
*
|
870
1081
|
* The Sellers edit distance is very similar to the Levenshtein edit distance.
|
@@ -983,14 +1194,14 @@ DEF_CONSTRUCTOR(Sellers, Sellers)
|
|
983
1194
|
* Document-method: pattern=
|
984
1195
|
*
|
985
1196
|
* call-seq: pattern=(pattern)
|
986
|
-
*
|
1197
|
+
*
|
987
1198
|
* Sets the current pattern string of this Amatch::Sellers instance to
|
988
1199
|
* <code>pattern</code>.
|
989
1200
|
*/
|
990
1201
|
|
991
1202
|
/*
|
992
1203
|
* call-seq: match(strings) -> results
|
993
|
-
*
|
1204
|
+
*
|
994
1205
|
* Uses this Amatch::Sellers instance to match Sellers#pattern against
|
995
1206
|
* <code>strings</code>, while taking into account the given weights. It
|
996
1207
|
* returns the number of weighted character operations, the Sellers distance.
|
@@ -999,14 +1210,14 @@ DEF_CONSTRUCTOR(Sellers, Sellers)
|
|
999
1210
|
* respectively.
|
1000
1211
|
*/
|
1001
1212
|
static VALUE rb_Sellers_match(VALUE self, VALUE strings)
|
1002
|
-
{
|
1213
|
+
{
|
1003
1214
|
GET_STRUCT(Sellers)
|
1004
1215
|
return Sellers_iterate_strings(amatch, strings, Sellers_match);
|
1005
1216
|
}
|
1006
1217
|
|
1007
1218
|
/*
|
1008
1219
|
* call-seq: similar(strings) -> results
|
1009
|
-
*
|
1220
|
+
*
|
1010
1221
|
* Uses this Amatch::Sellers instance to match Amatch::Sellers#pattern
|
1011
1222
|
* against <code>strings</code> (taking into account the given weights), and
|
1012
1223
|
* compute a Sellers distance metric number between 0.0 for very unsimilar
|
@@ -1016,7 +1227,7 @@ static VALUE rb_Sellers_match(VALUE self, VALUE strings)
|
|
1016
1227
|
* respectively.
|
1017
1228
|
*/
|
1018
1229
|
static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
|
1019
|
-
{
|
1230
|
+
{
|
1020
1231
|
GET_STRUCT(Sellers)
|
1021
1232
|
return Sellers_iterate_strings(amatch, strings, Sellers_similar);
|
1022
1233
|
}
|
@@ -1031,12 +1242,12 @@ static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
|
|
1031
1242
|
* <code>results</code> is either a Float or an Array of Floats respectively.
|
1032
1243
|
*/
|
1033
1244
|
static VALUE rb_Sellers_search(VALUE self, VALUE strings)
|
1034
|
-
{
|
1245
|
+
{
|
1035
1246
|
GET_STRUCT(Sellers)
|
1036
1247
|
return Sellers_iterate_strings(amatch, strings, Sellers_search);
|
1037
1248
|
}
|
1038
1249
|
|
1039
|
-
/*
|
1250
|
+
/*
|
1040
1251
|
* Document-class: Amatch::PairDistance
|
1041
1252
|
*
|
1042
1253
|
* The pair distance between two strings is based on the number of adjacent
|
@@ -1047,7 +1258,7 @@ static VALUE rb_Sellers_search(VALUE self, VALUE strings)
|
|
1047
1258
|
* are more dissimilar. The advantage of considering adjacent characters, is to
|
1048
1259
|
* take account not only of the characters, but also of the character ordering
|
1049
1260
|
* in the original strings.
|
1050
|
-
*
|
1261
|
+
*
|
1051
1262
|
* This metric is very capable to find similarities in natural languages.
|
1052
1263
|
* It is explained in more detail in Simon White's article "How to Strike a
|
1053
1264
|
* Match", located at this url:
|
@@ -1074,7 +1285,7 @@ DEF_CONSTRUCTOR(PairDistance, PairDistance)
|
|
1074
1285
|
|
1075
1286
|
/*
|
1076
1287
|
* call-seq: match(strings, regexp = /\s+/) -> results
|
1077
|
-
*
|
1288
|
+
*
|
1078
1289
|
* Uses this Amatch::PairDistance instance to match PairDistance#pattern against
|
1079
1290
|
* <code>strings</code>. It returns the pair distance measure, that is a
|
1080
1291
|
* returned value of 1.0 is an exact match, partial matches are lower
|
@@ -1090,7 +1301,7 @@ DEF_CONSTRUCTOR(PairDistance, PairDistance)
|
|
1090
1301
|
* Array of Floats respectively.
|
1091
1302
|
*/
|
1092
1303
|
static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
|
1093
|
-
{
|
1304
|
+
{
|
1094
1305
|
VALUE result, strings, regexp = Qnil;
|
1095
1306
|
int use_regexp;
|
1096
1307
|
GET_STRUCT(PairDistance)
|
@@ -1148,7 +1359,7 @@ static VALUE rb_str_pair_distance_similar(int argc, VALUE *argv, VALUE self)
|
|
1148
1359
|
}
|
1149
1360
|
}
|
1150
1361
|
|
1151
|
-
/*
|
1362
|
+
/*
|
1152
1363
|
* Document-class: Amatch::Hamming
|
1153
1364
|
*
|
1154
1365
|
* This class computes the Hamming distance between two strings.
|
@@ -1178,7 +1389,7 @@ DEF_CONSTRUCTOR(Hamming, General)
|
|
1178
1389
|
|
1179
1390
|
/*
|
1180
1391
|
* call-seq: match(strings) -> results
|
1181
|
-
*
|
1392
|
+
*
|
1182
1393
|
* Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
|
1183
1394
|
* <code>strings</code>, that is compute the hamming distance between
|
1184
1395
|
* <code>pattern</code> and <code>strings</code>. <code>strings</code> has to
|
@@ -1186,7 +1397,7 @@ DEF_CONSTRUCTOR(Hamming, General)
|
|
1186
1397
|
* is either a Fixnum or an Array of Fixnums respectively.
|
1187
1398
|
*/
|
1188
1399
|
static VALUE rb_Hamming_match(VALUE self, VALUE strings)
|
1189
|
-
{
|
1400
|
+
{
|
1190
1401
|
GET_STRUCT(General)
|
1191
1402
|
return General_iterate_strings(amatch, strings, Hamming_match);
|
1192
1403
|
}
|
@@ -1202,7 +1413,7 @@ static VALUE rb_Hamming_match(VALUE self, VALUE strings)
|
|
1202
1413
|
* respectively.
|
1203
1414
|
*/
|
1204
1415
|
static VALUE rb_Hamming_similar(VALUE self, VALUE strings)
|
1205
|
-
{
|
1416
|
+
{
|
1206
1417
|
GET_STRUCT(General)
|
1207
1418
|
return General_iterate_strings(amatch, strings, Hamming_similar);
|
1208
1419
|
}
|
@@ -1224,7 +1435,7 @@ static VALUE rb_str_hamming_similar(VALUE self, VALUE strings)
|
|
1224
1435
|
}
|
1225
1436
|
|
1226
1437
|
|
1227
|
-
/*
|
1438
|
+
/*
|
1228
1439
|
* Document-class: Amatch::LongestSubsequence
|
1229
1440
|
*
|
1230
1441
|
* This class computes the length of the longest subsequence common to two
|
@@ -1254,7 +1465,7 @@ DEF_CONSTRUCTOR(LongestSubsequence, General)
|
|
1254
1465
|
|
1255
1466
|
/*
|
1256
1467
|
* call-seq: match(strings) -> results
|
1257
|
-
*
|
1468
|
+
*
|
1258
1469
|
* Uses this Amatch::LongestSubsequence instance to match
|
1259
1470
|
* LongestSubsequence#pattern against <code>strings</code>, that is compute the
|
1260
1471
|
* length of the longest common subsequence. <code>strings</code> has to be
|
@@ -1262,14 +1473,14 @@ DEF_CONSTRUCTOR(LongestSubsequence, General)
|
|
1262
1473
|
* is either a Fixnum or an Array of Fixnums respectively.
|
1263
1474
|
*/
|
1264
1475
|
static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
|
1265
|
-
{
|
1476
|
+
{
|
1266
1477
|
GET_STRUCT(General)
|
1267
1478
|
return General_iterate_strings(amatch, strings, LongestSubsequence_match);
|
1268
1479
|
}
|
1269
1480
|
|
1270
1481
|
/*
|
1271
1482
|
* call-seq: similar(strings) -> results
|
1272
|
-
*
|
1483
|
+
*
|
1273
1484
|
* Uses this Amatch::LongestSubsequence instance to match
|
1274
1485
|
* Amatch::LongestSubsequence#pattern against <code>strings</code>, and compute
|
1275
1486
|
* a longest substring distance metric number between 0.0 for very unsimilar
|
@@ -1278,7 +1489,7 @@ static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
|
|
1278
1489
|
* a Fixnum or an Array of Fixnums
|
1279
1490
|
*/
|
1280
1491
|
static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
|
1281
|
-
{
|
1492
|
+
{
|
1282
1493
|
GET_STRUCT(General)
|
1283
1494
|
return General_iterate_strings(amatch, strings, LongestSubsequence_similar);
|
1284
1495
|
}
|
@@ -1294,12 +1505,12 @@ static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
|
|
1294
1505
|
* is either a Float or an Array of Floats respectively.
|
1295
1506
|
*/
|
1296
1507
|
static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
|
1297
|
-
{
|
1508
|
+
{
|
1298
1509
|
VALUE amatch = rb_LongestSubsequence_new(rb_cLongestSubsequence, self);
|
1299
1510
|
return rb_LongestSubsequence_similar(amatch, strings);
|
1300
1511
|
}
|
1301
1512
|
|
1302
|
-
/*
|
1513
|
+
/*
|
1303
1514
|
* Document-class: Amatch::LongestSubstring
|
1304
1515
|
*
|
1305
1516
|
* The longest common substring is the longest substring, that is part of
|
@@ -1310,7 +1521,7 @@ static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
|
|
1310
1521
|
* The longest common substring between 'string' and 'string' is 'string'
|
1311
1522
|
* again, thus the longest common substring length is 6. The longest common
|
1312
1523
|
* substring between 'string' and 'storing' is 'ring', thus the longest common
|
1313
|
-
* substring length is 4.
|
1524
|
+
* substring length is 4.
|
1314
1525
|
*/
|
1315
1526
|
|
1316
1527
|
DEF_RB_FREE(LongestSubstring, General)
|
@@ -1331,7 +1542,7 @@ DEF_CONSTRUCTOR(LongestSubstring, General)
|
|
1331
1542
|
|
1332
1543
|
/*
|
1333
1544
|
* call-seq: match(strings) -> results
|
1334
|
-
*
|
1545
|
+
*
|
1335
1546
|
* Uses this Amatch::LongestSubstring instance to match
|
1336
1547
|
* LongestSubstring#pattern against <code>strings</code>, that is compute the
|
1337
1548
|
* length of the longest common substring. <code>strings</code> has to be
|
@@ -1346,7 +1557,7 @@ static VALUE rb_LongestSubstring_match(VALUE self, VALUE strings)
|
|
1346
1557
|
|
1347
1558
|
/*
|
1348
1559
|
* call-seq: similar(strings) -> results
|
1349
|
-
*
|
1560
|
+
*
|
1350
1561
|
* Uses this Amatch::LongestSubstring instance to match
|
1351
1562
|
* Amatch::LongestSubstring#pattern against <code>strings</code>, and compute a
|
1352
1563
|
* longest substring distance metric number between 0.0 for very unsimilar
|
@@ -1372,11 +1583,11 @@ static VALUE rb_LongestSubstring_similar(VALUE self, VALUE strings)
|
|
1372
1583
|
* is either a Float or an Array of Floats respectively.
|
1373
1584
|
*/
|
1374
1585
|
static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
|
1375
|
-
{
|
1586
|
+
{
|
1376
1587
|
VALUE amatch = rb_LongestSubstring_new(rb_cLongestSubstring, self);
|
1377
1588
|
return rb_LongestSubstring_similar(amatch, strings);
|
1378
1589
|
}
|
1379
|
-
|
1590
|
+
|
1380
1591
|
/*
|
1381
1592
|
* Document-class: Amatch::Jaro
|
1382
1593
|
*
|
@@ -1573,6 +1784,17 @@ void Init_amatch_ext()
|
|
1573
1784
|
rb_define_method(rb_cLevenshtein, "similar", rb_Levenshtein_similar, 1);
|
1574
1785
|
rb_define_method(rb_mAmatchStringMethods, "levenshtein_similar", rb_str_levenshtein_similar, 1);
|
1575
1786
|
|
1787
|
+
/* DamerauLevenshtein */
|
1788
|
+
rb_cDamerauLevenshtein = rb_define_class_under(rb_mAmatch, "DamerauLevenshtein", rb_cObject);
|
1789
|
+
rb_define_alloc_func(rb_cDamerauLevenshtein, rb_DamerauLevenshtein_s_allocate);
|
1790
|
+
rb_define_method(rb_cDamerauLevenshtein, "initialize", rb_DamerauLevenshtein_initialize, 1);
|
1791
|
+
rb_define_method(rb_cDamerauLevenshtein, "pattern", rb_General_pattern, 0);
|
1792
|
+
rb_define_method(rb_cDamerauLevenshtein, "pattern=", rb_General_pattern_set, 1);
|
1793
|
+
rb_define_method(rb_cDamerauLevenshtein, "match", rb_DamerauLevenshtein_match, 1);
|
1794
|
+
rb_define_method(rb_cDamerauLevenshtein, "search", rb_DamerauLevenshtein_search, 1);
|
1795
|
+
rb_define_method(rb_cDamerauLevenshtein, "similar", rb_DamerauLevenshtein_similar, 1);
|
1796
|
+
rb_define_method(rb_mAmatchStringMethods, "damerau_levenshtein_similar", rb_str_damerau_levenshtein_similar, 1);
|
1797
|
+
|
1576
1798
|
/* Sellers */
|
1577
1799
|
rb_cSellers = rb_define_class_under(rb_mAmatch, "Sellers", rb_cObject);
|
1578
1800
|
rb_define_alloc_func(rb_cSellers, rb_Sellers_s_allocate);
|