amatch 0.3.0 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/CHANGES +7 -0
- data/COPYING +203 -340
- data/README.md +124 -0
- data/Rakefile +9 -14
- data/amatch.gemspec +0 -0
- data/bin/{agrep.rb → agrep} +23 -9
- data/bin/dupfind +153 -0
- data/ext/amatch_ext.c +313 -91
- data/ext/pair.c +3 -1
- data/images/amatch_ext.png +0 -0
- data/lib/amatch/version.rb +1 -1
- data/tests/test_damerau_levenshtein.rb +93 -0
- metadata +33 -37
- data/.gitignore +0 -6
- data/.travis.yml +0 -10
- data/README.rdoc +0 -128
- data/VERSION +0 -1
data/ext/amatch_ext.c
CHANGED
@@ -3,24 +3,8 @@
|
|
3
3
|
#include <ctype.h>
|
4
4
|
#include "common.h"
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
*
|
9
|
-
* call-seq: pattern -> pattern string
|
10
|
-
*
|
11
|
-
* Returns the current pattern string of this instance.
|
12
|
-
*/
|
13
|
-
|
14
|
-
/*
|
15
|
-
* Document-method: pattern=
|
16
|
-
*
|
17
|
-
* call-seq: pattern=(pattern)
|
18
|
-
*
|
19
|
-
* Sets the current pattern string of this instance to <code>pattern</code>.
|
20
|
-
*/
|
21
|
-
|
22
|
-
|
23
|
-
static VALUE rb_mAmatch, rb_mAmatchStringMethods, rb_cLevenshtein, rb_cSellers, rb_cHamming,
|
6
|
+
static VALUE rb_mAmatch, rb_mAmatchStringMethods, rb_cLevenshtein,
|
7
|
+
rb_cDamerauLevenshtein, rb_cSellers, rb_cHamming,
|
24
8
|
rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring,
|
25
9
|
rb_cJaro, rb_cJaroWinkler;
|
26
10
|
|
@@ -230,9 +214,11 @@ DEF_ITERATE_STRINGS(JaroWinkler)
|
|
230
214
|
*/
|
231
215
|
|
232
216
|
#define COMPUTE_LEVENSHTEIN_DISTANCE \
|
233
|
-
|
217
|
+
c = 0; \
|
218
|
+
p = 0; \
|
219
|
+
for (i = 1; i <= a_len; i++) { \
|
234
220
|
c = i % 2; /* current row */ \
|
235
|
-
p = (i
|
221
|
+
p = (i - 1) % 2; /* previous row */ \
|
236
222
|
v[c][0] = i; /* first column */ \
|
237
223
|
for (j = 1; j <= b_len; j++) { \
|
238
224
|
/* Bellman's principle of optimality: */ \
|
@@ -245,8 +231,6 @@ DEF_ITERATE_STRINGS(JaroWinkler)
|
|
245
231
|
} \
|
246
232
|
v[c][j] = weight; \
|
247
233
|
} \
|
248
|
-
p = c; \
|
249
|
-
c = (c + 1) % 2; \
|
250
234
|
}
|
251
235
|
|
252
236
|
static VALUE Levenshtein_match(General *amatch, VALUE string)
|
@@ -269,7 +253,7 @@ static VALUE Levenshtein_match(General *amatch, VALUE string)
|
|
269
253
|
|
270
254
|
COMPUTE_LEVENSHTEIN_DISTANCE
|
271
255
|
|
272
|
-
result = INT2FIX(v[
|
256
|
+
result = INT2FIX(v[c][b_len]);
|
273
257
|
|
274
258
|
xfree(v[0]);
|
275
259
|
xfree(v[1]);
|
@@ -287,6 +271,7 @@ static VALUE Levenshtein_similar(General *amatch, VALUE string)
|
|
287
271
|
|
288
272
|
Check_Type(string, T_STRING);
|
289
273
|
DONT_OPTIMIZE
|
274
|
+
|
290
275
|
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
291
276
|
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
292
277
|
v[0] = ALLOC_N(int, b_len + 1);
|
@@ -299,12 +284,14 @@ static VALUE Levenshtein_similar(General *amatch, VALUE string)
|
|
299
284
|
COMPUTE_LEVENSHTEIN_DISTANCE
|
300
285
|
|
301
286
|
if (b_len > a_len) {
|
302
|
-
result = rb_float_new(1.0 - ((double) v[
|
287
|
+
result = rb_float_new(1.0 - ((double) v[c][b_len]) / b_len);
|
303
288
|
} else {
|
304
|
-
result = rb_float_new(1.0 - ((double) v[
|
289
|
+
result = rb_float_new(1.0 - ((double) v[c][b_len]) / a_len);
|
305
290
|
}
|
291
|
+
|
306
292
|
xfree(v[0]);
|
307
293
|
xfree(v[1]);
|
294
|
+
|
308
295
|
return result;
|
309
296
|
}
|
310
297
|
|
@@ -327,26 +314,159 @@ static VALUE Levenshtein_search(General *amatch, VALUE string)
|
|
327
314
|
COMPUTE_LEVENSHTEIN_DISTANCE
|
328
315
|
|
329
316
|
for (i = 0, min = a_len; i <= b_len; i++) {
|
330
|
-
if (v[
|
317
|
+
if (v[c][i] < min) min = v[c][i];
|
331
318
|
}
|
332
319
|
|
333
320
|
result = INT2FIX(min);
|
334
321
|
|
335
322
|
xfree(v[0]);
|
336
323
|
xfree(v[1]);
|
337
|
-
|
324
|
+
|
325
|
+
return result;
|
326
|
+
}
|
327
|
+
|
328
|
+
/*
|
329
|
+
* DamerauLevenshtein edit distances are computed here:
|
330
|
+
*/
|
331
|
+
|
332
|
+
#define COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE \
|
333
|
+
c = 0; \
|
334
|
+
p = 0; \
|
335
|
+
pp = 0; \
|
336
|
+
for (i = 1; i <= a_len; i++) { \
|
337
|
+
c = i % 3; /* current row */ \
|
338
|
+
p = (i - 1) % 3; /* previous row */ \
|
339
|
+
pp = (i - 2) % 3; /* previous previous row */ \
|
340
|
+
v[c][0] = i; /* first column */ \
|
341
|
+
for (j = 1; j <= b_len; j++) { \
|
342
|
+
/* Bellman's principle of optimality: */ \
|
343
|
+
weight = v[p][j - 1] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
|
344
|
+
if (weight > v[p][j] + 1) { \
|
345
|
+
weight = v[p][j] + 1; \
|
346
|
+
} \
|
347
|
+
if (weight > v[c][j - 1] + 1) { \
|
348
|
+
weight = v[c][j - 1] + 1; \
|
349
|
+
} \
|
350
|
+
if (i > 2 && j > 2 && a_ptr[i - 1] == b_ptr[j - 2] && a_ptr[i - 2] == b_ptr[j - 1]) {\
|
351
|
+
if (weight > v[pp][j - 2]) { \
|
352
|
+
weight = v[pp][j - 2] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
|
353
|
+
} \
|
354
|
+
} \
|
355
|
+
v[c][j] = weight; \
|
356
|
+
} \
|
357
|
+
}
|
358
|
+
|
359
|
+
static VALUE DamerauLevenshtein_match(General *amatch, VALUE string)
|
360
|
+
{
|
361
|
+
VALUE result;
|
362
|
+
char *a_ptr, *b_ptr;
|
363
|
+
int a_len, b_len;
|
364
|
+
int *v[3], weight;
|
365
|
+
int i, j, c, p, pp;
|
366
|
+
|
367
|
+
Check_Type(string, T_STRING);
|
368
|
+
DONT_OPTIMIZE
|
369
|
+
|
370
|
+
v[0] = ALLOC_N(int, b_len + 1);
|
371
|
+
v[1] = ALLOC_N(int, b_len + 1);
|
372
|
+
v[2] = ALLOC_N(int, b_len + 1);
|
373
|
+
for (i = 0; i <= b_len; i++) {
|
374
|
+
v[0][i] = i;
|
375
|
+
v[1][i] = i;
|
376
|
+
v[2][i] = i;
|
377
|
+
}
|
378
|
+
|
379
|
+
COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE
|
380
|
+
|
381
|
+
result = INT2FIX(v[c][b_len]);
|
382
|
+
|
383
|
+
xfree(v[0]);
|
384
|
+
xfree(v[1]);
|
385
|
+
xfree(v[2]);
|
386
|
+
|
338
387
|
return result;
|
339
388
|
}
|
340
389
|
|
390
|
+
static VALUE DamerauLevenshtein_similar(General *amatch, VALUE string)
|
391
|
+
{
|
392
|
+
VALUE result;
|
393
|
+
char *a_ptr, *b_ptr;
|
394
|
+
int a_len, b_len;
|
395
|
+
int *v[3], weight;
|
396
|
+
int i, j, c, p, pp;
|
397
|
+
|
398
|
+
Check_Type(string, T_STRING);
|
399
|
+
DONT_OPTIMIZE
|
400
|
+
|
401
|
+
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
402
|
+
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
403
|
+
v[0] = ALLOC_N(int, b_len + 1);
|
404
|
+
v[1] = ALLOC_N(int, b_len + 1);
|
405
|
+
v[2] = ALLOC_N(int, b_len + 1);
|
406
|
+
for (i = 0; i <= b_len; i++) {
|
407
|
+
v[0][i] = i;
|
408
|
+
v[1][i] = i;
|
409
|
+
v[2][i] = i;
|
410
|
+
}
|
411
|
+
|
412
|
+
COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE
|
413
|
+
|
414
|
+
if (b_len > a_len) {
|
415
|
+
result = rb_float_new(1.0 - ((double) v[c][b_len]) / b_len);
|
416
|
+
} else {
|
417
|
+
result = rb_float_new(1.0 - ((double) v[c][b_len]) / a_len);
|
418
|
+
}
|
419
|
+
|
420
|
+
xfree(v[0]);
|
421
|
+
xfree(v[1]);
|
422
|
+
xfree(v[2]);
|
423
|
+
|
424
|
+
return result;
|
425
|
+
}
|
426
|
+
|
427
|
+
static VALUE DamerauLevenshtein_search(General *amatch, VALUE string)
|
428
|
+
{
|
429
|
+
VALUE result;
|
430
|
+
char *a_ptr, *b_ptr;
|
431
|
+
int a_len, b_len;
|
432
|
+
int *v[3], weight, min;
|
433
|
+
int i, j, c, p, pp;
|
434
|
+
|
435
|
+
Check_Type(string, T_STRING);
|
436
|
+
DONT_OPTIMIZE
|
437
|
+
|
438
|
+
v[0] = ALLOC_N(int, b_len + 1);
|
439
|
+
v[1] = ALLOC_N(int, b_len + 1);
|
440
|
+
v[2] = ALLOC_N(int, b_len + 1);
|
441
|
+
MEMZERO(v[0], int, b_len + 1);
|
442
|
+
MEMZERO(v[1], int, b_len + 1);
|
443
|
+
MEMZERO(v[2], int, b_len + 1);
|
444
|
+
|
445
|
+
COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE
|
446
|
+
|
447
|
+
for (i = 0, min = a_len; i <= b_len; i++) {
|
448
|
+
if (v[c][i] < min) min = v[c][i];
|
449
|
+
}
|
450
|
+
|
451
|
+
result = INT2FIX(min);
|
452
|
+
|
453
|
+
xfree(v[0]);
|
454
|
+
xfree(v[1]);
|
455
|
+
xfree(v[2]);
|
456
|
+
|
457
|
+
return result;
|
458
|
+
}
|
341
459
|
|
342
460
|
/*
|
343
461
|
* Sellers edit distances are computed here:
|
344
462
|
*/
|
345
463
|
|
346
464
|
#define COMPUTE_SELLERS_DISTANCE \
|
347
|
-
|
465
|
+
c = 0; \
|
466
|
+
p = 0; \
|
467
|
+
for (i = 1; i <= a_len; i++) { \
|
348
468
|
c = i % 2; /* current row */ \
|
349
|
-
p = (i
|
469
|
+
p = (i - 1) % 2; /* previous row */ \
|
350
470
|
v[c][0] = i * amatch->deletion; /* first column */ \
|
351
471
|
for (j = 1; j <= b_len; j++) { \
|
352
472
|
/* Bellman's principle of optimality: */ \
|
@@ -361,7 +481,6 @@ static VALUE Levenshtein_search(General *amatch, VALUE string)
|
|
361
481
|
v[c][j] = weight; \
|
362
482
|
} \
|
363
483
|
p = c; \
|
364
|
-
c = (c + 1) % 2; \
|
365
484
|
}
|
366
485
|
|
367
486
|
static VALUE Sellers_match(Sellers *amatch, VALUE string)
|
@@ -411,9 +530,10 @@ static VALUE Sellers_similar(Sellers *amatch, VALUE string)
|
|
411
530
|
max_weight = amatch->deletion;
|
412
531
|
}
|
413
532
|
}
|
414
|
-
|
533
|
+
|
415
534
|
Check_Type(string, T_STRING);
|
416
535
|
DONT_OPTIMIZE
|
536
|
+
|
417
537
|
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
418
538
|
if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
|
419
539
|
v[0] = ALLOC_N(double, b_len + 1);
|
@@ -459,7 +579,7 @@ static VALUE Sellers_search(Sellers *amatch, VALUE string)
|
|
459
579
|
result = rb_float_new(min);
|
460
580
|
xfree(v[0]);
|
461
581
|
xfree(v[1]);
|
462
|
-
|
582
|
+
|
463
583
|
return result;
|
464
584
|
}
|
465
585
|
|
@@ -470,34 +590,32 @@ static VALUE Sellers_search(Sellers *amatch, VALUE string)
|
|
470
590
|
static VALUE PairDistance_match(PairDistance *amatch, VALUE string, VALUE regexp, int use_regexp)
|
471
591
|
{
|
472
592
|
double result;
|
473
|
-
VALUE tokens;
|
474
|
-
PairArray *pair_array;
|
475
|
-
|
593
|
+
VALUE string_tokens, tokens;
|
594
|
+
PairArray *pattern_pair_array, *pair_array;
|
595
|
+
|
476
596
|
Check_Type(string, T_STRING);
|
477
597
|
if (!NIL_P(regexp) || use_regexp) {
|
478
598
|
tokens = rb_funcall(
|
479
599
|
rb_str_new(amatch->pattern, amatch->pattern_len),
|
480
600
|
id_split, 1, regexp
|
481
601
|
);
|
482
|
-
|
483
|
-
amatch->pattern_pair_array = PairArray_new(tokens);
|
484
|
-
} else {
|
485
|
-
pair_array_reactivate(amatch->pattern_pair_array);
|
486
|
-
}
|
487
|
-
tokens = rb_funcall(string, id_split, 1, regexp);
|
488
|
-
pair_array = PairArray_new(tokens);
|
602
|
+
string_tokens = rb_funcall(string, id_split, 1, regexp);
|
489
603
|
} else {
|
490
604
|
VALUE tmp = rb_str_new(amatch->pattern, amatch->pattern_len);
|
491
605
|
tokens = rb_ary_new4(1, &tmp);
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
606
|
+
string_tokens = rb_ary_new4(1, &string);
|
607
|
+
}
|
608
|
+
|
609
|
+
if (!amatch->pattern_pair_array) {
|
610
|
+
pattern_pair_array = PairArray_new(tokens);
|
611
|
+
amatch->pattern_pair_array = pattern_pair_array;
|
612
|
+
} else {
|
613
|
+
pattern_pair_array = amatch->pattern_pair_array;
|
614
|
+
pair_array_reactivate(amatch->pattern_pair_array);
|
499
615
|
}
|
500
|
-
|
616
|
+
pair_array = PairArray_new(string_tokens);
|
617
|
+
|
618
|
+
result = pair_array_match(pattern_pair_array, pair_array);
|
501
619
|
pair_array_destroy(pair_array);
|
502
620
|
return rb_float_new(result);
|
503
621
|
}
|
@@ -520,7 +638,7 @@ static VALUE Hamming_match(General *amatch, VALUE string)
|
|
520
638
|
char *a_ptr, *b_ptr;
|
521
639
|
int a_len, b_len;
|
522
640
|
int i, result;
|
523
|
-
|
641
|
+
|
524
642
|
Check_Type(string, T_STRING);
|
525
643
|
OPTIMIZE_TIME
|
526
644
|
COMPUTE_HAMMING_DISTANCE
|
@@ -532,7 +650,7 @@ static VALUE Hamming_similar(General *amatch, VALUE string)
|
|
532
650
|
char *a_ptr, *b_ptr;
|
533
651
|
int a_len, b_len;
|
534
652
|
int i, result;
|
535
|
-
|
653
|
+
|
536
654
|
Check_Type(string, T_STRING);
|
537
655
|
OPTIMIZE_TIME
|
538
656
|
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
@@ -572,7 +690,7 @@ static VALUE LongestSubsequence_match(General *amatch, VALUE string)
|
|
572
690
|
char *a_ptr, *b_ptr;
|
573
691
|
int a_len, b_len;
|
574
692
|
int result, c, p, i, j, *l[2];
|
575
|
-
|
693
|
+
|
576
694
|
Check_Type(string, T_STRING);
|
577
695
|
OPTIMIZE_TIME
|
578
696
|
|
@@ -586,7 +704,7 @@ static VALUE LongestSubsequence_similar(General *amatch, VALUE string)
|
|
586
704
|
char *a_ptr, *b_ptr;
|
587
705
|
int a_len, b_len;
|
588
706
|
int result, c, p, i, j, *l[2];
|
589
|
-
|
707
|
+
|
590
708
|
Check_Type(string, T_STRING);
|
591
709
|
OPTIMIZE_TIME
|
592
710
|
|
@@ -626,7 +744,7 @@ static VALUE LongestSubstring_match(General *amatch, VALUE string)
|
|
626
744
|
char *a_ptr, *b_ptr;
|
627
745
|
int a_len, b_len;
|
628
746
|
int result, c, p, i, j, *l[2];
|
629
|
-
|
747
|
+
|
630
748
|
Check_Type(string, T_STRING);
|
631
749
|
OPTIMIZE_TIME
|
632
750
|
if (a_len == 0 || b_len == 0) return INT2FIX(0);
|
@@ -639,7 +757,7 @@ static VALUE LongestSubstring_similar(General *amatch, VALUE string)
|
|
639
757
|
char *a_ptr, *b_ptr;
|
640
758
|
int a_len, b_len;
|
641
759
|
int result, c, p, i, j, *l[2];
|
642
|
-
|
760
|
+
|
643
761
|
Check_Type(string, T_STRING);
|
644
762
|
OPTIMIZE_TIME
|
645
763
|
if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
|
@@ -771,7 +889,7 @@ static VALUE JaroWinkler_match(JaroWinkler *amatch, VALUE string)
|
|
771
889
|
* Ruby API
|
772
890
|
*/
|
773
891
|
|
774
|
-
/*
|
892
|
+
/*
|
775
893
|
* Document-class: Amatch::Levenshtein
|
776
894
|
*
|
777
895
|
* The Levenshtein edit distance is defined as the minimal costs involved to
|
@@ -804,7 +922,7 @@ DEF_CONSTRUCTOR(Levenshtein, General)
|
|
804
922
|
|
805
923
|
/*
|
806
924
|
* call-seq: match(strings) -> results
|
807
|
-
*
|
925
|
+
*
|
808
926
|
* Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
|
809
927
|
* against <code>strings</code>. It returns the number operations, the Sellers
|
810
928
|
* distance. <code>strings</code> has to be either a String or an Array of
|
@@ -812,14 +930,14 @@ DEF_CONSTRUCTOR(Levenshtein, General)
|
|
812
930
|
* Floats respectively.
|
813
931
|
*/
|
814
932
|
static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
|
815
|
-
{
|
933
|
+
{
|
816
934
|
GET_STRUCT(General)
|
817
935
|
return General_iterate_strings(amatch, strings, Levenshtein_match);
|
818
936
|
}
|
819
937
|
|
820
938
|
/*
|
821
939
|
* call-seq: similar(strings) -> results
|
822
|
-
*
|
940
|
+
*
|
823
941
|
* Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
|
824
942
|
* against <code>strings</code>, and compute a Levenshtein distance metric
|
825
943
|
* number between 0.0 for very unsimilar strings and 1.0 for an exact match.
|
@@ -828,14 +946,14 @@ static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
|
|
828
946
|
* respectively.
|
829
947
|
*/
|
830
948
|
static VALUE rb_Levenshtein_similar(VALUE self, VALUE strings)
|
831
|
-
{
|
949
|
+
{
|
832
950
|
GET_STRUCT(General)
|
833
951
|
return General_iterate_strings(amatch, strings, Levenshtein_similar);
|
834
952
|
}
|
835
953
|
|
836
954
|
/*
|
837
955
|
* call-seq: levenshtein_similar(strings) -> results
|
838
|
-
*
|
956
|
+
*
|
839
957
|
* If called on a String, this string is used as a Amatch::Levenshtein#pattern
|
840
958
|
* to match against <code>strings</code>. It returns a Levenshtein distance
|
841
959
|
* metric number between 0.0 for very unsimilar strings and 1.0 for an exact
|
@@ -851,7 +969,7 @@ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
|
|
851
969
|
|
852
970
|
/*
|
853
971
|
* call-seq: search(strings) -> results
|
854
|
-
*
|
972
|
+
*
|
855
973
|
* searches Amatch::Levenshtein#pattern in <code>strings</code> and returns the
|
856
974
|
* edit distance (the sum of character operations) as a Fixnum value, by greedy
|
857
975
|
* trimming prefixes or postfixes of the match. <code>strings</code> has
|
@@ -859,12 +977,105 @@ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
|
|
859
977
|
* <code>results</code> is either a Float or an Array of Floats respectively.
|
860
978
|
*/
|
861
979
|
static VALUE rb_Levenshtein_search(VALUE self, VALUE strings)
|
862
|
-
{
|
980
|
+
{
|
863
981
|
GET_STRUCT(General)
|
864
982
|
return General_iterate_strings(amatch, strings, Levenshtein_search);
|
865
983
|
}
|
866
984
|
|
867
|
-
/*
|
985
|
+
/*
|
986
|
+
* Document-class: Amatch::DamerauLevenshtein
|
987
|
+
* XXX
|
988
|
+
* The DamerauLevenshtein edit distance is defined as the minimal costs
|
989
|
+
* involved to transform one string into another by using three elementary
|
990
|
+
* operations: deletion, insertion and substitution of a character. To
|
991
|
+
* transform "water" into "wine", for instance, you have to substitute "a" ->
|
992
|
+
* "i": "witer", "t" -> "n": "winer" and delete "r": "wine". The edit distance
|
993
|
+
* between "water" and "wine" is 3, because you have to apply three
|
994
|
+
* operations. The edit distance between "wine" and "wine" is 0 of course: no
|
995
|
+
* operation is necessary for the transformation -- they're already the same
|
996
|
+
* string. It's easy to see that more similar strings have smaller edit
|
997
|
+
* distances than strings that differ a lot.
|
998
|
+
*/
|
999
|
+
|
1000
|
+
DEF_RB_FREE(DamerauLevenshtein, General)
|
1001
|
+
|
1002
|
+
/*
|
1003
|
+
* call-seq: new(pattern)
|
1004
|
+
* XXX
|
1005
|
+
* Creates a new Amatch::DamerauLevenshtein instance from <code>pattern</code>.
|
1006
|
+
*/
|
1007
|
+
static VALUE rb_DamerauLevenshtein_initialize(VALUE self, VALUE pattern)
|
1008
|
+
{
|
1009
|
+
GET_STRUCT(General)
|
1010
|
+
General_pattern_set(amatch, pattern);
|
1011
|
+
return self;
|
1012
|
+
}
|
1013
|
+
|
1014
|
+
DEF_CONSTRUCTOR(DamerauLevenshtein, General)
|
1015
|
+
|
1016
|
+
/*
|
1017
|
+
* call-seq: match(strings) -> results
|
1018
|
+
* XXX
|
1019
|
+
* Uses this Amatch::DamerauLevenshtein instance to match Amatch::DamerauLevenshtein#pattern
|
1020
|
+
* against <code>strings</code>. It returns the number operations, the Sellers
|
1021
|
+
* distance. <code>strings</code> has to be either a String or an Array of
|
1022
|
+
* Strings. The returned <code>results</code> is either a Float or an Array of
|
1023
|
+
* Floats respectively.
|
1024
|
+
*/
|
1025
|
+
static VALUE rb_DamerauLevenshtein_match(VALUE self, VALUE strings)
|
1026
|
+
{
|
1027
|
+
GET_STRUCT(General)
|
1028
|
+
return General_iterate_strings(amatch, strings, DamerauLevenshtein_match);
|
1029
|
+
}
|
1030
|
+
|
1031
|
+
/*
|
1032
|
+
* call-seq: similar(strings) -> results
|
1033
|
+
* XXX
|
1034
|
+
* Uses this Amatch::DamerauLevenshtein instance to match Amatch::DamerauLevenshtein#pattern
|
1035
|
+
* against <code>strings</code>, and compute a DamerauLevenshtein distance metric
|
1036
|
+
* number between 0.0 for very unsimilar strings and 1.0 for an exact match.
|
1037
|
+
* <code>strings</code> has to be either a String or an Array of Strings. The
|
1038
|
+
* returned <code>results</code> is either a Fixnum or an Array of Fixnums
|
1039
|
+
* respectively.
|
1040
|
+
*/
|
1041
|
+
static VALUE rb_DamerauLevenshtein_similar(VALUE self, VALUE strings)
|
1042
|
+
{
|
1043
|
+
GET_STRUCT(General)
|
1044
|
+
return General_iterate_strings(amatch, strings, DamerauLevenshtein_similar);
|
1045
|
+
}
|
1046
|
+
|
1047
|
+
/*
|
1048
|
+
* call-seq: levenshtein_similar(strings) -> results
|
1049
|
+
* XXX
|
1050
|
+
* If called on a String, this string is used as a Amatch::DamerauLevenshtein#pattern
|
1051
|
+
* to match against <code>strings</code>. It returns a DamerauLevenshtein distance
|
1052
|
+
* metric number between 0.0 for very unsimilar strings and 1.0 for an exact
|
1053
|
+
* match. <code>strings</code> has to be either a String or an Array of
|
1054
|
+
* Strings. The returned <code>results</code> is either a Float or an Array of
|
1055
|
+
* Floats respectively.
|
1056
|
+
*/
|
1057
|
+
static VALUE rb_str_damerau_levenshtein_similar(VALUE self, VALUE strings)
|
1058
|
+
{
|
1059
|
+
VALUE amatch = rb_DamerauLevenshtein_new(rb_cDamerauLevenshtein, self);
|
1060
|
+
return rb_DamerauLevenshtein_similar(amatch, strings);
|
1061
|
+
}
|
1062
|
+
|
1063
|
+
/*
|
1064
|
+
* call-seq: search(strings) -> results
|
1065
|
+
* XXX
|
1066
|
+
* searches Amatch::DamerauLevenshtein#pattern in <code>strings</code> and returns the
|
1067
|
+
* edit distance (the sum of character operations) as a Fixnum value, by greedy
|
1068
|
+
* trimming prefixes or postfixes of the match. <code>strings</code> has
|
1069
|
+
* to be either a String or an Array of Strings. The returned
|
1070
|
+
* <code>results</code> is either a Float or an Array of Floats respectively.
|
1071
|
+
*/
|
1072
|
+
static VALUE rb_DamerauLevenshtein_search(VALUE self, VALUE strings)
|
1073
|
+
{
|
1074
|
+
GET_STRUCT(General)
|
1075
|
+
return General_iterate_strings(amatch, strings, DamerauLevenshtein_search);
|
1076
|
+
}
|
1077
|
+
|
1078
|
+
/*
|
868
1079
|
* Document-class: Amatch::Sellers
|
869
1080
|
*
|
870
1081
|
* The Sellers edit distance is very similar to the Levenshtein edit distance.
|
@@ -983,14 +1194,14 @@ DEF_CONSTRUCTOR(Sellers, Sellers)
|
|
983
1194
|
* Document-method: pattern=
|
984
1195
|
*
|
985
1196
|
* call-seq: pattern=(pattern)
|
986
|
-
*
|
1197
|
+
*
|
987
1198
|
* Sets the current pattern string of this Amatch::Sellers instance to
|
988
1199
|
* <code>pattern</code>.
|
989
1200
|
*/
|
990
1201
|
|
991
1202
|
/*
|
992
1203
|
* call-seq: match(strings) -> results
|
993
|
-
*
|
1204
|
+
*
|
994
1205
|
* Uses this Amatch::Sellers instance to match Sellers#pattern against
|
995
1206
|
* <code>strings</code>, while taking into account the given weights. It
|
996
1207
|
* returns the number of weighted character operations, the Sellers distance.
|
@@ -999,14 +1210,14 @@ DEF_CONSTRUCTOR(Sellers, Sellers)
|
|
999
1210
|
* respectively.
|
1000
1211
|
*/
|
1001
1212
|
static VALUE rb_Sellers_match(VALUE self, VALUE strings)
|
1002
|
-
{
|
1213
|
+
{
|
1003
1214
|
GET_STRUCT(Sellers)
|
1004
1215
|
return Sellers_iterate_strings(amatch, strings, Sellers_match);
|
1005
1216
|
}
|
1006
1217
|
|
1007
1218
|
/*
|
1008
1219
|
* call-seq: similar(strings) -> results
|
1009
|
-
*
|
1220
|
+
*
|
1010
1221
|
* Uses this Amatch::Sellers instance to match Amatch::Sellers#pattern
|
1011
1222
|
* against <code>strings</code> (taking into account the given weights), and
|
1012
1223
|
* compute a Sellers distance metric number between 0.0 for very unsimilar
|
@@ -1016,7 +1227,7 @@ static VALUE rb_Sellers_match(VALUE self, VALUE strings)
|
|
1016
1227
|
* respectively.
|
1017
1228
|
*/
|
1018
1229
|
static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
|
1019
|
-
{
|
1230
|
+
{
|
1020
1231
|
GET_STRUCT(Sellers)
|
1021
1232
|
return Sellers_iterate_strings(amatch, strings, Sellers_similar);
|
1022
1233
|
}
|
@@ -1031,12 +1242,12 @@ static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
|
|
1031
1242
|
* <code>results</code> is either a Float or an Array of Floats respectively.
|
1032
1243
|
*/
|
1033
1244
|
static VALUE rb_Sellers_search(VALUE self, VALUE strings)
|
1034
|
-
{
|
1245
|
+
{
|
1035
1246
|
GET_STRUCT(Sellers)
|
1036
1247
|
return Sellers_iterate_strings(amatch, strings, Sellers_search);
|
1037
1248
|
}
|
1038
1249
|
|
1039
|
-
/*
|
1250
|
+
/*
|
1040
1251
|
* Document-class: Amatch::PairDistance
|
1041
1252
|
*
|
1042
1253
|
* The pair distance between two strings is based on the number of adjacent
|
@@ -1047,7 +1258,7 @@ static VALUE rb_Sellers_search(VALUE self, VALUE strings)
|
|
1047
1258
|
* are more dissimilar. The advantage of considering adjacent characters, is to
|
1048
1259
|
* take account not only of the characters, but also of the character ordering
|
1049
1260
|
* in the original strings.
|
1050
|
-
*
|
1261
|
+
*
|
1051
1262
|
* This metric is very capable to find similarities in natural languages.
|
1052
1263
|
* It is explained in more detail in Simon White's article "How to Strike a
|
1053
1264
|
* Match", located at this url:
|
@@ -1074,7 +1285,7 @@ DEF_CONSTRUCTOR(PairDistance, PairDistance)
|
|
1074
1285
|
|
1075
1286
|
/*
|
1076
1287
|
* call-seq: match(strings, regexp = /\s+/) -> results
|
1077
|
-
*
|
1288
|
+
*
|
1078
1289
|
* Uses this Amatch::PairDistance instance to match PairDistance#pattern against
|
1079
1290
|
* <code>strings</code>. It returns the pair distance measure, that is a
|
1080
1291
|
* returned value of 1.0 is an exact match, partial matches are lower
|
@@ -1090,7 +1301,7 @@ DEF_CONSTRUCTOR(PairDistance, PairDistance)
|
|
1090
1301
|
* Array of Floats respectively.
|
1091
1302
|
*/
|
1092
1303
|
static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
|
1093
|
-
{
|
1304
|
+
{
|
1094
1305
|
VALUE result, strings, regexp = Qnil;
|
1095
1306
|
int use_regexp;
|
1096
1307
|
GET_STRUCT(PairDistance)
|
@@ -1148,7 +1359,7 @@ static VALUE rb_str_pair_distance_similar(int argc, VALUE *argv, VALUE self)
|
|
1148
1359
|
}
|
1149
1360
|
}
|
1150
1361
|
|
1151
|
-
/*
|
1362
|
+
/*
|
1152
1363
|
* Document-class: Amatch::Hamming
|
1153
1364
|
*
|
1154
1365
|
* This class computes the Hamming distance between two strings.
|
@@ -1178,7 +1389,7 @@ DEF_CONSTRUCTOR(Hamming, General)
|
|
1178
1389
|
|
1179
1390
|
/*
|
1180
1391
|
* call-seq: match(strings) -> results
|
1181
|
-
*
|
1392
|
+
*
|
1182
1393
|
* Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
|
1183
1394
|
* <code>strings</code>, that is compute the hamming distance between
|
1184
1395
|
* <code>pattern</code> and <code>strings</code>. <code>strings</code> has to
|
@@ -1186,7 +1397,7 @@ DEF_CONSTRUCTOR(Hamming, General)
|
|
1186
1397
|
* is either a Fixnum or an Array of Fixnums respectively.
|
1187
1398
|
*/
|
1188
1399
|
static VALUE rb_Hamming_match(VALUE self, VALUE strings)
|
1189
|
-
{
|
1400
|
+
{
|
1190
1401
|
GET_STRUCT(General)
|
1191
1402
|
return General_iterate_strings(amatch, strings, Hamming_match);
|
1192
1403
|
}
|
@@ -1202,7 +1413,7 @@ static VALUE rb_Hamming_match(VALUE self, VALUE strings)
|
|
1202
1413
|
* respectively.
|
1203
1414
|
*/
|
1204
1415
|
static VALUE rb_Hamming_similar(VALUE self, VALUE strings)
|
1205
|
-
{
|
1416
|
+
{
|
1206
1417
|
GET_STRUCT(General)
|
1207
1418
|
return General_iterate_strings(amatch, strings, Hamming_similar);
|
1208
1419
|
}
|
@@ -1224,7 +1435,7 @@ static VALUE rb_str_hamming_similar(VALUE self, VALUE strings)
|
|
1224
1435
|
}
|
1225
1436
|
|
1226
1437
|
|
1227
|
-
/*
|
1438
|
+
/*
|
1228
1439
|
* Document-class: Amatch::LongestSubsequence
|
1229
1440
|
*
|
1230
1441
|
* This class computes the length of the longest subsequence common to two
|
@@ -1254,7 +1465,7 @@ DEF_CONSTRUCTOR(LongestSubsequence, General)
|
|
1254
1465
|
|
1255
1466
|
/*
|
1256
1467
|
* call-seq: match(strings) -> results
|
1257
|
-
*
|
1468
|
+
*
|
1258
1469
|
* Uses this Amatch::LongestSubsequence instance to match
|
1259
1470
|
* LongestSubsequence#pattern against <code>strings</code>, that is compute the
|
1260
1471
|
* length of the longest common subsequence. <code>strings</code> has to be
|
@@ -1262,14 +1473,14 @@ DEF_CONSTRUCTOR(LongestSubsequence, General)
|
|
1262
1473
|
* is either a Fixnum or an Array of Fixnums respectively.
|
1263
1474
|
*/
|
1264
1475
|
static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
|
1265
|
-
{
|
1476
|
+
{
|
1266
1477
|
GET_STRUCT(General)
|
1267
1478
|
return General_iterate_strings(amatch, strings, LongestSubsequence_match);
|
1268
1479
|
}
|
1269
1480
|
|
1270
1481
|
/*
|
1271
1482
|
* call-seq: similar(strings) -> results
|
1272
|
-
*
|
1483
|
+
*
|
1273
1484
|
* Uses this Amatch::LongestSubsequence instance to match
|
1274
1485
|
* Amatch::LongestSubsequence#pattern against <code>strings</code>, and compute
|
1275
1486
|
* a longest substring distance metric number between 0.0 for very unsimilar
|
@@ -1278,7 +1489,7 @@ static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
|
|
1278
1489
|
* a Fixnum or an Array of Fixnums
|
1279
1490
|
*/
|
1280
1491
|
static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
|
1281
|
-
{
|
1492
|
+
{
|
1282
1493
|
GET_STRUCT(General)
|
1283
1494
|
return General_iterate_strings(amatch, strings, LongestSubsequence_similar);
|
1284
1495
|
}
|
@@ -1294,12 +1505,12 @@ static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
|
|
1294
1505
|
* is either a Float or an Array of Floats respectively.
|
1295
1506
|
*/
|
1296
1507
|
static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
|
1297
|
-
{
|
1508
|
+
{
|
1298
1509
|
VALUE amatch = rb_LongestSubsequence_new(rb_cLongestSubsequence, self);
|
1299
1510
|
return rb_LongestSubsequence_similar(amatch, strings);
|
1300
1511
|
}
|
1301
1512
|
|
1302
|
-
/*
|
1513
|
+
/*
|
1303
1514
|
* Document-class: Amatch::LongestSubstring
|
1304
1515
|
*
|
1305
1516
|
* The longest common substring is the longest substring, that is part of
|
@@ -1310,7 +1521,7 @@ static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
|
|
1310
1521
|
* The longest common substring between 'string' and 'string' is 'string'
|
1311
1522
|
* again, thus the longest common substring length is 6. The longest common
|
1312
1523
|
* substring between 'string' and 'storing' is 'ring', thus the longest common
|
1313
|
-
* substring length is 4.
|
1524
|
+
* substring length is 4.
|
1314
1525
|
*/
|
1315
1526
|
|
1316
1527
|
DEF_RB_FREE(LongestSubstring, General)
|
@@ -1331,7 +1542,7 @@ DEF_CONSTRUCTOR(LongestSubstring, General)
|
|
1331
1542
|
|
1332
1543
|
/*
|
1333
1544
|
* call-seq: match(strings) -> results
|
1334
|
-
*
|
1545
|
+
*
|
1335
1546
|
* Uses this Amatch::LongestSubstring instance to match
|
1336
1547
|
* LongestSubstring#pattern against <code>strings</code>, that is compute the
|
1337
1548
|
* length of the longest common substring. <code>strings</code> has to be
|
@@ -1346,7 +1557,7 @@ static VALUE rb_LongestSubstring_match(VALUE self, VALUE strings)
|
|
1346
1557
|
|
1347
1558
|
/*
|
1348
1559
|
* call-seq: similar(strings) -> results
|
1349
|
-
*
|
1560
|
+
*
|
1350
1561
|
* Uses this Amatch::LongestSubstring instance to match
|
1351
1562
|
* Amatch::LongestSubstring#pattern against <code>strings</code>, and compute a
|
1352
1563
|
* longest substring distance metric number between 0.0 for very unsimilar
|
@@ -1372,11 +1583,11 @@ static VALUE rb_LongestSubstring_similar(VALUE self, VALUE strings)
|
|
1372
1583
|
* is either a Float or an Array of Floats respectively.
|
1373
1584
|
*/
|
1374
1585
|
static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
|
1375
|
-
{
|
1586
|
+
{
|
1376
1587
|
VALUE amatch = rb_LongestSubstring_new(rb_cLongestSubstring, self);
|
1377
1588
|
return rb_LongestSubstring_similar(amatch, strings);
|
1378
1589
|
}
|
1379
|
-
|
1590
|
+
|
1380
1591
|
/*
|
1381
1592
|
* Document-class: Amatch::Jaro
|
1382
1593
|
*
|
@@ -1573,6 +1784,17 @@ void Init_amatch_ext()
|
|
1573
1784
|
rb_define_method(rb_cLevenshtein, "similar", rb_Levenshtein_similar, 1);
|
1574
1785
|
rb_define_method(rb_mAmatchStringMethods, "levenshtein_similar", rb_str_levenshtein_similar, 1);
|
1575
1786
|
|
1787
|
+
/* DamerauLevenshtein */
|
1788
|
+
rb_cDamerauLevenshtein = rb_define_class_under(rb_mAmatch, "DamerauLevenshtein", rb_cObject);
|
1789
|
+
rb_define_alloc_func(rb_cDamerauLevenshtein, rb_DamerauLevenshtein_s_allocate);
|
1790
|
+
rb_define_method(rb_cDamerauLevenshtein, "initialize", rb_DamerauLevenshtein_initialize, 1);
|
1791
|
+
rb_define_method(rb_cDamerauLevenshtein, "pattern", rb_General_pattern, 0);
|
1792
|
+
rb_define_method(rb_cDamerauLevenshtein, "pattern=", rb_General_pattern_set, 1);
|
1793
|
+
rb_define_method(rb_cDamerauLevenshtein, "match", rb_DamerauLevenshtein_match, 1);
|
1794
|
+
rb_define_method(rb_cDamerauLevenshtein, "search", rb_DamerauLevenshtein_search, 1);
|
1795
|
+
rb_define_method(rb_cDamerauLevenshtein, "similar", rb_DamerauLevenshtein_similar, 1);
|
1796
|
+
rb_define_method(rb_mAmatchStringMethods, "damerau_levenshtein_similar", rb_str_damerau_levenshtein_similar, 1);
|
1797
|
+
|
1576
1798
|
/* Sellers */
|
1577
1799
|
rb_cSellers = rb_define_class_under(rb_mAmatch, "Sellers", rb_cObject);
|
1578
1800
|
rb_define_alloc_func(rb_cSellers, rb_Sellers_s_allocate);
|