amatch 0.3.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/amatch_ext.c CHANGED
@@ -3,24 +3,8 @@
3
3
  #include <ctype.h>
4
4
  #include "common.h"
5
5
 
6
- /*
7
- * Document-method: pattern
8
- *
9
- * call-seq: pattern -> pattern string
10
- *
11
- * Returns the current pattern string of this instance.
12
- */
13
-
14
- /*
15
- * Document-method: pattern=
16
- *
17
- * call-seq: pattern=(pattern)
18
- *
19
- * Sets the current pattern string of this instance to <code>pattern</code>.
20
- */
21
-
22
-
23
- static VALUE rb_mAmatch, rb_mAmatchStringMethods, rb_cLevenshtein, rb_cSellers, rb_cHamming,
6
+ static VALUE rb_mAmatch, rb_mAmatchStringMethods, rb_cLevenshtein,
7
+ rb_cDamerauLevenshtein, rb_cSellers, rb_cHamming,
24
8
  rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring,
25
9
  rb_cJaro, rb_cJaroWinkler;
26
10
 
@@ -230,9 +214,11 @@ DEF_ITERATE_STRINGS(JaroWinkler)
230
214
  */
231
215
 
232
216
  #define COMPUTE_LEVENSHTEIN_DISTANCE \
233
- for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
217
+ c = 0; \
218
+ p = 0; \
219
+ for (i = 1; i <= a_len; i++) { \
234
220
  c = i % 2; /* current row */ \
235
- p = (i + 1) % 2; /* previous row */ \
221
+ p = (i - 1) % 2; /* previous row */ \
236
222
  v[c][0] = i; /* first column */ \
237
223
  for (j = 1; j <= b_len; j++) { \
238
224
  /* Bellman's principle of optimality: */ \
@@ -245,8 +231,6 @@ DEF_ITERATE_STRINGS(JaroWinkler)
245
231
  } \
246
232
  v[c][j] = weight; \
247
233
  } \
248
- p = c; \
249
- c = (c + 1) % 2; \
250
234
  }
251
235
 
252
236
  static VALUE Levenshtein_match(General *amatch, VALUE string)
@@ -269,7 +253,7 @@ static VALUE Levenshtein_match(General *amatch, VALUE string)
269
253
 
270
254
  COMPUTE_LEVENSHTEIN_DISTANCE
271
255
 
272
- result = INT2FIX(v[p][b_len]);
256
+ result = INT2FIX(v[c][b_len]);
273
257
 
274
258
  xfree(v[0]);
275
259
  xfree(v[1]);
@@ -287,6 +271,7 @@ static VALUE Levenshtein_similar(General *amatch, VALUE string)
287
271
 
288
272
  Check_Type(string, T_STRING);
289
273
  DONT_OPTIMIZE
274
+
290
275
  if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
291
276
  if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
292
277
  v[0] = ALLOC_N(int, b_len + 1);
@@ -299,12 +284,14 @@ static VALUE Levenshtein_similar(General *amatch, VALUE string)
299
284
  COMPUTE_LEVENSHTEIN_DISTANCE
300
285
 
301
286
  if (b_len > a_len) {
302
- result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
287
+ result = rb_float_new(1.0 - ((double) v[c][b_len]) / b_len);
303
288
  } else {
304
- result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
289
+ result = rb_float_new(1.0 - ((double) v[c][b_len]) / a_len);
305
290
  }
291
+
306
292
  xfree(v[0]);
307
293
  xfree(v[1]);
294
+
308
295
  return result;
309
296
  }
310
297
 
@@ -327,26 +314,159 @@ static VALUE Levenshtein_search(General *amatch, VALUE string)
327
314
  COMPUTE_LEVENSHTEIN_DISTANCE
328
315
 
329
316
  for (i = 0, min = a_len; i <= b_len; i++) {
330
- if (v[p][i] < min) min = v[p][i];
317
+ if (v[c][i] < min) min = v[c][i];
331
318
  }
332
319
 
333
320
  result = INT2FIX(min);
334
321
 
335
322
  xfree(v[0]);
336
323
  xfree(v[1]);
337
-
324
+
325
+ return result;
326
+ }
327
+
328
+ /*
329
+ * DamerauLevenshtein edit distances are computed here:
330
+ */
331
+
332
+ #define COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE \
333
+ c = 0; \
334
+ p = 0; \
335
+ pp = 0; \
336
+ for (i = 1; i <= a_len; i++) { \
337
+ c = i % 3; /* current row */ \
338
+ p = (i - 1) % 3; /* previous row */ \
339
+ pp = (i - 2) % 3; /* previous previous row */ \
340
+ v[c][0] = i; /* first column */ \
341
+ for (j = 1; j <= b_len; j++) { \
342
+ /* Bellman's principle of optimality: */ \
343
+ weight = v[p][j - 1] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
344
+ if (weight > v[p][j] + 1) { \
345
+ weight = v[p][j] + 1; \
346
+ } \
347
+ if (weight > v[c][j - 1] + 1) { \
348
+ weight = v[c][j - 1] + 1; \
349
+ } \
350
+ if (i > 2 && j > 2 && a_ptr[i - 1] == b_ptr[j - 2] && a_ptr[i - 2] == b_ptr[j - 1]) {\
351
+ if (weight > v[pp][j - 2]) { \
352
+ weight = v[pp][j - 2] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
353
+ } \
354
+ } \
355
+ v[c][j] = weight; \
356
+ } \
357
+ }
358
+
359
+ static VALUE DamerauLevenshtein_match(General *amatch, VALUE string)
360
+ {
361
+ VALUE result;
362
+ char *a_ptr, *b_ptr;
363
+ int a_len, b_len;
364
+ int *v[3], weight;
365
+ int i, j, c, p, pp;
366
+
367
+ Check_Type(string, T_STRING);
368
+ DONT_OPTIMIZE
369
+
370
+ v[0] = ALLOC_N(int, b_len + 1);
371
+ v[1] = ALLOC_N(int, b_len + 1);
372
+ v[2] = ALLOC_N(int, b_len + 1);
373
+ for (i = 0; i <= b_len; i++) {
374
+ v[0][i] = i;
375
+ v[1][i] = i;
376
+ v[2][i] = i;
377
+ }
378
+
379
+ COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE
380
+
381
+ result = INT2FIX(v[c][b_len]);
382
+
383
+ xfree(v[0]);
384
+ xfree(v[1]);
385
+ xfree(v[2]);
386
+
338
387
  return result;
339
388
  }
340
389
 
390
+ static VALUE DamerauLevenshtein_similar(General *amatch, VALUE string)
391
+ {
392
+ VALUE result;
393
+ char *a_ptr, *b_ptr;
394
+ int a_len, b_len;
395
+ int *v[3], weight;
396
+ int i, j, c, p, pp;
397
+
398
+ Check_Type(string, T_STRING);
399
+ DONT_OPTIMIZE
400
+
401
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
402
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
403
+ v[0] = ALLOC_N(int, b_len + 1);
404
+ v[1] = ALLOC_N(int, b_len + 1);
405
+ v[2] = ALLOC_N(int, b_len + 1);
406
+ for (i = 0; i <= b_len; i++) {
407
+ v[0][i] = i;
408
+ v[1][i] = i;
409
+ v[2][i] = i;
410
+ }
411
+
412
+ COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE
413
+
414
+ if (b_len > a_len) {
415
+ result = rb_float_new(1.0 - ((double) v[c][b_len]) / b_len);
416
+ } else {
417
+ result = rb_float_new(1.0 - ((double) v[c][b_len]) / a_len);
418
+ }
419
+
420
+ xfree(v[0]);
421
+ xfree(v[1]);
422
+ xfree(v[2]);
423
+
424
+ return result;
425
+ }
426
+
427
+ static VALUE DamerauLevenshtein_search(General *amatch, VALUE string)
428
+ {
429
+ VALUE result;
430
+ char *a_ptr, *b_ptr;
431
+ int a_len, b_len;
432
+ int *v[3], weight, min;
433
+ int i, j, c, p, pp;
434
+
435
+ Check_Type(string, T_STRING);
436
+ DONT_OPTIMIZE
437
+
438
+ v[0] = ALLOC_N(int, b_len + 1);
439
+ v[1] = ALLOC_N(int, b_len + 1);
440
+ v[2] = ALLOC_N(int, b_len + 1);
441
+ MEMZERO(v[0], int, b_len + 1);
442
+ MEMZERO(v[1], int, b_len + 1);
443
+ MEMZERO(v[2], int, b_len + 1);
444
+
445
+ COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE
446
+
447
+ for (i = 0, min = a_len; i <= b_len; i++) {
448
+ if (v[c][i] < min) min = v[c][i];
449
+ }
450
+
451
+ result = INT2FIX(min);
452
+
453
+ xfree(v[0]);
454
+ xfree(v[1]);
455
+ xfree(v[2]);
456
+
457
+ return result;
458
+ }
341
459
 
342
460
  /*
343
461
  * Sellers edit distances are computed here:
344
462
  */
345
463
 
346
464
  #define COMPUTE_SELLERS_DISTANCE \
347
- for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
465
+ c = 0; \
466
+ p = 0; \
467
+ for (i = 1; i <= a_len; i++) { \
348
468
  c = i % 2; /* current row */ \
349
- p = (i + 1) % 2; /* previous row */ \
469
+ p = (i - 1) % 2; /* previous row */ \
350
470
  v[c][0] = i * amatch->deletion; /* first column */ \
351
471
  for (j = 1; j <= b_len; j++) { \
352
472
  /* Bellman's principle of optimality: */ \
@@ -361,7 +481,6 @@ static VALUE Levenshtein_search(General *amatch, VALUE string)
361
481
  v[c][j] = weight; \
362
482
  } \
363
483
  p = c; \
364
- c = (c + 1) % 2; \
365
484
  }
366
485
 
367
486
  static VALUE Sellers_match(Sellers *amatch, VALUE string)
@@ -411,9 +530,10 @@ static VALUE Sellers_similar(Sellers *amatch, VALUE string)
411
530
  max_weight = amatch->deletion;
412
531
  }
413
532
  }
414
-
533
+
415
534
  Check_Type(string, T_STRING);
416
535
  DONT_OPTIMIZE
536
+
417
537
  if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
418
538
  if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
419
539
  v[0] = ALLOC_N(double, b_len + 1);
@@ -459,7 +579,7 @@ static VALUE Sellers_search(Sellers *amatch, VALUE string)
459
579
  result = rb_float_new(min);
460
580
  xfree(v[0]);
461
581
  xfree(v[1]);
462
-
582
+
463
583
  return result;
464
584
  }
465
585
 
@@ -470,34 +590,32 @@ static VALUE Sellers_search(Sellers *amatch, VALUE string)
470
590
  static VALUE PairDistance_match(PairDistance *amatch, VALUE string, VALUE regexp, int use_regexp)
471
591
  {
472
592
  double result;
473
- VALUE tokens;
474
- PairArray *pair_array;
475
-
593
+ VALUE string_tokens, tokens;
594
+ PairArray *pattern_pair_array, *pair_array;
595
+
476
596
  Check_Type(string, T_STRING);
477
597
  if (!NIL_P(regexp) || use_regexp) {
478
598
  tokens = rb_funcall(
479
599
  rb_str_new(amatch->pattern, amatch->pattern_len),
480
600
  id_split, 1, regexp
481
601
  );
482
- if (!amatch->pattern_pair_array) {
483
- amatch->pattern_pair_array = PairArray_new(tokens);
484
- } else {
485
- pair_array_reactivate(amatch->pattern_pair_array);
486
- }
487
- tokens = rb_funcall(string, id_split, 1, regexp);
488
- pair_array = PairArray_new(tokens);
602
+ string_tokens = rb_funcall(string, id_split, 1, regexp);
489
603
  } else {
490
604
  VALUE tmp = rb_str_new(amatch->pattern, amatch->pattern_len);
491
605
  tokens = rb_ary_new4(1, &tmp);
492
- if (!amatch->pattern_pair_array) {
493
- amatch->pattern_pair_array = PairArray_new(tokens);
494
- } else {
495
- pair_array_reactivate(amatch->pattern_pair_array);
496
- }
497
- tokens = rb_ary_new4(1, &string);
498
- pair_array = PairArray_new(tokens);
606
+ string_tokens = rb_ary_new4(1, &string);
607
+ }
608
+
609
+ if (!amatch->pattern_pair_array) {
610
+ pattern_pair_array = PairArray_new(tokens);
611
+ amatch->pattern_pair_array = pattern_pair_array;
612
+ } else {
613
+ pattern_pair_array = amatch->pattern_pair_array;
614
+ pair_array_reactivate(amatch->pattern_pair_array);
499
615
  }
500
- result = pair_array_match(amatch->pattern_pair_array, pair_array);
616
+ pair_array = PairArray_new(string_tokens);
617
+
618
+ result = pair_array_match(pattern_pair_array, pair_array);
501
619
  pair_array_destroy(pair_array);
502
620
  return rb_float_new(result);
503
621
  }
@@ -520,7 +638,7 @@ static VALUE Hamming_match(General *amatch, VALUE string)
520
638
  char *a_ptr, *b_ptr;
521
639
  int a_len, b_len;
522
640
  int i, result;
523
-
641
+
524
642
  Check_Type(string, T_STRING);
525
643
  OPTIMIZE_TIME
526
644
  COMPUTE_HAMMING_DISTANCE
@@ -532,7 +650,7 @@ static VALUE Hamming_similar(General *amatch, VALUE string)
532
650
  char *a_ptr, *b_ptr;
533
651
  int a_len, b_len;
534
652
  int i, result;
535
-
653
+
536
654
  Check_Type(string, T_STRING);
537
655
  OPTIMIZE_TIME
538
656
  if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
@@ -572,7 +690,7 @@ static VALUE LongestSubsequence_match(General *amatch, VALUE string)
572
690
  char *a_ptr, *b_ptr;
573
691
  int a_len, b_len;
574
692
  int result, c, p, i, j, *l[2];
575
-
693
+
576
694
  Check_Type(string, T_STRING);
577
695
  OPTIMIZE_TIME
578
696
 
@@ -586,7 +704,7 @@ static VALUE LongestSubsequence_similar(General *amatch, VALUE string)
586
704
  char *a_ptr, *b_ptr;
587
705
  int a_len, b_len;
588
706
  int result, c, p, i, j, *l[2];
589
-
707
+
590
708
  Check_Type(string, T_STRING);
591
709
  OPTIMIZE_TIME
592
710
 
@@ -626,7 +744,7 @@ static VALUE LongestSubstring_match(General *amatch, VALUE string)
626
744
  char *a_ptr, *b_ptr;
627
745
  int a_len, b_len;
628
746
  int result, c, p, i, j, *l[2];
629
-
747
+
630
748
  Check_Type(string, T_STRING);
631
749
  OPTIMIZE_TIME
632
750
  if (a_len == 0 || b_len == 0) return INT2FIX(0);
@@ -639,7 +757,7 @@ static VALUE LongestSubstring_similar(General *amatch, VALUE string)
639
757
  char *a_ptr, *b_ptr;
640
758
  int a_len, b_len;
641
759
  int result, c, p, i, j, *l[2];
642
-
760
+
643
761
  Check_Type(string, T_STRING);
644
762
  OPTIMIZE_TIME
645
763
  if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
@@ -771,7 +889,7 @@ static VALUE JaroWinkler_match(JaroWinkler *amatch, VALUE string)
771
889
  * Ruby API
772
890
  */
773
891
 
774
- /*
892
+ /*
775
893
  * Document-class: Amatch::Levenshtein
776
894
  *
777
895
  * The Levenshtein edit distance is defined as the minimal costs involved to
@@ -804,7 +922,7 @@ DEF_CONSTRUCTOR(Levenshtein, General)
804
922
 
805
923
  /*
806
924
  * call-seq: match(strings) -> results
807
- *
925
+ *
808
926
  * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
809
927
  * against <code>strings</code>. It returns the number operations, the Sellers
810
928
  * distance. <code>strings</code> has to be either a String or an Array of
@@ -812,14 +930,14 @@ DEF_CONSTRUCTOR(Levenshtein, General)
812
930
  * Floats respectively.
813
931
  */
814
932
  static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
815
- {
933
+ {
816
934
  GET_STRUCT(General)
817
935
  return General_iterate_strings(amatch, strings, Levenshtein_match);
818
936
  }
819
937
 
820
938
  /*
821
939
  * call-seq: similar(strings) -> results
822
- *
940
+ *
823
941
  * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
824
942
  * against <code>strings</code>, and compute a Levenshtein distance metric
825
943
  * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
@@ -828,14 +946,14 @@ static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
828
946
  * respectively.
829
947
  */
830
948
  static VALUE rb_Levenshtein_similar(VALUE self, VALUE strings)
831
- {
949
+ {
832
950
  GET_STRUCT(General)
833
951
  return General_iterate_strings(amatch, strings, Levenshtein_similar);
834
952
  }
835
953
 
836
954
  /*
837
955
  * call-seq: levenshtein_similar(strings) -> results
838
- *
956
+ *
839
957
  * If called on a String, this string is used as a Amatch::Levenshtein#pattern
840
958
  * to match against <code>strings</code>. It returns a Levenshtein distance
841
959
  * metric number between 0.0 for very unsimilar strings and 1.0 for an exact
@@ -851,7 +969,7 @@ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
851
969
 
852
970
  /*
853
971
  * call-seq: search(strings) -> results
854
- *
972
+ *
855
973
  * searches Amatch::Levenshtein#pattern in <code>strings</code> and returns the
856
974
  * edit distance (the sum of character operations) as a Fixnum value, by greedy
857
975
  * trimming prefixes or postfixes of the match. <code>strings</code> has
@@ -859,12 +977,105 @@ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
859
977
  * <code>results</code> is either a Float or an Array of Floats respectively.
860
978
  */
861
979
  static VALUE rb_Levenshtein_search(VALUE self, VALUE strings)
862
- {
980
+ {
863
981
  GET_STRUCT(General)
864
982
  return General_iterate_strings(amatch, strings, Levenshtein_search);
865
983
  }
866
984
 
867
- /*
985
+ /*
986
+ * Document-class: Amatch::DamerauLevenshtein
987
+ * XXX
988
+ * The DamerauLevenshtein edit distance is defined as the minimal costs
989
+ * involved to transform one string into another by using three elementary
990
+ * operations: deletion, insertion and substitution of a character. To
991
+ * transform "water" into "wine", for instance, you have to substitute "a" ->
992
+ * "i": "witer", "t" -> "n": "winer" and delete "r": "wine". The edit distance
993
+ * between "water" and "wine" is 3, because you have to apply three
994
+ * operations. The edit distance between "wine" and "wine" is 0 of course: no
995
+ * operation is necessary for the transformation -- they're already the same
996
+ * string. It's easy to see that more similar strings have smaller edit
997
+ * distances than strings that differ a lot.
998
+ */
999
+
1000
+ DEF_RB_FREE(DamerauLevenshtein, General)
1001
+
1002
+ /*
1003
+ * call-seq: new(pattern)
1004
+ * XXX
1005
+ * Creates a new Amatch::DamerauLevenshtein instance from <code>pattern</code>.
1006
+ */
1007
+ static VALUE rb_DamerauLevenshtein_initialize(VALUE self, VALUE pattern)
1008
+ {
1009
+ GET_STRUCT(General)
1010
+ General_pattern_set(amatch, pattern);
1011
+ return self;
1012
+ }
1013
+
1014
+ DEF_CONSTRUCTOR(DamerauLevenshtein, General)
1015
+
1016
+ /*
1017
+ * call-seq: match(strings) -> results
1018
+ * XXX
1019
+ * Uses this Amatch::DamerauLevenshtein instance to match Amatch::DamerauLevenshtein#pattern
1020
+ * against <code>strings</code>. It returns the number operations, the Sellers
1021
+ * distance. <code>strings</code> has to be either a String or an Array of
1022
+ * Strings. The returned <code>results</code> is either a Float or an Array of
1023
+ * Floats respectively.
1024
+ */
1025
+ static VALUE rb_DamerauLevenshtein_match(VALUE self, VALUE strings)
1026
+ {
1027
+ GET_STRUCT(General)
1028
+ return General_iterate_strings(amatch, strings, DamerauLevenshtein_match);
1029
+ }
1030
+
1031
+ /*
1032
+ * call-seq: similar(strings) -> results
1033
+ * XXX
1034
+ * Uses this Amatch::DamerauLevenshtein instance to match Amatch::DamerauLevenshtein#pattern
1035
+ * against <code>strings</code>, and compute a DamerauLevenshtein distance metric
1036
+ * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
1037
+ * <code>strings</code> has to be either a String or an Array of Strings. The
1038
+ * returned <code>results</code> is either a Fixnum or an Array of Fixnums
1039
+ * respectively.
1040
+ */
1041
+ static VALUE rb_DamerauLevenshtein_similar(VALUE self, VALUE strings)
1042
+ {
1043
+ GET_STRUCT(General)
1044
+ return General_iterate_strings(amatch, strings, DamerauLevenshtein_similar);
1045
+ }
1046
+
1047
+ /*
1048
+ * call-seq: levenshtein_similar(strings) -> results
1049
+ * XXX
1050
+ * If called on a String, this string is used as a Amatch::DamerauLevenshtein#pattern
1051
+ * to match against <code>strings</code>. It returns a DamerauLevenshtein distance
1052
+ * metric number between 0.0 for very unsimilar strings and 1.0 for an exact
1053
+ * match. <code>strings</code> has to be either a String or an Array of
1054
+ * Strings. The returned <code>results</code> is either a Float or an Array of
1055
+ * Floats respectively.
1056
+ */
1057
+ static VALUE rb_str_damerau_levenshtein_similar(VALUE self, VALUE strings)
1058
+ {
1059
+ VALUE amatch = rb_DamerauLevenshtein_new(rb_cDamerauLevenshtein, self);
1060
+ return rb_DamerauLevenshtein_similar(amatch, strings);
1061
+ }
1062
+
1063
+ /*
1064
+ * call-seq: search(strings) -> results
1065
+ * XXX
1066
+ * searches Amatch::DamerauLevenshtein#pattern in <code>strings</code> and returns the
1067
+ * edit distance (the sum of character operations) as a Fixnum value, by greedy
1068
+ * trimming prefixes or postfixes of the match. <code>strings</code> has
1069
+ * to be either a String or an Array of Strings. The returned
1070
+ * <code>results</code> is either a Float or an Array of Floats respectively.
1071
+ */
1072
+ static VALUE rb_DamerauLevenshtein_search(VALUE self, VALUE strings)
1073
+ {
1074
+ GET_STRUCT(General)
1075
+ return General_iterate_strings(amatch, strings, DamerauLevenshtein_search);
1076
+ }
1077
+
1078
+ /*
868
1079
  * Document-class: Amatch::Sellers
869
1080
  *
870
1081
  * The Sellers edit distance is very similar to the Levenshtein edit distance.
@@ -983,14 +1194,14 @@ DEF_CONSTRUCTOR(Sellers, Sellers)
983
1194
  * Document-method: pattern=
984
1195
  *
985
1196
  * call-seq: pattern=(pattern)
986
- *
1197
+ *
987
1198
  * Sets the current pattern string of this Amatch::Sellers instance to
988
1199
  * <code>pattern</code>.
989
1200
  */
990
1201
 
991
1202
  /*
992
1203
  * call-seq: match(strings) -> results
993
- *
1204
+ *
994
1205
  * Uses this Amatch::Sellers instance to match Sellers#pattern against
995
1206
  * <code>strings</code>, while taking into account the given weights. It
996
1207
  * returns the number of weighted character operations, the Sellers distance.
@@ -999,14 +1210,14 @@ DEF_CONSTRUCTOR(Sellers, Sellers)
999
1210
  * respectively.
1000
1211
  */
1001
1212
  static VALUE rb_Sellers_match(VALUE self, VALUE strings)
1002
- {
1213
+ {
1003
1214
  GET_STRUCT(Sellers)
1004
1215
  return Sellers_iterate_strings(amatch, strings, Sellers_match);
1005
1216
  }
1006
1217
 
1007
1218
  /*
1008
1219
  * call-seq: similar(strings) -> results
1009
- *
1220
+ *
1010
1221
  * Uses this Amatch::Sellers instance to match Amatch::Sellers#pattern
1011
1222
  * against <code>strings</code> (taking into account the given weights), and
1012
1223
  * compute a Sellers distance metric number between 0.0 for very unsimilar
@@ -1016,7 +1227,7 @@ static VALUE rb_Sellers_match(VALUE self, VALUE strings)
1016
1227
  * respectively.
1017
1228
  */
1018
1229
  static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
1019
- {
1230
+ {
1020
1231
  GET_STRUCT(Sellers)
1021
1232
  return Sellers_iterate_strings(amatch, strings, Sellers_similar);
1022
1233
  }
@@ -1031,12 +1242,12 @@ static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
1031
1242
  * <code>results</code> is either a Float or an Array of Floats respectively.
1032
1243
  */
1033
1244
  static VALUE rb_Sellers_search(VALUE self, VALUE strings)
1034
- {
1245
+ {
1035
1246
  GET_STRUCT(Sellers)
1036
1247
  return Sellers_iterate_strings(amatch, strings, Sellers_search);
1037
1248
  }
1038
1249
 
1039
- /*
1250
+ /*
1040
1251
  * Document-class: Amatch::PairDistance
1041
1252
  *
1042
1253
  * The pair distance between two strings is based on the number of adjacent
@@ -1047,7 +1258,7 @@ static VALUE rb_Sellers_search(VALUE self, VALUE strings)
1047
1258
  * are more dissimilar. The advantage of considering adjacent characters, is to
1048
1259
  * take account not only of the characters, but also of the character ordering
1049
1260
  * in the original strings.
1050
- *
1261
+ *
1051
1262
  * This metric is very capable to find similarities in natural languages.
1052
1263
  * It is explained in more detail in Simon White's article "How to Strike a
1053
1264
  * Match", located at this url:
@@ -1074,7 +1285,7 @@ DEF_CONSTRUCTOR(PairDistance, PairDistance)
1074
1285
 
1075
1286
  /*
1076
1287
  * call-seq: match(strings, regexp = /\s+/) -> results
1077
- *
1288
+ *
1078
1289
  * Uses this Amatch::PairDistance instance to match PairDistance#pattern against
1079
1290
  * <code>strings</code>. It returns the pair distance measure, that is a
1080
1291
  * returned value of 1.0 is an exact match, partial matches are lower
@@ -1090,7 +1301,7 @@ DEF_CONSTRUCTOR(PairDistance, PairDistance)
1090
1301
  * Array of Floats respectively.
1091
1302
  */
1092
1303
  static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
1093
- {
1304
+ {
1094
1305
  VALUE result, strings, regexp = Qnil;
1095
1306
  int use_regexp;
1096
1307
  GET_STRUCT(PairDistance)
@@ -1148,7 +1359,7 @@ static VALUE rb_str_pair_distance_similar(int argc, VALUE *argv, VALUE self)
1148
1359
  }
1149
1360
  }
1150
1361
 
1151
- /*
1362
+ /*
1152
1363
  * Document-class: Amatch::Hamming
1153
1364
  *
1154
1365
  * This class computes the Hamming distance between two strings.
@@ -1178,7 +1389,7 @@ DEF_CONSTRUCTOR(Hamming, General)
1178
1389
 
1179
1390
  /*
1180
1391
  * call-seq: match(strings) -> results
1181
- *
1392
+ *
1182
1393
  * Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
1183
1394
  * <code>strings</code>, that is compute the hamming distance between
1184
1395
  * <code>pattern</code> and <code>strings</code>. <code>strings</code> has to
@@ -1186,7 +1397,7 @@ DEF_CONSTRUCTOR(Hamming, General)
1186
1397
  * is either a Fixnum or an Array of Fixnums respectively.
1187
1398
  */
1188
1399
  static VALUE rb_Hamming_match(VALUE self, VALUE strings)
1189
- {
1400
+ {
1190
1401
  GET_STRUCT(General)
1191
1402
  return General_iterate_strings(amatch, strings, Hamming_match);
1192
1403
  }
@@ -1202,7 +1413,7 @@ static VALUE rb_Hamming_match(VALUE self, VALUE strings)
1202
1413
  * respectively.
1203
1414
  */
1204
1415
  static VALUE rb_Hamming_similar(VALUE self, VALUE strings)
1205
- {
1416
+ {
1206
1417
  GET_STRUCT(General)
1207
1418
  return General_iterate_strings(amatch, strings, Hamming_similar);
1208
1419
  }
@@ -1224,7 +1435,7 @@ static VALUE rb_str_hamming_similar(VALUE self, VALUE strings)
1224
1435
  }
1225
1436
 
1226
1437
 
1227
- /*
1438
+ /*
1228
1439
  * Document-class: Amatch::LongestSubsequence
1229
1440
  *
1230
1441
  * This class computes the length of the longest subsequence common to two
@@ -1254,7 +1465,7 @@ DEF_CONSTRUCTOR(LongestSubsequence, General)
1254
1465
 
1255
1466
  /*
1256
1467
  * call-seq: match(strings) -> results
1257
- *
1468
+ *
1258
1469
  * Uses this Amatch::LongestSubsequence instance to match
1259
1470
  * LongestSubsequence#pattern against <code>strings</code>, that is compute the
1260
1471
  * length of the longest common subsequence. <code>strings</code> has to be
@@ -1262,14 +1473,14 @@ DEF_CONSTRUCTOR(LongestSubsequence, General)
1262
1473
  * is either a Fixnum or an Array of Fixnums respectively.
1263
1474
  */
1264
1475
  static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
1265
- {
1476
+ {
1266
1477
  GET_STRUCT(General)
1267
1478
  return General_iterate_strings(amatch, strings, LongestSubsequence_match);
1268
1479
  }
1269
1480
 
1270
1481
  /*
1271
1482
  * call-seq: similar(strings) -> results
1272
- *
1483
+ *
1273
1484
  * Uses this Amatch::LongestSubsequence instance to match
1274
1485
  * Amatch::LongestSubsequence#pattern against <code>strings</code>, and compute
1275
1486
  * a longest substring distance metric number between 0.0 for very unsimilar
@@ -1278,7 +1489,7 @@ static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
1278
1489
  * a Fixnum or an Array of Fixnums
1279
1490
  */
1280
1491
  static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
1281
- {
1492
+ {
1282
1493
  GET_STRUCT(General)
1283
1494
  return General_iterate_strings(amatch, strings, LongestSubsequence_similar);
1284
1495
  }
@@ -1294,12 +1505,12 @@ static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
1294
1505
  * is either a Float or an Array of Floats respectively.
1295
1506
  */
1296
1507
  static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
1297
- {
1508
+ {
1298
1509
  VALUE amatch = rb_LongestSubsequence_new(rb_cLongestSubsequence, self);
1299
1510
  return rb_LongestSubsequence_similar(amatch, strings);
1300
1511
  }
1301
1512
 
1302
- /*
1513
+ /*
1303
1514
  * Document-class: Amatch::LongestSubstring
1304
1515
  *
1305
1516
  * The longest common substring is the longest substring, that is part of
@@ -1310,7 +1521,7 @@ static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
1310
1521
  * The longest common substring between 'string' and 'string' is 'string'
1311
1522
  * again, thus the longest common substring length is 6. The longest common
1312
1523
  * substring between 'string' and 'storing' is 'ring', thus the longest common
1313
- * substring length is 4.
1524
+ * substring length is 4.
1314
1525
  */
1315
1526
 
1316
1527
  DEF_RB_FREE(LongestSubstring, General)
@@ -1331,7 +1542,7 @@ DEF_CONSTRUCTOR(LongestSubstring, General)
1331
1542
 
1332
1543
  /*
1333
1544
  * call-seq: match(strings) -> results
1334
- *
1545
+ *
1335
1546
  * Uses this Amatch::LongestSubstring instance to match
1336
1547
  * LongestSubstring#pattern against <code>strings</code>, that is compute the
1337
1548
  * length of the longest common substring. <code>strings</code> has to be
@@ -1346,7 +1557,7 @@ static VALUE rb_LongestSubstring_match(VALUE self, VALUE strings)
1346
1557
 
1347
1558
  /*
1348
1559
  * call-seq: similar(strings) -> results
1349
- *
1560
+ *
1350
1561
  * Uses this Amatch::LongestSubstring instance to match
1351
1562
  * Amatch::LongestSubstring#pattern against <code>strings</code>, and compute a
1352
1563
  * longest substring distance metric number between 0.0 for very unsimilar
@@ -1372,11 +1583,11 @@ static VALUE rb_LongestSubstring_similar(VALUE self, VALUE strings)
1372
1583
  * is either a Float or an Array of Floats respectively.
1373
1584
  */
1374
1585
  static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
1375
- {
1586
+ {
1376
1587
  VALUE amatch = rb_LongestSubstring_new(rb_cLongestSubstring, self);
1377
1588
  return rb_LongestSubstring_similar(amatch, strings);
1378
1589
  }
1379
-
1590
+
1380
1591
  /*
1381
1592
  * Document-class: Amatch::Jaro
1382
1593
  *
@@ -1573,6 +1784,17 @@ void Init_amatch_ext()
1573
1784
  rb_define_method(rb_cLevenshtein, "similar", rb_Levenshtein_similar, 1);
1574
1785
  rb_define_method(rb_mAmatchStringMethods, "levenshtein_similar", rb_str_levenshtein_similar, 1);
1575
1786
 
1787
+ /* DamerauLevenshtein */
1788
+ rb_cDamerauLevenshtein = rb_define_class_under(rb_mAmatch, "DamerauLevenshtein", rb_cObject);
1789
+ rb_define_alloc_func(rb_cDamerauLevenshtein, rb_DamerauLevenshtein_s_allocate);
1790
+ rb_define_method(rb_cDamerauLevenshtein, "initialize", rb_DamerauLevenshtein_initialize, 1);
1791
+ rb_define_method(rb_cDamerauLevenshtein, "pattern", rb_General_pattern, 0);
1792
+ rb_define_method(rb_cDamerauLevenshtein, "pattern=", rb_General_pattern_set, 1);
1793
+ rb_define_method(rb_cDamerauLevenshtein, "match", rb_DamerauLevenshtein_match, 1);
1794
+ rb_define_method(rb_cDamerauLevenshtein, "search", rb_DamerauLevenshtein_search, 1);
1795
+ rb_define_method(rb_cDamerauLevenshtein, "similar", rb_DamerauLevenshtein_similar, 1);
1796
+ rb_define_method(rb_mAmatchStringMethods, "damerau_levenshtein_similar", rb_str_damerau_levenshtein_similar, 1);
1797
+
1576
1798
  /* Sellers */
1577
1799
  rb_cSellers = rb_define_class_under(rb_mAmatch, "Sellers", rb_cObject);
1578
1800
  rb_define_alloc_func(rb_cSellers, rb_Sellers_s_allocate);