amatch 0.3.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
data/ext/amatch_ext.c CHANGED
@@ -3,24 +3,8 @@
3
3
  #include <ctype.h>
4
4
  #include "common.h"
5
5
 
6
- /*
7
- * Document-method: pattern
8
- *
9
- * call-seq: pattern -> pattern string
10
- *
11
- * Returns the current pattern string of this instance.
12
- */
13
-
14
- /*
15
- * Document-method: pattern=
16
- *
17
- * call-seq: pattern=(pattern)
18
- *
19
- * Sets the current pattern string of this instance to <code>pattern</code>.
20
- */
21
-
22
-
23
- static VALUE rb_mAmatch, rb_mAmatchStringMethods, rb_cLevenshtein, rb_cSellers, rb_cHamming,
6
+ static VALUE rb_mAmatch, rb_mAmatchStringMethods, rb_cLevenshtein,
7
+ rb_cDamerauLevenshtein, rb_cSellers, rb_cHamming,
24
8
  rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring,
25
9
  rb_cJaro, rb_cJaroWinkler;
26
10
 
@@ -230,9 +214,11 @@ DEF_ITERATE_STRINGS(JaroWinkler)
230
214
  */
231
215
 
232
216
  #define COMPUTE_LEVENSHTEIN_DISTANCE \
233
- for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
217
+ c = 0; \
218
+ p = 0; \
219
+ for (i = 1; i <= a_len; i++) { \
234
220
  c = i % 2; /* current row */ \
235
- p = (i + 1) % 2; /* previous row */ \
221
+ p = (i - 1) % 2; /* previous row */ \
236
222
  v[c][0] = i; /* first column */ \
237
223
  for (j = 1; j <= b_len; j++) { \
238
224
  /* Bellman's principle of optimality: */ \
@@ -245,8 +231,6 @@ DEF_ITERATE_STRINGS(JaroWinkler)
245
231
  } \
246
232
  v[c][j] = weight; \
247
233
  } \
248
- p = c; \
249
- c = (c + 1) % 2; \
250
234
  }
251
235
 
252
236
  static VALUE Levenshtein_match(General *amatch, VALUE string)
@@ -269,7 +253,7 @@ static VALUE Levenshtein_match(General *amatch, VALUE string)
269
253
 
270
254
  COMPUTE_LEVENSHTEIN_DISTANCE
271
255
 
272
- result = INT2FIX(v[p][b_len]);
256
+ result = INT2FIX(v[c][b_len]);
273
257
 
274
258
  xfree(v[0]);
275
259
  xfree(v[1]);
@@ -287,6 +271,7 @@ static VALUE Levenshtein_similar(General *amatch, VALUE string)
287
271
 
288
272
  Check_Type(string, T_STRING);
289
273
  DONT_OPTIMIZE
274
+
290
275
  if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
291
276
  if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
292
277
  v[0] = ALLOC_N(int, b_len + 1);
@@ -299,12 +284,14 @@ static VALUE Levenshtein_similar(General *amatch, VALUE string)
299
284
  COMPUTE_LEVENSHTEIN_DISTANCE
300
285
 
301
286
  if (b_len > a_len) {
302
- result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
287
+ result = rb_float_new(1.0 - ((double) v[c][b_len]) / b_len);
303
288
  } else {
304
- result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
289
+ result = rb_float_new(1.0 - ((double) v[c][b_len]) / a_len);
305
290
  }
291
+
306
292
  xfree(v[0]);
307
293
  xfree(v[1]);
294
+
308
295
  return result;
309
296
  }
310
297
 
@@ -327,26 +314,159 @@ static VALUE Levenshtein_search(General *amatch, VALUE string)
327
314
  COMPUTE_LEVENSHTEIN_DISTANCE
328
315
 
329
316
  for (i = 0, min = a_len; i <= b_len; i++) {
330
- if (v[p][i] < min) min = v[p][i];
317
+ if (v[c][i] < min) min = v[c][i];
331
318
  }
332
319
 
333
320
  result = INT2FIX(min);
334
321
 
335
322
  xfree(v[0]);
336
323
  xfree(v[1]);
337
-
324
+
325
+ return result;
326
+ }
327
+
328
+ /*
329
+ * DamerauLevenshtein edit distances are computed here:
330
+ */
331
+
332
+ #define COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE \
333
+ c = 0; \
334
+ p = 0; \
335
+ pp = 0; \
336
+ for (i = 1; i <= a_len; i++) { \
337
+ c = i % 3; /* current row */ \
338
+ p = (i - 1) % 3; /* previous row */ \
339
+ pp = (i - 2) % 3; /* previous previous row */ \
340
+ v[c][0] = i; /* first column */ \
341
+ for (j = 1; j <= b_len; j++) { \
342
+ /* Bellman's principle of optimality: */ \
343
+ weight = v[p][j - 1] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
344
+ if (weight > v[p][j] + 1) { \
345
+ weight = v[p][j] + 1; \
346
+ } \
347
+ if (weight > v[c][j - 1] + 1) { \
348
+ weight = v[c][j - 1] + 1; \
349
+ } \
350
+ if (i > 2 && j > 2 && a_ptr[i - 1] == b_ptr[j - 2] && a_ptr[i - 2] == b_ptr[j - 1]) {\
351
+ if (weight > v[pp][j - 2]) { \
352
+ weight = v[pp][j - 2] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
353
+ } \
354
+ } \
355
+ v[c][j] = weight; \
356
+ } \
357
+ }
358
+
359
+ static VALUE DamerauLevenshtein_match(General *amatch, VALUE string)
360
+ {
361
+ VALUE result;
362
+ char *a_ptr, *b_ptr;
363
+ int a_len, b_len;
364
+ int *v[3], weight;
365
+ int i, j, c, p, pp;
366
+
367
+ Check_Type(string, T_STRING);
368
+ DONT_OPTIMIZE
369
+
370
+ v[0] = ALLOC_N(int, b_len + 1);
371
+ v[1] = ALLOC_N(int, b_len + 1);
372
+ v[2] = ALLOC_N(int, b_len + 1);
373
+ for (i = 0; i <= b_len; i++) {
374
+ v[0][i] = i;
375
+ v[1][i] = i;
376
+ v[2][i] = i;
377
+ }
378
+
379
+ COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE
380
+
381
+ result = INT2FIX(v[c][b_len]);
382
+
383
+ xfree(v[0]);
384
+ xfree(v[1]);
385
+ xfree(v[2]);
386
+
338
387
  return result;
339
388
  }
340
389
 
390
+ static VALUE DamerauLevenshtein_similar(General *amatch, VALUE string)
391
+ {
392
+ VALUE result;
393
+ char *a_ptr, *b_ptr;
394
+ int a_len, b_len;
395
+ int *v[3], weight;
396
+ int i, j, c, p, pp;
397
+
398
+ Check_Type(string, T_STRING);
399
+ DONT_OPTIMIZE
400
+
401
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
402
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
403
+ v[0] = ALLOC_N(int, b_len + 1);
404
+ v[1] = ALLOC_N(int, b_len + 1);
405
+ v[2] = ALLOC_N(int, b_len + 1);
406
+ for (i = 0; i <= b_len; i++) {
407
+ v[0][i] = i;
408
+ v[1][i] = i;
409
+ v[2][i] = i;
410
+ }
411
+
412
+ COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE
413
+
414
+ if (b_len > a_len) {
415
+ result = rb_float_new(1.0 - ((double) v[c][b_len]) / b_len);
416
+ } else {
417
+ result = rb_float_new(1.0 - ((double) v[c][b_len]) / a_len);
418
+ }
419
+
420
+ xfree(v[0]);
421
+ xfree(v[1]);
422
+ xfree(v[2]);
423
+
424
+ return result;
425
+ }
426
+
427
+ static VALUE DamerauLevenshtein_search(General *amatch, VALUE string)
428
+ {
429
+ VALUE result;
430
+ char *a_ptr, *b_ptr;
431
+ int a_len, b_len;
432
+ int *v[3], weight, min;
433
+ int i, j, c, p, pp;
434
+
435
+ Check_Type(string, T_STRING);
436
+ DONT_OPTIMIZE
437
+
438
+ v[0] = ALLOC_N(int, b_len + 1);
439
+ v[1] = ALLOC_N(int, b_len + 1);
440
+ v[2] = ALLOC_N(int, b_len + 1);
441
+ MEMZERO(v[0], int, b_len + 1);
442
+ MEMZERO(v[1], int, b_len + 1);
443
+ MEMZERO(v[2], int, b_len + 1);
444
+
445
+ COMPUTE_DAMERAU_LEVENSHTEIN_DISTANCE
446
+
447
+ for (i = 0, min = a_len; i <= b_len; i++) {
448
+ if (v[c][i] < min) min = v[c][i];
449
+ }
450
+
451
+ result = INT2FIX(min);
452
+
453
+ xfree(v[0]);
454
+ xfree(v[1]);
455
+ xfree(v[2]);
456
+
457
+ return result;
458
+ }
341
459
 
342
460
  /*
343
461
  * Sellers edit distances are computed here:
344
462
  */
345
463
 
346
464
  #define COMPUTE_SELLERS_DISTANCE \
347
- for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
465
+ c = 0; \
466
+ p = 0; \
467
+ for (i = 1; i <= a_len; i++) { \
348
468
  c = i % 2; /* current row */ \
349
- p = (i + 1) % 2; /* previous row */ \
469
+ p = (i - 1) % 2; /* previous row */ \
350
470
  v[c][0] = i * amatch->deletion; /* first column */ \
351
471
  for (j = 1; j <= b_len; j++) { \
352
472
  /* Bellman's principle of optimality: */ \
@@ -361,7 +481,6 @@ static VALUE Levenshtein_search(General *amatch, VALUE string)
361
481
  v[c][j] = weight; \
362
482
  } \
363
483
  p = c; \
364
- c = (c + 1) % 2; \
365
484
  }
366
485
 
367
486
  static VALUE Sellers_match(Sellers *amatch, VALUE string)
@@ -411,9 +530,10 @@ static VALUE Sellers_similar(Sellers *amatch, VALUE string)
411
530
  max_weight = amatch->deletion;
412
531
  }
413
532
  }
414
-
533
+
415
534
  Check_Type(string, T_STRING);
416
535
  DONT_OPTIMIZE
536
+
417
537
  if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
418
538
  if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
419
539
  v[0] = ALLOC_N(double, b_len + 1);
@@ -459,7 +579,7 @@ static VALUE Sellers_search(Sellers *amatch, VALUE string)
459
579
  result = rb_float_new(min);
460
580
  xfree(v[0]);
461
581
  xfree(v[1]);
462
-
582
+
463
583
  return result;
464
584
  }
465
585
 
@@ -470,34 +590,32 @@ static VALUE Sellers_search(Sellers *amatch, VALUE string)
470
590
  static VALUE PairDistance_match(PairDistance *amatch, VALUE string, VALUE regexp, int use_regexp)
471
591
  {
472
592
  double result;
473
- VALUE tokens;
474
- PairArray *pair_array;
475
-
593
+ VALUE string_tokens, tokens;
594
+ PairArray *pattern_pair_array, *pair_array;
595
+
476
596
  Check_Type(string, T_STRING);
477
597
  if (!NIL_P(regexp) || use_regexp) {
478
598
  tokens = rb_funcall(
479
599
  rb_str_new(amatch->pattern, amatch->pattern_len),
480
600
  id_split, 1, regexp
481
601
  );
482
- if (!amatch->pattern_pair_array) {
483
- amatch->pattern_pair_array = PairArray_new(tokens);
484
- } else {
485
- pair_array_reactivate(amatch->pattern_pair_array);
486
- }
487
- tokens = rb_funcall(string, id_split, 1, regexp);
488
- pair_array = PairArray_new(tokens);
602
+ string_tokens = rb_funcall(string, id_split, 1, regexp);
489
603
  } else {
490
604
  VALUE tmp = rb_str_new(amatch->pattern, amatch->pattern_len);
491
605
  tokens = rb_ary_new4(1, &tmp);
492
- if (!amatch->pattern_pair_array) {
493
- amatch->pattern_pair_array = PairArray_new(tokens);
494
- } else {
495
- pair_array_reactivate(amatch->pattern_pair_array);
496
- }
497
- tokens = rb_ary_new4(1, &string);
498
- pair_array = PairArray_new(tokens);
606
+ string_tokens = rb_ary_new4(1, &string);
607
+ }
608
+
609
+ if (!amatch->pattern_pair_array) {
610
+ pattern_pair_array = PairArray_new(tokens);
611
+ amatch->pattern_pair_array = pattern_pair_array;
612
+ } else {
613
+ pattern_pair_array = amatch->pattern_pair_array;
614
+ pair_array_reactivate(amatch->pattern_pair_array);
499
615
  }
500
- result = pair_array_match(amatch->pattern_pair_array, pair_array);
616
+ pair_array = PairArray_new(string_tokens);
617
+
618
+ result = pair_array_match(pattern_pair_array, pair_array);
501
619
  pair_array_destroy(pair_array);
502
620
  return rb_float_new(result);
503
621
  }
@@ -520,7 +638,7 @@ static VALUE Hamming_match(General *amatch, VALUE string)
520
638
  char *a_ptr, *b_ptr;
521
639
  int a_len, b_len;
522
640
  int i, result;
523
-
641
+
524
642
  Check_Type(string, T_STRING);
525
643
  OPTIMIZE_TIME
526
644
  COMPUTE_HAMMING_DISTANCE
@@ -532,7 +650,7 @@ static VALUE Hamming_similar(General *amatch, VALUE string)
532
650
  char *a_ptr, *b_ptr;
533
651
  int a_len, b_len;
534
652
  int i, result;
535
-
653
+
536
654
  Check_Type(string, T_STRING);
537
655
  OPTIMIZE_TIME
538
656
  if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
@@ -572,7 +690,7 @@ static VALUE LongestSubsequence_match(General *amatch, VALUE string)
572
690
  char *a_ptr, *b_ptr;
573
691
  int a_len, b_len;
574
692
  int result, c, p, i, j, *l[2];
575
-
693
+
576
694
  Check_Type(string, T_STRING);
577
695
  OPTIMIZE_TIME
578
696
 
@@ -586,7 +704,7 @@ static VALUE LongestSubsequence_similar(General *amatch, VALUE string)
586
704
  char *a_ptr, *b_ptr;
587
705
  int a_len, b_len;
588
706
  int result, c, p, i, j, *l[2];
589
-
707
+
590
708
  Check_Type(string, T_STRING);
591
709
  OPTIMIZE_TIME
592
710
 
@@ -626,7 +744,7 @@ static VALUE LongestSubstring_match(General *amatch, VALUE string)
626
744
  char *a_ptr, *b_ptr;
627
745
  int a_len, b_len;
628
746
  int result, c, p, i, j, *l[2];
629
-
747
+
630
748
  Check_Type(string, T_STRING);
631
749
  OPTIMIZE_TIME
632
750
  if (a_len == 0 || b_len == 0) return INT2FIX(0);
@@ -639,7 +757,7 @@ static VALUE LongestSubstring_similar(General *amatch, VALUE string)
639
757
  char *a_ptr, *b_ptr;
640
758
  int a_len, b_len;
641
759
  int result, c, p, i, j, *l[2];
642
-
760
+
643
761
  Check_Type(string, T_STRING);
644
762
  OPTIMIZE_TIME
645
763
  if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
@@ -771,7 +889,7 @@ static VALUE JaroWinkler_match(JaroWinkler *amatch, VALUE string)
771
889
  * Ruby API
772
890
  */
773
891
 
774
- /*
892
+ /*
775
893
  * Document-class: Amatch::Levenshtein
776
894
  *
777
895
  * The Levenshtein edit distance is defined as the minimal costs involved to
@@ -804,7 +922,7 @@ DEF_CONSTRUCTOR(Levenshtein, General)
804
922
 
805
923
  /*
806
924
  * call-seq: match(strings) -> results
807
- *
925
+ *
808
926
  * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
809
927
  * against <code>strings</code>. It returns the number operations, the Sellers
810
928
  * distance. <code>strings</code> has to be either a String or an Array of
@@ -812,14 +930,14 @@ DEF_CONSTRUCTOR(Levenshtein, General)
812
930
  * Floats respectively.
813
931
  */
814
932
  static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
815
- {
933
+ {
816
934
  GET_STRUCT(General)
817
935
  return General_iterate_strings(amatch, strings, Levenshtein_match);
818
936
  }
819
937
 
820
938
  /*
821
939
  * call-seq: similar(strings) -> results
822
- *
940
+ *
823
941
  * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
824
942
  * against <code>strings</code>, and compute a Levenshtein distance metric
825
943
  * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
@@ -828,14 +946,14 @@ static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
828
946
  * respectively.
829
947
  */
830
948
  static VALUE rb_Levenshtein_similar(VALUE self, VALUE strings)
831
- {
949
+ {
832
950
  GET_STRUCT(General)
833
951
  return General_iterate_strings(amatch, strings, Levenshtein_similar);
834
952
  }
835
953
 
836
954
  /*
837
955
  * call-seq: levenshtein_similar(strings) -> results
838
- *
956
+ *
839
957
  * If called on a String, this string is used as a Amatch::Levenshtein#pattern
840
958
  * to match against <code>strings</code>. It returns a Levenshtein distance
841
959
  * metric number between 0.0 for very unsimilar strings and 1.0 for an exact
@@ -851,7 +969,7 @@ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
851
969
 
852
970
  /*
853
971
  * call-seq: search(strings) -> results
854
- *
972
+ *
855
973
  * searches Amatch::Levenshtein#pattern in <code>strings</code> and returns the
856
974
  * edit distance (the sum of character operations) as a Fixnum value, by greedy
857
975
  * trimming prefixes or postfixes of the match. <code>strings</code> has
@@ -859,12 +977,105 @@ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
859
977
  * <code>results</code> is either a Float or an Array of Floats respectively.
860
978
  */
861
979
  static VALUE rb_Levenshtein_search(VALUE self, VALUE strings)
862
- {
980
+ {
863
981
  GET_STRUCT(General)
864
982
  return General_iterate_strings(amatch, strings, Levenshtein_search);
865
983
  }
866
984
 
867
- /*
985
+ /*
986
+ * Document-class: Amatch::DamerauLevenshtein
987
+ * XXX
988
+ * The DamerauLevenshtein edit distance is defined as the minimal costs
989
+ * involved to transform one string into another by using three elementary
990
+ * operations: deletion, insertion and substitution of a character. To
991
+ * transform "water" into "wine", for instance, you have to substitute "a" ->
992
+ * "i": "witer", "t" -> "n": "winer" and delete "r": "wine". The edit distance
993
+ * between "water" and "wine" is 3, because you have to apply three
994
+ * operations. The edit distance between "wine" and "wine" is 0 of course: no
995
+ * operation is necessary for the transformation -- they're already the same
996
+ * string. It's easy to see that more similar strings have smaller edit
997
+ * distances than strings that differ a lot.
998
+ */
999
+
1000
+ DEF_RB_FREE(DamerauLevenshtein, General)
1001
+
1002
+ /*
1003
+ * call-seq: new(pattern)
1004
+ * XXX
1005
+ * Creates a new Amatch::DamerauLevenshtein instance from <code>pattern</code>.
1006
+ */
1007
+ static VALUE rb_DamerauLevenshtein_initialize(VALUE self, VALUE pattern)
1008
+ {
1009
+ GET_STRUCT(General)
1010
+ General_pattern_set(amatch, pattern);
1011
+ return self;
1012
+ }
1013
+
1014
+ DEF_CONSTRUCTOR(DamerauLevenshtein, General)
1015
+
1016
+ /*
1017
+ * call-seq: match(strings) -> results
1018
+ * XXX
1019
+ * Uses this Amatch::DamerauLevenshtein instance to match Amatch::DamerauLevenshtein#pattern
1020
+ * against <code>strings</code>. It returns the number operations, the Sellers
1021
+ * distance. <code>strings</code> has to be either a String or an Array of
1022
+ * Strings. The returned <code>results</code> is either a Float or an Array of
1023
+ * Floats respectively.
1024
+ */
1025
+ static VALUE rb_DamerauLevenshtein_match(VALUE self, VALUE strings)
1026
+ {
1027
+ GET_STRUCT(General)
1028
+ return General_iterate_strings(amatch, strings, DamerauLevenshtein_match);
1029
+ }
1030
+
1031
+ /*
1032
+ * call-seq: similar(strings) -> results
1033
+ * XXX
1034
+ * Uses this Amatch::DamerauLevenshtein instance to match Amatch::DamerauLevenshtein#pattern
1035
+ * against <code>strings</code>, and compute a DamerauLevenshtein distance metric
1036
+ * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
1037
+ * <code>strings</code> has to be either a String or an Array of Strings. The
1038
+ * returned <code>results</code> is either a Fixnum or an Array of Fixnums
1039
+ * respectively.
1040
+ */
1041
+ static VALUE rb_DamerauLevenshtein_similar(VALUE self, VALUE strings)
1042
+ {
1043
+ GET_STRUCT(General)
1044
+ return General_iterate_strings(amatch, strings, DamerauLevenshtein_similar);
1045
+ }
1046
+
1047
+ /*
1048
+ * call-seq: levenshtein_similar(strings) -> results
1049
+ * XXX
1050
+ * If called on a String, this string is used as a Amatch::DamerauLevenshtein#pattern
1051
+ * to match against <code>strings</code>. It returns a DamerauLevenshtein distance
1052
+ * metric number between 0.0 for very unsimilar strings and 1.0 for an exact
1053
+ * match. <code>strings</code> has to be either a String or an Array of
1054
+ * Strings. The returned <code>results</code> is either a Float or an Array of
1055
+ * Floats respectively.
1056
+ */
1057
+ static VALUE rb_str_damerau_levenshtein_similar(VALUE self, VALUE strings)
1058
+ {
1059
+ VALUE amatch = rb_DamerauLevenshtein_new(rb_cDamerauLevenshtein, self);
1060
+ return rb_DamerauLevenshtein_similar(amatch, strings);
1061
+ }
1062
+
1063
+ /*
1064
+ * call-seq: search(strings) -> results
1065
+ * XXX
1066
+ * searches Amatch::DamerauLevenshtein#pattern in <code>strings</code> and returns the
1067
+ * edit distance (the sum of character operations) as a Fixnum value, by greedy
1068
+ * trimming prefixes or postfixes of the match. <code>strings</code> has
1069
+ * to be either a String or an Array of Strings. The returned
1070
+ * <code>results</code> is either a Float or an Array of Floats respectively.
1071
+ */
1072
+ static VALUE rb_DamerauLevenshtein_search(VALUE self, VALUE strings)
1073
+ {
1074
+ GET_STRUCT(General)
1075
+ return General_iterate_strings(amatch, strings, DamerauLevenshtein_search);
1076
+ }
1077
+
1078
+ /*
868
1079
  * Document-class: Amatch::Sellers
869
1080
  *
870
1081
  * The Sellers edit distance is very similar to the Levenshtein edit distance.
@@ -983,14 +1194,14 @@ DEF_CONSTRUCTOR(Sellers, Sellers)
983
1194
  * Document-method: pattern=
984
1195
  *
985
1196
  * call-seq: pattern=(pattern)
986
- *
1197
+ *
987
1198
  * Sets the current pattern string of this Amatch::Sellers instance to
988
1199
  * <code>pattern</code>.
989
1200
  */
990
1201
 
991
1202
  /*
992
1203
  * call-seq: match(strings) -> results
993
- *
1204
+ *
994
1205
  * Uses this Amatch::Sellers instance to match Sellers#pattern against
995
1206
  * <code>strings</code>, while taking into account the given weights. It
996
1207
  * returns the number of weighted character operations, the Sellers distance.
@@ -999,14 +1210,14 @@ DEF_CONSTRUCTOR(Sellers, Sellers)
999
1210
  * respectively.
1000
1211
  */
1001
1212
  static VALUE rb_Sellers_match(VALUE self, VALUE strings)
1002
- {
1213
+ {
1003
1214
  GET_STRUCT(Sellers)
1004
1215
  return Sellers_iterate_strings(amatch, strings, Sellers_match);
1005
1216
  }
1006
1217
 
1007
1218
  /*
1008
1219
  * call-seq: similar(strings) -> results
1009
- *
1220
+ *
1010
1221
  * Uses this Amatch::Sellers instance to match Amatch::Sellers#pattern
1011
1222
  * against <code>strings</code> (taking into account the given weights), and
1012
1223
  * compute a Sellers distance metric number between 0.0 for very unsimilar
@@ -1016,7 +1227,7 @@ static VALUE rb_Sellers_match(VALUE self, VALUE strings)
1016
1227
  * respectively.
1017
1228
  */
1018
1229
  static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
1019
- {
1230
+ {
1020
1231
  GET_STRUCT(Sellers)
1021
1232
  return Sellers_iterate_strings(amatch, strings, Sellers_similar);
1022
1233
  }
@@ -1031,12 +1242,12 @@ static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
1031
1242
  * <code>results</code> is either a Float or an Array of Floats respectively.
1032
1243
  */
1033
1244
  static VALUE rb_Sellers_search(VALUE self, VALUE strings)
1034
- {
1245
+ {
1035
1246
  GET_STRUCT(Sellers)
1036
1247
  return Sellers_iterate_strings(amatch, strings, Sellers_search);
1037
1248
  }
1038
1249
 
1039
- /*
1250
+ /*
1040
1251
  * Document-class: Amatch::PairDistance
1041
1252
  *
1042
1253
  * The pair distance between two strings is based on the number of adjacent
@@ -1047,7 +1258,7 @@ static VALUE rb_Sellers_search(VALUE self, VALUE strings)
1047
1258
  * are more dissimilar. The advantage of considering adjacent characters, is to
1048
1259
  * take account not only of the characters, but also of the character ordering
1049
1260
  * in the original strings.
1050
- *
1261
+ *
1051
1262
  * This metric is very capable to find similarities in natural languages.
1052
1263
  * It is explained in more detail in Simon White's article "How to Strike a
1053
1264
  * Match", located at this url:
@@ -1074,7 +1285,7 @@ DEF_CONSTRUCTOR(PairDistance, PairDistance)
1074
1285
 
1075
1286
  /*
1076
1287
  * call-seq: match(strings, regexp = /\s+/) -> results
1077
- *
1288
+ *
1078
1289
  * Uses this Amatch::PairDistance instance to match PairDistance#pattern against
1079
1290
  * <code>strings</code>. It returns the pair distance measure, that is a
1080
1291
  * returned value of 1.0 is an exact match, partial matches are lower
@@ -1090,7 +1301,7 @@ DEF_CONSTRUCTOR(PairDistance, PairDistance)
1090
1301
  * Array of Floats respectively.
1091
1302
  */
1092
1303
  static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
1093
- {
1304
+ {
1094
1305
  VALUE result, strings, regexp = Qnil;
1095
1306
  int use_regexp;
1096
1307
  GET_STRUCT(PairDistance)
@@ -1148,7 +1359,7 @@ static VALUE rb_str_pair_distance_similar(int argc, VALUE *argv, VALUE self)
1148
1359
  }
1149
1360
  }
1150
1361
 
1151
- /*
1362
+ /*
1152
1363
  * Document-class: Amatch::Hamming
1153
1364
  *
1154
1365
  * This class computes the Hamming distance between two strings.
@@ -1178,7 +1389,7 @@ DEF_CONSTRUCTOR(Hamming, General)
1178
1389
 
1179
1390
  /*
1180
1391
  * call-seq: match(strings) -> results
1181
- *
1392
+ *
1182
1393
  * Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
1183
1394
  * <code>strings</code>, that is compute the hamming distance between
1184
1395
  * <code>pattern</code> and <code>strings</code>. <code>strings</code> has to
@@ -1186,7 +1397,7 @@ DEF_CONSTRUCTOR(Hamming, General)
1186
1397
  * is either a Fixnum or an Array of Fixnums respectively.
1187
1398
  */
1188
1399
  static VALUE rb_Hamming_match(VALUE self, VALUE strings)
1189
- {
1400
+ {
1190
1401
  GET_STRUCT(General)
1191
1402
  return General_iterate_strings(amatch, strings, Hamming_match);
1192
1403
  }
@@ -1202,7 +1413,7 @@ static VALUE rb_Hamming_match(VALUE self, VALUE strings)
1202
1413
  * respectively.
1203
1414
  */
1204
1415
  static VALUE rb_Hamming_similar(VALUE self, VALUE strings)
1205
- {
1416
+ {
1206
1417
  GET_STRUCT(General)
1207
1418
  return General_iterate_strings(amatch, strings, Hamming_similar);
1208
1419
  }
@@ -1224,7 +1435,7 @@ static VALUE rb_str_hamming_similar(VALUE self, VALUE strings)
1224
1435
  }
1225
1436
 
1226
1437
 
1227
- /*
1438
+ /*
1228
1439
  * Document-class: Amatch::LongestSubsequence
1229
1440
  *
1230
1441
  * This class computes the length of the longest subsequence common to two
@@ -1254,7 +1465,7 @@ DEF_CONSTRUCTOR(LongestSubsequence, General)
1254
1465
 
1255
1466
  /*
1256
1467
  * call-seq: match(strings) -> results
1257
- *
1468
+ *
1258
1469
  * Uses this Amatch::LongestSubsequence instance to match
1259
1470
  * LongestSubsequence#pattern against <code>strings</code>, that is compute the
1260
1471
  * length of the longest common subsequence. <code>strings</code> has to be
@@ -1262,14 +1473,14 @@ DEF_CONSTRUCTOR(LongestSubsequence, General)
1262
1473
  * is either a Fixnum or an Array of Fixnums respectively.
1263
1474
  */
1264
1475
  static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
1265
- {
1476
+ {
1266
1477
  GET_STRUCT(General)
1267
1478
  return General_iterate_strings(amatch, strings, LongestSubsequence_match);
1268
1479
  }
1269
1480
 
1270
1481
  /*
1271
1482
  * call-seq: similar(strings) -> results
1272
- *
1483
+ *
1273
1484
  * Uses this Amatch::LongestSubsequence instance to match
1274
1485
  * Amatch::LongestSubsequence#pattern against <code>strings</code>, and compute
1275
1486
  * a longest substring distance metric number between 0.0 for very unsimilar
@@ -1278,7 +1489,7 @@ static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
1278
1489
  * a Fixnum or an Array of Fixnums
1279
1490
  */
1280
1491
  static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
1281
- {
1492
+ {
1282
1493
  GET_STRUCT(General)
1283
1494
  return General_iterate_strings(amatch, strings, LongestSubsequence_similar);
1284
1495
  }
@@ -1294,12 +1505,12 @@ static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
1294
1505
  * is either a Float or an Array of Floats respectively.
1295
1506
  */
1296
1507
  static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
1297
- {
1508
+ {
1298
1509
  VALUE amatch = rb_LongestSubsequence_new(rb_cLongestSubsequence, self);
1299
1510
  return rb_LongestSubsequence_similar(amatch, strings);
1300
1511
  }
1301
1512
 
1302
- /*
1513
+ /*
1303
1514
  * Document-class: Amatch::LongestSubstring
1304
1515
  *
1305
1516
  * The longest common substring is the longest substring, that is part of
@@ -1310,7 +1521,7 @@ static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
1310
1521
  * The longest common substring between 'string' and 'string' is 'string'
1311
1522
  * again, thus the longest common substring length is 6. The longest common
1312
1523
  * substring between 'string' and 'storing' is 'ring', thus the longest common
1313
- * substring length is 4.
1524
+ * substring length is 4.
1314
1525
  */
1315
1526
 
1316
1527
  DEF_RB_FREE(LongestSubstring, General)
@@ -1331,7 +1542,7 @@ DEF_CONSTRUCTOR(LongestSubstring, General)
1331
1542
 
1332
1543
  /*
1333
1544
  * call-seq: match(strings) -> results
1334
- *
1545
+ *
1335
1546
  * Uses this Amatch::LongestSubstring instance to match
1336
1547
  * LongestSubstring#pattern against <code>strings</code>, that is compute the
1337
1548
  * length of the longest common substring. <code>strings</code> has to be
@@ -1346,7 +1557,7 @@ static VALUE rb_LongestSubstring_match(VALUE self, VALUE strings)
1346
1557
 
1347
1558
  /*
1348
1559
  * call-seq: similar(strings) -> results
1349
- *
1560
+ *
1350
1561
  * Uses this Amatch::LongestSubstring instance to match
1351
1562
  * Amatch::LongestSubstring#pattern against <code>strings</code>, and compute a
1352
1563
  * longest substring distance metric number between 0.0 for very unsimilar
@@ -1372,11 +1583,11 @@ static VALUE rb_LongestSubstring_similar(VALUE self, VALUE strings)
1372
1583
  * is either a Float or an Array of Floats respectively.
1373
1584
  */
1374
1585
  static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
1375
- {
1586
+ {
1376
1587
  VALUE amatch = rb_LongestSubstring_new(rb_cLongestSubstring, self);
1377
1588
  return rb_LongestSubstring_similar(amatch, strings);
1378
1589
  }
1379
-
1590
+
1380
1591
  /*
1381
1592
  * Document-class: Amatch::Jaro
1382
1593
  *
@@ -1573,6 +1784,17 @@ void Init_amatch_ext()
1573
1784
  rb_define_method(rb_cLevenshtein, "similar", rb_Levenshtein_similar, 1);
1574
1785
  rb_define_method(rb_mAmatchStringMethods, "levenshtein_similar", rb_str_levenshtein_similar, 1);
1575
1786
 
1787
+ /* DamerauLevenshtein */
1788
+ rb_cDamerauLevenshtein = rb_define_class_under(rb_mAmatch, "DamerauLevenshtein", rb_cObject);
1789
+ rb_define_alloc_func(rb_cDamerauLevenshtein, rb_DamerauLevenshtein_s_allocate);
1790
+ rb_define_method(rb_cDamerauLevenshtein, "initialize", rb_DamerauLevenshtein_initialize, 1);
1791
+ rb_define_method(rb_cDamerauLevenshtein, "pattern", rb_General_pattern, 0);
1792
+ rb_define_method(rb_cDamerauLevenshtein, "pattern=", rb_General_pattern_set, 1);
1793
+ rb_define_method(rb_cDamerauLevenshtein, "match", rb_DamerauLevenshtein_match, 1);
1794
+ rb_define_method(rb_cDamerauLevenshtein, "search", rb_DamerauLevenshtein_search, 1);
1795
+ rb_define_method(rb_cDamerauLevenshtein, "similar", rb_DamerauLevenshtein_similar, 1);
1796
+ rb_define_method(rb_mAmatchStringMethods, "damerau_levenshtein_similar", rb_str_damerau_levenshtein_similar, 1);
1797
+
1576
1798
  /* Sellers */
1577
1799
  rb_cSellers = rb_define_class_under(rb_mAmatch, "Sellers", rb_cObject);
1578
1800
  rb_define_alloc_func(rb_cSellers, rb_Sellers_s_allocate);