amatch-rbx 0.2.12

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,35 @@
1
+ # vim: set filetype=ruby et sw=2 ts=2:
2
+
3
+ require 'gem_hadar'
4
+
5
+ GemHadar do
6
+ name 'amatch'
7
+ author 'Florian Frank'
8
+ email 'flori@ping.de'
9
+ homepage "http://github.com/flori/#{name}"
10
+ summary 'Approximate String Matching library'
11
+ description <<EOT
12
+ Amatch is a library for approximate string matching and searching in strings.
13
+ Several algorithms can be used to do this, and it's also possible to compute a
14
+ similarity metric number between 0.0 and 1.0 for two given strings.
15
+ EOT
16
+ executables << 'agrep.rb'
17
+ bindir 'bin'
18
+ test_dir 'tests'
19
+ ignore '.*.sw[pon]', 'pkg', 'Gemfile.lock', '.AppleDouble', '.rbx'
20
+ title "#{name.camelize} - Approximate Matching"
21
+ readme 'README.rdoc'
22
+ require_paths %w[lib ext]
23
+ dependency 'tins', '~>0.3'
24
+ development_dependency 'test-unit', '~>2.3'
25
+ development_dependency 'utils'
26
+ development_dependency 'rake', '~>10', '<11.0'
27
+
28
+ install_library do
29
+ libdir = CONFIG["sitelibdir"]
30
+ src, = Dir['ext/amatch.*'].reject { |x| x =~ /\.[co]$/ }
31
+ install(src, File.join(libdir, File.basename(src)), :verbose => true)
32
+ mkdir_p dst = File.join(libdir, 'amatch')
33
+ install('lib/amatch/version.rb', File.join(dst, 'version.rb'), :verbose => true)
34
+ end
35
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.12
Binary file
@@ -0,0 +1,79 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require 'amatch'
4
+ require 'getoptlong'
5
+
6
+ def usage(msg, options)
7
+ puts msg, "Usage: #{File.basename($0)} [OPTIONS] PATTERN [FILE ...]", ""
8
+ options.each do |o|
9
+ puts " " + o[1] + ", " + o[0] + " " +
10
+ (o[2] == GetoptLong::REQUIRED_ARGUMENT ? 'ARGUMENT' : '')
11
+ end
12
+ puts "\nReport bugs to <flori@ping.de>."
13
+ exit 0
14
+ end
15
+
16
+ class Amatch::Levenshtein
17
+ def search_relative(strings)
18
+ search(strings).to_f / pattern.size
19
+ end
20
+ end
21
+
22
+ $distance = 1
23
+ $mode = :search
24
+ begin
25
+ parser = GetoptLong.new
26
+ options = [
27
+ [ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
28
+ [ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
29
+ [ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
30
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
31
+ ]
32
+ parser.set_options(*options)
33
+ parser.each_option do |name, arg|
34
+ name = name.sub(/^--/, '')
35
+ case name
36
+ when 'distance'
37
+ $distance = arg.to_f
38
+ when 'relative'
39
+ $mode = :search_relative
40
+ when 'verbose'
41
+ $verbose = 1
42
+ when 'help'
43
+ usage('You\'ve asked for it!', options)
44
+ end
45
+ end
46
+ rescue
47
+ exit 1
48
+ end
49
+ pattern = ARGV.shift or usage('Pattern needed!', options)
50
+
51
+ matcher = Amatch::Levenshtein.new(pattern)
52
+ size = 0
53
+ start = Time.new
54
+ if ARGV.size > 0 then
55
+ ARGV.each do |filename|
56
+ File.stat(filename).file? or next
57
+ size += File.size(filename)
58
+ begin
59
+ File.open(filename, 'r').each_line do |line|
60
+ if matcher.__send__($mode, line) <= $distance
61
+ puts "#{filename}:#{line}"
62
+ end
63
+ end
64
+ rescue
65
+ STDERR.puts "Failure at #{filename}: #{$!} => Skipping!"
66
+ end
67
+ end
68
+ else
69
+ STDIN.each_line do |line|
70
+ size += line.size
71
+ if matcher.__send__($mode, line) <= $distance
72
+ puts line
73
+ end
74
+ end
75
+ end
76
+ time = Time.new - start
77
+ $verbose and STDERR.printf "%.3f secs running, scanned %.3f KB/s.\n",
78
+ time, size / time / 1024
79
+ exit 0
@@ -0,0 +1,1661 @@
1
+ #include "ruby.h"
2
+ #include "pair.h"
3
+ #include <ctype.h>
4
+ #include "common.h"
5
+
6
+ /*
7
+ * Document-method: pattern
8
+ *
9
+ * call-seq: pattern -> pattern string
10
+ *
11
+ * Returns the current pattern string of this instance.
12
+ */
13
+
14
+ /*
15
+ * Document-method: pattern=
16
+ *
17
+ * call-seq: pattern=(pattern)
18
+ *
19
+ * Sets the current pattern string of this instance to <code>pattern</code>.
20
+ */
21
+
22
+
23
+ static VALUE rb_mAmatch, rb_mAmatchStringMethods, rb_cLevenshtein, rb_cSellers, rb_cHamming,
24
+ rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring,
25
+ rb_cJaro, rb_cJaroWinkler;
26
+
27
+ static ID id_split, id_to_f;
28
+
29
+ #define GET_STRUCT(klass) \
30
+ klass *amatch; \
31
+ Data_Get_Struct(self, klass, amatch);
32
+
33
+ #define DEF_ALLOCATOR(type) \
34
+ static type *type##_allocate() \
35
+ { \
36
+ type *obj = ALLOC(type); \
37
+ MEMZERO(obj, type, 1); \
38
+ return obj; \
39
+ }
40
+
41
+ #define DEF_CONSTRUCTOR(klass, type) \
42
+ static VALUE rb_##klass##_s_allocate(VALUE klass2) \
43
+ { \
44
+ type *amatch = type##_allocate(); \
45
+ return Data_Wrap_Struct(klass2, NULL, rb_##klass##_free, amatch); \
46
+ } \
47
+ VALUE rb_##klass##_new(VALUE klass2, VALUE pattern) \
48
+ { \
49
+ VALUE obj = rb_##klass##_s_allocate(klass2); \
50
+ rb_##klass##_initialize(obj, pattern); \
51
+ return obj; \
52
+ }
53
+
54
+ #define DEF_RB_FREE(klass, type) \
55
+ static void rb_##klass##_free(type *amatch) \
56
+ { \
57
+ MEMZERO(amatch->pattern, char, amatch->pattern_len); \
58
+ xfree(amatch->pattern); \
59
+ MEMZERO(amatch, type, 1); \
60
+ xfree(amatch); \
61
+ }
62
+
63
+ #define DEF_PATTERN_ACCESSOR(type) \
64
+ static void type##_pattern_set(type *amatch, VALUE pattern) \
65
+ { \
66
+ Check_Type(pattern, T_STRING); \
67
+ xfree(amatch->pattern); \
68
+ amatch->pattern_len = (int) RSTRING_LEN(pattern); \
69
+ amatch->pattern = ALLOC_N(char, amatch->pattern_len); \
70
+ MEMCPY(amatch->pattern, RSTRING_PTR(pattern), char, \
71
+ RSTRING_LEN(pattern)); \
72
+ } \
73
+ static VALUE rb_##type##_pattern(VALUE self) \
74
+ { \
75
+ GET_STRUCT(type) \
76
+ return rb_str_new(amatch->pattern, amatch->pattern_len); \
77
+ } \
78
+ static VALUE rb_##type##_pattern_set(VALUE self, VALUE pattern) \
79
+ { \
80
+ GET_STRUCT(type) \
81
+ type##_pattern_set(amatch, pattern); \
82
+ return Qnil; \
83
+ }
84
+
85
+ #define DEF_ITERATE_STRINGS(type) \
86
+ static VALUE type##_iterate_strings(type *amatch, VALUE strings, \
87
+ VALUE (*match_function) (type *amatch, VALUE strings)) \
88
+ { \
89
+ if (TYPE(strings) == T_STRING) { \
90
+ return match_function(amatch, strings); \
91
+ } else { \
92
+ int i; \
93
+ VALUE result; \
94
+ Check_Type(strings, T_ARRAY); \
95
+ result = rb_ary_new2(RARRAY_LEN(strings)); \
96
+ for (i = 0; i < RARRAY_LEN(strings); i++) { \
97
+ VALUE string = rb_ary_entry(strings, i); \
98
+ if (TYPE(string) != T_STRING) { \
99
+ rb_raise(rb_eTypeError, \
100
+ "array has to contain only strings (%s given)", \
101
+ NIL_P(string) ? \
102
+ "NilClass" : \
103
+ rb_class2name(CLASS_OF(string))); \
104
+ } \
105
+ rb_ary_push(result, match_function(amatch, string)); \
106
+ } \
107
+ return result; \
108
+ } \
109
+ }
110
+
111
+ #define DEF_RB_READER(type, function, name, converter) \
112
+ VALUE function(VALUE self) \
113
+ { \
114
+ GET_STRUCT(type) \
115
+ return converter(amatch->name); \
116
+ }
117
+
118
+ #define DEF_RB_WRITER(type, function, name, vtype, caster, converter, check)\
119
+ VALUE function(VALUE self, VALUE value) \
120
+ { \
121
+ vtype value_ ## vtype; \
122
+ GET_STRUCT(type) \
123
+ caster(value); \
124
+ value_ ## vtype = converter(value); \
125
+ if (!(value_ ## vtype check)) \
126
+ rb_raise(rb_eTypeError, "check of value " #check " failed"); \
127
+ amatch->name = value_ ## vtype; \
128
+ return Qnil; \
129
+ }
130
+
131
+
132
+ #define CAST2FLOAT(obj) \
133
+ if (TYPE(obj) != T_FLOAT && rb_respond_to(obj, id_to_f)) \
134
+ obj = rb_funcall(obj, id_to_f, 0, 0); \
135
+ else \
136
+ Check_Type(obj, T_FLOAT)
137
+ #define FLOAT2C(obj) (RFLOAT_VALUE(obj))
138
+
139
+ #define CAST2BOOL(obj) \
140
+ if (obj == Qfalse || obj == Qnil) \
141
+ obj = Qfalse; \
142
+ else \
143
+ obj = Qtrue;
144
+ #define BOOL2C(obj) (obj == Qtrue)
145
+ #define C2BOOL(obj) (obj ? Qtrue : Qfalse)
146
+
147
+ #define OPTIMIZE_TIME \
148
+ if (amatch->pattern_len < RSTRING_LEN(string)) { \
149
+ a_ptr = amatch->pattern; \
150
+ a_len = (int) amatch->pattern_len; \
151
+ b_ptr = RSTRING_PTR(string); \
152
+ b_len = (int) RSTRING_LEN(string); \
153
+ } else { \
154
+ a_ptr = RSTRING_PTR(string); \
155
+ a_len = (int) RSTRING_LEN(string); \
156
+ b_ptr = amatch->pattern; \
157
+ b_len = (int) amatch->pattern_len; \
158
+ }
159
+
160
+ #define DONT_OPTIMIZE \
161
+ a_ptr = amatch->pattern; \
162
+ a_len = (int) amatch->pattern_len; \
163
+ b_ptr = RSTRING_PTR(string); \
164
+ b_len = (int) RSTRING_LEN(string); \
165
+
166
+ /*
167
+ * C structures of the Amatch classes
168
+ */
169
+
170
+ typedef struct GeneralStruct {
171
+ char *pattern;
172
+ int pattern_len;
173
+ } General;
174
+
175
+ DEF_ALLOCATOR(General)
176
+ DEF_PATTERN_ACCESSOR(General)
177
+ DEF_ITERATE_STRINGS(General)
178
+
179
+ typedef struct SellersStruct {
180
+ char *pattern;
181
+ int pattern_len;
182
+ double substitution;
183
+ double deletion;
184
+ double insertion;
185
+ } Sellers;
186
+
187
+ DEF_ALLOCATOR(Sellers)
188
+ DEF_PATTERN_ACCESSOR(Sellers)
189
+ DEF_ITERATE_STRINGS(Sellers)
190
+
191
+ static void Sellers_reset_weights(Sellers *self)
192
+ {
193
+ self->substitution = 1.0;
194
+ self->deletion = 1.0;
195
+ self->insertion = 1.0;
196
+ }
197
+
198
+ typedef struct PairDistanceStruct {
199
+ char *pattern;
200
+ int pattern_len;
201
+ PairArray *pattern_pair_array;
202
+ } PairDistance;
203
+
204
+ DEF_ALLOCATOR(PairDistance)
205
+ DEF_PATTERN_ACCESSOR(PairDistance)
206
+
207
+ typedef struct JaroStruct {
208
+ char *pattern;
209
+ int pattern_len;
210
+ int ignore_case;
211
+ } Jaro;
212
+
213
+ DEF_ALLOCATOR(Jaro)
214
+ DEF_PATTERN_ACCESSOR(Jaro)
215
+ DEF_ITERATE_STRINGS(Jaro)
216
+
217
+ typedef struct JaroWinklerStruct {
218
+ char *pattern;
219
+ int pattern_len;
220
+ int ignore_case;
221
+ double scaling_factor;
222
+ } JaroWinkler;
223
+
224
+ DEF_ALLOCATOR(JaroWinkler)
225
+ DEF_PATTERN_ACCESSOR(JaroWinkler)
226
+ DEF_ITERATE_STRINGS(JaroWinkler)
227
+
228
+ /*
229
+ * Levenshtein edit distances are computed here:
230
+ */
231
+
232
+ #define COMPUTE_LEVENSHTEIN_DISTANCE \
233
+ for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
234
+ c = i % 2; /* current row */ \
235
+ p = (i + 1) % 2; /* previous row */ \
236
+ v[c][0] = i; /* first column */ \
237
+ for (j = 1; j <= b_len; j++) { \
238
+ /* Bellman's principle of optimality: */ \
239
+ weight = v[p][j - 1] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
240
+ if (weight > v[p][j] + 1) { \
241
+ weight = v[p][j] + 1; \
242
+ } \
243
+ if (weight > v[c][j - 1] + 1) { \
244
+ weight = v[c][j - 1] + 1; \
245
+ } \
246
+ v[c][j] = weight; \
247
+ } \
248
+ p = c; \
249
+ c = (c + 1) % 2; \
250
+ }
251
+
252
+ static VALUE Levenshtein_match(General *amatch, VALUE string)
253
+ {
254
+ VALUE result;
255
+ char *a_ptr, *b_ptr;
256
+ int a_len, b_len;
257
+ int *v[2], weight;
258
+ int i, j, c, p;
259
+
260
+ Check_Type(string, T_STRING);
261
+ DONT_OPTIMIZE
262
+
263
+ v[0] = ALLOC_N(int, b_len + 1);
264
+ v[1] = ALLOC_N(int, b_len + 1);
265
+ for (i = 0; i <= b_len; i++) {
266
+ v[0][i] = i;
267
+ v[1][i] = i;
268
+ }
269
+
270
+ COMPUTE_LEVENSHTEIN_DISTANCE
271
+
272
+ result = INT2FIX(v[p][b_len]);
273
+
274
+ xfree(v[0]);
275
+ xfree(v[1]);
276
+
277
+ return result;
278
+ }
279
+
280
+ static VALUE Levenshtein_similar(General *amatch, VALUE string)
281
+ {
282
+ VALUE result;
283
+ char *a_ptr, *b_ptr;
284
+ int a_len, b_len;
285
+ int *v[2], weight;
286
+ int i, j, c, p;
287
+
288
+ Check_Type(string, T_STRING);
289
+ DONT_OPTIMIZE
290
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
291
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
292
+ v[0] = ALLOC_N(int, b_len + 1);
293
+ v[1] = ALLOC_N(int, b_len + 1);
294
+ for (i = 0; i <= b_len; i++) {
295
+ v[0][i] = i;
296
+ v[1][i] = i;
297
+ }
298
+
299
+ COMPUTE_LEVENSHTEIN_DISTANCE
300
+
301
+ if (b_len > a_len) {
302
+ result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
303
+ } else {
304
+ result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
305
+ }
306
+ xfree(v[0]);
307
+ xfree(v[1]);
308
+ return result;
309
+ }
310
+
311
+ static VALUE Levenshtein_search(General *amatch, VALUE string)
312
+ {
313
+ VALUE result;
314
+ char *a_ptr, *b_ptr;
315
+ int a_len, b_len;
316
+ int *v[2], weight, min;
317
+ int i, j, c, p;
318
+
319
+ Check_Type(string, T_STRING);
320
+ DONT_OPTIMIZE
321
+
322
+ v[0] = ALLOC_N(int, b_len + 1);
323
+ v[1] = ALLOC_N(int, b_len + 1);
324
+ MEMZERO(v[0], int, b_len + 1);
325
+ MEMZERO(v[1], int, b_len + 1);
326
+
327
+ COMPUTE_LEVENSHTEIN_DISTANCE
328
+
329
+ for (i = 0, min = a_len; i <= b_len; i++) {
330
+ if (v[p][i] < min) min = v[p][i];
331
+ }
332
+
333
+ result = INT2FIX(min);
334
+
335
+ xfree(v[0]);
336
+ xfree(v[1]);
337
+
338
+ return result;
339
+ }
340
+
341
+
342
+ /*
343
+ * Sellers edit distances are computed here:
344
+ */
345
+
346
+ #define COMPUTE_SELLERS_DISTANCE \
347
+ for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
348
+ c = i % 2; /* current row */ \
349
+ p = (i + 1) % 2; /* previous row */ \
350
+ v[c][0] = i * amatch->deletion; /* first column */ \
351
+ for (j = 1; j <= b_len; j++) { \
352
+ /* Bellman's principle of optimality: */ \
353
+ weight = v[p][j - 1] + \
354
+ (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : amatch->substitution); \
355
+ if (weight > v[p][j] + amatch->insertion) { \
356
+ weight = v[p][j] + amatch->insertion; \
357
+ } \
358
+ if (weight > v[c][j - 1] + amatch->deletion) { \
359
+ weight = v[c][j - 1] + amatch->deletion; \
360
+ } \
361
+ v[c][j] = weight; \
362
+ } \
363
+ p = c; \
364
+ c = (c + 1) % 2; \
365
+ }
366
+
367
+ static VALUE Sellers_match(Sellers *amatch, VALUE string)
368
+ {
369
+ VALUE result;
370
+ char *a_ptr, *b_ptr;
371
+ int a_len, b_len;
372
+ double *v[2], weight;
373
+ int i, j, c, p;
374
+
375
+ Check_Type(string, T_STRING);
376
+ DONT_OPTIMIZE
377
+
378
+ v[0] = ALLOC_N(double, b_len + 1);
379
+ v[1] = ALLOC_N(double, b_len + 1);
380
+ for (i = 0; i <= b_len; i++) {
381
+ v[0][i] = i * amatch->deletion;
382
+ v[1][i] = i * amatch->deletion;
383
+ }
384
+
385
+ COMPUTE_SELLERS_DISTANCE
386
+
387
+ result = rb_float_new(v[p][b_len]);
388
+ xfree(v[0]);
389
+ xfree(v[1]);
390
+ return result;
391
+ }
392
+
393
+ static VALUE Sellers_similar(Sellers *amatch, VALUE string)
394
+ {
395
+ VALUE result;
396
+ char *a_ptr, *b_ptr;
397
+ int a_len, b_len;
398
+ double *v[2], weight, max_weight;
399
+ int i, j, c, p;
400
+
401
+ if (amatch->insertion >= amatch->deletion) {
402
+ if (amatch->substitution >= amatch->insertion) {
403
+ max_weight = amatch->substitution;
404
+ } else {
405
+ max_weight = amatch->insertion;
406
+ }
407
+ } else {
408
+ if (amatch->substitution >= amatch->deletion) {
409
+ max_weight = amatch->substitution;
410
+ } else {
411
+ max_weight = amatch->deletion;
412
+ }
413
+ }
414
+
415
+ Check_Type(string, T_STRING);
416
+ DONT_OPTIMIZE
417
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
418
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
419
+ v[0] = ALLOC_N(double, b_len + 1);
420
+ v[1] = ALLOC_N(double, b_len + 1);
421
+ for (i = 0; i <= b_len; i++) {
422
+ v[0][i] = i * amatch->deletion;
423
+ v[1][i] = i * amatch->deletion;
424
+ }
425
+
426
+ COMPUTE_SELLERS_DISTANCE
427
+
428
+ if (b_len > a_len) {
429
+ result = rb_float_new(1.0 - v[p][b_len] / (b_len * max_weight));
430
+ } else {
431
+ result = rb_float_new(1.0 - v[p][b_len] / (a_len * max_weight));
432
+ }
433
+ xfree(v[0]);
434
+ xfree(v[1]);
435
+ return result;
436
+ }
437
+
438
+ static VALUE Sellers_search(Sellers *amatch, VALUE string)
439
+ {
440
+ VALUE result;
441
+ char *a_ptr, *b_ptr;
442
+ int a_len, b_len;
443
+ double *v[2], weight, min;
444
+ int i, j, c, p;
445
+
446
+ Check_Type(string, T_STRING);
447
+ DONT_OPTIMIZE
448
+
449
+ v[0] = ALLOC_N(double, b_len + 1);
450
+ v[1] = ALLOC_N(double, b_len + 1);
451
+ MEMZERO(v[0], double, b_len + 1);
452
+ MEMZERO(v[1], double, b_len + 1);
453
+
454
+ COMPUTE_SELLERS_DISTANCE
455
+
456
+ for (i = 0, min = a_len; i <= b_len; i++) {
457
+ if (v[p][i] < min) min = v[p][i];
458
+ }
459
+ result = rb_float_new(min);
460
+ xfree(v[0]);
461
+ xfree(v[1]);
462
+
463
+ return result;
464
+ }
465
+
466
+ /*
467
+ * Pair distances are computed here:
468
+ */
469
+
470
+ static VALUE PairDistance_match(PairDistance *amatch, VALUE string, VALUE regexp, int use_regexp)
471
+ {
472
+ double result;
473
+ VALUE tokens;
474
+ PairArray *pair_array;
475
+
476
+ Check_Type(string, T_STRING);
477
+ if (!NIL_P(regexp) || use_regexp) {
478
+ tokens = rb_funcall(
479
+ rb_str_new(amatch->pattern, amatch->pattern_len),
480
+ id_split, 1, regexp
481
+ );
482
+ if (!amatch->pattern_pair_array) {
483
+ amatch->pattern_pair_array = PairArray_new(tokens);
484
+ } else {
485
+ pair_array_reactivate(amatch->pattern_pair_array);
486
+ }
487
+ tokens = rb_funcall(string, id_split, 1, regexp);
488
+ pair_array = PairArray_new(tokens);
489
+ } else {
490
+ VALUE tmp = rb_str_new(amatch->pattern, amatch->pattern_len);
491
+ tokens = rb_ary_new4(1, &tmp);
492
+ if (!amatch->pattern_pair_array) {
493
+ amatch->pattern_pair_array = PairArray_new(tokens);
494
+ } else {
495
+ pair_array_reactivate(amatch->pattern_pair_array);
496
+ }
497
+ tokens = rb_ary_new4(1, &string);
498
+ pair_array = PairArray_new(tokens);
499
+ }
500
+ result = pair_array_match(amatch->pattern_pair_array, pair_array);
501
+ pair_array_destroy(pair_array);
502
+ return rb_float_new(result);
503
+ }
504
+
505
+ /*
506
+ * Hamming distances are computed here:
507
+ */
508
+
509
+ #define COMPUTE_HAMMING_DISTANCE \
510
+ for (i = 0, result = b_len - a_len; i < a_len; i++) { \
511
+ if (i >= b_len) { \
512
+ result += a_len - b_len; \
513
+ break; \
514
+ } \
515
+ if (b_ptr[i] != a_ptr[i]) result++; \
516
+ }
517
+
518
+ static VALUE Hamming_match(General *amatch, VALUE string)
519
+ {
520
+ char *a_ptr, *b_ptr;
521
+ int a_len, b_len;
522
+ int i, result;
523
+
524
+ Check_Type(string, T_STRING);
525
+ OPTIMIZE_TIME
526
+ COMPUTE_HAMMING_DISTANCE
527
+ return INT2FIX(result);
528
+ }
529
+
530
+ static VALUE Hamming_similar(General *amatch, VALUE string)
531
+ {
532
+ char *a_ptr, *b_ptr;
533
+ int a_len, b_len;
534
+ int i, result;
535
+
536
+ Check_Type(string, T_STRING);
537
+ OPTIMIZE_TIME
538
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
539
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
540
+ COMPUTE_HAMMING_DISTANCE
541
+ return rb_float_new(1.0 - ((double) result) / b_len);
542
+ }
543
+
544
+ /*
545
+ * Longest Common Subsequence computation
546
+ */
547
+
548
+ #define COMPUTE_LONGEST_SUBSEQUENCE \
549
+ l[0] = ALLOC_N(int, b_len + 1); \
550
+ l[1] = ALLOC_N(int, b_len + 1); \
551
+ for (i = a_len, c = 0, p = 1; i >= 0; i--) { \
552
+ for (j = b_len; j >= 0; j--) { \
553
+ if (i == a_len || j == b_len) { \
554
+ l[c][j] = 0; \
555
+ } else if (a_ptr[i] == b_ptr[j]) { \
556
+ l[c][j] = 1 + l[p][j + 1]; \
557
+ } else { \
558
+ int x = l[p][j], y = l[c][j + 1]; \
559
+ if (x > y) l[c][j] = x; else l[c][j] = y; \
560
+ } \
561
+ } \
562
+ p = c; \
563
+ c = (c + 1) % 2; \
564
+ } \
565
+ result = l[p][0]; \
566
+ xfree(l[0]); \
567
+ xfree(l[1]);
568
+
569
+
570
+ static VALUE LongestSubsequence_match(General *amatch, VALUE string)
571
+ {
572
+ char *a_ptr, *b_ptr;
573
+ int a_len, b_len;
574
+ int result, c, p, i, j, *l[2];
575
+
576
+ Check_Type(string, T_STRING);
577
+ OPTIMIZE_TIME
578
+
579
+ if (a_len == 0 || b_len == 0) return INT2FIX(0);
580
+ COMPUTE_LONGEST_SUBSEQUENCE
581
+ return INT2FIX(result);
582
+ }
583
+
584
+ static VALUE LongestSubsequence_similar(General *amatch, VALUE string)
585
+ {
586
+ char *a_ptr, *b_ptr;
587
+ int a_len, b_len;
588
+ int result, c, p, i, j, *l[2];
589
+
590
+ Check_Type(string, T_STRING);
591
+ OPTIMIZE_TIME
592
+
593
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
594
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
595
+ COMPUTE_LONGEST_SUBSEQUENCE
596
+ return rb_float_new(((double) result) / b_len);
597
+ }
598
+
599
+ /*
600
+ * Longest Common Substring computation
601
+ */
602
+
603
+ #define COMPUTE_LONGEST_SUBSTRING \
604
+ l[0] = ALLOC_N(int, b_len); \
605
+ MEMZERO(l[0], int, b_len); \
606
+ l[1] = ALLOC_N(int, b_len); \
607
+ MEMZERO(l[1], int, b_len); \
608
+ result = 0; \
609
+ for (i = 0, c = 0, p = 1; i < a_len; i++) { \
610
+ for (j = 0; j < b_len; j++) { \
611
+ if (a_ptr[i] == b_ptr[j]) { \
612
+ l[c][j] = j == 0 ? 1 : 1 + l[p][j - 1]; \
613
+ if (l[c][j] > result) result = l[c][j]; \
614
+ } else { \
615
+ l[c][j] = 0; \
616
+ } \
617
+ } \
618
+ p = c; \
619
+ c = (c + 1) % 2; \
620
+ } \
621
+ xfree(l[0]); \
622
+ xfree(l[1]);
623
+
624
+ static VALUE LongestSubstring_match(General *amatch, VALUE string)
625
+ {
626
+ char *a_ptr, *b_ptr;
627
+ int a_len, b_len;
628
+ int result, c, p, i, j, *l[2];
629
+
630
+ Check_Type(string, T_STRING);
631
+ OPTIMIZE_TIME
632
+ if (a_len == 0 || b_len == 0) return INT2FIX(0);
633
+ COMPUTE_LONGEST_SUBSTRING
634
+ return INT2FIX(result);
635
+ }
636
+
637
+ static VALUE LongestSubstring_similar(General *amatch, VALUE string)
638
+ {
639
+ char *a_ptr, *b_ptr;
640
+ int a_len, b_len;
641
+ int result, c, p, i, j, *l[2];
642
+
643
+ Check_Type(string, T_STRING);
644
+ OPTIMIZE_TIME
645
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
646
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
647
+ COMPUTE_LONGEST_SUBSTRING
648
+ return rb_float_new(((double) result) / b_len);
649
+ }
650
+
651
+ /*
652
+ * Jaro computation
653
+ */
654
+
655
+ #define COMPUTE_JARO \
656
+ l[0] = ALLOC_N(int, a_len); \
657
+ MEMZERO(l[0], int, a_len); \
658
+ l[1] = ALLOC_N(int, b_len); \
659
+ MEMZERO(l[1], int, b_len); \
660
+ max_dist = ((a_len > b_len ? a_len : b_len) / 2) - 1; \
661
+ m = 0; \
662
+ for (i = 0; i < a_len; i++) { \
663
+ low = (i > max_dist ? i - max_dist : 0); \
664
+ high = (i + max_dist < b_len ? i + max_dist : b_len - 1); \
665
+ for (j = low; j <= high; j++) { \
666
+ if (!l[1][j] && a_ptr[i] == b_ptr[j]) { \
667
+ l[0][i] = 1; \
668
+ l[1][j] = 1; \
669
+ m++; \
670
+ break; \
671
+ } \
672
+ } \
673
+ } \
674
+ if (m == 0) { \
675
+ result = 0.0; \
676
+ } else { \
677
+ k = t = 0; \
678
+ for (i = 0; i < a_len; i++) { \
679
+ if (l[0][i]) { \
680
+ for (j = k; j < b_len; j++) { \
681
+ if (l[1][j]) { \
682
+ k = j + 1; \
683
+ break; \
684
+ } \
685
+ } \
686
+ if (a_ptr[i] != b_ptr[j]) { \
687
+ t++; \
688
+ } \
689
+ } \
690
+ } \
691
+ t = t / 2; \
692
+ result = (((double)m)/a_len + ((double)m)/b_len + ((double)(m-t))/m)/3.0; \
693
+ } \
694
+ xfree(l[0]); \
695
+ xfree(l[1]);
696
+
697
+
698
+ #define LOWERCASE_STRINGS \
699
+ char *ying, *yang; \
700
+ ying = ALLOC_N(char, a_len); \
701
+ MEMCPY(ying, a_ptr, char, a_len); \
702
+ a_ptr = ying; \
703
+ yang = ALLOC_N(char, b_len); \
704
+ MEMCPY(yang, b_ptr, char, b_len); \
705
+ b_ptr = yang; \
706
+ for (i = 0; i < a_len; i++) { \
707
+ if (islower(a_ptr[i])) a_ptr[i] = toupper(a_ptr[i]); \
708
+ } \
709
+ for (i = 0; i < b_len; i++) { \
710
+ if (islower(b_ptr[i])) b_ptr[i] = toupper(b_ptr[i]); \
711
+ }
712
+
713
+ static VALUE Jaro_match(Jaro *amatch, VALUE string)
714
+ {
715
+ char *a_ptr, *b_ptr;
716
+ int a_len, b_len, max_dist, m, t, i, j, k, low, high;
717
+ int *l[2];
718
+ double result;
719
+
720
+ Check_Type(string, T_STRING);
721
+ OPTIMIZE_TIME
722
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
723
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
724
+ if (amatch->ignore_case) {
725
+ LOWERCASE_STRINGS
726
+ }
727
+ COMPUTE_JARO
728
+ if (amatch->ignore_case) {
729
+ xfree(a_ptr);
730
+ xfree(b_ptr);
731
+ }
732
+ return rb_float_new(result);
733
+ }
734
+
735
+ /*
736
+ * Jaro-Winkler computation
737
+ */
738
+
739
+ static VALUE JaroWinkler_match(JaroWinkler *amatch, VALUE string)
740
+ {
741
+ char *a_ptr, *b_ptr;
742
+ int a_len, b_len, max_dist, m, t, i, j, k, low, high, n;
743
+ int *l[2];
744
+ double result;
745
+
746
+ Check_Type(string, T_STRING);
747
+ OPTIMIZE_TIME
748
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
749
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
750
+ if (amatch->ignore_case) {
751
+ LOWERCASE_STRINGS
752
+ }
753
+ COMPUTE_JARO
754
+ n = 0;
755
+ for (i = 0; i < (a_len >= 4 ? 4 : a_len); i++) {
756
+ if (a_ptr[i] == b_ptr[i]) {
757
+ n++;
758
+ } else {
759
+ break;
760
+ }
761
+ }
762
+ result = result + n*amatch->scaling_factor*(1-result);
763
+ if (amatch->ignore_case) {
764
+ xfree(a_ptr);
765
+ xfree(b_ptr);
766
+ }
767
+ return rb_float_new(result);
768
+ }
769
+
770
+ /*
771
+ * Ruby API
772
+ */
773
+
774
+ /*
775
+ * Document-class: Amatch::Levenshtein
776
+ *
777
+ * The Levenshtein edit distance is defined as the minimal costs involved to
778
+ * transform one string into another by using three elementary operations:
779
+ * deletion, insertion and substitution of a character. To transform "water"
780
+ * into "wine", for instance, you have to substitute "a" -> "i": "witer", "t"
781
+ * -> "n": "winer" and delete "r": "wine". The edit distance between "water"
782
+ * and "wine" is 3, because you have to apply three operations. The edit
783
+ * distance between "wine" and "wine" is 0 of course: no operation is
784
+ * necessary for the transformation -- they're already the same string. It's
785
+ * easy to see that more similar strings have smaller edit distances than
786
+ * strings that differ a lot.
787
+ */
788
+
789
+ DEF_RB_FREE(Levenshtein, General)
790
+
791
+ /*
792
+ * call-seq: new(pattern)
793
+ *
794
+ * Creates a new Amatch::Levenshtein instance from <code>pattern</code>.
795
+ */
796
+ static VALUE rb_Levenshtein_initialize(VALUE self, VALUE pattern)
797
+ {
798
+ GET_STRUCT(General)
799
+ General_pattern_set(amatch, pattern);
800
+ return self;
801
+ }
802
+
803
+ DEF_CONSTRUCTOR(Levenshtein, General)
804
+
805
+ /*
806
+ * call-seq: match(strings) -> results
807
+ *
808
+ * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
809
+ * against <code>strings</code>. It returns the number operations, the Sellers
810
+ * distance. <code>strings</code> has to be either a String or an Array of
811
+ * Strings. The returned <code>results</code> is either a Float or an Array of
812
+ * Floats respectively.
813
+ */
814
+ static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
815
+ {
816
+ GET_STRUCT(General)
817
+ return General_iterate_strings(amatch, strings, Levenshtein_match);
818
+ }
819
+
820
+ /*
821
+ * call-seq: similar(strings) -> results
822
+ *
823
+ * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
824
+ * against <code>strings</code>, and compute a Levenshtein distance metric
825
+ * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
826
+ * <code>strings</code> has to be either a String or an Array of Strings. The
827
+ * returned <code>results</code> is either a Fixnum or an Array of Fixnums
828
+ * respectively.
829
+ */
830
+ static VALUE rb_Levenshtein_similar(VALUE self, VALUE strings)
831
+ {
832
+ GET_STRUCT(General)
833
+ return General_iterate_strings(amatch, strings, Levenshtein_similar);
834
+ }
835
+
836
+ /*
837
+ * call-seq: levenshtein_similar(strings) -> results
838
+ *
839
+ * If called on a String, this string is used as a Amatch::Levenshtein#pattern
840
+ * to match against <code>strings</code>. It returns a Levenshtein distance
841
+ * metric number between 0.0 for very unsimilar strings and 1.0 for an exact
842
+ * match. <code>strings</code> has to be either a String or an Array of
843
+ * Strings. The returned <code>results</code> is either a Float or an Array of
844
+ * Floats respectively.
845
+ */
846
+ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
847
+ {
848
+ VALUE amatch = rb_Levenshtein_new(rb_cLevenshtein, self);
849
+ return rb_Levenshtein_similar(amatch, strings);
850
+ }
851
+
852
+ /*
853
+ * call-seq: search(strings) -> results
854
+ *
855
+ * searches Amatch::Levenshtein#pattern in <code>strings</code> and returns the
856
+ * edit distance (the sum of character operations) as a Fixnum value, by greedy
857
+ * trimming prefixes or postfixes of the match. <code>strings</code> has
858
+ * to be either a String or an Array of Strings. The returned
859
+ * <code>results</code> is either a Float or an Array of Floats respectively.
860
+ */
861
+ static VALUE rb_Levenshtein_search(VALUE self, VALUE strings)
862
+ {
863
+ GET_STRUCT(General)
864
+ return General_iterate_strings(amatch, strings, Levenshtein_search);
865
+ }
866
+
867
+ /*
868
+ * Document-class: Amatch::Sellers
869
+ *
870
+ * The Sellers edit distance is very similar to the Levenshtein edit distance.
871
+ * The difference is, that you can also specify different weights for every
872
+ * operation to prefer special operations over others. This extension of the
873
+ * Sellers edit distance is also known under the names: Needleman-Wunsch
874
+ * distance.
875
+ */
876
+
877
+ DEF_RB_FREE(Sellers, Sellers)
878
+
879
+ /*
880
+ * Document-method: substitution
881
+ *
882
+ * call-seq: substitution -> weight
883
+ *
884
+ * Returns the weight of the substitution operation, that is used to compute
885
+ * the Sellers distance.
886
+ */
887
+ DEF_RB_READER(Sellers, rb_Sellers_substitution, substitution,
888
+ rb_float_new)
889
+
890
+ /*
891
+ * Document-method: deletion
892
+ *
893
+ * call-seq: deletion -> weight
894
+ *
895
+ * Returns the weight of the deletion operation, that is used to compute
896
+ * the Sellers distance.
897
+ */
898
+ DEF_RB_READER(Sellers, rb_Sellers_deletion, deletion,
899
+ rb_float_new)
900
+
901
+ /*
902
+ * Document-method: insertion
903
+ *
904
+ * call-seq: insertion -> weight
905
+ *
906
+ * Returns the weight of the insertion operation, that is used to compute
907
+ * the Sellers distance.
908
+ */
909
+ DEF_RB_READER(Sellers, rb_Sellers_insertion, insertion,
910
+ rb_float_new)
911
+
912
+ /*
913
+ * Document-method: substitution=
914
+ *
915
+ * call-seq: substitution=(weight)
916
+ *
917
+ * Sets the weight of the substitution operation, that is used to compute
918
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
919
+ * should be a Float value >= 0.0.
920
+ */
921
+ DEF_RB_WRITER(Sellers, rb_Sellers_substitution_set, substitution,
922
+ double, CAST2FLOAT, FLOAT2C, >= 0)
923
+
924
+ /*
925
+ * Document-method: deletion=
926
+ *
927
+ * call-seq: deletion=(weight)
928
+ *
929
+ * Sets the weight of the deletion operation, that is used to compute
930
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
931
+ * should be a Float value >= 0.0.
932
+ */
933
+ DEF_RB_WRITER(Sellers, rb_Sellers_deletion_set, deletion,
934
+ double, CAST2FLOAT, FLOAT2C, >= 0)
935
+
936
+ /*
937
+ * Document-method: insertion=
938
+ *
939
+ * call-seq: insertion=(weight)
940
+ *
941
+ * Sets the weight of the insertion operation, that is used to compute
942
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
943
+ * should be a Float value >= 0.0.
944
+ */
945
+ DEF_RB_WRITER(Sellers, rb_Sellers_insertion_set, insertion,
946
+ double, CAST2FLOAT, FLOAT2C, >= 0)
947
+
948
+ /*
949
+ * Resets all weights (substitution, deletion, and insertion) to 1.0.
950
+ */
951
+ static VALUE rb_Sellers_reset_weights(VALUE self)
952
+ {
953
+ GET_STRUCT(Sellers)
954
+ Sellers_reset_weights(amatch);
955
+ return self;
956
+ }
957
+
958
+ /*
959
+ * call-seq: new(pattern)
960
+ *
961
+ * Creates a new Amatch::Sellers instance from <code>pattern</code>,
962
+ * with all weights initially set to 1.0.
963
+ */
964
+ static VALUE rb_Sellers_initialize(VALUE self, VALUE pattern)
965
+ {
966
+ GET_STRUCT(Sellers)
967
+ Sellers_pattern_set(amatch, pattern);
968
+ Sellers_reset_weights(amatch);
969
+ return self;
970
+ }
971
+
972
+ DEF_CONSTRUCTOR(Sellers, Sellers)
973
+
974
+ /*
975
+ * Document-method: pattern
976
+ *
977
+ * call-seq: pattern -> pattern string
978
+ *
979
+ * Returns the current pattern string of this Amatch::Sellers instance.
980
+ */
981
+
982
+ /*
983
+ * Document-method: pattern=
984
+ *
985
+ * call-seq: pattern=(pattern)
986
+ *
987
+ * Sets the current pattern string of this Amatch::Sellers instance to
988
+ * <code>pattern</code>.
989
+ */
990
+
991
+ /*
992
+ * call-seq: match(strings) -> results
993
+ *
994
+ * Uses this Amatch::Sellers instance to match Sellers#pattern against
995
+ * <code>strings</code>, while taking into account the given weights. It
996
+ * returns the number of weighted character operations, the Sellers distance.
997
+ * <code>strings</code> has to be either a String or an Array of Strings. The
998
+ * returned <code>results</code> is either a Float or an Array of Floats
999
+ * respectively.
1000
+ */
1001
+ static VALUE rb_Sellers_match(VALUE self, VALUE strings)
1002
+ {
1003
+ GET_STRUCT(Sellers)
1004
+ return Sellers_iterate_strings(amatch, strings, Sellers_match);
1005
+ }
1006
+
1007
+ /*
1008
+ * call-seq: similar(strings) -> results
1009
+ *
1010
+ * Uses this Amatch::Sellers instance to match Amatch::Sellers#pattern
1011
+ * against <code>strings</code> (taking into account the given weights), and
1012
+ * compute a Sellers distance metric number between 0.0 for very unsimilar
1013
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1014
+ * String or an Array of Strings. The returned <code>results</code> is either
1015
+ * a Fixnum or an Array of Fixnums
1016
+ * respectively.
1017
+ */
1018
+ static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
1019
+ {
1020
+ GET_STRUCT(Sellers)
1021
+ return Sellers_iterate_strings(amatch, strings, Sellers_similar);
1022
+ }
1023
+
1024
+ /*
1025
+ * call-seq: search(strings) -> results
1026
+ *
1027
+ * searches Sellers#pattern in <code>strings</code> and returns the edit
1028
+ * distance (the sum of weighted character operations) as a Float value, by
1029
+ * greedy trimming prefixes or postfixes of the match. <code>strings</code> has
1030
+ * to be either a String or an Array of Strings. The returned
1031
+ * <code>results</code> is either a Float or an Array of Floats respectively.
1032
+ */
1033
+ static VALUE rb_Sellers_search(VALUE self, VALUE strings)
1034
+ {
1035
+ GET_STRUCT(Sellers)
1036
+ return Sellers_iterate_strings(amatch, strings, Sellers_search);
1037
+ }
1038
+
1039
+ /*
1040
+ * Document-class: Amatch::PairDistance
1041
+ *
1042
+ * The pair distance between two strings is based on the number of adjacent
1043
+ * character pairs, that are contained in both strings. The similiarity
1044
+ * metric of two strings s1 and s2 is
1045
+ * 2*|union(pairs(s1), pairs(s2))| / |pairs(s1)| + |pairs(s2)|
1046
+ * If it is 1.0 the two strings are an exact match, if less than 1.0 they
1047
+ * are more dissimilar. The advantage of considering adjacent characters, is to
1048
+ * take account not only of the characters, but also of the character ordering
1049
+ * in the original strings.
1050
+ *
1051
+ * This metric is very capable to find similarities in natural languages.
1052
+ * It is explained in more detail in Simon White's article "How to Strike a
1053
+ * Match", located at this url:
1054
+ * http://www.catalysoft.com/articles/StrikeAMatch.html
1055
+ * It is also very similar (a special case) to the method described under
1056
+ * http://citeseer.lcs.mit.edu/gravano01using.html in "Using q-grams in a DBMS
1057
+ * for Approximate String Processing."
1058
+ */
1059
+ DEF_RB_FREE(PairDistance, PairDistance)
1060
+
1061
+ /*
1062
+ * call-seq: new(pattern)
1063
+ *
1064
+ * Creates a new Amatch::PairDistance instance from <code>pattern</code>.
1065
+ */
1066
+ static VALUE rb_PairDistance_initialize(VALUE self, VALUE pattern)
1067
+ {
1068
+ GET_STRUCT(PairDistance)
1069
+ PairDistance_pattern_set(amatch, pattern);
1070
+ return self;
1071
+ }
1072
+
1073
+ DEF_CONSTRUCTOR(PairDistance, PairDistance)
1074
+
1075
+ /*
1076
+ * call-seq: match(strings, regexp = /\s+/) -> results
1077
+ *
1078
+ * Uses this Amatch::PairDistance instance to match PairDistance#pattern against
1079
+ * <code>strings</code>. It returns the pair distance measure, that is a
1080
+ * returned value of 1.0 is an exact match, partial matches are lower
1081
+ * values, while 0.0 means no match at all.
1082
+ *
1083
+ * <code>strings</code> has to be either a String or an
1084
+ * Array of Strings. The argument <code>regexp</code> is used to split the
1085
+ * pattern and strings into tokens first. It defaults to /\s+/. If the
1086
+ * splitting should be omitted, call the method with nil as <code>regexp</code>
1087
+ * explicitly.
1088
+ *
1089
+ * The returned <code>results</code> is either a Float or an
1090
+ * Array of Floats respectively.
1091
+ */
1092
+ static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
1093
+ {
1094
+ VALUE result, strings, regexp = Qnil;
1095
+ int use_regexp;
1096
+ GET_STRUCT(PairDistance)
1097
+
1098
+ rb_scan_args(argc, argv, "11", &strings, &regexp);
1099
+ use_regexp = NIL_P(regexp) && argc != 2;
1100
+ if (TYPE(strings) == T_STRING) {
1101
+ result = PairDistance_match(amatch, strings, regexp, use_regexp);
1102
+ } else {
1103
+ int i;
1104
+ Check_Type(strings, T_ARRAY);
1105
+ result = rb_ary_new2(RARRAY_LEN(strings));
1106
+ for (i = 0; i < RARRAY_LEN(strings); i++) {
1107
+ VALUE string = rb_ary_entry(strings, i);
1108
+ if (TYPE(string) != T_STRING) {
1109
+ rb_raise(rb_eTypeError,
1110
+ "array has to contain only strings (%s given)",
1111
+ NIL_P(string) ?
1112
+ "NilClass" :
1113
+ rb_class2name(CLASS_OF(string)));
1114
+ }
1115
+ rb_ary_push(result,
1116
+ PairDistance_match(amatch, string, regexp, use_regexp));
1117
+ }
1118
+ }
1119
+ pair_array_destroy(amatch->pattern_pair_array);
1120
+ amatch->pattern_pair_array = NULL;
1121
+ return result;
1122
+ }
1123
+
1124
+ /*
1125
+ * call-seq: pair_distance_similar(strings, regexp = nil) -> results
1126
+ *
1127
+ * If called on a String, this string is used as a Amatch::PairDistance#pattern
1128
+ * to match against <code>strings</code> using /\s+/ as the tokenizing regular
1129
+ * expression. It returns a pair distance metric number between 0.0 for very
1130
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1131
+ * either a String or an Array of Strings.
1132
+ *
1133
+ * The returned <code>results</code> is either a Float or an Array of Floats
1134
+ * respectively.
1135
+ */
1136
+ static VALUE rb_str_pair_distance_similar(int argc, VALUE *argv, VALUE self)
1137
+ {
1138
+ VALUE amatch, string, regexp = Qnil;
1139
+ rb_scan_args(argc, argv, "11", &string, &regexp);
1140
+ amatch = rb_PairDistance_new(rb_cPairDistance, self);
1141
+ if (NIL_P(regexp)) {
1142
+ return rb_PairDistance_match(1, &string, amatch);
1143
+ } else {
1144
+ VALUE *args = alloca(2);
1145
+ args[0] = string;
1146
+ args[1] = regexp;
1147
+ return rb_PairDistance_match(2, args, amatch);
1148
+ }
1149
+ }
1150
+
1151
+ /*
1152
+ * Document-class: Amatch::Hamming
1153
+ *
1154
+ * This class computes the Hamming distance between two strings.
1155
+ *
1156
+ * The Hamming distance between two strings is the number of characters, that
1157
+ * are different. Thus a hamming distance of 0 means an exact
1158
+ * match, a hamming distance of 1 means one character is different, and so on.
1159
+ * If one string is longer than the other string, the missing characters are
1160
+ * counted as different characters.
1161
+ */
1162
+
1163
+ DEF_RB_FREE(Hamming, General)
1164
+
1165
+ /*
1166
+ * call-seq: new(pattern)
1167
+ *
1168
+ * Creates a new Amatch::Hamming instance from <code>pattern</code>.
1169
+ */
1170
+ static VALUE rb_Hamming_initialize(VALUE self, VALUE pattern)
1171
+ {
1172
+ GET_STRUCT(General)
1173
+ General_pattern_set(amatch, pattern);
1174
+ return self;
1175
+ }
1176
+
1177
+ DEF_CONSTRUCTOR(Hamming, General)
1178
+
1179
+ /*
1180
+ * call-seq: match(strings) -> results
1181
+ *
1182
+ * Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
1183
+ * <code>strings</code>, that is compute the hamming distance between
1184
+ * <code>pattern</code> and <code>strings</code>. <code>strings</code> has to
1185
+ * be either a String or an Array of Strings. The returned <code>results</code>
1186
+ * is either a Fixnum or an Array of Fixnums respectively.
1187
+ */
1188
+ static VALUE rb_Hamming_match(VALUE self, VALUE strings)
1189
+ {
1190
+ GET_STRUCT(General)
1191
+ return General_iterate_strings(amatch, strings, Hamming_match);
1192
+ }
1193
+
1194
+ /*
1195
+ * call-seq: similar(strings) -> results
1196
+ *
1197
+ * Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
1198
+ * <code>strings</code>, and compute a Hamming distance metric number between
1199
+ * 0.0 for very unsimilar strings and 1.0 for an exact match.
1200
+ * <code>strings</code> has to be either a String or an Array of Strings. The
1201
+ * returned <code>results</code> is either a Fixnum or an Array of Fixnums
1202
+ * respectively.
1203
+ */
1204
+ static VALUE rb_Hamming_similar(VALUE self, VALUE strings)
1205
+ {
1206
+ GET_STRUCT(General)
1207
+ return General_iterate_strings(amatch, strings, Hamming_similar);
1208
+ }
1209
+
1210
+ /*
1211
+ * call-seq: hamming_similar(strings) -> results
1212
+ *
1213
+ * If called on a String, this string is used as a Amatch::Hamming#pattern to
1214
+ * match against <code>strings</code>. It returns a Hamming distance metric
1215
+ * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
1216
+ * <code>strings</code>
1217
+ * has to be either a String or an Array of Strings. The returned
1218
+ * <code>results</code> is either a Float or an Array of Floats respectively.
1219
+ */
1220
+ static VALUE rb_str_hamming_similar(VALUE self, VALUE strings)
1221
+ {
1222
+ VALUE amatch = rb_Hamming_new(rb_cHamming, self);
1223
+ return rb_Hamming_similar(amatch, strings);
1224
+ }
1225
+
1226
+
1227
+ /*
1228
+ * Document-class: Amatch::LongestSubsequence
1229
+ *
1230
+ * This class computes the length of the longest subsequence common to two
1231
+ * strings. A subsequence doesn't have to be contiguous. The longer the common
1232
+ * subsequence is, the more similar the two strings will be.
1233
+ *
1234
+ * The longest common subsequence between "test" and "test" is of length 4,
1235
+ * because "test" itself is this subsequence. The longest common subsequence
1236
+ * between "test" and "east" is "e", "s", "t" and the length of the
1237
+ * sequence is 3.
1238
+ */
1239
+ DEF_RB_FREE(LongestSubsequence, General)
1240
+
1241
+ /*
1242
+ * call-seq: new(pattern)
1243
+ *
1244
+ * Creates a new Amatch::LongestSubsequence instance from <code>pattern</code>.
1245
+ */
1246
+ static VALUE rb_LongestSubsequence_initialize(VALUE self, VALUE pattern)
1247
+ {
1248
+ GET_STRUCT(General)
1249
+ General_pattern_set(amatch, pattern);
1250
+ return self;
1251
+ }
1252
+
1253
+ DEF_CONSTRUCTOR(LongestSubsequence, General)
1254
+
1255
+ /*
1256
+ * call-seq: match(strings) -> results
1257
+ *
1258
+ * Uses this Amatch::LongestSubsequence instance to match
1259
+ * LongestSubsequence#pattern against <code>strings</code>, that is compute the
1260
+ * length of the longest common subsequence. <code>strings</code> has to be
1261
+ * either a String or an Array of Strings. The returned <code>results</code>
1262
+ * is either a Fixnum or an Array of Fixnums respectively.
1263
+ */
1264
+ static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
1265
+ {
1266
+ GET_STRUCT(General)
1267
+ return General_iterate_strings(amatch, strings, LongestSubsequence_match);
1268
+ }
1269
+
1270
+ /*
1271
+ * call-seq: similar(strings) -> results
1272
+ *
1273
+ * Uses this Amatch::LongestSubsequence instance to match
1274
+ * Amatch::LongestSubsequence#pattern against <code>strings</code>, and compute
1275
+ * a longest substring distance metric number between 0.0 for very unsimilar
1276
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1277
+ * String or an Array of Strings. The returned <code>results</code> is either
1278
+ * a Fixnum or an Array of Fixnums
1279
+ */
1280
+ static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
1281
+ {
1282
+ GET_STRUCT(General)
1283
+ return General_iterate_strings(amatch, strings, LongestSubsequence_similar);
1284
+ }
1285
+
1286
+ /*
1287
+ * call-seq: longest_subsequence_similar(strings) -> results
1288
+ *
1289
+ * If called on a String, this string is used as a
1290
+ * Amatch::LongestSubsequence#pattern to match against <code>strings</code>. It
1291
+ * returns a longest subsequence distance metric number between 0.0 for very
1292
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1293
+ * either a String or an Array of Strings. The returned <code>results</code>
1294
+ * is either a Float or an Array of Floats respectively.
1295
+ */
1296
+ static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
1297
+ {
1298
+ VALUE amatch = rb_LongestSubsequence_new(rb_cLongestSubsequence, self);
1299
+ return rb_LongestSubsequence_similar(amatch, strings);
1300
+ }
1301
+
1302
+ /*
1303
+ * Document-class: Amatch::LongestSubstring
1304
+ *
1305
+ * The longest common substring is the longest substring, that is part of
1306
+ * two strings. A substring is contiguous, while a subsequence need not to
1307
+ * be. The longer the common substring is, the more similar the two strings
1308
+ * will be.
1309
+ *
1310
+ * The longest common substring between 'string' and 'string' is 'string'
1311
+ * again, thus the longest common substring length is 6. The longest common
1312
+ * substring between 'string' and 'storing' is 'ring', thus the longest common
1313
+ * substring length is 4.
1314
+ */
1315
+
1316
+ DEF_RB_FREE(LongestSubstring, General)
1317
+
1318
+ /*
1319
+ * call-seq: new(pattern)
1320
+ *
1321
+ * Creates a new Amatch::LongestSubstring instance from <code>pattern</code>.
1322
+ */
1323
+ static VALUE rb_LongestSubstring_initialize(VALUE self, VALUE pattern)
1324
+ {
1325
+ GET_STRUCT(General)
1326
+ General_pattern_set(amatch, pattern);
1327
+ return self;
1328
+ }
1329
+
1330
+ DEF_CONSTRUCTOR(LongestSubstring, General)
1331
+
1332
+ /*
1333
+ * call-seq: match(strings) -> results
1334
+ *
1335
+ * Uses this Amatch::LongestSubstring instance to match
1336
+ * LongestSubstring#pattern against <code>strings</code>, that is compute the
1337
+ * length of the longest common substring. <code>strings</code> has to be
1338
+ * either a String or an Array of Strings. The returned <code>results</code>
1339
+ * is either a Fixnum or an Array of Fixnums respectively.
1340
+ */
1341
+ static VALUE rb_LongestSubstring_match(VALUE self, VALUE strings)
1342
+ {
1343
+ GET_STRUCT(General)
1344
+ return General_iterate_strings(amatch, strings, LongestSubstring_match);
1345
+ }
1346
+
1347
+ /*
1348
+ * call-seq: similar(strings) -> results
1349
+ *
1350
+ * Uses this Amatch::LongestSubstring instance to match
1351
+ * Amatch::LongestSubstring#pattern against <code>strings</code>, and compute a
1352
+ * longest substring distance metric number between 0.0 for very unsimilar
1353
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1354
+ * String or an Array of Strings. The returned <code>results</code> is either
1355
+ * a Fixnum or an Array of Fixnums
1356
+ * respectively.
1357
+ */
1358
+ static VALUE rb_LongestSubstring_similar(VALUE self, VALUE strings)
1359
+ {
1360
+ GET_STRUCT(General)
1361
+ return General_iterate_strings(amatch, strings, LongestSubstring_similar);
1362
+ }
1363
+
1364
+ /*
1365
+ * call-seq: longest_substring_similar(strings) -> results
1366
+ *
1367
+ * If called on a String, this string is used as a
1368
+ * Amatch::LongestSubstring#pattern to match against <code>strings</code>. It
1369
+ * returns a longest substring distance metric number between 0.0 for very
1370
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1371
+ * either a String or an Array of Strings. The returned <code>results</code>
1372
+ * is either a Float or an Array of Floats respectively.
1373
+ */
1374
+ static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
1375
+ {
1376
+ VALUE amatch = rb_LongestSubstring_new(rb_cLongestSubstring, self);
1377
+ return rb_LongestSubstring_similar(amatch, strings);
1378
+ }
1379
+
1380
+ /*
1381
+ * Document-class: Amatch::Jaro
1382
+ *
1383
+ * This class computes the Jaro metric for two strings.
1384
+ * The Jaro metric computes the similarity between 0 (no match)
1385
+ * and 1 (exact match) by looking for matching and transposed characters.
1386
+ */
1387
+ DEF_RB_FREE(Jaro, Jaro)
1388
+
1389
+ /*
1390
+ * Document-method: ignore_case
1391
+ *
1392
+ * call-seq: ignore_case -> true/false
1393
+ *
1394
+ * Returns whether case is ignored when computing matching characters.
1395
+ */
1396
+ DEF_RB_READER(Jaro, rb_Jaro_ignore_case, ignore_case, C2BOOL)
1397
+
1398
+ /*
1399
+ * Document-method: ignore_case=
1400
+ *
1401
+ * call-seq: ignore_case=(true/false)
1402
+ *
1403
+ * Sets whether case is ignored when computing matching characters.
1404
+ */
1405
+ DEF_RB_WRITER(Jaro, rb_Jaro_ignore_case_set, ignore_case,
1406
+ int, CAST2BOOL, BOOL2C, != Qundef)
1407
+
1408
+ /*
1409
+ * call-seq: new(pattern)
1410
+ *
1411
+ * Creates a new Amatch::Jaro instance from <code>pattern</code>.
1412
+ */
1413
+ static VALUE rb_Jaro_initialize(VALUE self, VALUE pattern)
1414
+ {
1415
+ GET_STRUCT(Jaro)
1416
+ Jaro_pattern_set(amatch, pattern);
1417
+ amatch->ignore_case = 1;
1418
+ return self;
1419
+ }
1420
+
1421
+ DEF_CONSTRUCTOR(Jaro, Jaro)
1422
+
1423
+ /*
1424
+ * call-seq: match(strings) -> results
1425
+ *
1426
+ * Uses this Amatch::Jaro instance to match
1427
+ * Jaro#pattern against <code>strings</code>, that is compute the
1428
+ * jaro metric with the strings. <code>strings</code> has to be
1429
+ * either a String or an Array of Strings. The returned <code>results</code>
1430
+ * is either a Float or an Array of Floats respectively.
1431
+ */
1432
+ static VALUE rb_Jaro_match(VALUE self, VALUE strings)
1433
+ {
1434
+ GET_STRUCT(Jaro)
1435
+ return Jaro_iterate_strings(amatch, strings, Jaro_match);
1436
+ }
1437
+
1438
+ /*
1439
+ * call-seq: jaro_similar(strings) -> results
1440
+ *
1441
+ * If called on a String, this string is used as a
1442
+ * Amatch::Jaro#pattern to match against <code>strings</code>. It
1443
+ * returns a Jaro metric number between 0.0 for very
1444
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1445
+ * either a String or an Array of Strings. The returned <code>results</code>
1446
+ * is either a Float or an Array of Floats respectively.
1447
+ */
1448
+ static VALUE rb_str_jaro_similar(VALUE self, VALUE strings)
1449
+ {
1450
+ VALUE amatch = rb_Jaro_new(rb_cJaro, self);
1451
+ return rb_Jaro_match(amatch, strings);
1452
+ }
1453
+
1454
+ /*
1455
+ * Document-class: Amatch::JaroWinkler
1456
+ *
1457
+ * This class computes the Jaro-Winkler metric for two strings.
1458
+ * The Jaro-Winkler metric computes the similarity between 0 (no match)
1459
+ * and 1 (exact match) by looking for matching and transposed characters.
1460
+ *
1461
+ * It is a variant of the Jaro metric, with additional weighting towards
1462
+ * common prefixes.
1463
+ */
1464
+ DEF_RB_FREE(JaroWinkler, JaroWinkler)
1465
+
1466
+ /*
1467
+ * Document-method: ignore_case
1468
+ *
1469
+ * call-seq: ignore_case -> true/false
1470
+ *
1471
+ * Returns whether case is ignored when computing matching characters.
1472
+ * Default is true.
1473
+ */
1474
+ DEF_RB_READER(JaroWinkler, rb_JaroWinkler_ignore_case, ignore_case, C2BOOL)
1475
+
1476
+ /*
1477
+ * Document-method: scaling_factor
1478
+ *
1479
+ * call-seq: scaling_factor -> weight
1480
+ *
1481
+ * The scaling factor is how much weight to give common prefixes.
1482
+ * Default is 0.1.
1483
+ */
1484
+ DEF_RB_READER(JaroWinkler, rb_JaroWinkler_scaling_factor, scaling_factor, rb_float_new)
1485
+
1486
+ /*
1487
+ * Document-method: ignore_case=
1488
+ *
1489
+ * call-seq: ignore_case=(true/false)
1490
+ *
1491
+ * Sets whether case is ignored when computing matching characters.
1492
+ */
1493
+ DEF_RB_WRITER(JaroWinkler, rb_JaroWinkler_ignore_case_set, ignore_case,
1494
+ int, CAST2BOOL, BOOL2C, != Qundef)
1495
+
1496
+ /*
1497
+ * Document-method: scaling_factor=
1498
+ *
1499
+ * call-seq: scaling_factor=(weight)
1500
+ *
1501
+ * Sets the weight to give common prefixes.
1502
+ */
1503
+ DEF_RB_WRITER(JaroWinkler, rb_JaroWinkler_scaling_factor_set, scaling_factor,
1504
+ double, CAST2FLOAT, FLOAT2C, >= 0)
1505
+
1506
+ /*
1507
+ * call-seq: new(pattern)
1508
+ *
1509
+ * Creates a new Amatch::JaroWinkler instance from <code>pattern</code>.
1510
+ */
1511
+ static VALUE rb_JaroWinkler_initialize(VALUE self, VALUE pattern)
1512
+ {
1513
+ GET_STRUCT(JaroWinkler)
1514
+ JaroWinkler_pattern_set(amatch, pattern);
1515
+ amatch->ignore_case = 1;
1516
+ amatch->scaling_factor = 0.1;
1517
+ return self;
1518
+ }
1519
+
1520
+ DEF_CONSTRUCTOR(JaroWinkler, JaroWinkler)
1521
+
1522
+ /*
1523
+ * call-seq: match(strings) -> results
1524
+ *
1525
+ * Uses this Amatch::Jaro instance to match
1526
+ * Jaro#pattern against <code>strings</code>, that is compute the
1527
+ * jaro metric with the strings. <code>strings</code> has to be
1528
+ * either a String or an Array of Strings. The returned <code>results</code>
1529
+ * is either a Float or an Array of Floats respectively.
1530
+ */
1531
+ static VALUE rb_JaroWinkler_match(VALUE self, VALUE strings)
1532
+ {
1533
+ GET_STRUCT(JaroWinkler)
1534
+ return JaroWinkler_iterate_strings(amatch, strings, JaroWinkler_match);
1535
+ }
1536
+
1537
+ /*
1538
+ * call-seq: jarowinkler_similar(strings) -> results
1539
+ *
1540
+ * If called on a String, this string is used as a
1541
+ * Amatch::JaroWinkler#pattern to match against <code>strings</code>. It
1542
+ * returns a Jaro-Winkler metric number between 0.0 for very
1543
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1544
+ * either a String or an Array of Strings. The returned <code>results</code>
1545
+ * are either a Float or an Array of Floats respectively.
1546
+ */
1547
+ static VALUE rb_str_jarowinkler_similar(VALUE self, VALUE strings)
1548
+ {
1549
+ VALUE amatch = rb_JaroWinkler_new(rb_cJaro, self);
1550
+ return rb_JaroWinkler_match(amatch, strings);
1551
+ }
1552
+
1553
+ /*
1554
+ * This is the namespace module that includes all other classes, modules, and
1555
+ * constants.
1556
+ */
1557
+
1558
+ void Init_amatch_ext()
1559
+ {
1560
+ rb_require("amatch/version");
1561
+ rb_mAmatch = rb_define_module("Amatch");
1562
+ /* This module can be mixed into ::String or its subclasses to mixin the similary methods directly. */
1563
+ rb_mAmatchStringMethods = rb_define_module_under(rb_mAmatch, "StringMethods");
1564
+
1565
+ /* Levenshtein */
1566
+ rb_cLevenshtein = rb_define_class_under(rb_mAmatch, "Levenshtein", rb_cObject);
1567
+ rb_define_alloc_func(rb_cLevenshtein, rb_Levenshtein_s_allocate);
1568
+ rb_define_method(rb_cLevenshtein, "initialize", rb_Levenshtein_initialize, 1);
1569
+ rb_define_method(rb_cLevenshtein, "pattern", rb_General_pattern, 0);
1570
+ rb_define_method(rb_cLevenshtein, "pattern=", rb_General_pattern_set, 1);
1571
+ rb_define_method(rb_cLevenshtein, "match", rb_Levenshtein_match, 1);
1572
+ rb_define_method(rb_cLevenshtein, "search", rb_Levenshtein_search, 1);
1573
+ rb_define_method(rb_cLevenshtein, "similar", rb_Levenshtein_similar, 1);
1574
+ rb_define_method(rb_mAmatchStringMethods, "levenshtein_similar", rb_str_levenshtein_similar, 1);
1575
+
1576
+ /* Sellers */
1577
+ rb_cSellers = rb_define_class_under(rb_mAmatch, "Sellers", rb_cObject);
1578
+ rb_define_alloc_func(rb_cSellers, rb_Sellers_s_allocate);
1579
+ rb_define_method(rb_cSellers, "initialize", rb_Sellers_initialize, 1);
1580
+ rb_define_method(rb_cSellers, "pattern", rb_Sellers_pattern, 0);
1581
+ rb_define_method(rb_cSellers, "pattern=", rb_Sellers_pattern_set, 1);
1582
+ rb_define_method(rb_cSellers, "substitution", rb_Sellers_substitution, 0);
1583
+ rb_define_method(rb_cSellers, "substitution=", rb_Sellers_substitution_set, 1);
1584
+ rb_define_method(rb_cSellers, "deletion", rb_Sellers_deletion, 0);
1585
+ rb_define_method(rb_cSellers, "deletion=", rb_Sellers_deletion_set, 1);
1586
+ rb_define_method(rb_cSellers, "insertion", rb_Sellers_insertion, 0);
1587
+ rb_define_method(rb_cSellers, "insertion=", rb_Sellers_insertion_set, 1);
1588
+ rb_define_method(rb_cSellers, "reset_weights", rb_Sellers_reset_weights, 0);
1589
+ rb_define_method(rb_cSellers, "match", rb_Sellers_match, 1);
1590
+ rb_define_method(rb_cSellers, "search", rb_Sellers_search, 1);
1591
+ rb_define_method(rb_cSellers, "similar", rb_Sellers_similar, 1);
1592
+
1593
+ /* Hamming */
1594
+ rb_cHamming = rb_define_class_under(rb_mAmatch, "Hamming", rb_cObject);
1595
+ rb_define_alloc_func(rb_cHamming, rb_Hamming_s_allocate);
1596
+ rb_define_method(rb_cHamming, "initialize", rb_Hamming_initialize, 1);
1597
+ rb_define_method(rb_cHamming, "pattern", rb_General_pattern, 0);
1598
+ rb_define_method(rb_cHamming, "pattern=", rb_General_pattern_set, 1);
1599
+ rb_define_method(rb_cHamming, "match", rb_Hamming_match, 1);
1600
+ rb_define_method(rb_cHamming, "similar", rb_Hamming_similar, 1);
1601
+ rb_define_method(rb_mAmatchStringMethods, "hamming_similar", rb_str_hamming_similar, 1);
1602
+
1603
+ /* Pair Distance Metric / Dice Coefficient */
1604
+ rb_cPairDistance = rb_define_class_under(rb_mAmatch, "PairDistance", rb_cObject);
1605
+ rb_define_alloc_func(rb_cPairDistance, rb_PairDistance_s_allocate);
1606
+ rb_define_method(rb_cPairDistance, "initialize", rb_PairDistance_initialize, 1);
1607
+ rb_define_method(rb_cPairDistance, "pattern", rb_PairDistance_pattern, 0);
1608
+ rb_define_method(rb_cPairDistance, "pattern=", rb_PairDistance_pattern_set, 1);
1609
+ rb_define_method(rb_cPairDistance, "match", rb_PairDistance_match, -1);
1610
+ rb_define_alias(rb_cPairDistance, "similar", "match");
1611
+ rb_define_method(rb_mAmatchStringMethods, "pair_distance_similar", rb_str_pair_distance_similar, -1);
1612
+
1613
+ /* Longest Common Subsequence */
1614
+ rb_cLongestSubsequence = rb_define_class_under(rb_mAmatch, "LongestSubsequence", rb_cObject);
1615
+ rb_define_alloc_func(rb_cLongestSubsequence, rb_LongestSubsequence_s_allocate);
1616
+ rb_define_method(rb_cLongestSubsequence, "initialize", rb_LongestSubsequence_initialize, 1);
1617
+ rb_define_method(rb_cLongestSubsequence, "pattern", rb_General_pattern, 0);
1618
+ rb_define_method(rb_cLongestSubsequence, "pattern=", rb_General_pattern_set, 1);
1619
+ rb_define_method(rb_cLongestSubsequence, "match", rb_LongestSubsequence_match, 1);
1620
+ rb_define_method(rb_cLongestSubsequence, "similar", rb_LongestSubsequence_similar, 1);
1621
+ rb_define_method(rb_mAmatchStringMethods, "longest_subsequence_similar", rb_str_longest_subsequence_similar, 1);
1622
+
1623
+ /* Longest Common Substring */
1624
+ rb_cLongestSubstring = rb_define_class_under(rb_mAmatch, "LongestSubstring", rb_cObject);
1625
+ rb_define_alloc_func(rb_cLongestSubstring, rb_LongestSubstring_s_allocate);
1626
+ rb_define_method(rb_cLongestSubstring, "initialize", rb_LongestSubstring_initialize, 1);
1627
+ rb_define_method(rb_cLongestSubstring, "pattern", rb_General_pattern, 0);
1628
+ rb_define_method(rb_cLongestSubstring, "pattern=", rb_General_pattern_set, 1);
1629
+ rb_define_method(rb_cLongestSubstring, "match", rb_LongestSubstring_match, 1);
1630
+ rb_define_method(rb_cLongestSubstring, "similar", rb_LongestSubstring_similar, 1);
1631
+ rb_define_method(rb_mAmatchStringMethods, "longest_substring_similar", rb_str_longest_substring_similar, 1);
1632
+
1633
+ /* Jaro */
1634
+ rb_cJaro = rb_define_class_under(rb_mAmatch, "Jaro", rb_cObject);
1635
+ rb_define_alloc_func(rb_cJaro, rb_Jaro_s_allocate);
1636
+ rb_define_method(rb_cJaro, "initialize", rb_Jaro_initialize, 1);
1637
+ rb_define_method(rb_cJaro, "pattern", rb_Jaro_pattern, 0);
1638
+ rb_define_method(rb_cJaro, "pattern=", rb_Jaro_pattern_set, 1);
1639
+ rb_define_method(rb_cJaro, "ignore_case", rb_Jaro_ignore_case, 0);
1640
+ rb_define_method(rb_cJaro, "ignore_case=", rb_Jaro_ignore_case_set, 1);
1641
+ rb_define_method(rb_cJaro, "match", rb_Jaro_match, 1);
1642
+ rb_define_alias(rb_cJaro, "similar", "match");
1643
+ rb_define_method(rb_mAmatchStringMethods, "jaro_similar", rb_str_jaro_similar, 1);
1644
+
1645
+ /* Jaro-Winkler */
1646
+ rb_cJaroWinkler = rb_define_class_under(rb_mAmatch, "JaroWinkler", rb_cObject);
1647
+ rb_define_alloc_func(rb_cJaroWinkler, rb_JaroWinkler_s_allocate);
1648
+ rb_define_method(rb_cJaroWinkler, "initialize", rb_JaroWinkler_initialize, 1);
1649
+ rb_define_method(rb_cJaroWinkler, "pattern", rb_JaroWinkler_pattern, 0);
1650
+ rb_define_method(rb_cJaroWinkler, "pattern=", rb_JaroWinkler_pattern_set, 1);
1651
+ rb_define_method(rb_cJaroWinkler, "ignore_case", rb_JaroWinkler_ignore_case, 0);
1652
+ rb_define_method(rb_cJaroWinkler, "ignore_case=", rb_JaroWinkler_ignore_case_set, 1);
1653
+ rb_define_method(rb_cJaroWinkler, "scaling_factor", rb_JaroWinkler_scaling_factor, 0);
1654
+ rb_define_method(rb_cJaroWinkler, "scaling_factor=", rb_JaroWinkler_scaling_factor_set, 1);
1655
+ rb_define_method(rb_cJaroWinkler, "match", rb_JaroWinkler_match, 1);
1656
+ rb_define_alias(rb_cJaroWinkler, "similar", "match");
1657
+ rb_define_method(rb_mAmatchStringMethods, "jarowinkler_similar", rb_str_jarowinkler_similar, 1);
1658
+
1659
+ id_split = rb_intern("split");
1660
+ id_to_f = rb_intern("to_f");
1661
+ }