amatch-rbx 0.2.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,35 @@
1
+ # vim: set filetype=ruby et sw=2 ts=2:
2
+
3
+ require 'gem_hadar'
4
+
5
+ GemHadar do
6
+ name 'amatch'
7
+ author 'Florian Frank'
8
+ email 'flori@ping.de'
9
+ homepage "http://github.com/flori/#{name}"
10
+ summary 'Approximate String Matching library'
11
+ description <<EOT
12
+ Amatch is a library for approximate string matching and searching in strings.
13
+ Several algorithms can be used to do this, and it's also possible to compute a
14
+ similarity metric number between 0.0 and 1.0 for two given strings.
15
+ EOT
16
+ executables << 'agrep.rb'
17
+ bindir 'bin'
18
+ test_dir 'tests'
19
+ ignore '.*.sw[pon]', 'pkg', 'Gemfile.lock', '.AppleDouble', '.rbx'
20
+ title "#{name.camelize} - Approximate Matching"
21
+ readme 'README.rdoc'
22
+ require_paths %w[lib ext]
23
+ dependency 'tins', '~>0.3'
24
+ development_dependency 'test-unit', '~>2.3'
25
+ development_dependency 'utils'
26
+ development_dependency 'rake', '~>10', '<11.0'
27
+
28
+ install_library do
29
+ libdir = CONFIG["sitelibdir"]
30
+ src, = Dir['ext/amatch.*'].reject { |x| x =~ /\.[co]$/ }
31
+ install(src, File.join(libdir, File.basename(src)), :verbose => true)
32
+ mkdir_p dst = File.join(libdir, 'amatch')
33
+ install('lib/amatch/version.rb', File.join(dst, 'version.rb'), :verbose => true)
34
+ end
35
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.12
Binary file
@@ -0,0 +1,79 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require 'amatch'
4
+ require 'getoptlong'
5
+
6
+ def usage(msg, options)
7
+ puts msg, "Usage: #{File.basename($0)} [OPTIONS] PATTERN [FILE ...]", ""
8
+ options.each do |o|
9
+ puts " " + o[1] + ", " + o[0] + " " +
10
+ (o[2] == GetoptLong::REQUIRED_ARGUMENT ? 'ARGUMENT' : '')
11
+ end
12
+ puts "\nReport bugs to <flori@ping.de>."
13
+ exit 0
14
+ end
15
+
16
+ class Amatch::Levenshtein
17
+ def search_relative(strings)
18
+ search(strings).to_f / pattern.size
19
+ end
20
+ end
21
+
22
+ $distance = 1
23
+ $mode = :search
24
+ begin
25
+ parser = GetoptLong.new
26
+ options = [
27
+ [ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
28
+ [ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
29
+ [ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
30
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
31
+ ]
32
+ parser.set_options(*options)
33
+ parser.each_option do |name, arg|
34
+ name = name.sub(/^--/, '')
35
+ case name
36
+ when 'distance'
37
+ $distance = arg.to_f
38
+ when 'relative'
39
+ $mode = :search_relative
40
+ when 'verbose'
41
+ $verbose = 1
42
+ when 'help'
43
+ usage('You\'ve asked for it!', options)
44
+ end
45
+ end
46
+ rescue
47
+ exit 1
48
+ end
49
+ pattern = ARGV.shift or usage('Pattern needed!', options)
50
+
51
+ matcher = Amatch::Levenshtein.new(pattern)
52
+ size = 0
53
+ start = Time.new
54
+ if ARGV.size > 0 then
55
+ ARGV.each do |filename|
56
+ File.stat(filename).file? or next
57
+ size += File.size(filename)
58
+ begin
59
+ File.open(filename, 'r').each_line do |line|
60
+ if matcher.__send__($mode, line) <= $distance
61
+ puts "#{filename}:#{line}"
62
+ end
63
+ end
64
+ rescue
65
+ STDERR.puts "Failure at #{filename}: #{$!} => Skipping!"
66
+ end
67
+ end
68
+ else
69
+ STDIN.each_line do |line|
70
+ size += line.size
71
+ if matcher.__send__($mode, line) <= $distance
72
+ puts line
73
+ end
74
+ end
75
+ end
76
+ time = Time.new - start
77
+ $verbose and STDERR.printf "%.3f secs running, scanned %.3f KB/s.\n",
78
+ time, size / time / 1024
79
+ exit 0
@@ -0,0 +1,1661 @@
1
+ #include "ruby.h"
2
+ #include "pair.h"
3
+ #include <ctype.h>
4
+ #include "common.h"
5
+
6
+ /*
7
+ * Document-method: pattern
8
+ *
9
+ * call-seq: pattern -> pattern string
10
+ *
11
+ * Returns the current pattern string of this instance.
12
+ */
13
+
14
+ /*
15
+ * Document-method: pattern=
16
+ *
17
+ * call-seq: pattern=(pattern)
18
+ *
19
+ * Sets the current pattern string of this instance to <code>pattern</code>.
20
+ */
21
+
22
+
23
+ static VALUE rb_mAmatch, rb_mAmatchStringMethods, rb_cLevenshtein, rb_cSellers, rb_cHamming,
24
+ rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring,
25
+ rb_cJaro, rb_cJaroWinkler;
26
+
27
+ static ID id_split, id_to_f;
28
+
29
+ #define GET_STRUCT(klass) \
30
+ klass *amatch; \
31
+ Data_Get_Struct(self, klass, amatch);
32
+
33
+ #define DEF_ALLOCATOR(type) \
34
+ static type *type##_allocate() \
35
+ { \
36
+ type *obj = ALLOC(type); \
37
+ MEMZERO(obj, type, 1); \
38
+ return obj; \
39
+ }
40
+
41
+ #define DEF_CONSTRUCTOR(klass, type) \
42
+ static VALUE rb_##klass##_s_allocate(VALUE klass2) \
43
+ { \
44
+ type *amatch = type##_allocate(); \
45
+ return Data_Wrap_Struct(klass2, NULL, rb_##klass##_free, amatch); \
46
+ } \
47
+ VALUE rb_##klass##_new(VALUE klass2, VALUE pattern) \
48
+ { \
49
+ VALUE obj = rb_##klass##_s_allocate(klass2); \
50
+ rb_##klass##_initialize(obj, pattern); \
51
+ return obj; \
52
+ }
53
+
54
+ #define DEF_RB_FREE(klass, type) \
55
+ static void rb_##klass##_free(type *amatch) \
56
+ { \
57
+ MEMZERO(amatch->pattern, char, amatch->pattern_len); \
58
+ xfree(amatch->pattern); \
59
+ MEMZERO(amatch, type, 1); \
60
+ xfree(amatch); \
61
+ }
62
+
63
+ #define DEF_PATTERN_ACCESSOR(type) \
64
+ static void type##_pattern_set(type *amatch, VALUE pattern) \
65
+ { \
66
+ Check_Type(pattern, T_STRING); \
67
+ xfree(amatch->pattern); \
68
+ amatch->pattern_len = (int) RSTRING_LEN(pattern); \
69
+ amatch->pattern = ALLOC_N(char, amatch->pattern_len); \
70
+ MEMCPY(amatch->pattern, RSTRING_PTR(pattern), char, \
71
+ RSTRING_LEN(pattern)); \
72
+ } \
73
+ static VALUE rb_##type##_pattern(VALUE self) \
74
+ { \
75
+ GET_STRUCT(type) \
76
+ return rb_str_new(amatch->pattern, amatch->pattern_len); \
77
+ } \
78
+ static VALUE rb_##type##_pattern_set(VALUE self, VALUE pattern) \
79
+ { \
80
+ GET_STRUCT(type) \
81
+ type##_pattern_set(amatch, pattern); \
82
+ return Qnil; \
83
+ }
84
+
85
+ #define DEF_ITERATE_STRINGS(type) \
86
+ static VALUE type##_iterate_strings(type *amatch, VALUE strings, \
87
+ VALUE (*match_function) (type *amatch, VALUE strings)) \
88
+ { \
89
+ if (TYPE(strings) == T_STRING) { \
90
+ return match_function(amatch, strings); \
91
+ } else { \
92
+ int i; \
93
+ VALUE result; \
94
+ Check_Type(strings, T_ARRAY); \
95
+ result = rb_ary_new2(RARRAY_LEN(strings)); \
96
+ for (i = 0; i < RARRAY_LEN(strings); i++) { \
97
+ VALUE string = rb_ary_entry(strings, i); \
98
+ if (TYPE(string) != T_STRING) { \
99
+ rb_raise(rb_eTypeError, \
100
+ "array has to contain only strings (%s given)", \
101
+ NIL_P(string) ? \
102
+ "NilClass" : \
103
+ rb_class2name(CLASS_OF(string))); \
104
+ } \
105
+ rb_ary_push(result, match_function(amatch, string)); \
106
+ } \
107
+ return result; \
108
+ } \
109
+ }
110
+
111
+ #define DEF_RB_READER(type, function, name, converter) \
112
+ VALUE function(VALUE self) \
113
+ { \
114
+ GET_STRUCT(type) \
115
+ return converter(amatch->name); \
116
+ }
117
+
118
+ #define DEF_RB_WRITER(type, function, name, vtype, caster, converter, check)\
119
+ VALUE function(VALUE self, VALUE value) \
120
+ { \
121
+ vtype value_ ## vtype; \
122
+ GET_STRUCT(type) \
123
+ caster(value); \
124
+ value_ ## vtype = converter(value); \
125
+ if (!(value_ ## vtype check)) \
126
+ rb_raise(rb_eTypeError, "check of value " #check " failed"); \
127
+ amatch->name = value_ ## vtype; \
128
+ return Qnil; \
129
+ }
130
+
131
+
132
+ #define CAST2FLOAT(obj) \
133
+ if (TYPE(obj) != T_FLOAT && rb_respond_to(obj, id_to_f)) \
134
+ obj = rb_funcall(obj, id_to_f, 0, 0); \
135
+ else \
136
+ Check_Type(obj, T_FLOAT)
137
+ #define FLOAT2C(obj) (RFLOAT_VALUE(obj))
138
+
139
+ #define CAST2BOOL(obj) \
140
+ if (obj == Qfalse || obj == Qnil) \
141
+ obj = Qfalse; \
142
+ else \
143
+ obj = Qtrue;
144
+ #define BOOL2C(obj) (obj == Qtrue)
145
+ #define C2BOOL(obj) (obj ? Qtrue : Qfalse)
146
+
147
+ #define OPTIMIZE_TIME \
148
+ if (amatch->pattern_len < RSTRING_LEN(string)) { \
149
+ a_ptr = amatch->pattern; \
150
+ a_len = (int) amatch->pattern_len; \
151
+ b_ptr = RSTRING_PTR(string); \
152
+ b_len = (int) RSTRING_LEN(string); \
153
+ } else { \
154
+ a_ptr = RSTRING_PTR(string); \
155
+ a_len = (int) RSTRING_LEN(string); \
156
+ b_ptr = amatch->pattern; \
157
+ b_len = (int) amatch->pattern_len; \
158
+ }
159
+
160
+ #define DONT_OPTIMIZE \
161
+ a_ptr = amatch->pattern; \
162
+ a_len = (int) amatch->pattern_len; \
163
+ b_ptr = RSTRING_PTR(string); \
164
+ b_len = (int) RSTRING_LEN(string); \
165
+
166
+ /*
167
+ * C structures of the Amatch classes
168
+ */
169
+
170
+ typedef struct GeneralStruct {
171
+ char *pattern;
172
+ int pattern_len;
173
+ } General;
174
+
175
+ DEF_ALLOCATOR(General)
176
+ DEF_PATTERN_ACCESSOR(General)
177
+ DEF_ITERATE_STRINGS(General)
178
+
179
+ typedef struct SellersStruct {
180
+ char *pattern;
181
+ int pattern_len;
182
+ double substitution;
183
+ double deletion;
184
+ double insertion;
185
+ } Sellers;
186
+
187
+ DEF_ALLOCATOR(Sellers)
188
+ DEF_PATTERN_ACCESSOR(Sellers)
189
+ DEF_ITERATE_STRINGS(Sellers)
190
+
191
+ static void Sellers_reset_weights(Sellers *self)
192
+ {
193
+ self->substitution = 1.0;
194
+ self->deletion = 1.0;
195
+ self->insertion = 1.0;
196
+ }
197
+
198
+ typedef struct PairDistanceStruct {
199
+ char *pattern;
200
+ int pattern_len;
201
+ PairArray *pattern_pair_array;
202
+ } PairDistance;
203
+
204
+ DEF_ALLOCATOR(PairDistance)
205
+ DEF_PATTERN_ACCESSOR(PairDistance)
206
+
207
+ typedef struct JaroStruct {
208
+ char *pattern;
209
+ int pattern_len;
210
+ int ignore_case;
211
+ } Jaro;
212
+
213
+ DEF_ALLOCATOR(Jaro)
214
+ DEF_PATTERN_ACCESSOR(Jaro)
215
+ DEF_ITERATE_STRINGS(Jaro)
216
+
217
+ typedef struct JaroWinklerStruct {
218
+ char *pattern;
219
+ int pattern_len;
220
+ int ignore_case;
221
+ double scaling_factor;
222
+ } JaroWinkler;
223
+
224
+ DEF_ALLOCATOR(JaroWinkler)
225
+ DEF_PATTERN_ACCESSOR(JaroWinkler)
226
+ DEF_ITERATE_STRINGS(JaroWinkler)
227
+
228
+ /*
229
+ * Levenshtein edit distances are computed here:
230
+ */
231
+
232
+ #define COMPUTE_LEVENSHTEIN_DISTANCE \
233
+ for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
234
+ c = i % 2; /* current row */ \
235
+ p = (i + 1) % 2; /* previous row */ \
236
+ v[c][0] = i; /* first column */ \
237
+ for (j = 1; j <= b_len; j++) { \
238
+ /* Bellman's principle of optimality: */ \
239
+ weight = v[p][j - 1] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
240
+ if (weight > v[p][j] + 1) { \
241
+ weight = v[p][j] + 1; \
242
+ } \
243
+ if (weight > v[c][j - 1] + 1) { \
244
+ weight = v[c][j - 1] + 1; \
245
+ } \
246
+ v[c][j] = weight; \
247
+ } \
248
+ p = c; \
249
+ c = (c + 1) % 2; \
250
+ }
251
+
252
+ static VALUE Levenshtein_match(General *amatch, VALUE string)
253
+ {
254
+ VALUE result;
255
+ char *a_ptr, *b_ptr;
256
+ int a_len, b_len;
257
+ int *v[2], weight;
258
+ int i, j, c, p;
259
+
260
+ Check_Type(string, T_STRING);
261
+ DONT_OPTIMIZE
262
+
263
+ v[0] = ALLOC_N(int, b_len + 1);
264
+ v[1] = ALLOC_N(int, b_len + 1);
265
+ for (i = 0; i <= b_len; i++) {
266
+ v[0][i] = i;
267
+ v[1][i] = i;
268
+ }
269
+
270
+ COMPUTE_LEVENSHTEIN_DISTANCE
271
+
272
+ result = INT2FIX(v[p][b_len]);
273
+
274
+ xfree(v[0]);
275
+ xfree(v[1]);
276
+
277
+ return result;
278
+ }
279
+
280
+ static VALUE Levenshtein_similar(General *amatch, VALUE string)
281
+ {
282
+ VALUE result;
283
+ char *a_ptr, *b_ptr;
284
+ int a_len, b_len;
285
+ int *v[2], weight;
286
+ int i, j, c, p;
287
+
288
+ Check_Type(string, T_STRING);
289
+ DONT_OPTIMIZE
290
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
291
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
292
+ v[0] = ALLOC_N(int, b_len + 1);
293
+ v[1] = ALLOC_N(int, b_len + 1);
294
+ for (i = 0; i <= b_len; i++) {
295
+ v[0][i] = i;
296
+ v[1][i] = i;
297
+ }
298
+
299
+ COMPUTE_LEVENSHTEIN_DISTANCE
300
+
301
+ if (b_len > a_len) {
302
+ result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
303
+ } else {
304
+ result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
305
+ }
306
+ xfree(v[0]);
307
+ xfree(v[1]);
308
+ return result;
309
+ }
310
+
311
+ static VALUE Levenshtein_search(General *amatch, VALUE string)
312
+ {
313
+ VALUE result;
314
+ char *a_ptr, *b_ptr;
315
+ int a_len, b_len;
316
+ int *v[2], weight, min;
317
+ int i, j, c, p;
318
+
319
+ Check_Type(string, T_STRING);
320
+ DONT_OPTIMIZE
321
+
322
+ v[0] = ALLOC_N(int, b_len + 1);
323
+ v[1] = ALLOC_N(int, b_len + 1);
324
+ MEMZERO(v[0], int, b_len + 1);
325
+ MEMZERO(v[1], int, b_len + 1);
326
+
327
+ COMPUTE_LEVENSHTEIN_DISTANCE
328
+
329
+ for (i = 0, min = a_len; i <= b_len; i++) {
330
+ if (v[p][i] < min) min = v[p][i];
331
+ }
332
+
333
+ result = INT2FIX(min);
334
+
335
+ xfree(v[0]);
336
+ xfree(v[1]);
337
+
338
+ return result;
339
+ }
340
+
341
+
342
+ /*
343
+ * Sellers edit distances are computed here:
344
+ */
345
+
346
+ #define COMPUTE_SELLERS_DISTANCE \
347
+ for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
348
+ c = i % 2; /* current row */ \
349
+ p = (i + 1) % 2; /* previous row */ \
350
+ v[c][0] = i * amatch->deletion; /* first column */ \
351
+ for (j = 1; j <= b_len; j++) { \
352
+ /* Bellman's principle of optimality: */ \
353
+ weight = v[p][j - 1] + \
354
+ (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : amatch->substitution); \
355
+ if (weight > v[p][j] + amatch->insertion) { \
356
+ weight = v[p][j] + amatch->insertion; \
357
+ } \
358
+ if (weight > v[c][j - 1] + amatch->deletion) { \
359
+ weight = v[c][j - 1] + amatch->deletion; \
360
+ } \
361
+ v[c][j] = weight; \
362
+ } \
363
+ p = c; \
364
+ c = (c + 1) % 2; \
365
+ }
366
+
367
+ static VALUE Sellers_match(Sellers *amatch, VALUE string)
368
+ {
369
+ VALUE result;
370
+ char *a_ptr, *b_ptr;
371
+ int a_len, b_len;
372
+ double *v[2], weight;
373
+ int i, j, c, p;
374
+
375
+ Check_Type(string, T_STRING);
376
+ DONT_OPTIMIZE
377
+
378
+ v[0] = ALLOC_N(double, b_len + 1);
379
+ v[1] = ALLOC_N(double, b_len + 1);
380
+ for (i = 0; i <= b_len; i++) {
381
+ v[0][i] = i * amatch->deletion;
382
+ v[1][i] = i * amatch->deletion;
383
+ }
384
+
385
+ COMPUTE_SELLERS_DISTANCE
386
+
387
+ result = rb_float_new(v[p][b_len]);
388
+ xfree(v[0]);
389
+ xfree(v[1]);
390
+ return result;
391
+ }
392
+
393
+ static VALUE Sellers_similar(Sellers *amatch, VALUE string)
394
+ {
395
+ VALUE result;
396
+ char *a_ptr, *b_ptr;
397
+ int a_len, b_len;
398
+ double *v[2], weight, max_weight;
399
+ int i, j, c, p;
400
+
401
+ if (amatch->insertion >= amatch->deletion) {
402
+ if (amatch->substitution >= amatch->insertion) {
403
+ max_weight = amatch->substitution;
404
+ } else {
405
+ max_weight = amatch->insertion;
406
+ }
407
+ } else {
408
+ if (amatch->substitution >= amatch->deletion) {
409
+ max_weight = amatch->substitution;
410
+ } else {
411
+ max_weight = amatch->deletion;
412
+ }
413
+ }
414
+
415
+ Check_Type(string, T_STRING);
416
+ DONT_OPTIMIZE
417
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
418
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
419
+ v[0] = ALLOC_N(double, b_len + 1);
420
+ v[1] = ALLOC_N(double, b_len + 1);
421
+ for (i = 0; i <= b_len; i++) {
422
+ v[0][i] = i * amatch->deletion;
423
+ v[1][i] = i * amatch->deletion;
424
+ }
425
+
426
+ COMPUTE_SELLERS_DISTANCE
427
+
428
+ if (b_len > a_len) {
429
+ result = rb_float_new(1.0 - v[p][b_len] / (b_len * max_weight));
430
+ } else {
431
+ result = rb_float_new(1.0 - v[p][b_len] / (a_len * max_weight));
432
+ }
433
+ xfree(v[0]);
434
+ xfree(v[1]);
435
+ return result;
436
+ }
437
+
438
+ static VALUE Sellers_search(Sellers *amatch, VALUE string)
439
+ {
440
+ VALUE result;
441
+ char *a_ptr, *b_ptr;
442
+ int a_len, b_len;
443
+ double *v[2], weight, min;
444
+ int i, j, c, p;
445
+
446
+ Check_Type(string, T_STRING);
447
+ DONT_OPTIMIZE
448
+
449
+ v[0] = ALLOC_N(double, b_len + 1);
450
+ v[1] = ALLOC_N(double, b_len + 1);
451
+ MEMZERO(v[0], double, b_len + 1);
452
+ MEMZERO(v[1], double, b_len + 1);
453
+
454
+ COMPUTE_SELLERS_DISTANCE
455
+
456
+ for (i = 0, min = a_len; i <= b_len; i++) {
457
+ if (v[p][i] < min) min = v[p][i];
458
+ }
459
+ result = rb_float_new(min);
460
+ xfree(v[0]);
461
+ xfree(v[1]);
462
+
463
+ return result;
464
+ }
465
+
466
+ /*
467
+ * Pair distances are computed here:
468
+ */
469
+
470
+ static VALUE PairDistance_match(PairDistance *amatch, VALUE string, VALUE regexp, int use_regexp)
471
+ {
472
+ double result;
473
+ VALUE tokens;
474
+ PairArray *pair_array;
475
+
476
+ Check_Type(string, T_STRING);
477
+ if (!NIL_P(regexp) || use_regexp) {
478
+ tokens = rb_funcall(
479
+ rb_str_new(amatch->pattern, amatch->pattern_len),
480
+ id_split, 1, regexp
481
+ );
482
+ if (!amatch->pattern_pair_array) {
483
+ amatch->pattern_pair_array = PairArray_new(tokens);
484
+ } else {
485
+ pair_array_reactivate(amatch->pattern_pair_array);
486
+ }
487
+ tokens = rb_funcall(string, id_split, 1, regexp);
488
+ pair_array = PairArray_new(tokens);
489
+ } else {
490
+ VALUE tmp = rb_str_new(amatch->pattern, amatch->pattern_len);
491
+ tokens = rb_ary_new4(1, &tmp);
492
+ if (!amatch->pattern_pair_array) {
493
+ amatch->pattern_pair_array = PairArray_new(tokens);
494
+ } else {
495
+ pair_array_reactivate(amatch->pattern_pair_array);
496
+ }
497
+ tokens = rb_ary_new4(1, &string);
498
+ pair_array = PairArray_new(tokens);
499
+ }
500
+ result = pair_array_match(amatch->pattern_pair_array, pair_array);
501
+ pair_array_destroy(pair_array);
502
+ return rb_float_new(result);
503
+ }
504
+
505
+ /*
506
+ * Hamming distances are computed here:
507
+ */
508
+
509
+ #define COMPUTE_HAMMING_DISTANCE \
510
+ for (i = 0, result = b_len - a_len; i < a_len; i++) { \
511
+ if (i >= b_len) { \
512
+ result += a_len - b_len; \
513
+ break; \
514
+ } \
515
+ if (b_ptr[i] != a_ptr[i]) result++; \
516
+ }
517
+
518
+ static VALUE Hamming_match(General *amatch, VALUE string)
519
+ {
520
+ char *a_ptr, *b_ptr;
521
+ int a_len, b_len;
522
+ int i, result;
523
+
524
+ Check_Type(string, T_STRING);
525
+ OPTIMIZE_TIME
526
+ COMPUTE_HAMMING_DISTANCE
527
+ return INT2FIX(result);
528
+ }
529
+
530
+ static VALUE Hamming_similar(General *amatch, VALUE string)
531
+ {
532
+ char *a_ptr, *b_ptr;
533
+ int a_len, b_len;
534
+ int i, result;
535
+
536
+ Check_Type(string, T_STRING);
537
+ OPTIMIZE_TIME
538
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
539
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
540
+ COMPUTE_HAMMING_DISTANCE
541
+ return rb_float_new(1.0 - ((double) result) / b_len);
542
+ }
543
+
544
+ /*
545
+ * Longest Common Subsequence computation
546
+ */
547
+
548
+ #define COMPUTE_LONGEST_SUBSEQUENCE \
549
+ l[0] = ALLOC_N(int, b_len + 1); \
550
+ l[1] = ALLOC_N(int, b_len + 1); \
551
+ for (i = a_len, c = 0, p = 1; i >= 0; i--) { \
552
+ for (j = b_len; j >= 0; j--) { \
553
+ if (i == a_len || j == b_len) { \
554
+ l[c][j] = 0; \
555
+ } else if (a_ptr[i] == b_ptr[j]) { \
556
+ l[c][j] = 1 + l[p][j + 1]; \
557
+ } else { \
558
+ int x = l[p][j], y = l[c][j + 1]; \
559
+ if (x > y) l[c][j] = x; else l[c][j] = y; \
560
+ } \
561
+ } \
562
+ p = c; \
563
+ c = (c + 1) % 2; \
564
+ } \
565
+ result = l[p][0]; \
566
+ xfree(l[0]); \
567
+ xfree(l[1]);
568
+
569
+
570
+ static VALUE LongestSubsequence_match(General *amatch, VALUE string)
571
+ {
572
+ char *a_ptr, *b_ptr;
573
+ int a_len, b_len;
574
+ int result, c, p, i, j, *l[2];
575
+
576
+ Check_Type(string, T_STRING);
577
+ OPTIMIZE_TIME
578
+
579
+ if (a_len == 0 || b_len == 0) return INT2FIX(0);
580
+ COMPUTE_LONGEST_SUBSEQUENCE
581
+ return INT2FIX(result);
582
+ }
583
+
584
+ static VALUE LongestSubsequence_similar(General *amatch, VALUE string)
585
+ {
586
+ char *a_ptr, *b_ptr;
587
+ int a_len, b_len;
588
+ int result, c, p, i, j, *l[2];
589
+
590
+ Check_Type(string, T_STRING);
591
+ OPTIMIZE_TIME
592
+
593
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
594
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
595
+ COMPUTE_LONGEST_SUBSEQUENCE
596
+ return rb_float_new(((double) result) / b_len);
597
+ }
598
+
599
+ /*
600
+ * Longest Common Substring computation
601
+ */
602
+
603
+ #define COMPUTE_LONGEST_SUBSTRING \
604
+ l[0] = ALLOC_N(int, b_len); \
605
+ MEMZERO(l[0], int, b_len); \
606
+ l[1] = ALLOC_N(int, b_len); \
607
+ MEMZERO(l[1], int, b_len); \
608
+ result = 0; \
609
+ for (i = 0, c = 0, p = 1; i < a_len; i++) { \
610
+ for (j = 0; j < b_len; j++) { \
611
+ if (a_ptr[i] == b_ptr[j]) { \
612
+ l[c][j] = j == 0 ? 1 : 1 + l[p][j - 1]; \
613
+ if (l[c][j] > result) result = l[c][j]; \
614
+ } else { \
615
+ l[c][j] = 0; \
616
+ } \
617
+ } \
618
+ p = c; \
619
+ c = (c + 1) % 2; \
620
+ } \
621
+ xfree(l[0]); \
622
+ xfree(l[1]);
623
+
624
+ static VALUE LongestSubstring_match(General *amatch, VALUE string)
625
+ {
626
+ char *a_ptr, *b_ptr;
627
+ int a_len, b_len;
628
+ int result, c, p, i, j, *l[2];
629
+
630
+ Check_Type(string, T_STRING);
631
+ OPTIMIZE_TIME
632
+ if (a_len == 0 || b_len == 0) return INT2FIX(0);
633
+ COMPUTE_LONGEST_SUBSTRING
634
+ return INT2FIX(result);
635
+ }
636
+
637
+ static VALUE LongestSubstring_similar(General *amatch, VALUE string)
638
+ {
639
+ char *a_ptr, *b_ptr;
640
+ int a_len, b_len;
641
+ int result, c, p, i, j, *l[2];
642
+
643
+ Check_Type(string, T_STRING);
644
+ OPTIMIZE_TIME
645
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
646
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
647
+ COMPUTE_LONGEST_SUBSTRING
648
+ return rb_float_new(((double) result) / b_len);
649
+ }
650
+
651
+ /*
652
+ * Jaro computation
653
+ */
654
+
655
+ #define COMPUTE_JARO \
656
+ l[0] = ALLOC_N(int, a_len); \
657
+ MEMZERO(l[0], int, a_len); \
658
+ l[1] = ALLOC_N(int, b_len); \
659
+ MEMZERO(l[1], int, b_len); \
660
+ max_dist = ((a_len > b_len ? a_len : b_len) / 2) - 1; \
661
+ m = 0; \
662
+ for (i = 0; i < a_len; i++) { \
663
+ low = (i > max_dist ? i - max_dist : 0); \
664
+ high = (i + max_dist < b_len ? i + max_dist : b_len - 1); \
665
+ for (j = low; j <= high; j++) { \
666
+ if (!l[1][j] && a_ptr[i] == b_ptr[j]) { \
667
+ l[0][i] = 1; \
668
+ l[1][j] = 1; \
669
+ m++; \
670
+ break; \
671
+ } \
672
+ } \
673
+ } \
674
+ if (m == 0) { \
675
+ result = 0.0; \
676
+ } else { \
677
+ k = t = 0; \
678
+ for (i = 0; i < a_len; i++) { \
679
+ if (l[0][i]) { \
680
+ for (j = k; j < b_len; j++) { \
681
+ if (l[1][j]) { \
682
+ k = j + 1; \
683
+ break; \
684
+ } \
685
+ } \
686
+ if (a_ptr[i] != b_ptr[j]) { \
687
+ t++; \
688
+ } \
689
+ } \
690
+ } \
691
+ t = t / 2; \
692
+ result = (((double)m)/a_len + ((double)m)/b_len + ((double)(m-t))/m)/3.0; \
693
+ } \
694
+ xfree(l[0]); \
695
+ xfree(l[1]);
696
+
697
+
698
+ #define LOWERCASE_STRINGS \
699
+ char *ying, *yang; \
700
+ ying = ALLOC_N(char, a_len); \
701
+ MEMCPY(ying, a_ptr, char, a_len); \
702
+ a_ptr = ying; \
703
+ yang = ALLOC_N(char, b_len); \
704
+ MEMCPY(yang, b_ptr, char, b_len); \
705
+ b_ptr = yang; \
706
+ for (i = 0; i < a_len; i++) { \
707
+ if (islower(a_ptr[i])) a_ptr[i] = toupper(a_ptr[i]); \
708
+ } \
709
+ for (i = 0; i < b_len; i++) { \
710
+ if (islower(b_ptr[i])) b_ptr[i] = toupper(b_ptr[i]); \
711
+ }
712
+
713
+ static VALUE Jaro_match(Jaro *amatch, VALUE string)
714
+ {
715
+ char *a_ptr, *b_ptr;
716
+ int a_len, b_len, max_dist, m, t, i, j, k, low, high;
717
+ int *l[2];
718
+ double result;
719
+
720
+ Check_Type(string, T_STRING);
721
+ OPTIMIZE_TIME
722
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
723
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
724
+ if (amatch->ignore_case) {
725
+ LOWERCASE_STRINGS
726
+ }
727
+ COMPUTE_JARO
728
+ if (amatch->ignore_case) {
729
+ xfree(a_ptr);
730
+ xfree(b_ptr);
731
+ }
732
+ return rb_float_new(result);
733
+ }
734
+
735
+ /*
736
+ * Jaro-Winkler computation
737
+ */
738
+
739
+ static VALUE JaroWinkler_match(JaroWinkler *amatch, VALUE string)
740
+ {
741
+ char *a_ptr, *b_ptr;
742
+ int a_len, b_len, max_dist, m, t, i, j, k, low, high, n;
743
+ int *l[2];
744
+ double result;
745
+
746
+ Check_Type(string, T_STRING);
747
+ OPTIMIZE_TIME
748
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
749
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
750
+ if (amatch->ignore_case) {
751
+ LOWERCASE_STRINGS
752
+ }
753
+ COMPUTE_JARO
754
+ n = 0;
755
+ for (i = 0; i < (a_len >= 4 ? 4 : a_len); i++) {
756
+ if (a_ptr[i] == b_ptr[i]) {
757
+ n++;
758
+ } else {
759
+ break;
760
+ }
761
+ }
762
+ result = result + n*amatch->scaling_factor*(1-result);
763
+ if (amatch->ignore_case) {
764
+ xfree(a_ptr);
765
+ xfree(b_ptr);
766
+ }
767
+ return rb_float_new(result);
768
+ }
769
+
770
+ /*
771
+ * Ruby API
772
+ */
773
+
774
+ /*
775
+ * Document-class: Amatch::Levenshtein
776
+ *
777
+ * The Levenshtein edit distance is defined as the minimal costs involved to
778
+ * transform one string into another by using three elementary operations:
779
+ * deletion, insertion and substitution of a character. To transform "water"
780
+ * into "wine", for instance, you have to substitute "a" -> "i": "witer", "t"
781
+ * -> "n": "winer" and delete "r": "wine". The edit distance between "water"
782
+ * and "wine" is 3, because you have to apply three operations. The edit
783
+ * distance between "wine" and "wine" is 0 of course: no operation is
784
+ * necessary for the transformation -- they're already the same string. It's
785
+ * easy to see that more similar strings have smaller edit distances than
786
+ * strings that differ a lot.
787
+ */
788
+
789
+ DEF_RB_FREE(Levenshtein, General)
790
+
791
+ /*
792
+ * call-seq: new(pattern)
793
+ *
794
+ * Creates a new Amatch::Levenshtein instance from <code>pattern</code>.
795
+ */
796
+ static VALUE rb_Levenshtein_initialize(VALUE self, VALUE pattern)
797
+ {
798
+ GET_STRUCT(General)
799
+ General_pattern_set(amatch, pattern);
800
+ return self;
801
+ }
802
+
803
+ DEF_CONSTRUCTOR(Levenshtein, General)
804
+
805
+ /*
806
+ * call-seq: match(strings) -> results
807
+ *
808
+ * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
809
+ * against <code>strings</code>. It returns the number operations, the Sellers
810
+ * distance. <code>strings</code> has to be either a String or an Array of
811
+ * Strings. The returned <code>results</code> is either a Float or an Array of
812
+ * Floats respectively.
813
+ */
814
+ static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
815
+ {
816
+ GET_STRUCT(General)
817
+ return General_iterate_strings(amatch, strings, Levenshtein_match);
818
+ }
819
+
820
+ /*
821
+ * call-seq: similar(strings) -> results
822
+ *
823
+ * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
824
+ * against <code>strings</code>, and compute a Levenshtein distance metric
825
+ * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
826
+ * <code>strings</code> has to be either a String or an Array of Strings. The
827
+ * returned <code>results</code> is either a Fixnum or an Array of Fixnums
828
+ * respectively.
829
+ */
830
+ static VALUE rb_Levenshtein_similar(VALUE self, VALUE strings)
831
+ {
832
+ GET_STRUCT(General)
833
+ return General_iterate_strings(amatch, strings, Levenshtein_similar);
834
+ }
835
+
836
+ /*
837
+ * call-seq: levenshtein_similar(strings) -> results
838
+ *
839
+ * If called on a String, this string is used as a Amatch::Levenshtein#pattern
840
+ * to match against <code>strings</code>. It returns a Levenshtein distance
841
+ * metric number between 0.0 for very unsimilar strings and 1.0 for an exact
842
+ * match. <code>strings</code> has to be either a String or an Array of
843
+ * Strings. The returned <code>results</code> is either a Float or an Array of
844
+ * Floats respectively.
845
+ */
846
+ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
847
+ {
848
+ VALUE amatch = rb_Levenshtein_new(rb_cLevenshtein, self);
849
+ return rb_Levenshtein_similar(amatch, strings);
850
+ }
851
+
852
+ /*
853
+ * call-seq: search(strings) -> results
854
+ *
855
+ * searches Amatch::Levenshtein#pattern in <code>strings</code> and returns the
856
+ * edit distance (the sum of character operations) as a Fixnum value, by greedy
857
+ * trimming prefixes or postfixes of the match. <code>strings</code> has
858
+ * to be either a String or an Array of Strings. The returned
859
+ * <code>results</code> is either a Float or an Array of Floats respectively.
860
+ */
861
+ static VALUE rb_Levenshtein_search(VALUE self, VALUE strings)
862
+ {
863
+ GET_STRUCT(General)
864
+ return General_iterate_strings(amatch, strings, Levenshtein_search);
865
+ }
866
+
867
+ /*
868
+ * Document-class: Amatch::Sellers
869
+ *
870
+ * The Sellers edit distance is very similar to the Levenshtein edit distance.
871
+ * The difference is, that you can also specify different weights for every
872
+ * operation to prefer special operations over others. This extension of the
873
+ * Sellers edit distance is also known under the names: Needleman-Wunsch
874
+ * distance.
875
+ */
876
+
877
+ DEF_RB_FREE(Sellers, Sellers)
878
+
879
+ /*
880
+ * Document-method: substitution
881
+ *
882
+ * call-seq: substitution -> weight
883
+ *
884
+ * Returns the weight of the substitution operation, that is used to compute
885
+ * the Sellers distance.
886
+ */
887
+ DEF_RB_READER(Sellers, rb_Sellers_substitution, substitution,
888
+ rb_float_new)
889
+
890
+ /*
891
+ * Document-method: deletion
892
+ *
893
+ * call-seq: deletion -> weight
894
+ *
895
+ * Returns the weight of the deletion operation, that is used to compute
896
+ * the Sellers distance.
897
+ */
898
+ DEF_RB_READER(Sellers, rb_Sellers_deletion, deletion,
899
+ rb_float_new)
900
+
901
+ /*
902
+ * Document-method: insertion
903
+ *
904
+ * call-seq: insertion -> weight
905
+ *
906
+ * Returns the weight of the insertion operation, that is used to compute
907
+ * the Sellers distance.
908
+ */
909
+ DEF_RB_READER(Sellers, rb_Sellers_insertion, insertion,
910
+ rb_float_new)
911
+
912
+ /*
913
+ * Document-method: substitution=
914
+ *
915
+ * call-seq: substitution=(weight)
916
+ *
917
+ * Sets the weight of the substitution operation, that is used to compute
918
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
919
+ * should be a Float value >= 0.0.
920
+ */
921
+ DEF_RB_WRITER(Sellers, rb_Sellers_substitution_set, substitution,
922
+ double, CAST2FLOAT, FLOAT2C, >= 0)
923
+
924
+ /*
925
+ * Document-method: deletion=
926
+ *
927
+ * call-seq: deletion=(weight)
928
+ *
929
+ * Sets the weight of the deletion operation, that is used to compute
930
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
931
+ * should be a Float value >= 0.0.
932
+ */
933
+ DEF_RB_WRITER(Sellers, rb_Sellers_deletion_set, deletion,
934
+ double, CAST2FLOAT, FLOAT2C, >= 0)
935
+
936
+ /*
937
+ * Document-method: insertion=
938
+ *
939
+ * call-seq: insertion=(weight)
940
+ *
941
+ * Sets the weight of the insertion operation, that is used to compute
942
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
943
+ * should be a Float value >= 0.0.
944
+ */
945
+ DEF_RB_WRITER(Sellers, rb_Sellers_insertion_set, insertion,
946
+ double, CAST2FLOAT, FLOAT2C, >= 0)
947
+
948
+ /*
949
+ * Resets all weights (substitution, deletion, and insertion) to 1.0.
950
+ */
951
+ static VALUE rb_Sellers_reset_weights(VALUE self)
952
+ {
953
+ GET_STRUCT(Sellers)
954
+ Sellers_reset_weights(amatch);
955
+ return self;
956
+ }
957
+
958
+ /*
959
+ * call-seq: new(pattern)
960
+ *
961
+ * Creates a new Amatch::Sellers instance from <code>pattern</code>,
962
+ * with all weights initially set to 1.0.
963
+ */
964
+ static VALUE rb_Sellers_initialize(VALUE self, VALUE pattern)
965
+ {
966
+ GET_STRUCT(Sellers)
967
+ Sellers_pattern_set(amatch, pattern);
968
+ Sellers_reset_weights(amatch);
969
+ return self;
970
+ }
971
+
972
+ DEF_CONSTRUCTOR(Sellers, Sellers)
973
+
974
+ /*
975
+ * Document-method: pattern
976
+ *
977
+ * call-seq: pattern -> pattern string
978
+ *
979
+ * Returns the current pattern string of this Amatch::Sellers instance.
980
+ */
981
+
982
+ /*
983
+ * Document-method: pattern=
984
+ *
985
+ * call-seq: pattern=(pattern)
986
+ *
987
+ * Sets the current pattern string of this Amatch::Sellers instance to
988
+ * <code>pattern</code>.
989
+ */
990
+
991
+ /*
992
+ * call-seq: match(strings) -> results
993
+ *
994
+ * Uses this Amatch::Sellers instance to match Sellers#pattern against
995
+ * <code>strings</code>, while taking into account the given weights. It
996
+ * returns the number of weighted character operations, the Sellers distance.
997
+ * <code>strings</code> has to be either a String or an Array of Strings. The
998
+ * returned <code>results</code> is either a Float or an Array of Floats
999
+ * respectively.
1000
+ */
1001
+ static VALUE rb_Sellers_match(VALUE self, VALUE strings)
1002
+ {
1003
+ GET_STRUCT(Sellers)
1004
+ return Sellers_iterate_strings(amatch, strings, Sellers_match);
1005
+ }
1006
+
1007
+ /*
1008
+ * call-seq: similar(strings) -> results
1009
+ *
1010
+ * Uses this Amatch::Sellers instance to match Amatch::Sellers#pattern
1011
+ * against <code>strings</code> (taking into account the given weights), and
1012
+ * compute a Sellers distance metric number between 0.0 for very unsimilar
1013
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1014
+ * String or an Array of Strings. The returned <code>results</code> is either
1015
+ * a Fixnum or an Array of Fixnums
1016
+ * respectively.
1017
+ */
1018
+ static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
1019
+ {
1020
+ GET_STRUCT(Sellers)
1021
+ return Sellers_iterate_strings(amatch, strings, Sellers_similar);
1022
+ }
1023
+
1024
+ /*
1025
+ * call-seq: search(strings) -> results
1026
+ *
1027
+ * searches Sellers#pattern in <code>strings</code> and returns the edit
1028
+ * distance (the sum of weighted character operations) as a Float value, by
1029
+ * greedy trimming prefixes or postfixes of the match. <code>strings</code> has
1030
+ * to be either a String or an Array of Strings. The returned
1031
+ * <code>results</code> is either a Float or an Array of Floats respectively.
1032
+ */
1033
+ static VALUE rb_Sellers_search(VALUE self, VALUE strings)
1034
+ {
1035
+ GET_STRUCT(Sellers)
1036
+ return Sellers_iterate_strings(amatch, strings, Sellers_search);
1037
+ }
1038
+
1039
+ /*
1040
+ * Document-class: Amatch::PairDistance
1041
+ *
1042
+ * The pair distance between two strings is based on the number of adjacent
1043
+ * character pairs, that are contained in both strings. The similiarity
1044
+ * metric of two strings s1 and s2 is
1045
+ * 2*|union(pairs(s1), pairs(s2))| / |pairs(s1)| + |pairs(s2)|
1046
+ * If it is 1.0 the two strings are an exact match, if less than 1.0 they
1047
+ * are more dissimilar. The advantage of considering adjacent characters, is to
1048
+ * take account not only of the characters, but also of the character ordering
1049
+ * in the original strings.
1050
+ *
1051
+ * This metric is very capable to find similarities in natural languages.
1052
+ * It is explained in more detail in Simon White's article "How to Strike a
1053
+ * Match", located at this url:
1054
+ * http://www.catalysoft.com/articles/StrikeAMatch.html
1055
+ * It is also very similar (a special case) to the method described under
1056
+ * http://citeseer.lcs.mit.edu/gravano01using.html in "Using q-grams in a DBMS
1057
+ * for Approximate String Processing."
1058
+ */
1059
+ DEF_RB_FREE(PairDistance, PairDistance)
1060
+
1061
+ /*
1062
+ * call-seq: new(pattern)
1063
+ *
1064
+ * Creates a new Amatch::PairDistance instance from <code>pattern</code>.
1065
+ */
1066
+ static VALUE rb_PairDistance_initialize(VALUE self, VALUE pattern)
1067
+ {
1068
+ GET_STRUCT(PairDistance)
1069
+ PairDistance_pattern_set(amatch, pattern);
1070
+ return self;
1071
+ }
1072
+
1073
+ DEF_CONSTRUCTOR(PairDistance, PairDistance)
1074
+
1075
+ /*
1076
+ * call-seq: match(strings, regexp = /\s+/) -> results
1077
+ *
1078
+ * Uses this Amatch::PairDistance instance to match PairDistance#pattern against
1079
+ * <code>strings</code>. It returns the pair distance measure, that is a
1080
+ * returned value of 1.0 is an exact match, partial matches are lower
1081
+ * values, while 0.0 means no match at all.
1082
+ *
1083
+ * <code>strings</code> has to be either a String or an
1084
+ * Array of Strings. The argument <code>regexp</code> is used to split the
1085
+ * pattern and strings into tokens first. It defaults to /\s+/. If the
1086
+ * splitting should be omitted, call the method with nil as <code>regexp</code>
1087
+ * explicitly.
1088
+ *
1089
+ * The returned <code>results</code> is either a Float or an
1090
+ * Array of Floats respectively.
1091
+ */
1092
+ static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
1093
+ {
1094
+ VALUE result, strings, regexp = Qnil;
1095
+ int use_regexp;
1096
+ GET_STRUCT(PairDistance)
1097
+
1098
+ rb_scan_args(argc, argv, "11", &strings, &regexp);
1099
+ use_regexp = NIL_P(regexp) && argc != 2;
1100
+ if (TYPE(strings) == T_STRING) {
1101
+ result = PairDistance_match(amatch, strings, regexp, use_regexp);
1102
+ } else {
1103
+ int i;
1104
+ Check_Type(strings, T_ARRAY);
1105
+ result = rb_ary_new2(RARRAY_LEN(strings));
1106
+ for (i = 0; i < RARRAY_LEN(strings); i++) {
1107
+ VALUE string = rb_ary_entry(strings, i);
1108
+ if (TYPE(string) != T_STRING) {
1109
+ rb_raise(rb_eTypeError,
1110
+ "array has to contain only strings (%s given)",
1111
+ NIL_P(string) ?
1112
+ "NilClass" :
1113
+ rb_class2name(CLASS_OF(string)));
1114
+ }
1115
+ rb_ary_push(result,
1116
+ PairDistance_match(amatch, string, regexp, use_regexp));
1117
+ }
1118
+ }
1119
+ pair_array_destroy(amatch->pattern_pair_array);
1120
+ amatch->pattern_pair_array = NULL;
1121
+ return result;
1122
+ }
1123
+
1124
+ /*
1125
+ * call-seq: pair_distance_similar(strings, regexp = nil) -> results
1126
+ *
1127
+ * If called on a String, this string is used as a Amatch::PairDistance#pattern
1128
+ * to match against <code>strings</code> using /\s+/ as the tokenizing regular
1129
+ * expression. It returns a pair distance metric number between 0.0 for very
1130
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1131
+ * either a String or an Array of Strings.
1132
+ *
1133
+ * The returned <code>results</code> is either a Float or an Array of Floats
1134
+ * respectively.
1135
+ */
1136
+ static VALUE rb_str_pair_distance_similar(int argc, VALUE *argv, VALUE self)
1137
+ {
1138
+ VALUE amatch, string, regexp = Qnil;
1139
+ rb_scan_args(argc, argv, "11", &string, &regexp);
1140
+ amatch = rb_PairDistance_new(rb_cPairDistance, self);
1141
+ if (NIL_P(regexp)) {
1142
+ return rb_PairDistance_match(1, &string, amatch);
1143
+ } else {
1144
+ VALUE *args = alloca(2);
1145
+ args[0] = string;
1146
+ args[1] = regexp;
1147
+ return rb_PairDistance_match(2, args, amatch);
1148
+ }
1149
+ }
1150
+
1151
+ /*
1152
+ * Document-class: Amatch::Hamming
1153
+ *
1154
+ * This class computes the Hamming distance between two strings.
1155
+ *
1156
+ * The Hamming distance between two strings is the number of characters, that
1157
+ * are different. Thus a hamming distance of 0 means an exact
1158
+ * match, a hamming distance of 1 means one character is different, and so on.
1159
+ * If one string is longer than the other string, the missing characters are
1160
+ * counted as different characters.
1161
+ */
1162
+
1163
+ DEF_RB_FREE(Hamming, General)
1164
+
1165
+ /*
1166
+ * call-seq: new(pattern)
1167
+ *
1168
+ * Creates a new Amatch::Hamming instance from <code>pattern</code>.
1169
+ */
1170
+ static VALUE rb_Hamming_initialize(VALUE self, VALUE pattern)
1171
+ {
1172
+ GET_STRUCT(General)
1173
+ General_pattern_set(amatch, pattern);
1174
+ return self;
1175
+ }
1176
+
1177
+ DEF_CONSTRUCTOR(Hamming, General)
1178
+
1179
+ /*
1180
+ * call-seq: match(strings) -> results
1181
+ *
1182
+ * Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
1183
+ * <code>strings</code>, that is compute the hamming distance between
1184
+ * <code>pattern</code> and <code>strings</code>. <code>strings</code> has to
1185
+ * be either a String or an Array of Strings. The returned <code>results</code>
1186
+ * is either a Fixnum or an Array of Fixnums respectively.
1187
+ */
1188
+ static VALUE rb_Hamming_match(VALUE self, VALUE strings)
1189
+ {
1190
+ GET_STRUCT(General)
1191
+ return General_iterate_strings(amatch, strings, Hamming_match);
1192
+ }
1193
+
1194
+ /*
1195
+ * call-seq: similar(strings) -> results
1196
+ *
1197
+ * Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
1198
+ * <code>strings</code>, and compute a Hamming distance metric number between
1199
+ * 0.0 for very unsimilar strings and 1.0 for an exact match.
1200
+ * <code>strings</code> has to be either a String or an Array of Strings. The
1201
+ * returned <code>results</code> is either a Fixnum or an Array of Fixnums
1202
+ * respectively.
1203
+ */
1204
+ static VALUE rb_Hamming_similar(VALUE self, VALUE strings)
1205
+ {
1206
+ GET_STRUCT(General)
1207
+ return General_iterate_strings(amatch, strings, Hamming_similar);
1208
+ }
1209
+
1210
+ /*
1211
+ * call-seq: hamming_similar(strings) -> results
1212
+ *
1213
+ * If called on a String, this string is used as a Amatch::Hamming#pattern to
1214
+ * match against <code>strings</code>. It returns a Hamming distance metric
1215
+ * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
1216
+ * <code>strings</code>
1217
+ * has to be either a String or an Array of Strings. The returned
1218
+ * <code>results</code> is either a Float or an Array of Floats respectively.
1219
+ */
1220
+ static VALUE rb_str_hamming_similar(VALUE self, VALUE strings)
1221
+ {
1222
+ VALUE amatch = rb_Hamming_new(rb_cHamming, self);
1223
+ return rb_Hamming_similar(amatch, strings);
1224
+ }
1225
+
1226
+
1227
+ /*
1228
+ * Document-class: Amatch::LongestSubsequence
1229
+ *
1230
+ * This class computes the length of the longest subsequence common to two
1231
+ * strings. A subsequence doesn't have to be contiguous. The longer the common
1232
+ * subsequence is, the more similar the two strings will be.
1233
+ *
1234
+ * The longest common subsequence between "test" and "test" is of length 4,
1235
+ * because "test" itself is this subsequence. The longest common subsequence
1236
+ * between "test" and "east" is "e", "s", "t" and the length of the
1237
+ * sequence is 3.
1238
+ */
1239
+ DEF_RB_FREE(LongestSubsequence, General)
1240
+
1241
+ /*
1242
+ * call-seq: new(pattern)
1243
+ *
1244
+ * Creates a new Amatch::LongestSubsequence instance from <code>pattern</code>.
1245
+ */
1246
+ static VALUE rb_LongestSubsequence_initialize(VALUE self, VALUE pattern)
1247
+ {
1248
+ GET_STRUCT(General)
1249
+ General_pattern_set(amatch, pattern);
1250
+ return self;
1251
+ }
1252
+
1253
+ DEF_CONSTRUCTOR(LongestSubsequence, General)
1254
+
1255
+ /*
1256
+ * call-seq: match(strings) -> results
1257
+ *
1258
+ * Uses this Amatch::LongestSubsequence instance to match
1259
+ * LongestSubsequence#pattern against <code>strings</code>, that is compute the
1260
+ * length of the longest common subsequence. <code>strings</code> has to be
1261
+ * either a String or an Array of Strings. The returned <code>results</code>
1262
+ * is either a Fixnum or an Array of Fixnums respectively.
1263
+ */
1264
+ static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
1265
+ {
1266
+ GET_STRUCT(General)
1267
+ return General_iterate_strings(amatch, strings, LongestSubsequence_match);
1268
+ }
1269
+
1270
+ /*
1271
+ * call-seq: similar(strings) -> results
1272
+ *
1273
+ * Uses this Amatch::LongestSubsequence instance to match
1274
+ * Amatch::LongestSubsequence#pattern against <code>strings</code>, and compute
1275
+ * a longest substring distance metric number between 0.0 for very unsimilar
1276
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1277
+ * String or an Array of Strings. The returned <code>results</code> is either
1278
+ * a Fixnum or an Array of Fixnums
1279
+ */
1280
+ static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
1281
+ {
1282
+ GET_STRUCT(General)
1283
+ return General_iterate_strings(amatch, strings, LongestSubsequence_similar);
1284
+ }
1285
+
1286
+ /*
1287
+ * call-seq: longest_subsequence_similar(strings) -> results
1288
+ *
1289
+ * If called on a String, this string is used as a
1290
+ * Amatch::LongestSubsequence#pattern to match against <code>strings</code>. It
1291
+ * returns a longest subsequence distance metric number between 0.0 for very
1292
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1293
+ * either a String or an Array of Strings. The returned <code>results</code>
1294
+ * is either a Float or an Array of Floats respectively.
1295
+ */
1296
+ static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
1297
+ {
1298
+ VALUE amatch = rb_LongestSubsequence_new(rb_cLongestSubsequence, self);
1299
+ return rb_LongestSubsequence_similar(amatch, strings);
1300
+ }
1301
+
1302
+ /*
1303
+ * Document-class: Amatch::LongestSubstring
1304
+ *
1305
+ * The longest common substring is the longest substring, that is part of
1306
+ * two strings. A substring is contiguous, while a subsequence need not to
1307
+ * be. The longer the common substring is, the more similar the two strings
1308
+ * will be.
1309
+ *
1310
+ * The longest common substring between 'string' and 'string' is 'string'
1311
+ * again, thus the longest common substring length is 6. The longest common
1312
+ * substring between 'string' and 'storing' is 'ring', thus the longest common
1313
+ * substring length is 4.
1314
+ */
1315
+
1316
+ DEF_RB_FREE(LongestSubstring, General)
1317
+
1318
+ /*
1319
+ * call-seq: new(pattern)
1320
+ *
1321
+ * Creates a new Amatch::LongestSubstring instance from <code>pattern</code>.
1322
+ */
1323
+ static VALUE rb_LongestSubstring_initialize(VALUE self, VALUE pattern)
1324
+ {
1325
+ GET_STRUCT(General)
1326
+ General_pattern_set(amatch, pattern);
1327
+ return self;
1328
+ }
1329
+
1330
+ DEF_CONSTRUCTOR(LongestSubstring, General)
1331
+
1332
+ /*
1333
+ * call-seq: match(strings) -> results
1334
+ *
1335
+ * Uses this Amatch::LongestSubstring instance to match
1336
+ * LongestSubstring#pattern against <code>strings</code>, that is compute the
1337
+ * length of the longest common substring. <code>strings</code> has to be
1338
+ * either a String or an Array of Strings. The returned <code>results</code>
1339
+ * is either a Fixnum or an Array of Fixnums respectively.
1340
+ */
1341
+ static VALUE rb_LongestSubstring_match(VALUE self, VALUE strings)
1342
+ {
1343
+ GET_STRUCT(General)
1344
+ return General_iterate_strings(amatch, strings, LongestSubstring_match);
1345
+ }
1346
+
1347
+ /*
1348
+ * call-seq: similar(strings) -> results
1349
+ *
1350
+ * Uses this Amatch::LongestSubstring instance to match
1351
+ * Amatch::LongestSubstring#pattern against <code>strings</code>, and compute a
1352
+ * longest substring distance metric number between 0.0 for very unsimilar
1353
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1354
+ * String or an Array of Strings. The returned <code>results</code> is either
1355
+ * a Fixnum or an Array of Fixnums
1356
+ * respectively.
1357
+ */
1358
+ static VALUE rb_LongestSubstring_similar(VALUE self, VALUE strings)
1359
+ {
1360
+ GET_STRUCT(General)
1361
+ return General_iterate_strings(amatch, strings, LongestSubstring_similar);
1362
+ }
1363
+
1364
+ /*
1365
+ * call-seq: longest_substring_similar(strings) -> results
1366
+ *
1367
+ * If called on a String, this string is used as a
1368
+ * Amatch::LongestSubstring#pattern to match against <code>strings</code>. It
1369
+ * returns a longest substring distance metric number between 0.0 for very
1370
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1371
+ * either a String or an Array of Strings. The returned <code>results</code>
1372
+ * is either a Float or an Array of Floats respectively.
1373
+ */
1374
+ static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
1375
+ {
1376
+ VALUE amatch = rb_LongestSubstring_new(rb_cLongestSubstring, self);
1377
+ return rb_LongestSubstring_similar(amatch, strings);
1378
+ }
1379
+
1380
+ /*
1381
+ * Document-class: Amatch::Jaro
1382
+ *
1383
+ * This class computes the Jaro metric for two strings.
1384
+ * The Jaro metric computes the similarity between 0 (no match)
1385
+ * and 1 (exact match) by looking for matching and transposed characters.
1386
+ */
1387
+ DEF_RB_FREE(Jaro, Jaro)
1388
+
1389
+ /*
1390
+ * Document-method: ignore_case
1391
+ *
1392
+ * call-seq: ignore_case -> true/false
1393
+ *
1394
+ * Returns whether case is ignored when computing matching characters.
1395
+ */
1396
+ DEF_RB_READER(Jaro, rb_Jaro_ignore_case, ignore_case, C2BOOL)
1397
+
1398
+ /*
1399
+ * Document-method: ignore_case=
1400
+ *
1401
+ * call-seq: ignore_case=(true/false)
1402
+ *
1403
+ * Sets whether case is ignored when computing matching characters.
1404
+ */
1405
+ DEF_RB_WRITER(Jaro, rb_Jaro_ignore_case_set, ignore_case,
1406
+ int, CAST2BOOL, BOOL2C, != Qundef)
1407
+
1408
+ /*
1409
+ * call-seq: new(pattern)
1410
+ *
1411
+ * Creates a new Amatch::Jaro instance from <code>pattern</code>.
1412
+ */
1413
+ static VALUE rb_Jaro_initialize(VALUE self, VALUE pattern)
1414
+ {
1415
+ GET_STRUCT(Jaro)
1416
+ Jaro_pattern_set(amatch, pattern);
1417
+ amatch->ignore_case = 1;
1418
+ return self;
1419
+ }
1420
+
1421
+ DEF_CONSTRUCTOR(Jaro, Jaro)
1422
+
1423
+ /*
1424
+ * call-seq: match(strings) -> results
1425
+ *
1426
+ * Uses this Amatch::Jaro instance to match
1427
+ * Jaro#pattern against <code>strings</code>, that is compute the
1428
+ * jaro metric with the strings. <code>strings</code> has to be
1429
+ * either a String or an Array of Strings. The returned <code>results</code>
1430
+ * is either a Float or an Array of Floats respectively.
1431
+ */
1432
+ static VALUE rb_Jaro_match(VALUE self, VALUE strings)
1433
+ {
1434
+ GET_STRUCT(Jaro)
1435
+ return Jaro_iterate_strings(amatch, strings, Jaro_match);
1436
+ }
1437
+
1438
+ /*
1439
+ * call-seq: jaro_similar(strings) -> results
1440
+ *
1441
+ * If called on a String, this string is used as a
1442
+ * Amatch::Jaro#pattern to match against <code>strings</code>. It
1443
+ * returns a Jaro metric number between 0.0 for very
1444
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1445
+ * either a String or an Array of Strings. The returned <code>results</code>
1446
+ * is either a Float or an Array of Floats respectively.
1447
+ */
1448
+ static VALUE rb_str_jaro_similar(VALUE self, VALUE strings)
1449
+ {
1450
+ VALUE amatch = rb_Jaro_new(rb_cJaro, self);
1451
+ return rb_Jaro_match(amatch, strings);
1452
+ }
1453
+
1454
+ /*
1455
+ * Document-class: Amatch::JaroWinkler
1456
+ *
1457
+ * This class computes the Jaro-Winkler metric for two strings.
1458
+ * The Jaro-Winkler metric computes the similarity between 0 (no match)
1459
+ * and 1 (exact match) by looking for matching and transposed characters.
1460
+ *
1461
+ * It is a variant of the Jaro metric, with additional weighting towards
1462
+ * common prefixes.
1463
+ */
1464
+ DEF_RB_FREE(JaroWinkler, JaroWinkler)
1465
+
1466
+ /*
1467
+ * Document-method: ignore_case
1468
+ *
1469
+ * call-seq: ignore_case -> true/false
1470
+ *
1471
+ * Returns whether case is ignored when computing matching characters.
1472
+ * Default is true.
1473
+ */
1474
+ DEF_RB_READER(JaroWinkler, rb_JaroWinkler_ignore_case, ignore_case, C2BOOL)
1475
+
1476
+ /*
1477
+ * Document-method: scaling_factor
1478
+ *
1479
+ * call-seq: scaling_factor -> weight
1480
+ *
1481
+ * The scaling factor is how much weight to give common prefixes.
1482
+ * Default is 0.1.
1483
+ */
1484
+ DEF_RB_READER(JaroWinkler, rb_JaroWinkler_scaling_factor, scaling_factor, rb_float_new)
1485
+
1486
+ /*
1487
+ * Document-method: ignore_case=
1488
+ *
1489
+ * call-seq: ignore_case=(true/false)
1490
+ *
1491
+ * Sets whether case is ignored when computing matching characters.
1492
+ */
1493
+ DEF_RB_WRITER(JaroWinkler, rb_JaroWinkler_ignore_case_set, ignore_case,
1494
+ int, CAST2BOOL, BOOL2C, != Qundef)
1495
+
1496
+ /*
1497
+ * Document-method: scaling_factor=
1498
+ *
1499
+ * call-seq: scaling_factor=(weight)
1500
+ *
1501
+ * Sets the weight to give common prefixes.
1502
+ */
1503
+ DEF_RB_WRITER(JaroWinkler, rb_JaroWinkler_scaling_factor_set, scaling_factor,
1504
+ double, CAST2FLOAT, FLOAT2C, >= 0)
1505
+
1506
+ /*
1507
+ * call-seq: new(pattern)
1508
+ *
1509
+ * Creates a new Amatch::JaroWinkler instance from <code>pattern</code>.
1510
+ */
1511
+ static VALUE rb_JaroWinkler_initialize(VALUE self, VALUE pattern)
1512
+ {
1513
+ GET_STRUCT(JaroWinkler)
1514
+ JaroWinkler_pattern_set(amatch, pattern);
1515
+ amatch->ignore_case = 1;
1516
+ amatch->scaling_factor = 0.1;
1517
+ return self;
1518
+ }
1519
+
1520
+ DEF_CONSTRUCTOR(JaroWinkler, JaroWinkler)
1521
+
1522
+ /*
1523
+ * call-seq: match(strings) -> results
1524
+ *
1525
+ * Uses this Amatch::Jaro instance to match
1526
+ * Jaro#pattern against <code>strings</code>, that is compute the
1527
+ * jaro metric with the strings. <code>strings</code> has to be
1528
+ * either a String or an Array of Strings. The returned <code>results</code>
1529
+ * is either a Float or an Array of Floats respectively.
1530
+ */
1531
+ static VALUE rb_JaroWinkler_match(VALUE self, VALUE strings)
1532
+ {
1533
+ GET_STRUCT(JaroWinkler)
1534
+ return JaroWinkler_iterate_strings(amatch, strings, JaroWinkler_match);
1535
+ }
1536
+
1537
+ /*
1538
+ * call-seq: jarowinkler_similar(strings) -> results
1539
+ *
1540
+ * If called on a String, this string is used as a
1541
+ * Amatch::JaroWinkler#pattern to match against <code>strings</code>. It
1542
+ * returns a Jaro-Winkler metric number between 0.0 for very
1543
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1544
+ * either a String or an Array of Strings. The returned <code>results</code>
1545
+ * are either a Float or an Array of Floats respectively.
1546
+ */
1547
+ static VALUE rb_str_jarowinkler_similar(VALUE self, VALUE strings)
1548
+ {
1549
+ VALUE amatch = rb_JaroWinkler_new(rb_cJaro, self);
1550
+ return rb_JaroWinkler_match(amatch, strings);
1551
+ }
1552
+
1553
+ /*
1554
+ * This is the namespace module that includes all other classes, modules, and
1555
+ * constants.
1556
+ */
1557
+
1558
+ void Init_amatch_ext()
1559
+ {
1560
+ rb_require("amatch/version");
1561
+ rb_mAmatch = rb_define_module("Amatch");
1562
+ /* This module can be mixed into ::String or its subclasses to mixin the similary methods directly. */
1563
+ rb_mAmatchStringMethods = rb_define_module_under(rb_mAmatch, "StringMethods");
1564
+
1565
+ /* Levenshtein */
1566
+ rb_cLevenshtein = rb_define_class_under(rb_mAmatch, "Levenshtein", rb_cObject);
1567
+ rb_define_alloc_func(rb_cLevenshtein, rb_Levenshtein_s_allocate);
1568
+ rb_define_method(rb_cLevenshtein, "initialize", rb_Levenshtein_initialize, 1);
1569
+ rb_define_method(rb_cLevenshtein, "pattern", rb_General_pattern, 0);
1570
+ rb_define_method(rb_cLevenshtein, "pattern=", rb_General_pattern_set, 1);
1571
+ rb_define_method(rb_cLevenshtein, "match", rb_Levenshtein_match, 1);
1572
+ rb_define_method(rb_cLevenshtein, "search", rb_Levenshtein_search, 1);
1573
+ rb_define_method(rb_cLevenshtein, "similar", rb_Levenshtein_similar, 1);
1574
+ rb_define_method(rb_mAmatchStringMethods, "levenshtein_similar", rb_str_levenshtein_similar, 1);
1575
+
1576
+ /* Sellers */
1577
+ rb_cSellers = rb_define_class_under(rb_mAmatch, "Sellers", rb_cObject);
1578
+ rb_define_alloc_func(rb_cSellers, rb_Sellers_s_allocate);
1579
+ rb_define_method(rb_cSellers, "initialize", rb_Sellers_initialize, 1);
1580
+ rb_define_method(rb_cSellers, "pattern", rb_Sellers_pattern, 0);
1581
+ rb_define_method(rb_cSellers, "pattern=", rb_Sellers_pattern_set, 1);
1582
+ rb_define_method(rb_cSellers, "substitution", rb_Sellers_substitution, 0);
1583
+ rb_define_method(rb_cSellers, "substitution=", rb_Sellers_substitution_set, 1);
1584
+ rb_define_method(rb_cSellers, "deletion", rb_Sellers_deletion, 0);
1585
+ rb_define_method(rb_cSellers, "deletion=", rb_Sellers_deletion_set, 1);
1586
+ rb_define_method(rb_cSellers, "insertion", rb_Sellers_insertion, 0);
1587
+ rb_define_method(rb_cSellers, "insertion=", rb_Sellers_insertion_set, 1);
1588
+ rb_define_method(rb_cSellers, "reset_weights", rb_Sellers_reset_weights, 0);
1589
+ rb_define_method(rb_cSellers, "match", rb_Sellers_match, 1);
1590
+ rb_define_method(rb_cSellers, "search", rb_Sellers_search, 1);
1591
+ rb_define_method(rb_cSellers, "similar", rb_Sellers_similar, 1);
1592
+
1593
+ /* Hamming */
1594
+ rb_cHamming = rb_define_class_under(rb_mAmatch, "Hamming", rb_cObject);
1595
+ rb_define_alloc_func(rb_cHamming, rb_Hamming_s_allocate);
1596
+ rb_define_method(rb_cHamming, "initialize", rb_Hamming_initialize, 1);
1597
+ rb_define_method(rb_cHamming, "pattern", rb_General_pattern, 0);
1598
+ rb_define_method(rb_cHamming, "pattern=", rb_General_pattern_set, 1);
1599
+ rb_define_method(rb_cHamming, "match", rb_Hamming_match, 1);
1600
+ rb_define_method(rb_cHamming, "similar", rb_Hamming_similar, 1);
1601
+ rb_define_method(rb_mAmatchStringMethods, "hamming_similar", rb_str_hamming_similar, 1);
1602
+
1603
+ /* Pair Distance Metric / Dice Coefficient */
1604
+ rb_cPairDistance = rb_define_class_under(rb_mAmatch, "PairDistance", rb_cObject);
1605
+ rb_define_alloc_func(rb_cPairDistance, rb_PairDistance_s_allocate);
1606
+ rb_define_method(rb_cPairDistance, "initialize", rb_PairDistance_initialize, 1);
1607
+ rb_define_method(rb_cPairDistance, "pattern", rb_PairDistance_pattern, 0);
1608
+ rb_define_method(rb_cPairDistance, "pattern=", rb_PairDistance_pattern_set, 1);
1609
+ rb_define_method(rb_cPairDistance, "match", rb_PairDistance_match, -1);
1610
+ rb_define_alias(rb_cPairDistance, "similar", "match");
1611
+ rb_define_method(rb_mAmatchStringMethods, "pair_distance_similar", rb_str_pair_distance_similar, -1);
1612
+
1613
+ /* Longest Common Subsequence */
1614
+ rb_cLongestSubsequence = rb_define_class_under(rb_mAmatch, "LongestSubsequence", rb_cObject);
1615
+ rb_define_alloc_func(rb_cLongestSubsequence, rb_LongestSubsequence_s_allocate);
1616
+ rb_define_method(rb_cLongestSubsequence, "initialize", rb_LongestSubsequence_initialize, 1);
1617
+ rb_define_method(rb_cLongestSubsequence, "pattern", rb_General_pattern, 0);
1618
+ rb_define_method(rb_cLongestSubsequence, "pattern=", rb_General_pattern_set, 1);
1619
+ rb_define_method(rb_cLongestSubsequence, "match", rb_LongestSubsequence_match, 1);
1620
+ rb_define_method(rb_cLongestSubsequence, "similar", rb_LongestSubsequence_similar, 1);
1621
+ rb_define_method(rb_mAmatchStringMethods, "longest_subsequence_similar", rb_str_longest_subsequence_similar, 1);
1622
+
1623
+ /* Longest Common Substring */
1624
+ rb_cLongestSubstring = rb_define_class_under(rb_mAmatch, "LongestSubstring", rb_cObject);
1625
+ rb_define_alloc_func(rb_cLongestSubstring, rb_LongestSubstring_s_allocate);
1626
+ rb_define_method(rb_cLongestSubstring, "initialize", rb_LongestSubstring_initialize, 1);
1627
+ rb_define_method(rb_cLongestSubstring, "pattern", rb_General_pattern, 0);
1628
+ rb_define_method(rb_cLongestSubstring, "pattern=", rb_General_pattern_set, 1);
1629
+ rb_define_method(rb_cLongestSubstring, "match", rb_LongestSubstring_match, 1);
1630
+ rb_define_method(rb_cLongestSubstring, "similar", rb_LongestSubstring_similar, 1);
1631
+ rb_define_method(rb_mAmatchStringMethods, "longest_substring_similar", rb_str_longest_substring_similar, 1);
1632
+
1633
+ /* Jaro */
1634
+ rb_cJaro = rb_define_class_under(rb_mAmatch, "Jaro", rb_cObject);
1635
+ rb_define_alloc_func(rb_cJaro, rb_Jaro_s_allocate);
1636
+ rb_define_method(rb_cJaro, "initialize", rb_Jaro_initialize, 1);
1637
+ rb_define_method(rb_cJaro, "pattern", rb_Jaro_pattern, 0);
1638
+ rb_define_method(rb_cJaro, "pattern=", rb_Jaro_pattern_set, 1);
1639
+ rb_define_method(rb_cJaro, "ignore_case", rb_Jaro_ignore_case, 0);
1640
+ rb_define_method(rb_cJaro, "ignore_case=", rb_Jaro_ignore_case_set, 1);
1641
+ rb_define_method(rb_cJaro, "match", rb_Jaro_match, 1);
1642
+ rb_define_alias(rb_cJaro, "similar", "match");
1643
+ rb_define_method(rb_mAmatchStringMethods, "jaro_similar", rb_str_jaro_similar, 1);
1644
+
1645
+ /* Jaro-Winkler */
1646
+ rb_cJaroWinkler = rb_define_class_under(rb_mAmatch, "JaroWinkler", rb_cObject);
1647
+ rb_define_alloc_func(rb_cJaroWinkler, rb_JaroWinkler_s_allocate);
1648
+ rb_define_method(rb_cJaroWinkler, "initialize", rb_JaroWinkler_initialize, 1);
1649
+ rb_define_method(rb_cJaroWinkler, "pattern", rb_JaroWinkler_pattern, 0);
1650
+ rb_define_method(rb_cJaroWinkler, "pattern=", rb_JaroWinkler_pattern_set, 1);
1651
+ rb_define_method(rb_cJaroWinkler, "ignore_case", rb_JaroWinkler_ignore_case, 0);
1652
+ rb_define_method(rb_cJaroWinkler, "ignore_case=", rb_JaroWinkler_ignore_case_set, 1);
1653
+ rb_define_method(rb_cJaroWinkler, "scaling_factor", rb_JaroWinkler_scaling_factor, 0);
1654
+ rb_define_method(rb_cJaroWinkler, "scaling_factor=", rb_JaroWinkler_scaling_factor_set, 1);
1655
+ rb_define_method(rb_cJaroWinkler, "match", rb_JaroWinkler_match, 1);
1656
+ rb_define_alias(rb_cJaroWinkler, "similar", "match");
1657
+ rb_define_method(rb_mAmatchStringMethods, "jarowinkler_similar", rb_str_jarowinkler_similar, 1);
1658
+
1659
+ id_split = rb_intern("split");
1660
+ id_to_f = rb_intern("to_f");
1661
+ }