amatch 0.2.5-x86-mswin32

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,141 @@
1
+ # vim: set filetype=ruby et sw=2 ts=2:
2
+
3
+ begin
4
+ require 'rake/gempackagetask'
5
+ require 'rake/extensiontask'
6
+ rescue LoadError
7
+ end
8
+
9
+ require 'rbconfig'
10
+ include Config
11
+
12
+ require 'rake/clean'
13
+ CLEAN.include 'coverage', 'doc'
14
+ require 'rake/testtask'
15
+
16
+ MAKE = ENV['MAKE'] || %w[gmake make].find { |c| system(c, '-v') }
17
+ PKG_NAME = 'amatch'
18
+ PKG_VERSION = File.read('VERSION').chomp
19
+ PKG_FILES = FileList["**/*"].exclude(/^(pkg|coverage|doc|tmp)/)
20
+ PKG_DOC_FILES = [ "ext/amatch.c" ].concat(Dir['lib/**/*.rb']) << 'README'
21
+
22
+ desc "Run unit tests"
23
+ task :test => :compile_ext do
24
+ sh %{testrb -Iext:lib tests/test_*.rb}
25
+ end
26
+
27
+ desc "Compiling library"
28
+ task :compile_ext do
29
+ cd 'ext' do
30
+ ruby %{extconf.rb}
31
+ sh MAKE
32
+ end
33
+ end
34
+
35
+ desc "Installing library"
36
+ task :install => :test do
37
+ src, = Dir['ext/amatch.*'].reject { |x| /\.[co]$/.match x }
38
+ filename = File.basename(src)
39
+ dst = File.join(CONFIG["sitelibdir"], filename)
40
+ install(src, dst, :verbose => true)
41
+ end
42
+
43
+ desc "Removing generated files"
44
+ task :clean do
45
+ cd 'ext' do
46
+ ruby 'extconf.rb'
47
+ sh "#{MAKE} distclean" if File.exist?('Makefile')
48
+ end
49
+ end
50
+
51
+ desc "Build the documentation"
52
+ task :doc do
53
+ sh "rdoc -m README -t '#{PKG_NAME} - Approximate Matching' #{PKG_DOC_FILES * ' '}"
54
+ end
55
+
56
+ if defined?(Gem) and defined?(Rake::GemPackageTask) and
57
+ defined?(Rake::ExtensionTask)
58
+ then
59
+ spec_src = <<-GEM
60
+ Gem::Specification.new do |s|
61
+ s.name = '#{PKG_NAME}'
62
+ s.version = '#{PKG_VERSION}'
63
+ s.summary = "Approximate String Matching library"
64
+ s.description = <<EOF
65
+ Amatch is a library for approximate string matching and searching in strings.
66
+ Several algorithms can be used to do this, and it's also possible to compute a
67
+ similarity metric number between 0.0 and 1.0 for two given strings.
68
+ EOF
69
+
70
+ s.files = #{PKG_FILES.sort.inspect}
71
+
72
+ s.extensions << "ext/extconf.rb"
73
+
74
+ s.require_paths << 'ext' << 'lib'
75
+
76
+ s.bindir = "bin"
77
+ s.executables = ["agrep.rb"]
78
+ s.default_executable = "agrep.rb"
79
+
80
+ s.has_rdoc = true
81
+ s.extra_rdoc_files.concat #{PKG_DOC_FILES.sort.inspect}
82
+ s.rdoc_options << '--main' << 'README' <<
83
+ '--title' << "#{PKG_NAME} - Approximate Matching"
84
+ s.test_files.concat Dir['tests/test_*.rb']
85
+
86
+ s.author = "Florian Frank"
87
+ s.email = "flori@ping.de"
88
+ s.homepage = "http://amatch.rubyforge.org"
89
+ s.rubyforge_project = '#{PKG_NAME}'
90
+ end
91
+ GEM
92
+
93
+ desc 'Create a gemspec file'
94
+ task :gemspec do
95
+ File.open("#{PKG_NAME}.gemspec", 'w') do |f|
96
+ f.puts spec_src
97
+ end
98
+ end
99
+
100
+ spec = eval(spec_src)
101
+ Rake::GemPackageTask.new(spec) do |pkg|
102
+ pkg.need_tar = true
103
+ pkg.package_files = PKG_FILES
104
+ end
105
+
106
+ Rake::ExtensionTask.new do |ext|
107
+ ext.name = PKG_NAME
108
+ ext.gem_spec = spec
109
+ ext.cross_compile = true
110
+ ext.cross_platform = 'i386-mswin32'
111
+ ext.ext_dir = 'ext'
112
+ ext.lib_dir = 'lib'
113
+ end
114
+ end
115
+
116
+ desc m = "Writing version information for #{PKG_VERSION}"
117
+ task :version do
118
+ puts m
119
+ File.open(File.join('lib', 'amatch', 'version.rb'), 'w') do |v|
120
+ v.puts <<EOT
121
+ module Amatch
122
+ # Amatch version
123
+ VERSION = '#{PKG_VERSION}'
124
+ VERSION_ARRAY = VERSION.split(/\\./).map { |x| x.to_i } # :nodoc:
125
+ VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc:
126
+ VERSION_MINOR = VERSION_ARRAY[1] # :nodoc:
127
+ VERSION_BUILD = VERSION_ARRAY[2] # :nodoc:
128
+ end
129
+ EOT
130
+ end
131
+ end
132
+
133
+
134
+ desc "Default task"
135
+ task :default => [ :version, :gemspec, :test ]
136
+
137
+ desc "Build all gems and archives for a new release."
138
+ task :release => [ :clean, :version, :gemspec, :cross, :native, :gem ] do
139
+ system "#$0 clean native gem"
140
+ system "#$0 clean package"
141
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.5
@@ -0,0 +1,31 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'amatch'
3
+ s.version = '0.2.5'
4
+ s.summary = "Approximate String Matching library"
5
+ s.description = <<EOF
6
+ Amatch is a library for approximate string matching and searching in strings.
7
+ Several algorithms can be used to do this, and it's also possible to compute a
8
+ similarity metric number between 0.0 and 1.0 for two given strings.
9
+ EOF
10
+
11
+ s.files = ["CHANGES", "COPYING", "README", "Rakefile", "VERSION", "amatch.gemspec", "bin", "bin/agrep.rb", "ext", "ext/amatch.c", "ext/common.h", "ext/extconf.rb", "ext/pair.c", "ext/pair.h", "install.rb", "lib", "lib/amatch", "lib/amatch/version.rb", "tests", "tests/test_hamming.rb", "tests/test_jaro.rb", "tests/test_jaro_winkler.rb", "tests/test_levenshtein.rb", "tests/test_longest_subsequence.rb", "tests/test_longest_substring.rb", "tests/test_pair_distance.rb", "tests/test_sellers.rb"]
12
+
13
+ s.extensions << "ext/extconf.rb"
14
+
15
+ s.require_paths << 'ext' << 'lib'
16
+
17
+ s.bindir = "bin"
18
+ s.executables = ["agrep.rb"]
19
+ s.default_executable = "agrep.rb"
20
+
21
+ s.has_rdoc = true
22
+ s.extra_rdoc_files.concat ["README", "ext/amatch.c", "lib/amatch/version.rb"]
23
+ s.rdoc_options << '--main' << 'README' <<
24
+ '--title' << "amatch - Approximate Matching"
25
+ s.test_files.concat Dir['tests/test_*.rb']
26
+
27
+ s.author = "Florian Frank"
28
+ s.email = "flori@ping.de"
29
+ s.homepage = "http://amatch.rubyforge.org"
30
+ s.rubyforge_project = 'amatch'
31
+ end
@@ -0,0 +1,79 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require 'amatch'
4
+ require 'getoptlong'
5
+
6
+ def usage(msg, options)
7
+ puts msg, "Usage: #{File.basename($0)} [OPTIONS] PATTERN [FILE ...]", ""
8
+ options.each do |o|
9
+ puts " " + o[1] + ", " + o[0] + " " +
10
+ (o[2] == GetoptLong::REQUIRED_ARGUMENT ? 'ARGUMENT' : '')
11
+ end
12
+ puts "\nReport bugs to <flori@ping.de>."
13
+ exit 0
14
+ end
15
+
16
+ class Amatch::Levenshtein
17
+ def search_relative(strings)
18
+ search(strings).to_f / pattern.size
19
+ end
20
+ end
21
+
22
+ $distance = 1
23
+ $mode = :search
24
+ begin
25
+ parser = GetoptLong.new
26
+ options = [
27
+ [ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
28
+ [ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
29
+ [ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
30
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
31
+ ]
32
+ parser.set_options(*options)
33
+ parser.each_option do |name, arg|
34
+ name = name.sub(/^--/, '')
35
+ case name
36
+ when 'distance'
37
+ $distance = arg.to_f
38
+ when 'relative'
39
+ $mode = :search_relative
40
+ when 'verbose'
41
+ $verbose = 1
42
+ when 'help'
43
+ usage('You\'ve asked for it!', options)
44
+ end
45
+ end
46
+ rescue
47
+ exit 1
48
+ end
49
+ pattern = ARGV.shift or usage('Pattern needed!', options)
50
+
51
+ matcher = Amatch::Levenshtein.new(pattern)
52
+ size = 0
53
+ start = Time.new
54
+ if ARGV.size > 0 then
55
+ ARGV.each do |filename|
56
+ File.stat(filename).file? or next
57
+ size += File.size(filename)
58
+ begin
59
+ File.open(filename, 'r').each_line do |line|
60
+ if matcher.__send__($mode, line) <= $distance
61
+ puts "#{filename}:#{line}"
62
+ end
63
+ end
64
+ rescue
65
+ STDERR.puts "Failure at #{filename}: #{$!} => Skipping!"
66
+ end
67
+ end
68
+ else
69
+ STDIN.each_line do |line|
70
+ size += line.size
71
+ if matcher.__send__($mode, line) <= $distance
72
+ puts line
73
+ end
74
+ end
75
+ end
76
+ time = Time.new - start
77
+ $verbose and STDERR.printf "%.3f secs running, scanned %.3f KB/s.\n",
78
+ time, size / time / 1024
79
+ exit 0
@@ -0,0 +1,1641 @@
1
+ #include "ruby.h"
2
+ #include "pair.h"
3
+ #include <ctype.h>
4
+ #include "common.h"
5
+
6
+ /*
7
+ * Document-method: pattern
8
+ *
9
+ * call-seq: pattern -> pattern string
10
+ *
11
+ * Returns the current pattern string of this instance.
12
+ */
13
+
14
+ /*
15
+ * Document-method: pattern=
16
+ *
17
+ * call-seq: pattern=(pattern)
18
+ *
19
+ * Sets the current pattern string of this instance to <code>pattern</code>.
20
+ */
21
+
22
+
23
+ static VALUE rb_mAmatch, rb_cLevenshtein, rb_cSellers, rb_cHamming,
24
+ rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring,
25
+ rb_cJaro, rb_cJaroWinkler;
26
+
27
+ static ID id_split, id_to_f;
28
+
29
+ #define GET_STRUCT(klass) \
30
+ klass *amatch; \
31
+ Data_Get_Struct(self, klass, amatch);
32
+
33
+ #define DEF_ALLOCATOR(type) \
34
+ static type *type##_allocate() \
35
+ { \
36
+ type *obj = ALLOC(type); \
37
+ MEMZERO(obj, type, 1); \
38
+ return obj; \
39
+ }
40
+
41
+ #define DEF_CONSTRUCTOR(klass, type) \
42
+ static VALUE rb_##klass##_s_allocate(VALUE klass2) \
43
+ { \
44
+ type *amatch = type##_allocate(); \
45
+ return Data_Wrap_Struct(klass2, NULL, rb_##klass##_free, amatch); \
46
+ } \
47
+ VALUE rb_##klass##_new(VALUE klass2, VALUE pattern) \
48
+ { \
49
+ VALUE obj = rb_##klass##_s_allocate(klass2); \
50
+ rb_##klass##_initialize(obj, pattern); \
51
+ return obj; \
52
+ }
53
+
54
+ #define DEF_RB_FREE(klass, type) \
55
+ static void rb_##klass##_free(type *amatch) \
56
+ { \
57
+ MEMZERO(amatch->pattern, char, amatch->pattern_len); \
58
+ free(amatch->pattern); \
59
+ MEMZERO(amatch, type, 1); \
60
+ free(amatch); \
61
+ }
62
+
63
+ #define DEF_PATTERN_ACCESSOR(type) \
64
+ static void type##_pattern_set(type *amatch, VALUE pattern) \
65
+ { \
66
+ Check_Type(pattern, T_STRING); \
67
+ free(amatch->pattern); \
68
+ amatch->pattern_len = RSTRING_LEN(pattern); \
69
+ amatch->pattern = ALLOC_N(char, amatch->pattern_len); \
70
+ MEMCPY(amatch->pattern, RSTRING_PTR(pattern), char, \
71
+ RSTRING_LEN(pattern)); \
72
+ } \
73
+ static VALUE rb_##type##_pattern(VALUE self) \
74
+ { \
75
+ GET_STRUCT(type) \
76
+ return rb_str_new(amatch->pattern, amatch->pattern_len); \
77
+ } \
78
+ static VALUE rb_##type##_pattern_set(VALUE self, VALUE pattern) \
79
+ { \
80
+ GET_STRUCT(type) \
81
+ type##_pattern_set(amatch, pattern); \
82
+ return Qnil; \
83
+ }
84
+
85
+ #define DEF_ITERATE_STRINGS(type) \
86
+ static VALUE type##_iterate_strings(type *amatch, VALUE strings, \
87
+ VALUE (*match_function) (type *amatch, VALUE strings)) \
88
+ { \
89
+ if (TYPE(strings) == T_STRING) { \
90
+ return match_function(amatch, strings); \
91
+ } else { \
92
+ Check_Type(strings, T_ARRAY); \
93
+ int i; \
94
+ VALUE result = rb_ary_new2(RARRAY_LEN(strings)); \
95
+ for (i = 0; i < RARRAY_LEN(strings); i++) { \
96
+ VALUE string = rb_ary_entry(strings, i); \
97
+ if (TYPE(string) != T_STRING) { \
98
+ rb_raise(rb_eTypeError, \
99
+ "array has to contain only strings (%s given)", \
100
+ NIL_P(string) ? \
101
+ "NilClass" : \
102
+ rb_class2name(CLASS_OF(string))); \
103
+ } \
104
+ rb_ary_push(result, match_function(amatch, string)); \
105
+ } \
106
+ return result; \
107
+ } \
108
+ }
109
+
110
+ #define DEF_RB_READER(type, function, name, converter) \
111
+ VALUE function(VALUE self) \
112
+ { \
113
+ GET_STRUCT(type) \
114
+ return converter(amatch->name); \
115
+ }
116
+
117
+ #define DEF_RB_WRITER(type, function, name, vtype, caster, converter, check)\
118
+ VALUE function(VALUE self, VALUE value) \
119
+ { \
120
+ vtype value_ ## vtype; \
121
+ GET_STRUCT(type) \
122
+ caster(value); \
123
+ value_ ## vtype = converter(value); \
124
+ if (!(value_ ## vtype check)) \
125
+ rb_raise(rb_eTypeError, "check of value " #check " failed"); \
126
+ amatch->name = value_ ## vtype; \
127
+ return Qnil; \
128
+ }
129
+
130
+
131
+ #define CAST2FLOAT(obj) \
132
+ if (TYPE(obj) != T_FLOAT && rb_respond_to(obj, id_to_f)) \
133
+ obj = rb_funcall(obj, id_to_f, 0, 0); \
134
+ else \
135
+ Check_Type(obj, T_FLOAT)
136
+ #define FLOAT2C(obj) (RFLOAT_VALUE(obj))
137
+
138
+ #define CAST2BOOL(obj) \
139
+ if (obj == Qfalse || obj == Qnil) \
140
+ obj = Qfalse; \
141
+ else \
142
+ obj = Qtrue;
143
+ #define BOOL2C(obj) (obj == Qtrue)
144
+ #define C2BOOL(obj) (obj ? Qtrue : Qfalse)
145
+
146
+ #define OPTIMIZE_TIME \
147
+ if (amatch->pattern_len < RSTRING_LEN(string)) { \
148
+ a_ptr = amatch->pattern; \
149
+ a_len = amatch->pattern_len; \
150
+ b_ptr = RSTRING_PTR(string); \
151
+ b_len = RSTRING_LEN(string); \
152
+ } else { \
153
+ a_ptr = RSTRING_PTR(string); \
154
+ a_len = RSTRING_LEN(string); \
155
+ b_ptr = amatch->pattern; \
156
+ b_len = amatch->pattern_len; \
157
+ }
158
+
159
+ #define DONT_OPTIMIZE \
160
+ a_ptr = amatch->pattern; \
161
+ a_len = amatch->pattern_len; \
162
+ b_ptr = RSTRING_PTR(string); \
163
+ b_len = RSTRING_LEN(string); \
164
+
165
+ /*
166
+ * C structures of the Amatch classes
167
+ */
168
+
169
+ typedef struct GeneralStruct {
170
+ char *pattern;
171
+ int pattern_len;
172
+ } General;
173
+
174
+ DEF_ALLOCATOR(General)
175
+ DEF_PATTERN_ACCESSOR(General)
176
+ DEF_ITERATE_STRINGS(General)
177
+
178
+ typedef struct SellersStruct {
179
+ char *pattern;
180
+ int pattern_len;
181
+ double substitution;
182
+ double deletion;
183
+ double insertion;
184
+ } Sellers;
185
+
186
+ DEF_ALLOCATOR(Sellers)
187
+ DEF_PATTERN_ACCESSOR(Sellers)
188
+ DEF_ITERATE_STRINGS(Sellers)
189
+
190
+ static void Sellers_reset_weights(Sellers *self)
191
+ {
192
+ self->substitution = 1.0;
193
+ self->deletion = 1.0;
194
+ self->insertion = 1.0;
195
+ }
196
+
197
+ typedef struct PairDistanceStruct {
198
+ char *pattern;
199
+ int pattern_len;
200
+ PairArray *pattern_pair_array;
201
+ } PairDistance;
202
+
203
+ DEF_ALLOCATOR(PairDistance)
204
+ DEF_PATTERN_ACCESSOR(PairDistance)
205
+
206
+ typedef struct JaroStruct {
207
+ char *pattern;
208
+ int pattern_len;
209
+ int ignore_case;
210
+ } Jaro;
211
+
212
+ DEF_ALLOCATOR(Jaro)
213
+ DEF_PATTERN_ACCESSOR(Jaro)
214
+ DEF_ITERATE_STRINGS(Jaro)
215
+
216
+ typedef struct JaroWinklerStruct {
217
+ char *pattern;
218
+ int pattern_len;
219
+ int ignore_case;
220
+ float scaling_factor;
221
+ } JaroWinkler;
222
+
223
+ DEF_ALLOCATOR(JaroWinkler)
224
+ DEF_PATTERN_ACCESSOR(JaroWinkler)
225
+ DEF_ITERATE_STRINGS(JaroWinkler)
226
+
227
+ /*
228
+ * Levenshtein edit distances are computed here:
229
+ */
230
+
231
+ #define COMPUTE_LEVENSHTEIN_DISTANCE \
232
+ for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
233
+ c = i % 2; /* current row */ \
234
+ p = (i + 1) % 2; /* previous row */ \
235
+ v[c][0] = i; /* first column */ \
236
+ for (j = 1; j <= b_len; j++) { \
237
+ /* Bellman's principle of optimality: */ \
238
+ weight = v[p][j - 1] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
239
+ if (weight > v[p][j] + 1) { \
240
+ weight = v[p][j] + 1; \
241
+ } \
242
+ if (weight > v[c][j - 1] + 1) { \
243
+ weight = v[c][j - 1] + 1; \
244
+ } \
245
+ v[c][j] = weight; \
246
+ } \
247
+ p = c; \
248
+ c = (c + 1) % 2; \
249
+ }
250
+
251
+ static VALUE Levenshtein_match(General *amatch, VALUE string)
252
+ {
253
+ VALUE result;
254
+ char *a_ptr, *b_ptr;
255
+ int a_len, b_len;
256
+ int *v[2], weight;
257
+ int i, j, c, p;
258
+
259
+ Check_Type(string, T_STRING);
260
+ DONT_OPTIMIZE
261
+
262
+ v[0] = ALLOC_N(int, b_len + 1);
263
+ v[1] = ALLOC_N(int, b_len + 1);
264
+ for (i = 0; i <= b_len; i++) {
265
+ v[0][i] = i;
266
+ v[1][i] = i;
267
+ }
268
+
269
+ COMPUTE_LEVENSHTEIN_DISTANCE
270
+
271
+ result = INT2FIX(v[p][b_len]);
272
+
273
+ free(v[0]);
274
+ free(v[1]);
275
+
276
+ return result;
277
+ }
278
+
279
+ static VALUE Levenshtein_similar(General *amatch, VALUE string)
280
+ {
281
+ VALUE result;
282
+ char *a_ptr, *b_ptr;
283
+ int a_len, b_len;
284
+ int *v[2], weight;
285
+ int i, j, c, p;
286
+
287
+ Check_Type(string, T_STRING);
288
+ DONT_OPTIMIZE
289
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
290
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
291
+ v[0] = ALLOC_N(int, b_len + 1);
292
+ v[1] = ALLOC_N(int, b_len + 1);
293
+ for (i = 0; i <= b_len; i++) {
294
+ v[0][i] = i;
295
+ v[1][i] = i;
296
+ }
297
+
298
+ COMPUTE_LEVENSHTEIN_DISTANCE
299
+
300
+ if (b_len > a_len) {
301
+ result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
302
+ } else {
303
+ result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
304
+ }
305
+ free(v[0]);
306
+ free(v[1]);
307
+ return result;
308
+ }
309
+
310
+ static VALUE Levenshtein_search(General *amatch, VALUE string)
311
+ {
312
+ VALUE result;
313
+ char *a_ptr, *b_ptr;
314
+ int a_len, b_len;
315
+ int *v[2], weight, min;
316
+ int i, j, c, p;
317
+
318
+ Check_Type(string, T_STRING);
319
+ DONT_OPTIMIZE
320
+
321
+ v[0] = ALLOC_N(int, b_len + 1);
322
+ v[1] = ALLOC_N(int, b_len + 1);
323
+ MEMZERO(v[0], int, b_len + 1);
324
+ MEMZERO(v[1], int, b_len + 1);
325
+
326
+ COMPUTE_LEVENSHTEIN_DISTANCE
327
+
328
+ for (i = 0, min = a_len; i <= b_len; i++) {
329
+ if (v[p][i] < min) min = v[p][i];
330
+ }
331
+
332
+ result = INT2FIX(min);
333
+
334
+ free(v[0]);
335
+ free(v[1]);
336
+
337
+ return result;
338
+ }
339
+
340
+
341
+ /*
342
+ * Sellers edit distances are computed here:
343
+ */
344
+
345
+ #define COMPUTE_SELLERS_DISTANCE \
346
+ for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
347
+ c = i % 2; /* current row */ \
348
+ p = (i + 1) % 2; /* previous row */ \
349
+ v[c][0] = i * amatch->deletion; /* first column */ \
350
+ for (j = 1; j <= b_len; j++) { \
351
+ /* Bellman's principle of optimality: */ \
352
+ weight = v[p][j - 1] + \
353
+ (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : amatch->substitution); \
354
+ if (weight > v[p][j] + amatch->insertion) { \
355
+ weight = v[p][j] + amatch->insertion; \
356
+ } \
357
+ if (weight > v[c][j - 1] + amatch->deletion) { \
358
+ weight = v[c][j - 1] + amatch->deletion; \
359
+ } \
360
+ v[c][j] = weight; \
361
+ } \
362
+ p = c; \
363
+ c = (c + 1) % 2; \
364
+ }
365
+
366
+ static VALUE Sellers_match(Sellers *amatch, VALUE string)
367
+ {
368
+ VALUE result;
369
+ char *a_ptr, *b_ptr;
370
+ int a_len, b_len;
371
+ double *v[2], weight;
372
+ int i, j, c, p;
373
+
374
+ Check_Type(string, T_STRING);
375
+ DONT_OPTIMIZE
376
+
377
+ v[0] = ALLOC_N(double, b_len + 1);
378
+ v[1] = ALLOC_N(double, b_len + 1);
379
+ for (i = 0; i <= b_len; i++) {
380
+ v[0][i] = i * amatch->deletion;
381
+ v[1][i] = i * amatch->deletion;
382
+ }
383
+
384
+ COMPUTE_SELLERS_DISTANCE
385
+
386
+ result = rb_float_new(v[p][b_len]);
387
+ free(v[0]);
388
+ free(v[1]);
389
+ return result;
390
+ }
391
+
392
+ static VALUE Sellers_similar(Sellers *amatch, VALUE string)
393
+ {
394
+ VALUE result;
395
+ char *a_ptr, *b_ptr;
396
+ int a_len, b_len;
397
+ double *v[2], weight, max_weight;
398
+ int i, j, c, p;
399
+
400
+ if (amatch->insertion >= amatch->deletion) {
401
+ if (amatch->substitution >= amatch->insertion) {
402
+ max_weight = amatch->substitution;
403
+ } else {
404
+ max_weight = amatch->insertion;
405
+ }
406
+ } else {
407
+ if (amatch->substitution >= amatch->deletion) {
408
+ max_weight = amatch->substitution;
409
+ } else {
410
+ max_weight = amatch->deletion;
411
+ }
412
+ }
413
+
414
+ Check_Type(string, T_STRING);
415
+ DONT_OPTIMIZE
416
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
417
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
418
+ v[0] = ALLOC_N(double, b_len + 1);
419
+ v[1] = ALLOC_N(double, b_len + 1);
420
+ for (i = 0; i <= b_len; i++) {
421
+ v[0][i] = i * amatch->deletion;
422
+ v[1][i] = i * amatch->deletion;
423
+ }
424
+
425
+ COMPUTE_SELLERS_DISTANCE
426
+
427
+ if (b_len > a_len) {
428
+ result = rb_float_new(1.0 - v[p][b_len] / (b_len * max_weight));
429
+ } else {
430
+ result = rb_float_new(1.0 - v[p][b_len] / (a_len * max_weight));
431
+ }
432
+ free(v[0]);
433
+ free(v[1]);
434
+ return result;
435
+ }
436
+
437
+ static VALUE Sellers_search(Sellers *amatch, VALUE string)
438
+ {
439
+ VALUE result;
440
+ char *a_ptr, *b_ptr;
441
+ int a_len, b_len;
442
+ double *v[2], weight, min;
443
+ int i, j, c, p;
444
+
445
+ Check_Type(string, T_STRING);
446
+ DONT_OPTIMIZE
447
+
448
+ v[0] = ALLOC_N(double, b_len + 1);
449
+ v[1] = ALLOC_N(double, b_len + 1);
450
+ MEMZERO(v[0], double, b_len + 1);
451
+ MEMZERO(v[1], double, b_len + 1);
452
+
453
+ COMPUTE_SELLERS_DISTANCE
454
+
455
+ for (i = 0, min = a_len; i <= b_len; i++) {
456
+ if (v[p][i] < min) min = v[p][i];
457
+ }
458
+ result = rb_float_new(min);
459
+ free(v[0]);
460
+ free(v[1]);
461
+
462
+ return result;
463
+ }
464
+
465
+ /*
466
+ * Pair distances are computed here:
467
+ */
468
+
469
+ static VALUE PairDistance_match(
470
+ PairDistance *amatch, VALUE string, VALUE regexp, int use_regexp)
471
+ {
472
+ double result;
473
+ VALUE tokens;
474
+ PairArray *pair_array;
475
+
476
+ Check_Type(string, T_STRING);
477
+ if (!NIL_P(regexp) || use_regexp) {
478
+ tokens = rb_funcall(
479
+ rb_str_new(amatch->pattern, amatch->pattern_len),
480
+ id_split, 1, regexp
481
+ );
482
+ if (!amatch->pattern_pair_array) {
483
+ amatch->pattern_pair_array = PairArray_new(tokens);
484
+ } else {
485
+ pair_array_reactivate(amatch->pattern_pair_array);
486
+ }
487
+ tokens = rb_funcall(string, id_split, 1, regexp);
488
+ pair_array = PairArray_new(tokens);
489
+ } else {
490
+ VALUE tmp = rb_str_new(amatch->pattern, amatch->pattern_len);
491
+ tokens = rb_ary_new4(1, &tmp);
492
+ if (!amatch->pattern_pair_array) {
493
+ amatch->pattern_pair_array = PairArray_new(tokens);
494
+ } else {
495
+ pair_array_reactivate(amatch->pattern_pair_array);
496
+ }
497
+ tokens = rb_ary_new4(1, &string);
498
+ pair_array = PairArray_new(tokens);
499
+ }
500
+ result = pair_array_match(amatch->pattern_pair_array, pair_array);
501
+ pair_array_destroy(pair_array);
502
+ return rb_float_new(result);
503
+ }
504
+
505
+ /*
506
+ * Hamming distances are computed here:
507
+ */
508
+
509
+ #define COMPUTE_HAMMING_DISTANCE \
510
+ for (i = 0, result = b_len - a_len; i < a_len; i++) { \
511
+ if (i >= b_len) { \
512
+ result += a_len - b_len; \
513
+ break; \
514
+ } \
515
+ if (b_ptr[i] != a_ptr[i]) result++; \
516
+ }
517
+
518
+ static VALUE Hamming_match(General *amatch, VALUE string)
519
+ {
520
+ char *a_ptr, *b_ptr;
521
+ int a_len, b_len;
522
+ int i, result;
523
+
524
+ Check_Type(string, T_STRING);
525
+ OPTIMIZE_TIME
526
+ COMPUTE_HAMMING_DISTANCE
527
+ return INT2FIX(result);
528
+ }
529
+
530
+ static VALUE Hamming_similar(General *amatch, VALUE string)
531
+ {
532
+ char *a_ptr, *b_ptr;
533
+ int a_len, b_len;
534
+ int i, result;
535
+
536
+ Check_Type(string, T_STRING);
537
+ OPTIMIZE_TIME
538
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
539
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
540
+ COMPUTE_HAMMING_DISTANCE
541
+ return rb_float_new(1.0 - ((double) result) / b_len);
542
+ }
543
+
544
+ /*
545
+ * Longest Common Subsequence computation
546
+ */
547
+
548
+ #define COMPUTE_LONGEST_SUBSEQUENCE \
549
+ l[0] = ALLOC_N(int, b_len + 1); \
550
+ l[1] = ALLOC_N(int, b_len + 1); \
551
+ for (i = a_len, c = 0, p = 1; i >= 0; i--) { \
552
+ for (j = b_len; j >= 0; j--) { \
553
+ if (i == a_len || j == b_len) { \
554
+ l[c][j] = 0; \
555
+ } else if (a_ptr[i] == b_ptr[j]) { \
556
+ l[c][j] = 1 + l[p][j + 1]; \
557
+ } else { \
558
+ int x = l[p][j], y = l[c][j + 1]; \
559
+ if (x > y) l[c][j] = x; else l[c][j] = y; \
560
+ } \
561
+ } \
562
+ p = c; \
563
+ c = (c + 1) % 2; \
564
+ } \
565
+ result = l[p][0]; \
566
+ free(l[0]); \
567
+ free(l[1]);
568
+
569
+
570
+ static VALUE LongestSubsequence_match(General *amatch, VALUE string)
571
+ {
572
+ char *a_ptr, *b_ptr;
573
+ int a_len, b_len;
574
+ int result, c, p, i, j, *l[2];
575
+
576
+ Check_Type(string, T_STRING);
577
+ OPTIMIZE_TIME
578
+
579
+ if (a_len == 0 || b_len == 0) return INT2FIX(0);
580
+ COMPUTE_LONGEST_SUBSEQUENCE
581
+ return INT2FIX(result);
582
+ }
583
+
584
+ static VALUE LongestSubsequence_similar(General *amatch, VALUE string)
585
+ {
586
+ char *a_ptr, *b_ptr;
587
+ int a_len, b_len;
588
+ int result, c, p, i, j, *l[2];
589
+
590
+ Check_Type(string, T_STRING);
591
+ OPTIMIZE_TIME
592
+
593
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
594
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
595
+ COMPUTE_LONGEST_SUBSEQUENCE
596
+ return rb_float_new(((double) result) / b_len);
597
+ }
598
+
599
+ /*
600
+ * Longest Common Substring computation
601
+ */
602
+
603
+ #define COMPUTE_LONGEST_SUBSTRING \
604
+ l[0] = ALLOC_N(int, b_len); \
605
+ MEMZERO(l[0], int, b_len); \
606
+ l[1] = ALLOC_N(int, b_len); \
607
+ MEMZERO(l[1], int, b_len); \
608
+ result = 0; \
609
+ for (i = 0, c = 0, p = 1; i < a_len; i++) { \
610
+ for (j = 0; j < b_len; j++) { \
611
+ if (a_ptr[i] == b_ptr[j]) { \
612
+ l[c][j] = j == 0 ? 1 : 1 + l[p][j - 1]; \
613
+ if (l[c][j] > result) result = l[c][j]; \
614
+ } else { \
615
+ l[c][j] = 0; \
616
+ } \
617
+ } \
618
+ p = c; \
619
+ c = (c + 1) % 2; \
620
+ } \
621
+ free(l[0]); \
622
+ free(l[1]);
623
+
624
+ static VALUE LongestSubstring_match(General *amatch, VALUE string)
625
+ {
626
+ char *a_ptr, *b_ptr;
627
+ int a_len, b_len;
628
+ int result, c, p, i, j, *l[2];
629
+
630
+ Check_Type(string, T_STRING);
631
+ OPTIMIZE_TIME
632
+ if (a_len == 0 || b_len == 0) return INT2FIX(0);
633
+ COMPUTE_LONGEST_SUBSTRING
634
+ return INT2FIX(result);
635
+ }
636
+
637
+ static VALUE LongestSubstring_similar(General *amatch, VALUE string)
638
+ {
639
+ char *a_ptr, *b_ptr;
640
+ int a_len, b_len;
641
+ int result, c, p, i, j, *l[2];
642
+
643
+ Check_Type(string, T_STRING);
644
+ OPTIMIZE_TIME
645
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
646
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
647
+ COMPUTE_LONGEST_SUBSTRING
648
+ return rb_float_new(((double) result) / b_len);
649
+ }
650
+
651
+ /*
652
+ * Jaro computation
653
+ */
654
+
655
+ #define COMPUTE_JARO \
656
+ l[0] = ALLOC_N(int, a_len); \
657
+ MEMZERO(l[0], int, a_len); \
658
+ l[1] = ALLOC_N(int, b_len); \
659
+ MEMZERO(l[1], int, b_len); \
660
+ max_dist = ((a_len > b_len ? a_len : b_len) / 2) - 1; \
661
+ m = 0; \
662
+ for (i = 0; i < a_len; i++) { \
663
+ low = (i > max_dist ? i - max_dist : 0); \
664
+ high = (i + max_dist < b_len ? i + max_dist : b_len); \
665
+ for (j = low; j <= high; j++) { \
666
+ if (!l[1][j] && a_ptr[i] == b_ptr[j]) { \
667
+ l[0][i] = 1; \
668
+ l[1][j] = 1; \
669
+ m++; \
670
+ break; \
671
+ } \
672
+ } \
673
+ } \
674
+ if (m == 0) { \
675
+ result = 0.0; \
676
+ } else { \
677
+ k = t = 0; \
678
+ for (i = 0; i < a_len; i++) { \
679
+ if (l[0][i]) { \
680
+ for (j = k; j < b_len; j++) { \
681
+ if (l[1][j]) { \
682
+ k = j + 1; \
683
+ break; \
684
+ } \
685
+ } \
686
+ if (a_ptr[i] != b_ptr[j]) { \
687
+ t++; \
688
+ } \
689
+ } \
690
+ } \
691
+ t = t / 2; \
692
+ result = (((double)m)/a_len + ((double)m)/b_len + ((double)(m-t))/m)/3.0; \
693
+ }
694
+
695
+ #define LOWERCASE_STRINGS \
696
+ char *ying = ALLOC_N(char, a_len); \
697
+ MEMCPY(ying, a_ptr, char, a_len); \
698
+ a_ptr = ying; \
699
+ char *yang = ALLOC_N(char, b_len); \
700
+ MEMCPY(yang, b_ptr, char, b_len); \
701
+ b_ptr = yang; \
702
+ for (i = 0; i < a_len; i++) { \
703
+ if (islower(a_ptr[i])) a_ptr[i] = toupper(a_ptr[i]); \
704
+ } \
705
+ for (i = 0; i < b_len; i++) { \
706
+ if (islower(b_ptr[i])) b_ptr[i] = toupper(b_ptr[i]); \
707
+ }
708
+
709
+ #define FREE_STRINGS \
710
+ xfree(a_ptr); \
711
+ xfree(b_ptr);
712
+
713
+ static VALUE Jaro_match(Jaro *amatch, VALUE string)
714
+ {
715
+ char *a_ptr, *b_ptr;
716
+ int a_len, b_len, max_dist, m, t, i, j, k, low, high;
717
+ int *l[2];
718
+ double result;
719
+
720
+ Check_Type(string, T_STRING);
721
+ OPTIMIZE_TIME
722
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
723
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
724
+ if (amatch->ignore_case) {
725
+ LOWERCASE_STRINGS
726
+ }
727
+ COMPUTE_JARO
728
+ if (amatch->ignore_case) {
729
+ FREE_STRINGS
730
+ }
731
+ return rb_float_new(result);
732
+ }
733
+
734
+ /*
735
+ * Jaro-Winkler computation
736
+ */
737
+
738
+ static VALUE JaroWinkler_match(JaroWinkler *amatch, VALUE string)
739
+ {
740
+ char *a_ptr, *b_ptr;
741
+ int a_len, b_len, max_dist, m, t, i, j, k, low, high, n;
742
+ int *l[2];
743
+ double result;
744
+
745
+ Check_Type(string, T_STRING);
746
+ OPTIMIZE_TIME
747
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
748
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
749
+ if (amatch->ignore_case) {
750
+ LOWERCASE_STRINGS
751
+ }
752
+ COMPUTE_JARO
753
+ n = 0;
754
+ for (i = 0; i < (a_len >= 4 ? 4 : a_len); i++) {
755
+ if (a_ptr[i] == b_ptr[i]) {
756
+ n++;
757
+ } else {
758
+ break;
759
+ }
760
+ }
761
+ result = result + n*amatch->scaling_factor*(1-result);
762
+ if (amatch->ignore_case) {
763
+ FREE_STRINGS
764
+ }
765
+ return rb_float_new(result);
766
+ }
767
+
768
+ /*
769
+ * Ruby API
770
+ */
771
+
772
+ /*
773
+ * Document-class: Amatch::Levenshtein
774
+ *
775
+ * The Levenshtein edit distance is defined as the minimal costs involved to
776
+ * transform one string into another by using three elementary operations:
777
+ * deletion, insertion and substitution of a character. To transform "water"
778
+ * into "wine", for instance, you have to substitute "a" -> "i": "witer", "t"
779
+ * -> "n": "winer" and delete "r": "wine". The edit distance between "water"
780
+ * and "wine" is 3, because you have to apply three operations. The edit
781
+ * distance between "wine" and "wine" is 0 of course: no operation is
782
+ * necessary for the transformation -- they're already the same string. It's
783
+ * easy to see that more similar strings have smaller edit distances than
784
+ * strings that differ a lot.
785
+ */
786
+
787
+ DEF_RB_FREE(Levenshtein, General)
788
+
789
+ /*
790
+ * call-seq: new(pattern)
791
+ *
792
+ * Creates a new Amatch::Levenshtein instance from <code>pattern</code>.
793
+ */
794
+ static VALUE rb_Levenshtein_initialize(VALUE self, VALUE pattern)
795
+ {
796
+ GET_STRUCT(General)
797
+ General_pattern_set(amatch, pattern);
798
+ return self;
799
+ }
800
+
801
+ DEF_CONSTRUCTOR(Levenshtein, General)
802
+
803
+ /*
804
+ * call-seq: match(strings) -> results
805
+ *
806
+ * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
807
+ * against <code>strings</code>. It returns the number operations, the Sellers
808
+ * distance. <code>strings</code> has to be either a String or an Array of
809
+ * Strings. The returned <code>results</code> are either a Float or an Array of
810
+ * Floats respectively.
811
+ */
812
+ static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
813
+ {
814
+ GET_STRUCT(General)
815
+ return General_iterate_strings(amatch, strings, Levenshtein_match);
816
+ }
817
+
818
+ /*
819
+ * call-seq: similar(strings) -> results
820
+ *
821
+ * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
822
+ * against <code>strings</code>, and compute a Levenshtein distance metric
823
+ * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
824
+ * <code>strings</code> has to be either a String or an Array of Strings. The
825
+ * returned <code>results</code> are either a Fixnum or an Array of Fixnums
826
+ * respectively.
827
+ */
828
+ static VALUE rb_Levenshtein_similar(VALUE self, VALUE strings)
829
+ {
830
+ GET_STRUCT(General)
831
+ return General_iterate_strings(amatch, strings, Levenshtein_similar);
832
+ }
833
+
834
+ /*
835
+ * call-seq: levenshtein_similar(strings) -> results
836
+ *
837
+ * If called on a String, this string is used as a Amatch::Levenshtein#pattern
838
+ * to match against <code>strings</code>. It returns a Levenshtein distance
839
+ * metric number between 0.0 for very unsimilar strings and 1.0 for an exact
840
+ * match. <code>strings</code> has to be either a String or an Array of
841
+ * Strings. The returned <code>results</code> are either a Float or an Array of
842
+ * Floats respectively.
843
+ */
844
+ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
845
+ {
846
+ VALUE amatch = rb_Levenshtein_new(rb_cLevenshtein, self);
847
+ return rb_Levenshtein_similar(amatch, strings);
848
+ }
849
+
850
+ /*
851
+ * call-seq: search(strings) -> results
852
+ *
853
+ * searches Amatch::Levenshtein#pattern in <code>strings</code> and returns the
854
+ * edit distance (the sum of character operations) as a Fixnum value, by greedy
855
+ * trimming prefixes or postfixes of the match. <code>strings</code> has
856
+ * to be either a String or an Array of Strings. The returned
857
+ * <code>results</code> are either a Float or an Array of Floats respectively.
858
+ */
859
+ static VALUE rb_Levenshtein_search(VALUE self, VALUE strings)
860
+ {
861
+ GET_STRUCT(General)
862
+ return General_iterate_strings(amatch, strings, Levenshtein_search);
863
+ }
864
+
865
+ /*
866
+ * Document-class: Amatch::Sellers
867
+ *
868
+ * The Sellers edit distance is very similar to the Levenshtein edit distance.
869
+ * The difference is, that you can also specify different weights for every
870
+ * operation to prefer special operations over others. This extension of the
871
+ * Sellers edit distance is also known under the names: Needleman-Wunsch
872
+ * distance.
873
+ */
874
+
875
+ DEF_RB_FREE(Sellers, Sellers)
876
+
877
+ /*
878
+ * Document-method: substitution
879
+ *
880
+ * call-seq: substitution -> weight
881
+ *
882
+ * Returns the weight of the substitution operation, that is used to compute
883
+ * the Sellers distance.
884
+ */
885
+ DEF_RB_READER(Sellers, rb_Sellers_substitution, substitution,
886
+ rb_float_new)
887
+
888
+ /*
889
+ * Document-method: deletion
890
+ *
891
+ * call-seq: deletion -> weight
892
+ *
893
+ * Returns the weight of the deletion operation, that is used to compute
894
+ * the Sellers distance.
895
+ */
896
+ DEF_RB_READER(Sellers, rb_Sellers_deletion, deletion,
897
+ rb_float_new)
898
+
899
+ /*
900
+ * Document-method: insertion
901
+ *
902
+ * call-seq: insertion -> weight
903
+ *
904
+ * Returns the weight of the insertion operation, that is used to compute
905
+ * the Sellers distance.
906
+ */
907
+ DEF_RB_READER(Sellers, rb_Sellers_insertion, insertion,
908
+ rb_float_new)
909
+
910
+ /*
911
+ * Document-method: substitution=
912
+ *
913
+ * call-seq: substitution=(weight)
914
+ *
915
+ * Sets the weight of the substitution operation, that is used to compute
916
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
917
+ * should be a Float value >= 0.0.
918
+ */
919
+ DEF_RB_WRITER(Sellers, rb_Sellers_substitution_set, substitution,
920
+ double, CAST2FLOAT, FLOAT2C, >= 0)
921
+
922
+ /*
923
+ * Document-method: deletion=
924
+ *
925
+ * call-seq: deletion=(weight)
926
+ *
927
+ * Sets the weight of the deletion operation, that is used to compute
928
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
929
+ * should be a Float value >= 0.0.
930
+ */
931
+ DEF_RB_WRITER(Sellers, rb_Sellers_deletion_set, deletion,
932
+ double, CAST2FLOAT, FLOAT2C, >= 0)
933
+
934
+ /*
935
+ * Document-method: insertion=
936
+ *
937
+ * call-seq: insertion=(weight)
938
+ *
939
+ * Sets the weight of the insertion operation, that is used to compute
940
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
941
+ * should be a Float value >= 0.0.
942
+ */
943
+ DEF_RB_WRITER(Sellers, rb_Sellers_insertion_set, insertion,
944
+ double, CAST2FLOAT, FLOAT2C, >= 0)
945
+
946
+ /*
947
+ * Resets all weights (substitution, deletion, and insertion) to 1.0.
948
+ */
949
+ static VALUE rb_Sellers_reset_weights(VALUE self)
950
+ {
951
+ GET_STRUCT(Sellers)
952
+ Sellers_reset_weights(amatch);
953
+ return self;
954
+ }
955
+
956
+ /*
957
+ * call-seq: new(pattern)
958
+ *
959
+ * Creates a new Amatch::Sellers instance from <code>pattern</code>,
960
+ * with all weights initially set to 1.0.
961
+ */
962
+ static VALUE rb_Sellers_initialize(VALUE self, VALUE pattern)
963
+ {
964
+ GET_STRUCT(Sellers)
965
+ Sellers_pattern_set(amatch, pattern);
966
+ Sellers_reset_weights(amatch);
967
+ return self;
968
+ }
969
+
970
+ DEF_CONSTRUCTOR(Sellers, Sellers)
971
+
972
+ /*
973
+ * Document-method: pattern
974
+ *
975
+ * call-seq: pattern -> pattern string
976
+ *
977
+ * Returns the current pattern string of this Amatch::Sellers instance.
978
+ */
979
+
980
+ /*
981
+ * Document-method: pattern=
982
+ *
983
+ * call-seq: pattern=(pattern)
984
+ *
985
+ * Sets the current pattern string of this Amatch::Sellers instance to
986
+ * <code>pattern</code>.
987
+ */
988
+
989
+ /*
990
+ * call-seq: match(strings) -> results
991
+ *
992
+ * Uses this Amatch::Sellers instance to match Sellers#pattern against
993
+ * <code>strings</code>, while taking into account the given weights. It
994
+ * returns the number of weighted character operations, the Sellers distance.
995
+ * <code>strings</code> has to be either a String or an Array of Strings. The
996
+ * returned <code>results</code> are either a Float or an Array of Floats
997
+ * respectively.
998
+ */
999
+ static VALUE rb_Sellers_match(VALUE self, VALUE strings)
1000
+ {
1001
+ GET_STRUCT(Sellers)
1002
+ return Sellers_iterate_strings(amatch, strings, Sellers_match);
1003
+ }
1004
+
1005
+ /*
1006
+ * call-seq: similar(strings) -> results
1007
+ *
1008
+ * Uses this Amatch::Sellers instance to match Amatch::Sellers#pattern
1009
+ * against <code>strings</code> (taking into account the given weights), and
1010
+ * compute a Sellers distance metric number between 0.0 for very unsimilar
1011
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1012
+ * String or an Array of Strings. The returned <code>results</code> are either
1013
+ * a Fixnum or an Array of Fixnums
1014
+ * respectively.
1015
+ */
1016
+ static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
1017
+ {
1018
+ GET_STRUCT(Sellers)
1019
+ return Sellers_iterate_strings(amatch, strings, Sellers_similar);
1020
+ }
1021
+
1022
+ /*
1023
+ * call-seq: search(strings) -> results
1024
+ *
1025
+ * searches Sellers#pattern in <code>strings</code> and returns the edit
1026
+ * distance (the sum of weighted character operations) as a Float value, by
1027
+ * greedy trimming prefixes or postfixes of the match. <code>strings</code> has
1028
+ * to be either a String or an Array of Strings. The returned
1029
+ * <code>results</code> are either a Float or an Array of Floats respectively.
1030
+ */
1031
+ static VALUE rb_Sellers_search(VALUE self, VALUE strings)
1032
+ {
1033
+ GET_STRUCT(Sellers)
1034
+ return Sellers_iterate_strings(amatch, strings, Sellers_search);
1035
+ }
1036
+
1037
+ /*
1038
+ * Document-class: Amatch::PairDistance
1039
+ *
1040
+ * The pair distance between two strings is based on the number of adjacent
1041
+ * character pairs, that are contained in both strings. The similiarity
1042
+ * metric of two strings s1 and s2 is
1043
+ * 2*|union(pairs(s1), pairs(s2))| / |pairs(s1)| + |pairs(s2)|
1044
+ * If it is 1.0 the two strings are an exact match, if less than 1.0 they
1045
+ * are more dissimilar. The advantage of considering adjacent characters, is to
1046
+ * take account not only of the characters, but also of the character ordering
1047
+ * in the original strings.
1048
+ *
1049
+ * This metric is very capable to find similarities in natural languages.
1050
+ * It is explained in more detail in Simon White's article "How to Strike a
1051
+ * Match", located at this url:
1052
+ * http://www.catalysoft.com/articles/StrikeAMatch.html
1053
+ * It is also very similar (a special case) to the method described under
1054
+ * http://citeseer.lcs.mit.edu/gravano01using.html in "Using q-grams in a DBMS
1055
+ * for Approximate String Processing."
1056
+ */
1057
+ DEF_RB_FREE(PairDistance, PairDistance)
1058
+
1059
+ /*
1060
+ * call-seq: new(pattern)
1061
+ *
1062
+ * Creates a new Amatch::PairDistance instance from <code>pattern</code>.
1063
+ */
1064
+ static VALUE rb_PairDistance_initialize(VALUE self, VALUE pattern)
1065
+ {
1066
+ GET_STRUCT(PairDistance)
1067
+ PairDistance_pattern_set(amatch, pattern);
1068
+ return self;
1069
+ }
1070
+
1071
+ DEF_CONSTRUCTOR(PairDistance, PairDistance)
1072
+
1073
+ /*
1074
+ * call-seq: match(strings, regexp = /\s+/) -> results
1075
+ *
1076
+ * Uses this Amatch::PairDistance instance to match PairDistance#pattern against
1077
+ * <code>strings</code>. It returns the pair distance measure, that is a
1078
+ * returned value of 1.0 is an exact match, partial matches are lower
1079
+ * values, while 0.0 means no match at all.
1080
+ *
1081
+ * <code>strings</code> has to be either a String or an
1082
+ * Array of Strings. The argument <code>regexp</code> is used to split the
1083
+ * pattern and strings into tokens first. It defaults to /\s+/. If the
1084
+ * splitting should be omitted, call the method with nil as <code>regexp</code>
1085
+ * explicitly.
1086
+ *
1087
+ * The returned <code>results</code> are either a Float or an
1088
+ * Array of Floats respectively.
1089
+ */
1090
+ static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
1091
+ {
1092
+ VALUE result, strings, regexp = Qnil;
1093
+ int use_regexp;
1094
+ GET_STRUCT(PairDistance)
1095
+
1096
+ rb_scan_args(argc, argv, "11", &strings, &regexp);
1097
+ use_regexp = NIL_P(regexp) && argc != 2;
1098
+ if (TYPE(strings) == T_STRING) {
1099
+ result = PairDistance_match(amatch, strings, regexp, use_regexp);
1100
+ } else {
1101
+ Check_Type(strings, T_ARRAY);
1102
+ int i;
1103
+ result = rb_ary_new2(RARRAY_LEN(strings));
1104
+ for (i = 0; i < RARRAY_LEN(strings); i++) {
1105
+ VALUE string = rb_ary_entry(strings, i);
1106
+ if (TYPE(string) != T_STRING) {
1107
+ rb_raise(rb_eTypeError,
1108
+ "array has to contain only strings (%s given)",
1109
+ NIL_P(string) ?
1110
+ "NilClass" :
1111
+ rb_class2name(CLASS_OF(string)));
1112
+ }
1113
+ rb_ary_push(result,
1114
+ PairDistance_match(amatch, string, regexp, use_regexp));
1115
+ }
1116
+ }
1117
+ pair_array_destroy(amatch->pattern_pair_array);
1118
+ amatch->pattern_pair_array = NULL;
1119
+ return result;
1120
+ }
1121
+
1122
+ /*
1123
+ * call-seq: pair_distance_similar(strings) -> results
1124
+ *
1125
+ * If called on a String, this string is used as a Amatch::PairDistance#pattern
1126
+ * to match against <code>strings</code> using /\s+/ as the tokenizing regular
1127
+ * expression. It returns a pair distance metric number between 0.0 for very
1128
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1129
+ * either a String or an Array of Strings. The returned <code>results</code>
1130
+ * are either a Float or an Array of Floats respectively.
1131
+ */
1132
+ static VALUE rb_str_pair_distance_similar(VALUE self, VALUE strings)
1133
+ {
1134
+ VALUE amatch = rb_PairDistance_new(rb_cPairDistance, self);
1135
+ return rb_PairDistance_match(1, &strings, amatch);
1136
+ }
1137
+
1138
+ /*
1139
+ * Document-class: Amatch::Hamming
1140
+ *
1141
+ * This class computes the Hamming distance between two strings.
1142
+ *
1143
+ * The Hamming distance between two strings is the number of characters, that
1144
+ * are different. Thus a hamming distance of 0 means an exact
1145
+ * match, a hamming distance of 1 means one character is different, and so on.
1146
+ * If one string is longer than the other string, the missing characters are
1147
+ * counted as different characters.
1148
+ */
1149
+
1150
+ DEF_RB_FREE(Hamming, General)
1151
+
1152
+ /*
1153
+ * call-seq: new(pattern)
1154
+ *
1155
+ * Creates a new Amatch::Hamming instance from <code>pattern</code>.
1156
+ */
1157
+ static VALUE rb_Hamming_initialize(VALUE self, VALUE pattern)
1158
+ {
1159
+ GET_STRUCT(General)
1160
+ General_pattern_set(amatch, pattern);
1161
+ return self;
1162
+ }
1163
+
1164
+ DEF_CONSTRUCTOR(Hamming, General)
1165
+
1166
+ /*
1167
+ * call-seq: match(strings) -> results
1168
+ *
1169
+ * Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
1170
+ * <code>strings</code>, that is compute the hamming distance between
1171
+ * <code>pattern</code> and <code>strings</code>. <code>strings</code> has to
1172
+ * be either a String or an Array of Strings. The returned <code>results</code>
1173
+ * are either a Fixnum or an Array of Fixnums respectively.
1174
+ */
1175
+ static VALUE rb_Hamming_match(VALUE self, VALUE strings)
1176
+ {
1177
+ GET_STRUCT(General)
1178
+ return General_iterate_strings(amatch, strings, Hamming_match);
1179
+ }
1180
+
1181
+ /*
1182
+ * call-seq: similar(strings) -> results
1183
+ *
1184
+ * Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
1185
+ * <code>strings</code>, and compute a Hamming distance metric number between
1186
+ * 0.0 for very unsimilar strings and 1.0 for an exact match.
1187
+ * <code>strings</code> has to be either a String or an Array of Strings. The
1188
+ * returned <code>results</code> are either a Fixnum or an Array of Fixnums
1189
+ * respectively.
1190
+ */
1191
+ static VALUE rb_Hamming_similar(VALUE self, VALUE strings)
1192
+ {
1193
+ GET_STRUCT(General)
1194
+ return General_iterate_strings(amatch, strings, Hamming_similar);
1195
+ }
1196
+
1197
+ /*
1198
+ * call-seq: hamming_similar(strings) -> results
1199
+ *
1200
+ * If called on a String, this string is used as a Amatch::Hamming#pattern to
1201
+ * match against <code>strings</code>. It returns a Hamming distance metric
1202
+ * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
1203
+ * <code>strings</code>
1204
+ * has to be either a String or an Array of Strings. The returned
1205
+ * <code>results</code> are either a Float or an Array of Floats respectively.
1206
+ */
1207
+ static VALUE rb_str_hamming_similar(VALUE self, VALUE strings)
1208
+ {
1209
+ VALUE amatch = rb_Hamming_new(rb_cHamming, self);
1210
+ return rb_Hamming_similar(amatch, strings);
1211
+ }
1212
+
1213
+
1214
+ /*
1215
+ * Document-class: Amatch::LongestSubsequence
1216
+ *
1217
+ * This class computes the length of the longest subsequence common to two
1218
+ * strings. A subsequence doesn't have to be contiguous. The longer the common
1219
+ * subsequence is, the more similar the two strings will be.
1220
+ *
1221
+ * The longest common subsequence between "test" and "test" is of length 4,
1222
+ * because "test" itself is this subsequence. The longest common subsequence
1223
+ * between "test" and "east" is "e", "s", "t" and the length of the
1224
+ * sequence is 3.
1225
+ */
1226
+ DEF_RB_FREE(LongestSubsequence, General)
1227
+
1228
+ /*
1229
+ * call-seq: new(pattern)
1230
+ *
1231
+ * Creates a new Amatch::LongestSubsequence instance from <code>pattern</code>.
1232
+ */
1233
+ static VALUE rb_LongestSubsequence_initialize(VALUE self, VALUE pattern)
1234
+ {
1235
+ GET_STRUCT(General)
1236
+ General_pattern_set(amatch, pattern);
1237
+ return self;
1238
+ }
1239
+
1240
+ DEF_CONSTRUCTOR(LongestSubsequence, General)
1241
+
1242
+ /*
1243
+ * call-seq: match(strings) -> results
1244
+ *
1245
+ * Uses this Amatch::LongestSubsequence instance to match
1246
+ * LongestSubsequence#pattern against <code>strings</code>, that is compute the
1247
+ * length of the longest common subsequence. <code>strings</code> has to be
1248
+ * either a String or an Array of Strings. The returned <code>results</code>
1249
+ * are either a Fixnum or an Array of Fixnums respectively.
1250
+ */
1251
+ static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
1252
+ {
1253
+ GET_STRUCT(General)
1254
+ return General_iterate_strings(amatch, strings, LongestSubsequence_match);
1255
+ }
1256
+
1257
+ /*
1258
+ * call-seq: similar(strings) -> results
1259
+ *
1260
+ * Uses this Amatch::LongestSubsequence instance to match
1261
+ * Amatch::LongestSubsequence#pattern against <code>strings</code>, and compute
1262
+ * a longest substring distance metric number between 0.0 for very unsimilar
1263
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1264
+ * String or an Array of Strings. The returned <code>results</code> are either
1265
+ * a Fixnum or an Array of Fixnums
1266
+ */
1267
+ static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
1268
+ {
1269
+ GET_STRUCT(General)
1270
+ return General_iterate_strings(amatch, strings, LongestSubsequence_similar);
1271
+ }
1272
+
1273
+ /*
1274
+ * call-seq: longest_subsequence_similar(strings) -> results
1275
+ *
1276
+ * If called on a String, this string is used as a
1277
+ * Amatch::LongestSubsequence#pattern to match against <code>strings</code>. It
1278
+ * returns a longest subsequence distance metric number between 0.0 for very
1279
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1280
+ * either a String or an Array of Strings. The returned <code>results</code>
1281
+ * are either a Float or an Array of Floats respectively.
1282
+ */
1283
+ static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
1284
+ {
1285
+ VALUE amatch = rb_LongestSubsequence_new(rb_cLongestSubsequence, self);
1286
+ return rb_LongestSubsequence_similar(amatch, strings);
1287
+ }
1288
+
1289
+ /*
1290
+ * Document-class: Amatch::LongestSubstring
1291
+ *
1292
+ * The longest common substring is the longest substring, that is part of
1293
+ * two strings. A substring is contiguous, while a subsequence need not to
1294
+ * be. The longer the common substring is, the more similar the two strings
1295
+ * will be.
1296
+ *
1297
+ * The longest common substring between 'string' and 'string' is 'string'
1298
+ * again, thus the longest common substring length is 6. The longest common
1299
+ * substring between 'string' and 'storing' is 'ring', thus the longest common
1300
+ * substring length is 4.
1301
+ */
1302
+
1303
+ DEF_RB_FREE(LongestSubstring, General)
1304
+
1305
+ /*
1306
+ * call-seq: new(pattern)
1307
+ *
1308
+ * Creates a new Amatch::LongestSubstring instance from <code>pattern</code>.
1309
+ */
1310
+ static VALUE rb_LongestSubstring_initialize(VALUE self, VALUE pattern)
1311
+ {
1312
+ GET_STRUCT(General)
1313
+ General_pattern_set(amatch, pattern);
1314
+ return self;
1315
+ }
1316
+
1317
+ DEF_CONSTRUCTOR(LongestSubstring, General)
1318
+
1319
+ /*
1320
+ * call-seq: match(strings) -> results
1321
+ *
1322
+ * Uses this Amatch::LongestSubstring instance to match
1323
+ * LongestSubstring#pattern against <code>strings</code>, that is compute the
1324
+ * length of the longest common substring. <code>strings</code> has to be
1325
+ * either a String or an Array of Strings. The returned <code>results</code>
1326
+ * are either a Fixnum or an Array of Fixnums respectively.
1327
+ */
1328
+ static VALUE rb_LongestSubstring_match(VALUE self, VALUE strings)
1329
+ {
1330
+ GET_STRUCT(General)
1331
+ return General_iterate_strings(amatch, strings, LongestSubstring_match);
1332
+ }
1333
+
1334
+ /*
1335
+ * call-seq: similar(strings) -> results
1336
+ *
1337
+ * Uses this Amatch::LongestSubstring instance to match
1338
+ * Amatch::LongestSubstring#pattern against <code>strings</code>, and compute a
1339
+ * longest substring distance metric number between 0.0 for very unsimilar
1340
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1341
+ * String or an Array of Strings. The returned <code>results</code> are either
1342
+ * a Fixnum or an Array of Fixnums
1343
+ * respectively.
1344
+ */
1345
+ static VALUE rb_LongestSubstring_similar(VALUE self, VALUE strings)
1346
+ {
1347
+ GET_STRUCT(General)
1348
+ return General_iterate_strings(amatch, strings, LongestSubstring_similar);
1349
+ }
1350
+
1351
+ /*
1352
+ * call-seq: longest_substring_similar(strings) -> results
1353
+ *
1354
+ * If called on a String, this string is used as a
1355
+ * Amatch::LongestSubstring#pattern to match against <code>strings</code>. It
1356
+ * returns a longest substring distance metric number between 0.0 for very
1357
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1358
+ * either a String or an Array of Strings. The returned <code>results</code>
1359
+ * are either a Float or an Array of Floats respectively.
1360
+ */
1361
+ static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
1362
+ {
1363
+ VALUE amatch = rb_LongestSubstring_new(rb_cLongestSubstring, self);
1364
+ return rb_LongestSubstring_similar(amatch, strings);
1365
+ }
1366
+
1367
+ /*
1368
+ * Document-class: Amatch::Jaro
1369
+ *
1370
+ * This class computes the Jaro metric for two strings.
1371
+ * The Jaro metric computes the similarity between 0 (no match)
1372
+ * and 1 (exact match) by looking for matching and transposed characters.
1373
+ */
1374
+ DEF_RB_FREE(Jaro, Jaro)
1375
+
1376
+ /*
1377
+ * Document-method: ignore_case
1378
+ *
1379
+ * call-seq: ignore_case -> true/false
1380
+ *
1381
+ * Returns whether case is ignored when computing matching characters.
1382
+ */
1383
+ DEF_RB_READER(Jaro, rb_Jaro_ignore_case, ignore_case, C2BOOL)
1384
+
1385
+ /*
1386
+ * Document-method: ignore_case=
1387
+ *
1388
+ * call-seq: ignore_case=(true/false)
1389
+ *
1390
+ * Sets whether case is ignored when computing matching characters.
1391
+ */
1392
+ DEF_RB_WRITER(Jaro, rb_Jaro_ignore_case_set, ignore_case,
1393
+ int, CAST2BOOL, BOOL2C, != Qundef)
1394
+
1395
+ /*
1396
+ * call-seq: new(pattern)
1397
+ *
1398
+ * Creates a new Amatch::Jaro instance from <code>pattern</code>.
1399
+ */
1400
+ static VALUE rb_Jaro_initialize(VALUE self, VALUE pattern)
1401
+ {
1402
+ GET_STRUCT(Jaro)
1403
+ Jaro_pattern_set(amatch, pattern);
1404
+ amatch->ignore_case = 1;
1405
+ return self;
1406
+ }
1407
+
1408
+ DEF_CONSTRUCTOR(Jaro, Jaro)
1409
+
1410
+ /*
1411
+ * call-seq: match(strings) -> results
1412
+ *
1413
+ * Uses this Amatch::Jaro instance to match
1414
+ * Jaro#pattern against <code>strings</code>, that is compute the
1415
+ * jaro metric with the strings. <code>strings</code> has to be
1416
+ * either a String or an Array of Strings. The returned <code>results</code>
1417
+ * are either a Float or an Array of Floats respectively.
1418
+ */
1419
+ static VALUE rb_Jaro_match(VALUE self, VALUE strings)
1420
+ {
1421
+ GET_STRUCT(Jaro)
1422
+ return Jaro_iterate_strings(amatch, strings, Jaro_match);
1423
+ }
1424
+
1425
+ /*
1426
+ * call-seq: jaro_similar(strings) -> results
1427
+ *
1428
+ * If called on a String, this string is used as a
1429
+ * Amatch::Jaro#pattern to match against <code>strings</code>. It
1430
+ * returns a Jaro metric number between 0.0 for very
1431
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1432
+ * either a String or an Array of Strings. The returned <code>results</code>
1433
+ * are either a Float or an Array of Floats respectively.
1434
+ */
1435
+ static VALUE rb_str_jaro_similar(VALUE self, VALUE strings)
1436
+ {
1437
+ VALUE amatch = rb_Jaro_new(rb_cJaro, self);
1438
+ return rb_Jaro_match(amatch, strings);
1439
+ }
1440
+
1441
+ /*
1442
+ * Document-class: Amatch::JaroWinkler
1443
+ *
1444
+ * This class computes the Jaro-Winkler metric for two strings.
1445
+ * The Jaro-Winkler metric computes the similarity between 0 (no match)
1446
+ * and 1 (exact match) by looking for matching and transposed characters.
1447
+ *
1448
+ * It is a variant of the Jaro metric, with additional weighting towards
1449
+ * common prefixes.
1450
+ */
1451
+ DEF_RB_FREE(JaroWinkler, JaroWinkler)
1452
+
1453
+ /*
1454
+ * Document-method: ignore_case
1455
+ *
1456
+ * call-seq: ignore_case -> true/false
1457
+ *
1458
+ * Returns whether case is ignored when computing matching characters.
1459
+ * Default is true.
1460
+ */
1461
+ DEF_RB_READER(JaroWinkler, rb_JaroWinkler_ignore_case, ignore_case, C2BOOL)
1462
+
1463
+ /*
1464
+ * Document-method: scaling_factor
1465
+ *
1466
+ * call-seq: scaling_factor -> weight
1467
+ *
1468
+ * The scaling factor is how much weight to give common prefixes.
1469
+ * Default is 0.1.
1470
+ */
1471
+ DEF_RB_READER(JaroWinkler, rb_JaroWinkler_scaling_factor, scaling_factor, rb_float_new)
1472
+
1473
+ /*
1474
+ * Document-method: ignore_case=
1475
+ *
1476
+ * call-seq: ignore_case=(true/false)
1477
+ *
1478
+ * Sets whether case is ignored when computing matching characters.
1479
+ */
1480
+ DEF_RB_WRITER(JaroWinkler, rb_JaroWinkler_ignore_case_set, ignore_case,
1481
+ int, CAST2BOOL, BOOL2C, != Qundef)
1482
+
1483
+ /*
1484
+ * Document-method: scaling_factor=
1485
+ *
1486
+ * call-seq: scaling_factor=(weight)
1487
+ *
1488
+ * Sets the weight to give common prefixes.
1489
+ */
1490
+ DEF_RB_WRITER(JaroWinkler, rb_JaroWinkler_scaling_factor_set, scaling_factor,
1491
+ double, CAST2FLOAT, FLOAT2C, >= 0)
1492
+
1493
+ /*
1494
+ * call-seq: new(pattern)
1495
+ *
1496
+ * Creates a new Amatch::JaroWinkler instance from <code>pattern</code>.
1497
+ */
1498
+ static VALUE rb_JaroWinkler_initialize(VALUE self, VALUE pattern)
1499
+ {
1500
+ GET_STRUCT(JaroWinkler)
1501
+ JaroWinkler_pattern_set(amatch, pattern);
1502
+ amatch->ignore_case = 1;
1503
+ amatch->scaling_factor = 0.1;
1504
+ return self;
1505
+ }
1506
+
1507
+ DEF_CONSTRUCTOR(JaroWinkler, JaroWinkler)
1508
+
1509
+ /*
1510
+ * call-seq: match(strings) -> results
1511
+ *
1512
+ * Uses this Amatch::Jaro instance to match
1513
+ * Jaro#pattern against <code>strings</code>, that is compute the
1514
+ * jaro metric with the strings. <code>strings</code> has to be
1515
+ * either a String or an Array of Strings. The returned <code>results</code>
1516
+ * are either a Float or an Array of Floats respectively.
1517
+ */
1518
+ static VALUE rb_JaroWinkler_match(VALUE self, VALUE strings)
1519
+ {
1520
+ GET_STRUCT(JaroWinkler)
1521
+ return JaroWinkler_iterate_strings(amatch, strings, JaroWinkler_match);
1522
+ }
1523
+
1524
+ /*
1525
+ * call-seq: jarowinkler_similar(strings) -> results
1526
+ *
1527
+ * If called on a String, this string is used as a
1528
+ * Amatch::JaroWinkler#pattern to match against <code>strings</code>. It
1529
+ * returns a Jaro-Winkler metric number between 0.0 for very
1530
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1531
+ * either a String or an Array of Strings. The returned <code>results</code>
1532
+ * are either a Float or an Array of Floats respectively.
1533
+ */
1534
+ static VALUE rb_str_jarowinkler_similar(VALUE self, VALUE strings)
1535
+ {
1536
+ VALUE amatch = rb_JaroWinkler_new(rb_cJaro, self);
1537
+ return rb_JaroWinkler_match(amatch, strings);
1538
+ }
1539
+
1540
+ void Init_amatch()
1541
+ {
1542
+ rb_require("amatch/version");
1543
+ rb_mAmatch = rb_define_module("Amatch");
1544
+
1545
+ /* Levenshtein */
1546
+ rb_cLevenshtein = rb_define_class_under(rb_mAmatch, "Levenshtein", rb_cObject);
1547
+ rb_define_alloc_func(rb_cLevenshtein, rb_Levenshtein_s_allocate);
1548
+ rb_define_method(rb_cLevenshtein, "initialize", rb_Levenshtein_initialize, 1);
1549
+ rb_define_method(rb_cLevenshtein, "pattern", rb_General_pattern, 0);
1550
+ rb_define_method(rb_cLevenshtein, "pattern=", rb_General_pattern_set, 1);
1551
+ rb_define_method(rb_cLevenshtein, "match", rb_Levenshtein_match, 1);
1552
+ rb_define_method(rb_cLevenshtein, "search", rb_Levenshtein_search, 1);
1553
+ rb_define_method(rb_cLevenshtein, "similar", rb_Levenshtein_similar, 1);
1554
+ rb_define_method(rb_cString, "levenshtein_similar", rb_str_levenshtein_similar, 1);
1555
+
1556
+ /* Sellers */
1557
+ rb_cSellers = rb_define_class_under(rb_mAmatch, "Sellers", rb_cObject);
1558
+ rb_define_alloc_func(rb_cSellers, rb_Sellers_s_allocate);
1559
+ rb_define_method(rb_cSellers, "initialize", rb_Sellers_initialize, 1);
1560
+ rb_define_method(rb_cSellers, "pattern", rb_Sellers_pattern, 0);
1561
+ rb_define_method(rb_cSellers, "pattern=", rb_Sellers_pattern_set, 1);
1562
+ rb_define_method(rb_cSellers, "substitution", rb_Sellers_substitution, 0);
1563
+ rb_define_method(rb_cSellers, "substitution=", rb_Sellers_substitution_set, 1);
1564
+ rb_define_method(rb_cSellers, "deletion", rb_Sellers_deletion, 0);
1565
+ rb_define_method(rb_cSellers, "deletion=", rb_Sellers_deletion_set, 1);
1566
+ rb_define_method(rb_cSellers, "insertion", rb_Sellers_insertion, 0);
1567
+ rb_define_method(rb_cSellers, "insertion=", rb_Sellers_insertion_set, 1);
1568
+ rb_define_method(rb_cSellers, "reset_weights", rb_Sellers_reset_weights, 0);
1569
+ rb_define_method(rb_cSellers, "match", rb_Sellers_match, 1);
1570
+ rb_define_method(rb_cSellers, "search", rb_Sellers_search, 1);
1571
+ rb_define_method(rb_cSellers, "similar", rb_Sellers_similar, 1);
1572
+
1573
+ /* Hamming */
1574
+ rb_cHamming = rb_define_class_under(rb_mAmatch, "Hamming", rb_cObject);
1575
+ rb_define_alloc_func(rb_cHamming, rb_Hamming_s_allocate);
1576
+ rb_define_method(rb_cHamming, "initialize", rb_Hamming_initialize, 1);
1577
+ rb_define_method(rb_cHamming, "pattern", rb_General_pattern, 0);
1578
+ rb_define_method(rb_cHamming, "pattern=", rb_General_pattern_set, 1);
1579
+ rb_define_method(rb_cHamming, "match", rb_Hamming_match, 1);
1580
+ rb_define_method(rb_cHamming, "similar", rb_Hamming_similar, 1);
1581
+ rb_define_method(rb_cString, "hamming_similar", rb_str_hamming_similar, 1);
1582
+
1583
+ /* Pair Distance Metric */
1584
+ rb_cPairDistance = rb_define_class_under(rb_mAmatch, "PairDistance", rb_cObject);
1585
+ rb_define_alloc_func(rb_cPairDistance, rb_PairDistance_s_allocate);
1586
+ rb_define_method(rb_cPairDistance, "initialize", rb_PairDistance_initialize, 1);
1587
+ rb_define_method(rb_cPairDistance, "pattern", rb_PairDistance_pattern, 0);
1588
+ rb_define_method(rb_cPairDistance, "pattern=", rb_PairDistance_pattern_set, 1);
1589
+ rb_define_method(rb_cPairDistance, "match", rb_PairDistance_match, -1);
1590
+ rb_define_alias(rb_cPairDistance, "similar", "match");
1591
+ rb_define_method(rb_cString, "pair_distance_similar", rb_str_pair_distance_similar, 1);
1592
+
1593
+ /* Longest Common Subsequence */
1594
+ rb_cLongestSubsequence = rb_define_class_under(rb_mAmatch, "LongestSubsequence", rb_cObject);
1595
+ rb_define_alloc_func(rb_cLongestSubsequence, rb_LongestSubsequence_s_allocate);
1596
+ rb_define_method(rb_cLongestSubsequence, "initialize", rb_LongestSubsequence_initialize, 1);
1597
+ rb_define_method(rb_cLongestSubsequence, "pattern", rb_General_pattern, 0);
1598
+ rb_define_method(rb_cLongestSubsequence, "pattern=", rb_General_pattern_set, 1);
1599
+ rb_define_method(rb_cLongestSubsequence, "match", rb_LongestSubsequence_match, 1);
1600
+ rb_define_method(rb_cLongestSubsequence, "similar", rb_LongestSubsequence_similar, 1);
1601
+ rb_define_method(rb_cString, "longest_subsequence_similar", rb_str_longest_subsequence_similar, 1);
1602
+
1603
+ /* Longest Common Substring */
1604
+ rb_cLongestSubstring = rb_define_class_under(rb_mAmatch, "LongestSubstring", rb_cObject);
1605
+ rb_define_alloc_func(rb_cLongestSubstring, rb_LongestSubstring_s_allocate);
1606
+ rb_define_method(rb_cLongestSubstring, "initialize", rb_LongestSubstring_initialize, 1);
1607
+ rb_define_method(rb_cLongestSubstring, "pattern", rb_General_pattern, 0);
1608
+ rb_define_method(rb_cLongestSubstring, "pattern=", rb_General_pattern_set, 1);
1609
+ rb_define_method(rb_cLongestSubstring, "match", rb_LongestSubstring_match, 1);
1610
+ rb_define_method(rb_cLongestSubstring, "similar", rb_LongestSubstring_similar, 1);
1611
+ rb_define_method(rb_cString, "longest_substring_similar", rb_str_longest_substring_similar, 1);
1612
+
1613
+ /* Jaro */
1614
+ rb_cJaro = rb_define_class_under(rb_mAmatch, "Jaro", rb_cObject);
1615
+ rb_define_alloc_func(rb_cJaro, rb_Jaro_s_allocate);
1616
+ rb_define_method(rb_cJaro, "initialize", rb_Jaro_initialize, 1);
1617
+ rb_define_method(rb_cJaro, "pattern", rb_Jaro_pattern, 0);
1618
+ rb_define_method(rb_cJaro, "pattern=", rb_Jaro_pattern_set, 1);
1619
+ rb_define_method(rb_cJaro, "ignore_case", rb_Jaro_ignore_case, 0);
1620
+ rb_define_method(rb_cJaro, "ignore_case=", rb_Jaro_ignore_case_set, 1);
1621
+ rb_define_method(rb_cJaro, "match", rb_Jaro_match, 1);
1622
+ rb_define_alias(rb_cJaro, "similar", "match");
1623
+ rb_define_method(rb_cString, "jaro_similar", rb_str_jaro_similar, 1);
1624
+
1625
+ /* Jaro-Winkler */
1626
+ rb_cJaroWinkler = rb_define_class_under(rb_mAmatch, "JaroWinkler", rb_cObject);
1627
+ rb_define_alloc_func(rb_cJaroWinkler, rb_JaroWinkler_s_allocate);
1628
+ rb_define_method(rb_cJaroWinkler, "initialize", rb_JaroWinkler_initialize, 1);
1629
+ rb_define_method(rb_cJaroWinkler, "pattern", rb_JaroWinkler_pattern, 0);
1630
+ rb_define_method(rb_cJaroWinkler, "pattern=", rb_JaroWinkler_pattern_set, 1);
1631
+ rb_define_method(rb_cJaroWinkler, "ignore_case", rb_JaroWinkler_ignore_case, 0);
1632
+ rb_define_method(rb_cJaroWinkler, "ignore_case=", rb_JaroWinkler_ignore_case_set, 1);
1633
+ rb_define_method(rb_cJaroWinkler, "scaling_factor", rb_JaroWinkler_scaling_factor, 0);
1634
+ rb_define_method(rb_cJaroWinkler, "scaling_factor=", rb_JaroWinkler_scaling_factor_set, 1);
1635
+ rb_define_method(rb_cJaroWinkler, "match", rb_JaroWinkler_match, 1);
1636
+ rb_define_alias(rb_cJaroWinkler, "similar", "match");
1637
+ rb_define_method(rb_cString, "jarowinkler_similar", rb_str_jarowinkler_similar, 1);
1638
+
1639
+ id_split = rb_intern("split");
1640
+ id_to_f = rb_intern("to_f");
1641
+ }