amatch 0.2.5-x86-mswin32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,141 @@
1
+ # vim: set filetype=ruby et sw=2 ts=2:
2
+
3
+ begin
4
+ require 'rake/gempackagetask'
5
+ require 'rake/extensiontask'
6
+ rescue LoadError
7
+ end
8
+
9
+ require 'rbconfig'
10
+ include Config
11
+
12
+ require 'rake/clean'
13
+ CLEAN.include 'coverage', 'doc'
14
+ require 'rake/testtask'
15
+
16
+ MAKE = ENV['MAKE'] || %w[gmake make].find { |c| system(c, '-v') }
17
+ PKG_NAME = 'amatch'
18
+ PKG_VERSION = File.read('VERSION').chomp
19
+ PKG_FILES = FileList["**/*"].exclude(/^(pkg|coverage|doc|tmp)/)
20
+ PKG_DOC_FILES = [ "ext/amatch.c" ].concat(Dir['lib/**/*.rb']) << 'README'
21
+
22
+ desc "Run unit tests"
23
+ task :test => :compile_ext do
24
+ sh %{testrb -Iext:lib tests/test_*.rb}
25
+ end
26
+
27
+ desc "Compiling library"
28
+ task :compile_ext do
29
+ cd 'ext' do
30
+ ruby %{extconf.rb}
31
+ sh MAKE
32
+ end
33
+ end
34
+
35
+ desc "Installing library"
36
+ task :install => :test do
37
+ src, = Dir['ext/amatch.*'].reject { |x| /\.[co]$/.match x }
38
+ filename = File.basename(src)
39
+ dst = File.join(CONFIG["sitelibdir"], filename)
40
+ install(src, dst, :verbose => true)
41
+ end
42
+
43
+ desc "Removing generated files"
44
+ task :clean do
45
+ cd 'ext' do
46
+ ruby 'extconf.rb'
47
+ sh "#{MAKE} distclean" if File.exist?('Makefile')
48
+ end
49
+ end
50
+
51
+ desc "Build the documentation"
52
+ task :doc do
53
+ sh "rdoc -m README -t '#{PKG_NAME} - Approximate Matching' #{PKG_DOC_FILES * ' '}"
54
+ end
55
+
56
+ if defined?(Gem) and defined?(Rake::GemPackageTask) and
57
+ defined?(Rake::ExtensionTask)
58
+ then
59
+ spec_src = <<-GEM
60
+ Gem::Specification.new do |s|
61
+ s.name = '#{PKG_NAME}'
62
+ s.version = '#{PKG_VERSION}'
63
+ s.summary = "Approximate String Matching library"
64
+ s.description = <<EOF
65
+ Amatch is a library for approximate string matching and searching in strings.
66
+ Several algorithms can be used to do this, and it's also possible to compute a
67
+ similarity metric number between 0.0 and 1.0 for two given strings.
68
+ EOF
69
+
70
+ s.files = #{PKG_FILES.sort.inspect}
71
+
72
+ s.extensions << "ext/extconf.rb"
73
+
74
+ s.require_paths << 'ext' << 'lib'
75
+
76
+ s.bindir = "bin"
77
+ s.executables = ["agrep.rb"]
78
+ s.default_executable = "agrep.rb"
79
+
80
+ s.has_rdoc = true
81
+ s.extra_rdoc_files.concat #{PKG_DOC_FILES.sort.inspect}
82
+ s.rdoc_options << '--main' << 'README' <<
83
+ '--title' << "#{PKG_NAME} - Approximate Matching"
84
+ s.test_files.concat Dir['tests/test_*.rb']
85
+
86
+ s.author = "Florian Frank"
87
+ s.email = "flori@ping.de"
88
+ s.homepage = "http://amatch.rubyforge.org"
89
+ s.rubyforge_project = '#{PKG_NAME}'
90
+ end
91
+ GEM
92
+
93
+ desc 'Create a gemspec file'
94
+ task :gemspec do
95
+ File.open("#{PKG_NAME}.gemspec", 'w') do |f|
96
+ f.puts spec_src
97
+ end
98
+ end
99
+
100
+ spec = eval(spec_src)
101
+ Rake::GemPackageTask.new(spec) do |pkg|
102
+ pkg.need_tar = true
103
+ pkg.package_files = PKG_FILES
104
+ end
105
+
106
+ Rake::ExtensionTask.new do |ext|
107
+ ext.name = PKG_NAME
108
+ ext.gem_spec = spec
109
+ ext.cross_compile = true
110
+ ext.cross_platform = 'i386-mswin32'
111
+ ext.ext_dir = 'ext'
112
+ ext.lib_dir = 'lib'
113
+ end
114
+ end
115
+
116
+ desc m = "Writing version information for #{PKG_VERSION}"
117
+ task :version do
118
+ puts m
119
+ File.open(File.join('lib', 'amatch', 'version.rb'), 'w') do |v|
120
+ v.puts <<EOT
121
+ module Amatch
122
+ # Amatch version
123
+ VERSION = '#{PKG_VERSION}'
124
+ VERSION_ARRAY = VERSION.split(/\\./).map { |x| x.to_i } # :nodoc:
125
+ VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc:
126
+ VERSION_MINOR = VERSION_ARRAY[1] # :nodoc:
127
+ VERSION_BUILD = VERSION_ARRAY[2] # :nodoc:
128
+ end
129
+ EOT
130
+ end
131
+ end
132
+
133
+
134
+ desc "Default task"
135
+ task :default => [ :version, :gemspec, :test ]
136
+
137
+ desc "Build all gems and archives for a new release."
138
+ task :release => [ :clean, :version, :gemspec, :cross, :native, :gem ] do
139
+ system "#$0 clean native gem"
140
+ system "#$0 clean package"
141
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.5
@@ -0,0 +1,31 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'amatch'
3
+ s.version = '0.2.5'
4
+ s.summary = "Approximate String Matching library"
5
+ s.description = <<EOF
6
+ Amatch is a library for approximate string matching and searching in strings.
7
+ Several algorithms can be used to do this, and it's also possible to compute a
8
+ similarity metric number between 0.0 and 1.0 for two given strings.
9
+ EOF
10
+
11
+ s.files = ["CHANGES", "COPYING", "README", "Rakefile", "VERSION", "amatch.gemspec", "bin", "bin/agrep.rb", "ext", "ext/amatch.c", "ext/common.h", "ext/extconf.rb", "ext/pair.c", "ext/pair.h", "install.rb", "lib", "lib/amatch", "lib/amatch/version.rb", "tests", "tests/test_hamming.rb", "tests/test_jaro.rb", "tests/test_jaro_winkler.rb", "tests/test_levenshtein.rb", "tests/test_longest_subsequence.rb", "tests/test_longest_substring.rb", "tests/test_pair_distance.rb", "tests/test_sellers.rb"]
12
+
13
+ s.extensions << "ext/extconf.rb"
14
+
15
+ s.require_paths << 'ext' << 'lib'
16
+
17
+ s.bindir = "bin"
18
+ s.executables = ["agrep.rb"]
19
+ s.default_executable = "agrep.rb"
20
+
21
+ s.has_rdoc = true
22
+ s.extra_rdoc_files.concat ["README", "ext/amatch.c", "lib/amatch/version.rb"]
23
+ s.rdoc_options << '--main' << 'README' <<
24
+ '--title' << "amatch - Approximate Matching"
25
+ s.test_files.concat Dir['tests/test_*.rb']
26
+
27
+ s.author = "Florian Frank"
28
+ s.email = "flori@ping.de"
29
+ s.homepage = "http://amatch.rubyforge.org"
30
+ s.rubyforge_project = 'amatch'
31
+ end
@@ -0,0 +1,79 @@
1
+ #! /usr/bin/env ruby
2
+
3
+ require 'amatch'
4
+ require 'getoptlong'
5
+
6
+ def usage(msg, options)
7
+ puts msg, "Usage: #{File.basename($0)} [OPTIONS] PATTERN [FILE ...]", ""
8
+ options.each do |o|
9
+ puts " " + o[1] + ", " + o[0] + " " +
10
+ (o[2] == GetoptLong::REQUIRED_ARGUMENT ? 'ARGUMENT' : '')
11
+ end
12
+ puts "\nReport bugs to <flori@ping.de>."
13
+ exit 0
14
+ end
15
+
16
+ class Amatch::Levenshtein
17
+ def search_relative(strings)
18
+ search(strings).to_f / pattern.size
19
+ end
20
+ end
21
+
22
+ $distance = 1
23
+ $mode = :search
24
+ begin
25
+ parser = GetoptLong.new
26
+ options = [
27
+ [ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
28
+ [ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
29
+ [ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
30
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
31
+ ]
32
+ parser.set_options(*options)
33
+ parser.each_option do |name, arg|
34
+ name = name.sub(/^--/, '')
35
+ case name
36
+ when 'distance'
37
+ $distance = arg.to_f
38
+ when 'relative'
39
+ $mode = :search_relative
40
+ when 'verbose'
41
+ $verbose = 1
42
+ when 'help'
43
+ usage('You\'ve asked for it!', options)
44
+ end
45
+ end
46
+ rescue
47
+ exit 1
48
+ end
49
+ pattern = ARGV.shift or usage('Pattern needed!', options)
50
+
51
+ matcher = Amatch::Levenshtein.new(pattern)
52
+ size = 0
53
+ start = Time.new
54
+ if ARGV.size > 0 then
55
+ ARGV.each do |filename|
56
+ File.stat(filename).file? or next
57
+ size += File.size(filename)
58
+ begin
59
+ File.open(filename, 'r').each_line do |line|
60
+ if matcher.__send__($mode, line) <= $distance
61
+ puts "#{filename}:#{line}"
62
+ end
63
+ end
64
+ rescue
65
+ STDERR.puts "Failure at #{filename}: #{$!} => Skipping!"
66
+ end
67
+ end
68
+ else
69
+ STDIN.each_line do |line|
70
+ size += line.size
71
+ if matcher.__send__($mode, line) <= $distance
72
+ puts line
73
+ end
74
+ end
75
+ end
76
+ time = Time.new - start
77
+ $verbose and STDERR.printf "%.3f secs running, scanned %.3f KB/s.\n",
78
+ time, size / time / 1024
79
+ exit 0
@@ -0,0 +1,1641 @@
1
+ #include "ruby.h"
2
+ #include "pair.h"
3
+ #include <ctype.h>
4
+ #include "common.h"
5
+
6
+ /*
7
+ * Document-method: pattern
8
+ *
9
+ * call-seq: pattern -> pattern string
10
+ *
11
+ * Returns the current pattern string of this instance.
12
+ */
13
+
14
+ /*
15
+ * Document-method: pattern=
16
+ *
17
+ * call-seq: pattern=(pattern)
18
+ *
19
+ * Sets the current pattern string of this instance to <code>pattern</code>.
20
+ */
21
+
22
+
23
+ static VALUE rb_mAmatch, rb_cLevenshtein, rb_cSellers, rb_cHamming,
24
+ rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring,
25
+ rb_cJaro, rb_cJaroWinkler;
26
+
27
+ static ID id_split, id_to_f;
28
+
29
+ #define GET_STRUCT(klass) \
30
+ klass *amatch; \
31
+ Data_Get_Struct(self, klass, amatch);
32
+
33
+ #define DEF_ALLOCATOR(type) \
34
+ static type *type##_allocate() \
35
+ { \
36
+ type *obj = ALLOC(type); \
37
+ MEMZERO(obj, type, 1); \
38
+ return obj; \
39
+ }
40
+
41
+ #define DEF_CONSTRUCTOR(klass, type) \
42
+ static VALUE rb_##klass##_s_allocate(VALUE klass2) \
43
+ { \
44
+ type *amatch = type##_allocate(); \
45
+ return Data_Wrap_Struct(klass2, NULL, rb_##klass##_free, amatch); \
46
+ } \
47
+ VALUE rb_##klass##_new(VALUE klass2, VALUE pattern) \
48
+ { \
49
+ VALUE obj = rb_##klass##_s_allocate(klass2); \
50
+ rb_##klass##_initialize(obj, pattern); \
51
+ return obj; \
52
+ }
53
+
54
+ #define DEF_RB_FREE(klass, type) \
55
+ static void rb_##klass##_free(type *amatch) \
56
+ { \
57
+ MEMZERO(amatch->pattern, char, amatch->pattern_len); \
58
+ free(amatch->pattern); \
59
+ MEMZERO(amatch, type, 1); \
60
+ free(amatch); \
61
+ }
62
+
63
+ #define DEF_PATTERN_ACCESSOR(type) \
64
+ static void type##_pattern_set(type *amatch, VALUE pattern) \
65
+ { \
66
+ Check_Type(pattern, T_STRING); \
67
+ free(amatch->pattern); \
68
+ amatch->pattern_len = RSTRING_LEN(pattern); \
69
+ amatch->pattern = ALLOC_N(char, amatch->pattern_len); \
70
+ MEMCPY(amatch->pattern, RSTRING_PTR(pattern), char, \
71
+ RSTRING_LEN(pattern)); \
72
+ } \
73
+ static VALUE rb_##type##_pattern(VALUE self) \
74
+ { \
75
+ GET_STRUCT(type) \
76
+ return rb_str_new(amatch->pattern, amatch->pattern_len); \
77
+ } \
78
+ static VALUE rb_##type##_pattern_set(VALUE self, VALUE pattern) \
79
+ { \
80
+ GET_STRUCT(type) \
81
+ type##_pattern_set(amatch, pattern); \
82
+ return Qnil; \
83
+ }
84
+
85
+ #define DEF_ITERATE_STRINGS(type) \
86
+ static VALUE type##_iterate_strings(type *amatch, VALUE strings, \
87
+ VALUE (*match_function) (type *amatch, VALUE strings)) \
88
+ { \
89
+ if (TYPE(strings) == T_STRING) { \
90
+ return match_function(amatch, strings); \
91
+ } else { \
92
+ Check_Type(strings, T_ARRAY); \
93
+ int i; \
94
+ VALUE result = rb_ary_new2(RARRAY_LEN(strings)); \
95
+ for (i = 0; i < RARRAY_LEN(strings); i++) { \
96
+ VALUE string = rb_ary_entry(strings, i); \
97
+ if (TYPE(string) != T_STRING) { \
98
+ rb_raise(rb_eTypeError, \
99
+ "array has to contain only strings (%s given)", \
100
+ NIL_P(string) ? \
101
+ "NilClass" : \
102
+ rb_class2name(CLASS_OF(string))); \
103
+ } \
104
+ rb_ary_push(result, match_function(amatch, string)); \
105
+ } \
106
+ return result; \
107
+ } \
108
+ }
109
+
110
+ #define DEF_RB_READER(type, function, name, converter) \
111
+ VALUE function(VALUE self) \
112
+ { \
113
+ GET_STRUCT(type) \
114
+ return converter(amatch->name); \
115
+ }
116
+
117
+ #define DEF_RB_WRITER(type, function, name, vtype, caster, converter, check)\
118
+ VALUE function(VALUE self, VALUE value) \
119
+ { \
120
+ vtype value_ ## vtype; \
121
+ GET_STRUCT(type) \
122
+ caster(value); \
123
+ value_ ## vtype = converter(value); \
124
+ if (!(value_ ## vtype check)) \
125
+ rb_raise(rb_eTypeError, "check of value " #check " failed"); \
126
+ amatch->name = value_ ## vtype; \
127
+ return Qnil; \
128
+ }
129
+
130
+
131
+ #define CAST2FLOAT(obj) \
132
+ if (TYPE(obj) != T_FLOAT && rb_respond_to(obj, id_to_f)) \
133
+ obj = rb_funcall(obj, id_to_f, 0, 0); \
134
+ else \
135
+ Check_Type(obj, T_FLOAT)
136
+ #define FLOAT2C(obj) (RFLOAT_VALUE(obj))
137
+
138
+ #define CAST2BOOL(obj) \
139
+ if (obj == Qfalse || obj == Qnil) \
140
+ obj = Qfalse; \
141
+ else \
142
+ obj = Qtrue;
143
+ #define BOOL2C(obj) (obj == Qtrue)
144
+ #define C2BOOL(obj) (obj ? Qtrue : Qfalse)
145
+
146
+ #define OPTIMIZE_TIME \
147
+ if (amatch->pattern_len < RSTRING_LEN(string)) { \
148
+ a_ptr = amatch->pattern; \
149
+ a_len = amatch->pattern_len; \
150
+ b_ptr = RSTRING_PTR(string); \
151
+ b_len = RSTRING_LEN(string); \
152
+ } else { \
153
+ a_ptr = RSTRING_PTR(string); \
154
+ a_len = RSTRING_LEN(string); \
155
+ b_ptr = amatch->pattern; \
156
+ b_len = amatch->pattern_len; \
157
+ }
158
+
159
+ #define DONT_OPTIMIZE \
160
+ a_ptr = amatch->pattern; \
161
+ a_len = amatch->pattern_len; \
162
+ b_ptr = RSTRING_PTR(string); \
163
+ b_len = RSTRING_LEN(string); \
164
+
165
+ /*
166
+ * C structures of the Amatch classes
167
+ */
168
+
169
+ typedef struct GeneralStruct {
170
+ char *pattern;
171
+ int pattern_len;
172
+ } General;
173
+
174
+ DEF_ALLOCATOR(General)
175
+ DEF_PATTERN_ACCESSOR(General)
176
+ DEF_ITERATE_STRINGS(General)
177
+
178
+ typedef struct SellersStruct {
179
+ char *pattern;
180
+ int pattern_len;
181
+ double substitution;
182
+ double deletion;
183
+ double insertion;
184
+ } Sellers;
185
+
186
+ DEF_ALLOCATOR(Sellers)
187
+ DEF_PATTERN_ACCESSOR(Sellers)
188
+ DEF_ITERATE_STRINGS(Sellers)
189
+
190
+ static void Sellers_reset_weights(Sellers *self)
191
+ {
192
+ self->substitution = 1.0;
193
+ self->deletion = 1.0;
194
+ self->insertion = 1.0;
195
+ }
196
+
197
+ typedef struct PairDistanceStruct {
198
+ char *pattern;
199
+ int pattern_len;
200
+ PairArray *pattern_pair_array;
201
+ } PairDistance;
202
+
203
+ DEF_ALLOCATOR(PairDistance)
204
+ DEF_PATTERN_ACCESSOR(PairDistance)
205
+
206
+ typedef struct JaroStruct {
207
+ char *pattern;
208
+ int pattern_len;
209
+ int ignore_case;
210
+ } Jaro;
211
+
212
+ DEF_ALLOCATOR(Jaro)
213
+ DEF_PATTERN_ACCESSOR(Jaro)
214
+ DEF_ITERATE_STRINGS(Jaro)
215
+
216
+ typedef struct JaroWinklerStruct {
217
+ char *pattern;
218
+ int pattern_len;
219
+ int ignore_case;
220
+ float scaling_factor;
221
+ } JaroWinkler;
222
+
223
+ DEF_ALLOCATOR(JaroWinkler)
224
+ DEF_PATTERN_ACCESSOR(JaroWinkler)
225
+ DEF_ITERATE_STRINGS(JaroWinkler)
226
+
227
+ /*
228
+ * Levenshtein edit distances are computed here:
229
+ */
230
+
231
+ #define COMPUTE_LEVENSHTEIN_DISTANCE \
232
+ for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
233
+ c = i % 2; /* current row */ \
234
+ p = (i + 1) % 2; /* previous row */ \
235
+ v[c][0] = i; /* first column */ \
236
+ for (j = 1; j <= b_len; j++) { \
237
+ /* Bellman's principle of optimality: */ \
238
+ weight = v[p][j - 1] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
239
+ if (weight > v[p][j] + 1) { \
240
+ weight = v[p][j] + 1; \
241
+ } \
242
+ if (weight > v[c][j - 1] + 1) { \
243
+ weight = v[c][j - 1] + 1; \
244
+ } \
245
+ v[c][j] = weight; \
246
+ } \
247
+ p = c; \
248
+ c = (c + 1) % 2; \
249
+ }
250
+
251
+ static VALUE Levenshtein_match(General *amatch, VALUE string)
252
+ {
253
+ VALUE result;
254
+ char *a_ptr, *b_ptr;
255
+ int a_len, b_len;
256
+ int *v[2], weight;
257
+ int i, j, c, p;
258
+
259
+ Check_Type(string, T_STRING);
260
+ DONT_OPTIMIZE
261
+
262
+ v[0] = ALLOC_N(int, b_len + 1);
263
+ v[1] = ALLOC_N(int, b_len + 1);
264
+ for (i = 0; i <= b_len; i++) {
265
+ v[0][i] = i;
266
+ v[1][i] = i;
267
+ }
268
+
269
+ COMPUTE_LEVENSHTEIN_DISTANCE
270
+
271
+ result = INT2FIX(v[p][b_len]);
272
+
273
+ free(v[0]);
274
+ free(v[1]);
275
+
276
+ return result;
277
+ }
278
+
279
+ static VALUE Levenshtein_similar(General *amatch, VALUE string)
280
+ {
281
+ VALUE result;
282
+ char *a_ptr, *b_ptr;
283
+ int a_len, b_len;
284
+ int *v[2], weight;
285
+ int i, j, c, p;
286
+
287
+ Check_Type(string, T_STRING);
288
+ DONT_OPTIMIZE
289
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
290
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
291
+ v[0] = ALLOC_N(int, b_len + 1);
292
+ v[1] = ALLOC_N(int, b_len + 1);
293
+ for (i = 0; i <= b_len; i++) {
294
+ v[0][i] = i;
295
+ v[1][i] = i;
296
+ }
297
+
298
+ COMPUTE_LEVENSHTEIN_DISTANCE
299
+
300
+ if (b_len > a_len) {
301
+ result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
302
+ } else {
303
+ result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
304
+ }
305
+ free(v[0]);
306
+ free(v[1]);
307
+ return result;
308
+ }
309
+
310
+ static VALUE Levenshtein_search(General *amatch, VALUE string)
311
+ {
312
+ VALUE result;
313
+ char *a_ptr, *b_ptr;
314
+ int a_len, b_len;
315
+ int *v[2], weight, min;
316
+ int i, j, c, p;
317
+
318
+ Check_Type(string, T_STRING);
319
+ DONT_OPTIMIZE
320
+
321
+ v[0] = ALLOC_N(int, b_len + 1);
322
+ v[1] = ALLOC_N(int, b_len + 1);
323
+ MEMZERO(v[0], int, b_len + 1);
324
+ MEMZERO(v[1], int, b_len + 1);
325
+
326
+ COMPUTE_LEVENSHTEIN_DISTANCE
327
+
328
+ for (i = 0, min = a_len; i <= b_len; i++) {
329
+ if (v[p][i] < min) min = v[p][i];
330
+ }
331
+
332
+ result = INT2FIX(min);
333
+
334
+ free(v[0]);
335
+ free(v[1]);
336
+
337
+ return result;
338
+ }
339
+
340
+
341
+ /*
342
+ * Sellers edit distances are computed here:
343
+ */
344
+
345
+ #define COMPUTE_SELLERS_DISTANCE \
346
+ for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
347
+ c = i % 2; /* current row */ \
348
+ p = (i + 1) % 2; /* previous row */ \
349
+ v[c][0] = i * amatch->deletion; /* first column */ \
350
+ for (j = 1; j <= b_len; j++) { \
351
+ /* Bellman's principle of optimality: */ \
352
+ weight = v[p][j - 1] + \
353
+ (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : amatch->substitution); \
354
+ if (weight > v[p][j] + amatch->insertion) { \
355
+ weight = v[p][j] + amatch->insertion; \
356
+ } \
357
+ if (weight > v[c][j - 1] + amatch->deletion) { \
358
+ weight = v[c][j - 1] + amatch->deletion; \
359
+ } \
360
+ v[c][j] = weight; \
361
+ } \
362
+ p = c; \
363
+ c = (c + 1) % 2; \
364
+ }
365
+
366
+ static VALUE Sellers_match(Sellers *amatch, VALUE string)
367
+ {
368
+ VALUE result;
369
+ char *a_ptr, *b_ptr;
370
+ int a_len, b_len;
371
+ double *v[2], weight;
372
+ int i, j, c, p;
373
+
374
+ Check_Type(string, T_STRING);
375
+ DONT_OPTIMIZE
376
+
377
+ v[0] = ALLOC_N(double, b_len + 1);
378
+ v[1] = ALLOC_N(double, b_len + 1);
379
+ for (i = 0; i <= b_len; i++) {
380
+ v[0][i] = i * amatch->deletion;
381
+ v[1][i] = i * amatch->deletion;
382
+ }
383
+
384
+ COMPUTE_SELLERS_DISTANCE
385
+
386
+ result = rb_float_new(v[p][b_len]);
387
+ free(v[0]);
388
+ free(v[1]);
389
+ return result;
390
+ }
391
+
392
+ static VALUE Sellers_similar(Sellers *amatch, VALUE string)
393
+ {
394
+ VALUE result;
395
+ char *a_ptr, *b_ptr;
396
+ int a_len, b_len;
397
+ double *v[2], weight, max_weight;
398
+ int i, j, c, p;
399
+
400
+ if (amatch->insertion >= amatch->deletion) {
401
+ if (amatch->substitution >= amatch->insertion) {
402
+ max_weight = amatch->substitution;
403
+ } else {
404
+ max_weight = amatch->insertion;
405
+ }
406
+ } else {
407
+ if (amatch->substitution >= amatch->deletion) {
408
+ max_weight = amatch->substitution;
409
+ } else {
410
+ max_weight = amatch->deletion;
411
+ }
412
+ }
413
+
414
+ Check_Type(string, T_STRING);
415
+ DONT_OPTIMIZE
416
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
417
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
418
+ v[0] = ALLOC_N(double, b_len + 1);
419
+ v[1] = ALLOC_N(double, b_len + 1);
420
+ for (i = 0; i <= b_len; i++) {
421
+ v[0][i] = i * amatch->deletion;
422
+ v[1][i] = i * amatch->deletion;
423
+ }
424
+
425
+ COMPUTE_SELLERS_DISTANCE
426
+
427
+ if (b_len > a_len) {
428
+ result = rb_float_new(1.0 - v[p][b_len] / (b_len * max_weight));
429
+ } else {
430
+ result = rb_float_new(1.0 - v[p][b_len] / (a_len * max_weight));
431
+ }
432
+ free(v[0]);
433
+ free(v[1]);
434
+ return result;
435
+ }
436
+
437
+ static VALUE Sellers_search(Sellers *amatch, VALUE string)
438
+ {
439
+ VALUE result;
440
+ char *a_ptr, *b_ptr;
441
+ int a_len, b_len;
442
+ double *v[2], weight, min;
443
+ int i, j, c, p;
444
+
445
+ Check_Type(string, T_STRING);
446
+ DONT_OPTIMIZE
447
+
448
+ v[0] = ALLOC_N(double, b_len + 1);
449
+ v[1] = ALLOC_N(double, b_len + 1);
450
+ MEMZERO(v[0], double, b_len + 1);
451
+ MEMZERO(v[1], double, b_len + 1);
452
+
453
+ COMPUTE_SELLERS_DISTANCE
454
+
455
+ for (i = 0, min = a_len; i <= b_len; i++) {
456
+ if (v[p][i] < min) min = v[p][i];
457
+ }
458
+ result = rb_float_new(min);
459
+ free(v[0]);
460
+ free(v[1]);
461
+
462
+ return result;
463
+ }
464
+
465
+ /*
466
+ * Pair distances are computed here:
467
+ */
468
+
469
+ static VALUE PairDistance_match(
470
+ PairDistance *amatch, VALUE string, VALUE regexp, int use_regexp)
471
+ {
472
+ double result;
473
+ VALUE tokens;
474
+ PairArray *pair_array;
475
+
476
+ Check_Type(string, T_STRING);
477
+ if (!NIL_P(regexp) || use_regexp) {
478
+ tokens = rb_funcall(
479
+ rb_str_new(amatch->pattern, amatch->pattern_len),
480
+ id_split, 1, regexp
481
+ );
482
+ if (!amatch->pattern_pair_array) {
483
+ amatch->pattern_pair_array = PairArray_new(tokens);
484
+ } else {
485
+ pair_array_reactivate(amatch->pattern_pair_array);
486
+ }
487
+ tokens = rb_funcall(string, id_split, 1, regexp);
488
+ pair_array = PairArray_new(tokens);
489
+ } else {
490
+ VALUE tmp = rb_str_new(amatch->pattern, amatch->pattern_len);
491
+ tokens = rb_ary_new4(1, &tmp);
492
+ if (!amatch->pattern_pair_array) {
493
+ amatch->pattern_pair_array = PairArray_new(tokens);
494
+ } else {
495
+ pair_array_reactivate(amatch->pattern_pair_array);
496
+ }
497
+ tokens = rb_ary_new4(1, &string);
498
+ pair_array = PairArray_new(tokens);
499
+ }
500
+ result = pair_array_match(amatch->pattern_pair_array, pair_array);
501
+ pair_array_destroy(pair_array);
502
+ return rb_float_new(result);
503
+ }
504
+
505
+ /*
506
+ * Hamming distances are computed here:
507
+ */
508
+
509
+ #define COMPUTE_HAMMING_DISTANCE \
510
+ for (i = 0, result = b_len - a_len; i < a_len; i++) { \
511
+ if (i >= b_len) { \
512
+ result += a_len - b_len; \
513
+ break; \
514
+ } \
515
+ if (b_ptr[i] != a_ptr[i]) result++; \
516
+ }
517
+
518
+ static VALUE Hamming_match(General *amatch, VALUE string)
519
+ {
520
+ char *a_ptr, *b_ptr;
521
+ int a_len, b_len;
522
+ int i, result;
523
+
524
+ Check_Type(string, T_STRING);
525
+ OPTIMIZE_TIME
526
+ COMPUTE_HAMMING_DISTANCE
527
+ return INT2FIX(result);
528
+ }
529
+
530
+ static VALUE Hamming_similar(General *amatch, VALUE string)
531
+ {
532
+ char *a_ptr, *b_ptr;
533
+ int a_len, b_len;
534
+ int i, result;
535
+
536
+ Check_Type(string, T_STRING);
537
+ OPTIMIZE_TIME
538
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
539
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
540
+ COMPUTE_HAMMING_DISTANCE
541
+ return rb_float_new(1.0 - ((double) result) / b_len);
542
+ }
543
+
544
+ /*
545
+ * Longest Common Subsequence computation
546
+ */
547
+
548
+ #define COMPUTE_LONGEST_SUBSEQUENCE \
549
+ l[0] = ALLOC_N(int, b_len + 1); \
550
+ l[1] = ALLOC_N(int, b_len + 1); \
551
+ for (i = a_len, c = 0, p = 1; i >= 0; i--) { \
552
+ for (j = b_len; j >= 0; j--) { \
553
+ if (i == a_len || j == b_len) { \
554
+ l[c][j] = 0; \
555
+ } else if (a_ptr[i] == b_ptr[j]) { \
556
+ l[c][j] = 1 + l[p][j + 1]; \
557
+ } else { \
558
+ int x = l[p][j], y = l[c][j + 1]; \
559
+ if (x > y) l[c][j] = x; else l[c][j] = y; \
560
+ } \
561
+ } \
562
+ p = c; \
563
+ c = (c + 1) % 2; \
564
+ } \
565
+ result = l[p][0]; \
566
+ free(l[0]); \
567
+ free(l[1]);
568
+
569
+
570
+ static VALUE LongestSubsequence_match(General *amatch, VALUE string)
571
+ {
572
+ char *a_ptr, *b_ptr;
573
+ int a_len, b_len;
574
+ int result, c, p, i, j, *l[2];
575
+
576
+ Check_Type(string, T_STRING);
577
+ OPTIMIZE_TIME
578
+
579
+ if (a_len == 0 || b_len == 0) return INT2FIX(0);
580
+ COMPUTE_LONGEST_SUBSEQUENCE
581
+ return INT2FIX(result);
582
+ }
583
+
584
+ static VALUE LongestSubsequence_similar(General *amatch, VALUE string)
585
+ {
586
+ char *a_ptr, *b_ptr;
587
+ int a_len, b_len;
588
+ int result, c, p, i, j, *l[2];
589
+
590
+ Check_Type(string, T_STRING);
591
+ OPTIMIZE_TIME
592
+
593
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
594
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
595
+ COMPUTE_LONGEST_SUBSEQUENCE
596
+ return rb_float_new(((double) result) / b_len);
597
+ }
598
+
599
+ /*
600
+ * Longest Common Substring computation
601
+ */
602
+
603
+ #define COMPUTE_LONGEST_SUBSTRING \
604
+ l[0] = ALLOC_N(int, b_len); \
605
+ MEMZERO(l[0], int, b_len); \
606
+ l[1] = ALLOC_N(int, b_len); \
607
+ MEMZERO(l[1], int, b_len); \
608
+ result = 0; \
609
+ for (i = 0, c = 0, p = 1; i < a_len; i++) { \
610
+ for (j = 0; j < b_len; j++) { \
611
+ if (a_ptr[i] == b_ptr[j]) { \
612
+ l[c][j] = j == 0 ? 1 : 1 + l[p][j - 1]; \
613
+ if (l[c][j] > result) result = l[c][j]; \
614
+ } else { \
615
+ l[c][j] = 0; \
616
+ } \
617
+ } \
618
+ p = c; \
619
+ c = (c + 1) % 2; \
620
+ } \
621
+ free(l[0]); \
622
+ free(l[1]);
623
+
624
+ static VALUE LongestSubstring_match(General *amatch, VALUE string)
625
+ {
626
+ char *a_ptr, *b_ptr;
627
+ int a_len, b_len;
628
+ int result, c, p, i, j, *l[2];
629
+
630
+ Check_Type(string, T_STRING);
631
+ OPTIMIZE_TIME
632
+ if (a_len == 0 || b_len == 0) return INT2FIX(0);
633
+ COMPUTE_LONGEST_SUBSTRING
634
+ return INT2FIX(result);
635
+ }
636
+
637
+ static VALUE LongestSubstring_similar(General *amatch, VALUE string)
638
+ {
639
+ char *a_ptr, *b_ptr;
640
+ int a_len, b_len;
641
+ int result, c, p, i, j, *l[2];
642
+
643
+ Check_Type(string, T_STRING);
644
+ OPTIMIZE_TIME
645
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
646
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
647
+ COMPUTE_LONGEST_SUBSTRING
648
+ return rb_float_new(((double) result) / b_len);
649
+ }
650
+
651
+ /*
652
+ * Jaro computation
653
+ */
654
+
655
+ #define COMPUTE_JARO \
656
+ l[0] = ALLOC_N(int, a_len); \
657
+ MEMZERO(l[0], int, a_len); \
658
+ l[1] = ALLOC_N(int, b_len); \
659
+ MEMZERO(l[1], int, b_len); \
660
+ max_dist = ((a_len > b_len ? a_len : b_len) / 2) - 1; \
661
+ m = 0; \
662
+ for (i = 0; i < a_len; i++) { \
663
+ low = (i > max_dist ? i - max_dist : 0); \
664
+ high = (i + max_dist < b_len ? i + max_dist : b_len); \
665
+ for (j = low; j <= high; j++) { \
666
+ if (!l[1][j] && a_ptr[i] == b_ptr[j]) { \
667
+ l[0][i] = 1; \
668
+ l[1][j] = 1; \
669
+ m++; \
670
+ break; \
671
+ } \
672
+ } \
673
+ } \
674
+ if (m == 0) { \
675
+ result = 0.0; \
676
+ } else { \
677
+ k = t = 0; \
678
+ for (i = 0; i < a_len; i++) { \
679
+ if (l[0][i]) { \
680
+ for (j = k; j < b_len; j++) { \
681
+ if (l[1][j]) { \
682
+ k = j + 1; \
683
+ break; \
684
+ } \
685
+ } \
686
+ if (a_ptr[i] != b_ptr[j]) { \
687
+ t++; \
688
+ } \
689
+ } \
690
+ } \
691
+ t = t / 2; \
692
+ result = (((double)m)/a_len + ((double)m)/b_len + ((double)(m-t))/m)/3.0; \
693
+ }
694
+
695
+ #define LOWERCASE_STRINGS \
696
+ char *ying = ALLOC_N(char, a_len); \
697
+ MEMCPY(ying, a_ptr, char, a_len); \
698
+ a_ptr = ying; \
699
+ char *yang = ALLOC_N(char, b_len); \
700
+ MEMCPY(yang, b_ptr, char, b_len); \
701
+ b_ptr = yang; \
702
+ for (i = 0; i < a_len; i++) { \
703
+ if (islower(a_ptr[i])) a_ptr[i] = toupper(a_ptr[i]); \
704
+ } \
705
+ for (i = 0; i < b_len; i++) { \
706
+ if (islower(b_ptr[i])) b_ptr[i] = toupper(b_ptr[i]); \
707
+ }
708
+
709
+ #define FREE_STRINGS \
710
+ xfree(a_ptr); \
711
+ xfree(b_ptr);
712
+
713
+ static VALUE Jaro_match(Jaro *amatch, VALUE string)
714
+ {
715
+ char *a_ptr, *b_ptr;
716
+ int a_len, b_len, max_dist, m, t, i, j, k, low, high;
717
+ int *l[2];
718
+ double result;
719
+
720
+ Check_Type(string, T_STRING);
721
+ OPTIMIZE_TIME
722
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
723
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
724
+ if (amatch->ignore_case) {
725
+ LOWERCASE_STRINGS
726
+ }
727
+ COMPUTE_JARO
728
+ if (amatch->ignore_case) {
729
+ FREE_STRINGS
730
+ }
731
+ return rb_float_new(result);
732
+ }
733
+
734
+ /*
735
+ * Jaro-Winkler computation
736
+ */
737
+
738
+ static VALUE JaroWinkler_match(JaroWinkler *amatch, VALUE string)
739
+ {
740
+ char *a_ptr, *b_ptr;
741
+ int a_len, b_len, max_dist, m, t, i, j, k, low, high, n;
742
+ int *l[2];
743
+ double result;
744
+
745
+ Check_Type(string, T_STRING);
746
+ OPTIMIZE_TIME
747
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
748
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
749
+ if (amatch->ignore_case) {
750
+ LOWERCASE_STRINGS
751
+ }
752
+ COMPUTE_JARO
753
+ n = 0;
754
+ for (i = 0; i < (a_len >= 4 ? 4 : a_len); i++) {
755
+ if (a_ptr[i] == b_ptr[i]) {
756
+ n++;
757
+ } else {
758
+ break;
759
+ }
760
+ }
761
+ result = result + n*amatch->scaling_factor*(1-result);
762
+ if (amatch->ignore_case) {
763
+ FREE_STRINGS
764
+ }
765
+ return rb_float_new(result);
766
+ }
767
+
768
+ /*
769
+ * Ruby API
770
+ */
771
+
772
+ /*
773
+ * Document-class: Amatch::Levenshtein
774
+ *
775
+ * The Levenshtein edit distance is defined as the minimal costs involved to
776
+ * transform one string into another by using three elementary operations:
777
+ * deletion, insertion and substitution of a character. To transform "water"
778
+ * into "wine", for instance, you have to substitute "a" -> "i": "witer", "t"
779
+ * -> "n": "winer" and delete "r": "wine". The edit distance between "water"
780
+ * and "wine" is 3, because you have to apply three operations. The edit
781
+ * distance between "wine" and "wine" is 0 of course: no operation is
782
+ * necessary for the transformation -- they're already the same string. It's
783
+ * easy to see that more similar strings have smaller edit distances than
784
+ * strings that differ a lot.
785
+ */
786
+
787
+ DEF_RB_FREE(Levenshtein, General)
788
+
789
+ /*
790
+ * call-seq: new(pattern)
791
+ *
792
+ * Creates a new Amatch::Levenshtein instance from <code>pattern</code>.
793
+ */
794
+ static VALUE rb_Levenshtein_initialize(VALUE self, VALUE pattern)
795
+ {
796
+ GET_STRUCT(General)
797
+ General_pattern_set(amatch, pattern);
798
+ return self;
799
+ }
800
+
801
+ DEF_CONSTRUCTOR(Levenshtein, General)
802
+
803
+ /*
804
+ * call-seq: match(strings) -> results
805
+ *
806
+ * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
807
+ * against <code>strings</code>. It returns the number operations, the Sellers
808
+ * distance. <code>strings</code> has to be either a String or an Array of
809
+ * Strings. The returned <code>results</code> are either a Float or an Array of
810
+ * Floats respectively.
811
+ */
812
+ static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
813
+ {
814
+ GET_STRUCT(General)
815
+ return General_iterate_strings(amatch, strings, Levenshtein_match);
816
+ }
817
+
818
+ /*
819
+ * call-seq: similar(strings) -> results
820
+ *
821
+ * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
822
+ * against <code>strings</code>, and compute a Levenshtein distance metric
823
+ * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
824
+ * <code>strings</code> has to be either a String or an Array of Strings. The
825
+ * returned <code>results</code> are either a Fixnum or an Array of Fixnums
826
+ * respectively.
827
+ */
828
+ static VALUE rb_Levenshtein_similar(VALUE self, VALUE strings)
829
+ {
830
+ GET_STRUCT(General)
831
+ return General_iterate_strings(amatch, strings, Levenshtein_similar);
832
+ }
833
+
834
+ /*
835
+ * call-seq: levenshtein_similar(strings) -> results
836
+ *
837
+ * If called on a String, this string is used as a Amatch::Levenshtein#pattern
838
+ * to match against <code>strings</code>. It returns a Levenshtein distance
839
+ * metric number between 0.0 for very unsimilar strings and 1.0 for an exact
840
+ * match. <code>strings</code> has to be either a String or an Array of
841
+ * Strings. The returned <code>results</code> are either a Float or an Array of
842
+ * Floats respectively.
843
+ */
844
+ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
845
+ {
846
+ VALUE amatch = rb_Levenshtein_new(rb_cLevenshtein, self);
847
+ return rb_Levenshtein_similar(amatch, strings);
848
+ }
849
+
850
+ /*
851
+ * call-seq: search(strings) -> results
852
+ *
853
+ * searches Amatch::Levenshtein#pattern in <code>strings</code> and returns the
854
+ * edit distance (the sum of character operations) as a Fixnum value, by greedy
855
+ * trimming prefixes or postfixes of the match. <code>strings</code> has
856
+ * to be either a String or an Array of Strings. The returned
857
+ * <code>results</code> are either a Float or an Array of Floats respectively.
858
+ */
859
+ static VALUE rb_Levenshtein_search(VALUE self, VALUE strings)
860
+ {
861
+ GET_STRUCT(General)
862
+ return General_iterate_strings(amatch, strings, Levenshtein_search);
863
+ }
864
+
865
+ /*
866
+ * Document-class: Amatch::Sellers
867
+ *
868
+ * The Sellers edit distance is very similar to the Levenshtein edit distance.
869
+ * The difference is, that you can also specify different weights for every
870
+ * operation to prefer special operations over others. This extension of the
871
+ * Sellers edit distance is also known under the names: Needleman-Wunsch
872
+ * distance.
873
+ */
874
+
875
+ DEF_RB_FREE(Sellers, Sellers)
876
+
877
+ /*
878
+ * Document-method: substitution
879
+ *
880
+ * call-seq: substitution -> weight
881
+ *
882
+ * Returns the weight of the substitution operation, that is used to compute
883
+ * the Sellers distance.
884
+ */
885
+ DEF_RB_READER(Sellers, rb_Sellers_substitution, substitution,
886
+ rb_float_new)
887
+
888
+ /*
889
+ * Document-method: deletion
890
+ *
891
+ * call-seq: deletion -> weight
892
+ *
893
+ * Returns the weight of the deletion operation, that is used to compute
894
+ * the Sellers distance.
895
+ */
896
+ DEF_RB_READER(Sellers, rb_Sellers_deletion, deletion,
897
+ rb_float_new)
898
+
899
+ /*
900
+ * Document-method: insertion
901
+ *
902
+ * call-seq: insertion -> weight
903
+ *
904
+ * Returns the weight of the insertion operation, that is used to compute
905
+ * the Sellers distance.
906
+ */
907
+ DEF_RB_READER(Sellers, rb_Sellers_insertion, insertion,
908
+ rb_float_new)
909
+
910
+ /*
911
+ * Document-method: substitution=
912
+ *
913
+ * call-seq: substitution=(weight)
914
+ *
915
+ * Sets the weight of the substitution operation, that is used to compute
916
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
917
+ * should be a Float value >= 0.0.
918
+ */
919
+ DEF_RB_WRITER(Sellers, rb_Sellers_substitution_set, substitution,
920
+ double, CAST2FLOAT, FLOAT2C, >= 0)
921
+
922
+ /*
923
+ * Document-method: deletion=
924
+ *
925
+ * call-seq: deletion=(weight)
926
+ *
927
+ * Sets the weight of the deletion operation, that is used to compute
928
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
929
+ * should be a Float value >= 0.0.
930
+ */
931
+ DEF_RB_WRITER(Sellers, rb_Sellers_deletion_set, deletion,
932
+ double, CAST2FLOAT, FLOAT2C, >= 0)
933
+
934
+ /*
935
+ * Document-method: insertion=
936
+ *
937
+ * call-seq: insertion=(weight)
938
+ *
939
+ * Sets the weight of the insertion operation, that is used to compute
940
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
941
+ * should be a Float value >= 0.0.
942
+ */
943
+ DEF_RB_WRITER(Sellers, rb_Sellers_insertion_set, insertion,
944
+ double, CAST2FLOAT, FLOAT2C, >= 0)
945
+
946
+ /*
947
+ * Resets all weights (substitution, deletion, and insertion) to 1.0.
948
+ */
949
+ static VALUE rb_Sellers_reset_weights(VALUE self)
950
+ {
951
+ GET_STRUCT(Sellers)
952
+ Sellers_reset_weights(amatch);
953
+ return self;
954
+ }
955
+
956
+ /*
957
+ * call-seq: new(pattern)
958
+ *
959
+ * Creates a new Amatch::Sellers instance from <code>pattern</code>,
960
+ * with all weights initially set to 1.0.
961
+ */
962
+ static VALUE rb_Sellers_initialize(VALUE self, VALUE pattern)
963
+ {
964
+ GET_STRUCT(Sellers)
965
+ Sellers_pattern_set(amatch, pattern);
966
+ Sellers_reset_weights(amatch);
967
+ return self;
968
+ }
969
+
970
+ DEF_CONSTRUCTOR(Sellers, Sellers)
971
+
972
+ /*
973
+ * Document-method: pattern
974
+ *
975
+ * call-seq: pattern -> pattern string
976
+ *
977
+ * Returns the current pattern string of this Amatch::Sellers instance.
978
+ */
979
+
980
+ /*
981
+ * Document-method: pattern=
982
+ *
983
+ * call-seq: pattern=(pattern)
984
+ *
985
+ * Sets the current pattern string of this Amatch::Sellers instance to
986
+ * <code>pattern</code>.
987
+ */
988
+
989
+ /*
990
+ * call-seq: match(strings) -> results
991
+ *
992
+ * Uses this Amatch::Sellers instance to match Sellers#pattern against
993
+ * <code>strings</code>, while taking into account the given weights. It
994
+ * returns the number of weighted character operations, the Sellers distance.
995
+ * <code>strings</code> has to be either a String or an Array of Strings. The
996
+ * returned <code>results</code> are either a Float or an Array of Floats
997
+ * respectively.
998
+ */
999
+ static VALUE rb_Sellers_match(VALUE self, VALUE strings)
1000
+ {
1001
+ GET_STRUCT(Sellers)
1002
+ return Sellers_iterate_strings(amatch, strings, Sellers_match);
1003
+ }
1004
+
1005
+ /*
1006
+ * call-seq: similar(strings) -> results
1007
+ *
1008
+ * Uses this Amatch::Sellers instance to match Amatch::Sellers#pattern
1009
+ * against <code>strings</code> (taking into account the given weights), and
1010
+ * compute a Sellers distance metric number between 0.0 for very unsimilar
1011
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1012
+ * String or an Array of Strings. The returned <code>results</code> are either
1013
+ * a Fixnum or an Array of Fixnums
1014
+ * respectively.
1015
+ */
1016
+ static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
1017
+ {
1018
+ GET_STRUCT(Sellers)
1019
+ return Sellers_iterate_strings(amatch, strings, Sellers_similar);
1020
+ }
1021
+
1022
+ /*
1023
+ * call-seq: search(strings) -> results
1024
+ *
1025
+ * searches Sellers#pattern in <code>strings</code> and returns the edit
1026
+ * distance (the sum of weighted character operations) as a Float value, by
1027
+ * greedy trimming prefixes or postfixes of the match. <code>strings</code> has
1028
+ * to be either a String or an Array of Strings. The returned
1029
+ * <code>results</code> are either a Float or an Array of Floats respectively.
1030
+ */
1031
+ static VALUE rb_Sellers_search(VALUE self, VALUE strings)
1032
+ {
1033
+ GET_STRUCT(Sellers)
1034
+ return Sellers_iterate_strings(amatch, strings, Sellers_search);
1035
+ }
1036
+
1037
+ /*
1038
+ * Document-class: Amatch::PairDistance
1039
+ *
1040
+ * The pair distance between two strings is based on the number of adjacent
1041
+ * character pairs, that are contained in both strings. The similiarity
1042
+ * metric of two strings s1 and s2 is
1043
+ * 2*|union(pairs(s1), pairs(s2))| / |pairs(s1)| + |pairs(s2)|
1044
+ * If it is 1.0 the two strings are an exact match, if less than 1.0 they
1045
+ * are more dissimilar. The advantage of considering adjacent characters, is to
1046
+ * take account not only of the characters, but also of the character ordering
1047
+ * in the original strings.
1048
+ *
1049
+ * This metric is very capable to find similarities in natural languages.
1050
+ * It is explained in more detail in Simon White's article "How to Strike a
1051
+ * Match", located at this url:
1052
+ * http://www.catalysoft.com/articles/StrikeAMatch.html
1053
+ * It is also very similar (a special case) to the method described under
1054
+ * http://citeseer.lcs.mit.edu/gravano01using.html in "Using q-grams in a DBMS
1055
+ * for Approximate String Processing."
1056
+ */
1057
+ DEF_RB_FREE(PairDistance, PairDistance)
1058
+
1059
+ /*
1060
+ * call-seq: new(pattern)
1061
+ *
1062
+ * Creates a new Amatch::PairDistance instance from <code>pattern</code>.
1063
+ */
1064
+ static VALUE rb_PairDistance_initialize(VALUE self, VALUE pattern)
1065
+ {
1066
+ GET_STRUCT(PairDistance)
1067
+ PairDistance_pattern_set(amatch, pattern);
1068
+ return self;
1069
+ }
1070
+
1071
+ DEF_CONSTRUCTOR(PairDistance, PairDistance)
1072
+
1073
+ /*
1074
+ * call-seq: match(strings, regexp = /\s+/) -> results
1075
+ *
1076
+ * Uses this Amatch::PairDistance instance to match PairDistance#pattern against
1077
+ * <code>strings</code>. It returns the pair distance measure, that is a
1078
+ * returned value of 1.0 is an exact match, partial matches are lower
1079
+ * values, while 0.0 means no match at all.
1080
+ *
1081
+ * <code>strings</code> has to be either a String or an
1082
+ * Array of Strings. The argument <code>regexp</code> is used to split the
1083
+ * pattern and strings into tokens first. It defaults to /\s+/. If the
1084
+ * splitting should be omitted, call the method with nil as <code>regexp</code>
1085
+ * explicitly.
1086
+ *
1087
+ * The returned <code>results</code> are either a Float or an
1088
+ * Array of Floats respectively.
1089
+ */
1090
+ static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
1091
+ {
1092
+ VALUE result, strings, regexp = Qnil;
1093
+ int use_regexp;
1094
+ GET_STRUCT(PairDistance)
1095
+
1096
+ rb_scan_args(argc, argv, "11", &strings, &regexp);
1097
+ use_regexp = NIL_P(regexp) && argc != 2;
1098
+ if (TYPE(strings) == T_STRING) {
1099
+ result = PairDistance_match(amatch, strings, regexp, use_regexp);
1100
+ } else {
1101
+ Check_Type(strings, T_ARRAY);
1102
+ int i;
1103
+ result = rb_ary_new2(RARRAY_LEN(strings));
1104
+ for (i = 0; i < RARRAY_LEN(strings); i++) {
1105
+ VALUE string = rb_ary_entry(strings, i);
1106
+ if (TYPE(string) != T_STRING) {
1107
+ rb_raise(rb_eTypeError,
1108
+ "array has to contain only strings (%s given)",
1109
+ NIL_P(string) ?
1110
+ "NilClass" :
1111
+ rb_class2name(CLASS_OF(string)));
1112
+ }
1113
+ rb_ary_push(result,
1114
+ PairDistance_match(amatch, string, regexp, use_regexp));
1115
+ }
1116
+ }
1117
+ pair_array_destroy(amatch->pattern_pair_array);
1118
+ amatch->pattern_pair_array = NULL;
1119
+ return result;
1120
+ }
1121
+
1122
+ /*
1123
+ * call-seq: pair_distance_similar(strings) -> results
1124
+ *
1125
+ * If called on a String, this string is used as a Amatch::PairDistance#pattern
1126
+ * to match against <code>strings</code> using /\s+/ as the tokenizing regular
1127
+ * expression. It returns a pair distance metric number between 0.0 for very
1128
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1129
+ * either a String or an Array of Strings. The returned <code>results</code>
1130
+ * are either a Float or an Array of Floats respectively.
1131
+ */
1132
+ static VALUE rb_str_pair_distance_similar(VALUE self, VALUE strings)
1133
+ {
1134
+ VALUE amatch = rb_PairDistance_new(rb_cPairDistance, self);
1135
+ return rb_PairDistance_match(1, &strings, amatch);
1136
+ }
1137
+
1138
+ /*
1139
+ * Document-class: Amatch::Hamming
1140
+ *
1141
+ * This class computes the Hamming distance between two strings.
1142
+ *
1143
+ * The Hamming distance between two strings is the number of characters, that
1144
+ * are different. Thus a hamming distance of 0 means an exact
1145
+ * match, a hamming distance of 1 means one character is different, and so on.
1146
+ * If one string is longer than the other string, the missing characters are
1147
+ * counted as different characters.
1148
+ */
1149
+
1150
+ DEF_RB_FREE(Hamming, General)
1151
+
1152
+ /*
1153
+ * call-seq: new(pattern)
1154
+ *
1155
+ * Creates a new Amatch::Hamming instance from <code>pattern</code>.
1156
+ */
1157
+ static VALUE rb_Hamming_initialize(VALUE self, VALUE pattern)
1158
+ {
1159
+ GET_STRUCT(General)
1160
+ General_pattern_set(amatch, pattern);
1161
+ return self;
1162
+ }
1163
+
1164
+ DEF_CONSTRUCTOR(Hamming, General)
1165
+
1166
+ /*
1167
+ * call-seq: match(strings) -> results
1168
+ *
1169
+ * Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
1170
+ * <code>strings</code>, that is compute the hamming distance between
1171
+ * <code>pattern</code> and <code>strings</code>. <code>strings</code> has to
1172
+ * be either a String or an Array of Strings. The returned <code>results</code>
1173
+ * are either a Fixnum or an Array of Fixnums respectively.
1174
+ */
1175
+ static VALUE rb_Hamming_match(VALUE self, VALUE strings)
1176
+ {
1177
+ GET_STRUCT(General)
1178
+ return General_iterate_strings(amatch, strings, Hamming_match);
1179
+ }
1180
+
1181
+ /*
1182
+ * call-seq: similar(strings) -> results
1183
+ *
1184
+ * Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
1185
+ * <code>strings</code>, and compute a Hamming distance metric number between
1186
+ * 0.0 for very unsimilar strings and 1.0 for an exact match.
1187
+ * <code>strings</code> has to be either a String or an Array of Strings. The
1188
+ * returned <code>results</code> are either a Fixnum or an Array of Fixnums
1189
+ * respectively.
1190
+ */
1191
+ static VALUE rb_Hamming_similar(VALUE self, VALUE strings)
1192
+ {
1193
+ GET_STRUCT(General)
1194
+ return General_iterate_strings(amatch, strings, Hamming_similar);
1195
+ }
1196
+
1197
+ /*
1198
+ * call-seq: hamming_similar(strings) -> results
1199
+ *
1200
+ * If called on a String, this string is used as a Amatch::Hamming#pattern to
1201
+ * match against <code>strings</code>. It returns a Hamming distance metric
1202
+ * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
1203
+ * <code>strings</code>
1204
+ * has to be either a String or an Array of Strings. The returned
1205
+ * <code>results</code> are either a Float or an Array of Floats respectively.
1206
+ */
1207
+ static VALUE rb_str_hamming_similar(VALUE self, VALUE strings)
1208
+ {
1209
+ VALUE amatch = rb_Hamming_new(rb_cHamming, self);
1210
+ return rb_Hamming_similar(amatch, strings);
1211
+ }
1212
+
1213
+
1214
+ /*
1215
+ * Document-class: Amatch::LongestSubsequence
1216
+ *
1217
+ * This class computes the length of the longest subsequence common to two
1218
+ * strings. A subsequence doesn't have to be contiguous. The longer the common
1219
+ * subsequence is, the more similar the two strings will be.
1220
+ *
1221
+ * The longest common subsequence between "test" and "test" is of length 4,
1222
+ * because "test" itself is this subsequence. The longest common subsequence
1223
+ * between "test" and "east" is "e", "s", "t" and the length of the
1224
+ * sequence is 3.
1225
+ */
1226
+ DEF_RB_FREE(LongestSubsequence, General)
1227
+
1228
+ /*
1229
+ * call-seq: new(pattern)
1230
+ *
1231
+ * Creates a new Amatch::LongestSubsequence instance from <code>pattern</code>.
1232
+ */
1233
+ static VALUE rb_LongestSubsequence_initialize(VALUE self, VALUE pattern)
1234
+ {
1235
+ GET_STRUCT(General)
1236
+ General_pattern_set(amatch, pattern);
1237
+ return self;
1238
+ }
1239
+
1240
+ DEF_CONSTRUCTOR(LongestSubsequence, General)
1241
+
1242
+ /*
1243
+ * call-seq: match(strings) -> results
1244
+ *
1245
+ * Uses this Amatch::LongestSubsequence instance to match
1246
+ * LongestSubsequence#pattern against <code>strings</code>, that is compute the
1247
+ * length of the longest common subsequence. <code>strings</code> has to be
1248
+ * either a String or an Array of Strings. The returned <code>results</code>
1249
+ * are either a Fixnum or an Array of Fixnums respectively.
1250
+ */
1251
+ static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
1252
+ {
1253
+ GET_STRUCT(General)
1254
+ return General_iterate_strings(amatch, strings, LongestSubsequence_match);
1255
+ }
1256
+
1257
+ /*
1258
+ * call-seq: similar(strings) -> results
1259
+ *
1260
+ * Uses this Amatch::LongestSubsequence instance to match
1261
+ * Amatch::LongestSubsequence#pattern against <code>strings</code>, and compute
1262
+ * a longest substring distance metric number between 0.0 for very unsimilar
1263
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1264
+ * String or an Array of Strings. The returned <code>results</code> are either
1265
+ * a Fixnum or an Array of Fixnums
1266
+ */
1267
+ static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
1268
+ {
1269
+ GET_STRUCT(General)
1270
+ return General_iterate_strings(amatch, strings, LongestSubsequence_similar);
1271
+ }
1272
+
1273
+ /*
1274
+ * call-seq: longest_subsequence_similar(strings) -> results
1275
+ *
1276
+ * If called on a String, this string is used as a
1277
+ * Amatch::LongestSubsequence#pattern to match against <code>strings</code>. It
1278
+ * returns a longest subsequence distance metric number between 0.0 for very
1279
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1280
+ * either a String or an Array of Strings. The returned <code>results</code>
1281
+ * are either a Float or an Array of Floats respectively.
1282
+ */
1283
+ static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
1284
+ {
1285
+ VALUE amatch = rb_LongestSubsequence_new(rb_cLongestSubsequence, self);
1286
+ return rb_LongestSubsequence_similar(amatch, strings);
1287
+ }
1288
+
1289
+ /*
1290
+ * Document-class: Amatch::LongestSubstring
1291
+ *
1292
+ * The longest common substring is the longest substring, that is part of
1293
+ * two strings. A substring is contiguous, while a subsequence need not to
1294
+ * be. The longer the common substring is, the more similar the two strings
1295
+ * will be.
1296
+ *
1297
+ * The longest common substring between 'string' and 'string' is 'string'
1298
+ * again, thus the longest common substring length is 6. The longest common
1299
+ * substring between 'string' and 'storing' is 'ring', thus the longest common
1300
+ * substring length is 4.
1301
+ */
1302
+
1303
+ DEF_RB_FREE(LongestSubstring, General)
1304
+
1305
+ /*
1306
+ * call-seq: new(pattern)
1307
+ *
1308
+ * Creates a new Amatch::LongestSubstring instance from <code>pattern</code>.
1309
+ */
1310
+ static VALUE rb_LongestSubstring_initialize(VALUE self, VALUE pattern)
1311
+ {
1312
+ GET_STRUCT(General)
1313
+ General_pattern_set(amatch, pattern);
1314
+ return self;
1315
+ }
1316
+
1317
+ DEF_CONSTRUCTOR(LongestSubstring, General)
1318
+
1319
+ /*
1320
+ * call-seq: match(strings) -> results
1321
+ *
1322
+ * Uses this Amatch::LongestSubstring instance to match
1323
+ * LongestSubstring#pattern against <code>strings</code>, that is compute the
1324
+ * length of the longest common substring. <code>strings</code> has to be
1325
+ * either a String or an Array of Strings. The returned <code>results</code>
1326
+ * are either a Fixnum or an Array of Fixnums respectively.
1327
+ */
1328
+ static VALUE rb_LongestSubstring_match(VALUE self, VALUE strings)
1329
+ {
1330
+ GET_STRUCT(General)
1331
+ return General_iterate_strings(amatch, strings, LongestSubstring_match);
1332
+ }
1333
+
1334
+ /*
1335
+ * call-seq: similar(strings) -> results
1336
+ *
1337
+ * Uses this Amatch::LongestSubstring instance to match
1338
+ * Amatch::LongestSubstring#pattern against <code>strings</code>, and compute a
1339
+ * longest substring distance metric number between 0.0 for very unsimilar
1340
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1341
+ * String or an Array of Strings. The returned <code>results</code> are either
1342
+ * a Fixnum or an Array of Fixnums
1343
+ * respectively.
1344
+ */
1345
+ static VALUE rb_LongestSubstring_similar(VALUE self, VALUE strings)
1346
+ {
1347
+ GET_STRUCT(General)
1348
+ return General_iterate_strings(amatch, strings, LongestSubstring_similar);
1349
+ }
1350
+
1351
+ /*
1352
+ * call-seq: longest_substring_similar(strings) -> results
1353
+ *
1354
+ * If called on a String, this string is used as a
1355
+ * Amatch::LongestSubstring#pattern to match against <code>strings</code>. It
1356
+ * returns a longest substring distance metric number between 0.0 for very
1357
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1358
+ * either a String or an Array of Strings. The returned <code>results</code>
1359
+ * are either a Float or an Array of Floats respectively.
1360
+ */
1361
+ static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
1362
+ {
1363
+ VALUE amatch = rb_LongestSubstring_new(rb_cLongestSubstring, self);
1364
+ return rb_LongestSubstring_similar(amatch, strings);
1365
+ }
1366
+
1367
+ /*
1368
+ * Document-class: Amatch::Jaro
1369
+ *
1370
+ * This class computes the Jaro metric for two strings.
1371
+ * The Jaro metric computes the similarity between 0 (no match)
1372
+ * and 1 (exact match) by looking for matching and transposed characters.
1373
+ */
1374
+ DEF_RB_FREE(Jaro, Jaro)
1375
+
1376
+ /*
1377
+ * Document-method: ignore_case
1378
+ *
1379
+ * call-seq: ignore_case -> true/false
1380
+ *
1381
+ * Returns whether case is ignored when computing matching characters.
1382
+ */
1383
+ DEF_RB_READER(Jaro, rb_Jaro_ignore_case, ignore_case, C2BOOL)
1384
+
1385
+ /*
1386
+ * Document-method: ignore_case=
1387
+ *
1388
+ * call-seq: ignore_case=(true/false)
1389
+ *
1390
+ * Sets whether case is ignored when computing matching characters.
1391
+ */
1392
+ DEF_RB_WRITER(Jaro, rb_Jaro_ignore_case_set, ignore_case,
1393
+ int, CAST2BOOL, BOOL2C, != Qundef)
1394
+
1395
+ /*
1396
+ * call-seq: new(pattern)
1397
+ *
1398
+ * Creates a new Amatch::Jaro instance from <code>pattern</code>.
1399
+ */
1400
+ static VALUE rb_Jaro_initialize(VALUE self, VALUE pattern)
1401
+ {
1402
+ GET_STRUCT(Jaro)
1403
+ Jaro_pattern_set(amatch, pattern);
1404
+ amatch->ignore_case = 1;
1405
+ return self;
1406
+ }
1407
+
1408
+ DEF_CONSTRUCTOR(Jaro, Jaro)
1409
+
1410
+ /*
1411
+ * call-seq: match(strings) -> results
1412
+ *
1413
+ * Uses this Amatch::Jaro instance to match
1414
+ * Jaro#pattern against <code>strings</code>, that is compute the
1415
+ * jaro metric with the strings. <code>strings</code> has to be
1416
+ * either a String or an Array of Strings. The returned <code>results</code>
1417
+ * are either a Float or an Array of Floats respectively.
1418
+ */
1419
+ static VALUE rb_Jaro_match(VALUE self, VALUE strings)
1420
+ {
1421
+ GET_STRUCT(Jaro)
1422
+ return Jaro_iterate_strings(amatch, strings, Jaro_match);
1423
+ }
1424
+
1425
+ /*
1426
+ * call-seq: jaro_similar(strings) -> results
1427
+ *
1428
+ * If called on a String, this string is used as a
1429
+ * Amatch::Jaro#pattern to match against <code>strings</code>. It
1430
+ * returns a Jaro metric number between 0.0 for very
1431
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1432
+ * either a String or an Array of Strings. The returned <code>results</code>
1433
+ * are either a Float or an Array of Floats respectively.
1434
+ */
1435
+ static VALUE rb_str_jaro_similar(VALUE self, VALUE strings)
1436
+ {
1437
+ VALUE amatch = rb_Jaro_new(rb_cJaro, self);
1438
+ return rb_Jaro_match(amatch, strings);
1439
+ }
1440
+
1441
+ /*
1442
+ * Document-class: Amatch::JaroWinkler
1443
+ *
1444
+ * This class computes the Jaro-Winkler metric for two strings.
1445
+ * The Jaro-Winkler metric computes the similarity between 0 (no match)
1446
+ * and 1 (exact match) by looking for matching and transposed characters.
1447
+ *
1448
+ * It is a variant of the Jaro metric, with additional weighting towards
1449
+ * common prefixes.
1450
+ */
1451
+ DEF_RB_FREE(JaroWinkler, JaroWinkler)
1452
+
1453
+ /*
1454
+ * Document-method: ignore_case
1455
+ *
1456
+ * call-seq: ignore_case -> true/false
1457
+ *
1458
+ * Returns whether case is ignored when computing matching characters.
1459
+ * Default is true.
1460
+ */
1461
+ DEF_RB_READER(JaroWinkler, rb_JaroWinkler_ignore_case, ignore_case, C2BOOL)
1462
+
1463
+ /*
1464
+ * Document-method: scaling_factor
1465
+ *
1466
+ * call-seq: scaling_factor -> weight
1467
+ *
1468
+ * The scaling factor is how much weight to give common prefixes.
1469
+ * Default is 0.1.
1470
+ */
1471
+ DEF_RB_READER(JaroWinkler, rb_JaroWinkler_scaling_factor, scaling_factor, rb_float_new)
1472
+
1473
+ /*
1474
+ * Document-method: ignore_case=
1475
+ *
1476
+ * call-seq: ignore_case=(true/false)
1477
+ *
1478
+ * Sets whether case is ignored when computing matching characters.
1479
+ */
1480
+ DEF_RB_WRITER(JaroWinkler, rb_JaroWinkler_ignore_case_set, ignore_case,
1481
+ int, CAST2BOOL, BOOL2C, != Qundef)
1482
+
1483
+ /*
1484
+ * Document-method: scaling_factor=
1485
+ *
1486
+ * call-seq: scaling_factor=(weight)
1487
+ *
1488
+ * Sets the weight to give common prefixes.
1489
+ */
1490
+ DEF_RB_WRITER(JaroWinkler, rb_JaroWinkler_scaling_factor_set, scaling_factor,
1491
+ double, CAST2FLOAT, FLOAT2C, >= 0)
1492
+
1493
+ /*
1494
+ * call-seq: new(pattern)
1495
+ *
1496
+ * Creates a new Amatch::JaroWinkler instance from <code>pattern</code>.
1497
+ */
1498
+ static VALUE rb_JaroWinkler_initialize(VALUE self, VALUE pattern)
1499
+ {
1500
+ GET_STRUCT(JaroWinkler)
1501
+ JaroWinkler_pattern_set(amatch, pattern);
1502
+ amatch->ignore_case = 1;
1503
+ amatch->scaling_factor = 0.1;
1504
+ return self;
1505
+ }
1506
+
1507
+ DEF_CONSTRUCTOR(JaroWinkler, JaroWinkler)
1508
+
1509
+ /*
1510
+ * call-seq: match(strings) -> results
1511
+ *
1512
+ * Uses this Amatch::Jaro instance to match
1513
+ * Jaro#pattern against <code>strings</code>, that is compute the
1514
+ * jaro metric with the strings. <code>strings</code> has to be
1515
+ * either a String or an Array of Strings. The returned <code>results</code>
1516
+ * are either a Float or an Array of Floats respectively.
1517
+ */
1518
+ static VALUE rb_JaroWinkler_match(VALUE self, VALUE strings)
1519
+ {
1520
+ GET_STRUCT(JaroWinkler)
1521
+ return JaroWinkler_iterate_strings(amatch, strings, JaroWinkler_match);
1522
+ }
1523
+
1524
+ /*
1525
+ * call-seq: jarowinkler_similar(strings) -> results
1526
+ *
1527
+ * If called on a String, this string is used as a
1528
+ * Amatch::JaroWinkler#pattern to match against <code>strings</code>. It
1529
+ * returns a Jaro-Winkler metric number between 0.0 for very
1530
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1531
+ * either a String or an Array of Strings. The returned <code>results</code>
1532
+ * are either a Float or an Array of Floats respectively.
1533
+ */
1534
+ static VALUE rb_str_jarowinkler_similar(VALUE self, VALUE strings)
1535
+ {
1536
+ VALUE amatch = rb_JaroWinkler_new(rb_cJaro, self);
1537
+ return rb_JaroWinkler_match(amatch, strings);
1538
+ }
1539
+
1540
+ void Init_amatch()
1541
+ {
1542
+ rb_require("amatch/version");
1543
+ rb_mAmatch = rb_define_module("Amatch");
1544
+
1545
+ /* Levenshtein */
1546
+ rb_cLevenshtein = rb_define_class_under(rb_mAmatch, "Levenshtein", rb_cObject);
1547
+ rb_define_alloc_func(rb_cLevenshtein, rb_Levenshtein_s_allocate);
1548
+ rb_define_method(rb_cLevenshtein, "initialize", rb_Levenshtein_initialize, 1);
1549
+ rb_define_method(rb_cLevenshtein, "pattern", rb_General_pattern, 0);
1550
+ rb_define_method(rb_cLevenshtein, "pattern=", rb_General_pattern_set, 1);
1551
+ rb_define_method(rb_cLevenshtein, "match", rb_Levenshtein_match, 1);
1552
+ rb_define_method(rb_cLevenshtein, "search", rb_Levenshtein_search, 1);
1553
+ rb_define_method(rb_cLevenshtein, "similar", rb_Levenshtein_similar, 1);
1554
+ rb_define_method(rb_cString, "levenshtein_similar", rb_str_levenshtein_similar, 1);
1555
+
1556
+ /* Sellers */
1557
+ rb_cSellers = rb_define_class_under(rb_mAmatch, "Sellers", rb_cObject);
1558
+ rb_define_alloc_func(rb_cSellers, rb_Sellers_s_allocate);
1559
+ rb_define_method(rb_cSellers, "initialize", rb_Sellers_initialize, 1);
1560
+ rb_define_method(rb_cSellers, "pattern", rb_Sellers_pattern, 0);
1561
+ rb_define_method(rb_cSellers, "pattern=", rb_Sellers_pattern_set, 1);
1562
+ rb_define_method(rb_cSellers, "substitution", rb_Sellers_substitution, 0);
1563
+ rb_define_method(rb_cSellers, "substitution=", rb_Sellers_substitution_set, 1);
1564
+ rb_define_method(rb_cSellers, "deletion", rb_Sellers_deletion, 0);
1565
+ rb_define_method(rb_cSellers, "deletion=", rb_Sellers_deletion_set, 1);
1566
+ rb_define_method(rb_cSellers, "insertion", rb_Sellers_insertion, 0);
1567
+ rb_define_method(rb_cSellers, "insertion=", rb_Sellers_insertion_set, 1);
1568
+ rb_define_method(rb_cSellers, "reset_weights", rb_Sellers_reset_weights, 0);
1569
+ rb_define_method(rb_cSellers, "match", rb_Sellers_match, 1);
1570
+ rb_define_method(rb_cSellers, "search", rb_Sellers_search, 1);
1571
+ rb_define_method(rb_cSellers, "similar", rb_Sellers_similar, 1);
1572
+
1573
+ /* Hamming */
1574
+ rb_cHamming = rb_define_class_under(rb_mAmatch, "Hamming", rb_cObject);
1575
+ rb_define_alloc_func(rb_cHamming, rb_Hamming_s_allocate);
1576
+ rb_define_method(rb_cHamming, "initialize", rb_Hamming_initialize, 1);
1577
+ rb_define_method(rb_cHamming, "pattern", rb_General_pattern, 0);
1578
+ rb_define_method(rb_cHamming, "pattern=", rb_General_pattern_set, 1);
1579
+ rb_define_method(rb_cHamming, "match", rb_Hamming_match, 1);
1580
+ rb_define_method(rb_cHamming, "similar", rb_Hamming_similar, 1);
1581
+ rb_define_method(rb_cString, "hamming_similar", rb_str_hamming_similar, 1);
1582
+
1583
+ /* Pair Distance Metric */
1584
+ rb_cPairDistance = rb_define_class_under(rb_mAmatch, "PairDistance", rb_cObject);
1585
+ rb_define_alloc_func(rb_cPairDistance, rb_PairDistance_s_allocate);
1586
+ rb_define_method(rb_cPairDistance, "initialize", rb_PairDistance_initialize, 1);
1587
+ rb_define_method(rb_cPairDistance, "pattern", rb_PairDistance_pattern, 0);
1588
+ rb_define_method(rb_cPairDistance, "pattern=", rb_PairDistance_pattern_set, 1);
1589
+ rb_define_method(rb_cPairDistance, "match", rb_PairDistance_match, -1);
1590
+ rb_define_alias(rb_cPairDistance, "similar", "match");
1591
+ rb_define_method(rb_cString, "pair_distance_similar", rb_str_pair_distance_similar, 1);
1592
+
1593
+ /* Longest Common Subsequence */
1594
+ rb_cLongestSubsequence = rb_define_class_under(rb_mAmatch, "LongestSubsequence", rb_cObject);
1595
+ rb_define_alloc_func(rb_cLongestSubsequence, rb_LongestSubsequence_s_allocate);
1596
+ rb_define_method(rb_cLongestSubsequence, "initialize", rb_LongestSubsequence_initialize, 1);
1597
+ rb_define_method(rb_cLongestSubsequence, "pattern", rb_General_pattern, 0);
1598
+ rb_define_method(rb_cLongestSubsequence, "pattern=", rb_General_pattern_set, 1);
1599
+ rb_define_method(rb_cLongestSubsequence, "match", rb_LongestSubsequence_match, 1);
1600
+ rb_define_method(rb_cLongestSubsequence, "similar", rb_LongestSubsequence_similar, 1);
1601
+ rb_define_method(rb_cString, "longest_subsequence_similar", rb_str_longest_subsequence_similar, 1);
1602
+
1603
+ /* Longest Common Substring */
1604
+ rb_cLongestSubstring = rb_define_class_under(rb_mAmatch, "LongestSubstring", rb_cObject);
1605
+ rb_define_alloc_func(rb_cLongestSubstring, rb_LongestSubstring_s_allocate);
1606
+ rb_define_method(rb_cLongestSubstring, "initialize", rb_LongestSubstring_initialize, 1);
1607
+ rb_define_method(rb_cLongestSubstring, "pattern", rb_General_pattern, 0);
1608
+ rb_define_method(rb_cLongestSubstring, "pattern=", rb_General_pattern_set, 1);
1609
+ rb_define_method(rb_cLongestSubstring, "match", rb_LongestSubstring_match, 1);
1610
+ rb_define_method(rb_cLongestSubstring, "similar", rb_LongestSubstring_similar, 1);
1611
+ rb_define_method(rb_cString, "longest_substring_similar", rb_str_longest_substring_similar, 1);
1612
+
1613
+ /* Jaro */
1614
+ rb_cJaro = rb_define_class_under(rb_mAmatch, "Jaro", rb_cObject);
1615
+ rb_define_alloc_func(rb_cJaro, rb_Jaro_s_allocate);
1616
+ rb_define_method(rb_cJaro, "initialize", rb_Jaro_initialize, 1);
1617
+ rb_define_method(rb_cJaro, "pattern", rb_Jaro_pattern, 0);
1618
+ rb_define_method(rb_cJaro, "pattern=", rb_Jaro_pattern_set, 1);
1619
+ rb_define_method(rb_cJaro, "ignore_case", rb_Jaro_ignore_case, 0);
1620
+ rb_define_method(rb_cJaro, "ignore_case=", rb_Jaro_ignore_case_set, 1);
1621
+ rb_define_method(rb_cJaro, "match", rb_Jaro_match, 1);
1622
+ rb_define_alias(rb_cJaro, "similar", "match");
1623
+ rb_define_method(rb_cString, "jaro_similar", rb_str_jaro_similar, 1);
1624
+
1625
+ /* Jaro-Winkler */
1626
+ rb_cJaroWinkler = rb_define_class_under(rb_mAmatch, "JaroWinkler", rb_cObject);
1627
+ rb_define_alloc_func(rb_cJaroWinkler, rb_JaroWinkler_s_allocate);
1628
+ rb_define_method(rb_cJaroWinkler, "initialize", rb_JaroWinkler_initialize, 1);
1629
+ rb_define_method(rb_cJaroWinkler, "pattern", rb_JaroWinkler_pattern, 0);
1630
+ rb_define_method(rb_cJaroWinkler, "pattern=", rb_JaroWinkler_pattern_set, 1);
1631
+ rb_define_method(rb_cJaroWinkler, "ignore_case", rb_JaroWinkler_ignore_case, 0);
1632
+ rb_define_method(rb_cJaroWinkler, "ignore_case=", rb_JaroWinkler_ignore_case_set, 1);
1633
+ rb_define_method(rb_cJaroWinkler, "scaling_factor", rb_JaroWinkler_scaling_factor, 0);
1634
+ rb_define_method(rb_cJaroWinkler, "scaling_factor=", rb_JaroWinkler_scaling_factor_set, 1);
1635
+ rb_define_method(rb_cJaroWinkler, "match", rb_JaroWinkler_match, 1);
1636
+ rb_define_alias(rb_cJaroWinkler, "similar", "match");
1637
+ rb_define_method(rb_cString, "jarowinkler_similar", rb_str_jarowinkler_similar, 1);
1638
+
1639
+ id_split = rb_intern("split");
1640
+ id_to_f = rb_intern("to_f");
1641
+ }