mumboe-amatch 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.3
@@ -0,0 +1,83 @@
1
+ #! /usr/bin/env ruby
2
+ # vim: set et sw=2 ts=2:
3
+ #
4
+ ## $Id: agrep.rb,v 1.5 2006/09/26 15:59:48 flori Exp $
5
+ #
6
+
7
+ require 'amatch'
8
+ require 'getoptlong'
9
+
10
+ def usage(msg, options)
11
+ puts msg, "Usage: #{File.basename($0)} [OPTIONS] PATTERN [FILE ...]", ""
12
+ options.each do |o|
13
+ puts " " + o[1] + ", " + o[0] + " " +
14
+ (o[2] == GetoptLong::REQUIRED_ARGUMENT ? 'ARGUMENT' : '')
15
+ end
16
+ puts "\nReport bugs to <flori@ping.de>."
17
+ exit 0
18
+ end
19
+
20
+ class Amatch::Levenshtein
21
+ def search_relative(strings)
22
+ search(strings).to_f / pattern.size
23
+ end
24
+ end
25
+
26
+ $distance = 1
27
+ $mode = :search
28
+ begin
29
+ parser = GetoptLong.new
30
+ options = [
31
+ [ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
32
+ [ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
33
+ [ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
34
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
35
+ ]
36
+ parser.set_options(*options)
37
+ parser.each_option do |name, arg|
38
+ name = name.sub(/^--/, '')
39
+ case name
40
+ when 'distance'
41
+ $distance = arg.to_f
42
+ when 'relative'
43
+ $mode = :search_relative
44
+ when 'verbose'
45
+ $verbose = 1
46
+ when 'help'
47
+ usage('You\'ve asked for it!', options)
48
+ end
49
+ end
50
+ rescue
51
+ exit 1
52
+ end
53
+ pattern = ARGV.shift or usage('Pattern needed!', options)
54
+
55
+ matcher = Amatch::Levenshtein.new(pattern)
56
+ size = 0
57
+ start = Time.new
58
+ if ARGV.size > 0 then
59
+ ARGV.each do |filename|
60
+ File.stat(filename).file? or next
61
+ size += File.size(filename)
62
+ begin
63
+ File.open(filename, 'r').each_line do |line|
64
+ if matcher.__send__($mode, line) <= $distance
65
+ puts "#{filename}:#{line}"
66
+ end
67
+ end
68
+ rescue
69
+ STDERR.puts "Failure at #{filename}: #{$!} => Skipping!"
70
+ end
71
+ end
72
+ else
73
+ STDIN.each_line do |line|
74
+ size += line.size
75
+ if matcher.__send__($mode, line) <= $distance
76
+ puts line
77
+ end
78
+ end
79
+ end
80
+ time = Time.new - start
81
+ $verbose and STDERR.printf "%.3f secs running, scanned %.3f KB/s.\n",
82
+ time, size / time / 1024
83
+ exit 0
@@ -0,0 +1,2 @@
1
+ amatch.c
2
+ extconf.rb
@@ -0,0 +1,149 @@
1
+
2
+ SHELL = /bin/sh
3
+
4
+ #### Start of system configuration section. ####
5
+
6
+ srcdir = .
7
+ topdir = /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/universal-darwin9.0
8
+ hdrdir = $(topdir)
9
+ VPATH = $(srcdir):$(topdir):$(hdrdir)
10
+ prefix = $(DESTDIR)/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr
11
+ exec_prefix = $(prefix)
12
+ sitedir = $(DESTDIR)/Library/Ruby/Site
13
+ rubylibdir = $(libdir)/ruby/$(ruby_version)
14
+ docdir = $(datarootdir)/doc/$(PACKAGE)
15
+ dvidir = $(docdir)
16
+ datarootdir = $(prefix)/share
17
+ archdir = $(rubylibdir)/$(arch)
18
+ sbindir = $(exec_prefix)/sbin
19
+ psdir = $(docdir)
20
+ localedir = $(datarootdir)/locale
21
+ htmldir = $(docdir)
22
+ datadir = $(datarootdir)
23
+ includedir = $(prefix)/include
24
+ infodir = $(DESTDIR)/usr/share/info
25
+ sysconfdir = $(prefix)/etc
26
+ mandir = $(DESTDIR)/usr/share/man
27
+ libdir = $(exec_prefix)/lib
28
+ sharedstatedir = $(prefix)/com
29
+ oldincludedir = $(DESTDIR)/usr/include
30
+ pdfdir = $(docdir)
31
+ sitearchdir = $(sitelibdir)/$(sitearch)
32
+ bindir = $(exec_prefix)/bin
33
+ localstatedir = $(prefix)/var
34
+ sitelibdir = $(sitedir)/$(ruby_version)
35
+ libexecdir = $(exec_prefix)/libexec
36
+
37
+ CC = gcc -Wall
38
+ LIBRUBY = $(LIBRUBY_SO)
39
+ LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
40
+ LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
41
+ LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)
42
+
43
+ RUBY_EXTCONF_H =
44
+ CFLAGS = -fno-common -arch ppc -arch i386 -Os -pipe -fno-common
45
+ INCFLAGS = -I. -I$(topdir) -I$(hdrdir) -I$(srcdir)
46
+ CPPFLAGS =
47
+ CXXFLAGS = $(CFLAGS)
48
+ DLDFLAGS = -L. -arch ppc -arch i386
49
+ LDSHARED = cc -arch ppc -arch i386 -pipe -bundle -undefined dynamic_lookup
50
+ AR = ar
51
+ EXEEXT =
52
+
53
+ RUBY_INSTALL_NAME = ruby
54
+ RUBY_SO_NAME = ruby
55
+ arch = universal-darwin9.0
56
+ sitearch = universal-darwin9.0
57
+ ruby_version = 1.8
58
+ ruby = /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/bin/ruby
59
+ RUBY = $(ruby)
60
+ RM = rm -f
61
+ MAKEDIRS = mkdir -p
62
+ INSTALL = /usr/bin/install -c
63
+ INSTALL_PROG = $(INSTALL) -m 0755
64
+ INSTALL_DATA = $(INSTALL) -m 644
65
+ COPY = cp
66
+
67
+ #### End of system configuration section. ####
68
+
69
+ preload =
70
+
71
+ libpath = . $(libdir)
72
+ LIBPATH = -L"." -L"$(libdir)"
73
+ DEFFILE =
74
+
75
+ CLEANFILES = mkmf.log
76
+ DISTCLEANFILES =
77
+
78
+ extout =
79
+ extout_prefix =
80
+ target_prefix =
81
+ LOCAL_LIBS =
82
+ LIBS = $(LIBRUBYARG_SHARED) -lpthread -ldl -lm
83
+ SRCS = amatch.c pair.c
84
+ OBJS = amatch.o pair.o
85
+ TARGET = amatch
86
+ DLLIB = $(TARGET).bundle
87
+ EXTSTATIC =
88
+ STATIC_LIB =
89
+
90
+ RUBYCOMMONDIR = $(sitedir)$(target_prefix)
91
+ RUBYLIBDIR = /Library/Ruby/Gems/1.8/gems/amatch-0.2.3/ext$(target_prefix)
92
+ RUBYARCHDIR = /Library/Ruby/Gems/1.8/gems/amatch-0.2.3/ext$(target_prefix)
93
+
94
+ TARGET_SO = $(DLLIB)
95
+ CLEANLIBS = $(TARGET).bundle $(TARGET).il? $(TARGET).tds $(TARGET).map
96
+ CLEANOBJS = *.o *.a *.s[ol] *.pdb *.exp *.bak
97
+
98
+ all: $(DLLIB)
99
+ static: $(STATIC_LIB)
100
+
101
+ clean:
102
+ @-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
103
+
104
+ distclean: clean
105
+ @-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
106
+ @-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
107
+
108
+ realclean: distclean
109
+ install: install-so install-rb
110
+
111
+ install-so: $(RUBYARCHDIR)
112
+ install-so: $(RUBYARCHDIR)/$(DLLIB)
113
+ $(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
114
+ $(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
115
+ install-rb: pre-install-rb install-rb-default
116
+ install-rb-default: pre-install-rb-default
117
+ pre-install-rb: Makefile
118
+ pre-install-rb-default: Makefile
119
+ $(RUBYARCHDIR):
120
+ $(MAKEDIRS) $@
121
+
122
+ site-install: site-install-so site-install-rb
123
+ site-install-so: install-so
124
+ site-install-rb: install-rb
125
+
126
+ .SUFFIXES: .c .m .cc .cxx .cpp .C .o
127
+
128
+ .cc.o:
129
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
130
+
131
+ .cxx.o:
132
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
133
+
134
+ .cpp.o:
135
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
136
+
137
+ .C.o:
138
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
139
+
140
+ .c.o:
141
+ $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) -c $<
142
+
143
+ $(DLLIB): $(OBJS)
144
+ @-$(RM) $@
145
+ $(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
146
+
147
+
148
+
149
+ $(OBJS): ruby.h defines.h
Binary file
@@ -0,0 +1,1388 @@
1
+ #include "ruby.h"
2
+ #include "pair.h"
3
+
4
+ /*
5
+ * Document-method: pattern
6
+ *
7
+ * call-seq: pattern -> pattern string
8
+ *
9
+ * Returns the current pattern string of this instance.
10
+ */
11
+
12
+ /*
13
+ * Document-method: pattern=
14
+ *
15
+ * call-seq: pattern=(pattern)
16
+ *
17
+ * Sets the current pattern string of this instance to <code>pattern</code>.
18
+ */
19
+
20
+
21
+ static VALUE rb_mAmatch, rb_cLevenshtein, rb_cSellers, rb_cHamming,
22
+ rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring;
23
+
24
+ static ID id_split, id_to_f;
25
+
26
+ #define GET_STRUCT(klass) \
27
+ klass *amatch; \
28
+ Data_Get_Struct(self, klass, amatch);
29
+
30
+ #define DEF_ALLOCATOR(type) \
31
+ static type *type##_allocate() \
32
+ { \
33
+ type *obj = ALLOC(type); \
34
+ MEMZERO(obj, type, 1); \
35
+ return obj; \
36
+ }
37
+
38
+ #define DEF_CONSTRUCTOR(klass, type) \
39
+ static VALUE rb_##klass##_s_allocate(VALUE klass2) \
40
+ { \
41
+ type *amatch = type##_allocate(); \
42
+ return Data_Wrap_Struct(klass2, NULL, rb_##klass##_free, amatch); \
43
+ } \
44
+ VALUE rb_##klass##_new(VALUE klass2, VALUE pattern) \
45
+ { \
46
+ VALUE obj = rb_##klass##_s_allocate(klass2); \
47
+ rb_##klass##_initialize(obj, pattern); \
48
+ return obj; \
49
+ }
50
+
51
+ #define DEF_RB_FREE(klass, type) \
52
+ static void rb_##klass##_free(type *amatch) \
53
+ { \
54
+ MEMZERO(amatch->pattern, char, amatch->pattern_len); \
55
+ free(amatch->pattern); \
56
+ MEMZERO(amatch, type, 1); \
57
+ free(amatch); \
58
+ }
59
+
60
+ #define DEF_PATTERN_ACCESSOR(type) \
61
+ static void type##_pattern_set(type *amatch, VALUE pattern) \
62
+ { \
63
+ Check_Type(pattern, T_STRING); \
64
+ free(amatch->pattern); \
65
+ amatch->pattern_len = RSTRING_LEN(pattern); \
66
+ amatch->pattern = ALLOC_N(char, amatch->pattern_len); \
67
+ MEMCPY(amatch->pattern, RSTRING_PTR(pattern), char, \
68
+ RSTRING_LEN(pattern)); \
69
+ } \
70
+ static VALUE rb_##type##_pattern(VALUE self) \
71
+ { \
72
+ GET_STRUCT(type) \
73
+ return rb_str_new(amatch->pattern, amatch->pattern_len); \
74
+ } \
75
+ static VALUE rb_##type##_pattern_set(VALUE self, VALUE pattern) \
76
+ { \
77
+ GET_STRUCT(type) \
78
+ type##_pattern_set(amatch, pattern); \
79
+ return Qnil; \
80
+ }
81
+
82
+ #define DEF_ITERATE_STRINGS(type) \
83
+ static VALUE type##_iterate_strings(type *amatch, VALUE strings, \
84
+ VALUE (*match_function) (type *amatch, VALUE strings)) \
85
+ { \
86
+ if (TYPE(strings) == T_STRING) { \
87
+ return match_function(amatch, strings); \
88
+ } else { \
89
+ Check_Type(strings, T_ARRAY); \
90
+ int i; \
91
+ VALUE result = rb_ary_new2(RARRAY_LEN(strings)); \
92
+ for (i = 0; i < RARRAY_LEN(strings); i++) { \
93
+ VALUE string = rb_ary_entry(strings, i); \
94
+ if (TYPE(string) != T_STRING) { \
95
+ rb_raise(rb_eTypeError, \
96
+ "array has to contain only strings (%s given)", \
97
+ NIL_P(string) ? \
98
+ "NilClass" : \
99
+ rb_class2name(CLASS_OF(string))); \
100
+ } \
101
+ rb_ary_push(result, match_function(amatch, string)); \
102
+ } \
103
+ return result; \
104
+ } \
105
+ }
106
+
107
+ #define DEF_RB_READER(type, function, name, converter) \
108
+ VALUE function(VALUE self) \
109
+ { \
110
+ GET_STRUCT(type) \
111
+ return converter(amatch->name); \
112
+ }
113
+
114
+ #define DEF_RB_WRITER(type, function, name, vtype, caster, converter, check)\
115
+ VALUE function(VALUE self, VALUE value) \
116
+ { \
117
+ vtype value_ ## vtype; \
118
+ GET_STRUCT(type) \
119
+ caster(value); \
120
+ value_ ## vtype = converter(value); \
121
+ if (!(value_ ## vtype check)) \
122
+ rb_raise(rb_eTypeError, "check of value " #check " failed"); \
123
+ amatch->name = value_ ## vtype; \
124
+ return Qnil; \
125
+ }
126
+
127
+
128
+ #define CAST2FLOAT(obj) \
129
+ if (TYPE(obj) != T_FLOAT && rb_respond_to(obj, id_to_f)) \
130
+ obj = rb_funcall(obj, id_to_f, 0, 0); \
131
+ else \
132
+ Check_Type(obj, T_FLOAT)
133
+ #define FLOAT2C(obj) RFLOAT_VALUE(obj)
134
+
135
+ #define OPTIMIZE_TIME \
136
+ if (amatch->pattern_len < RSTRING_LEN(string)) { \
137
+ a_ptr = amatch->pattern; \
138
+ a_len = amatch->pattern_len; \
139
+ b_ptr = RSTRING_PTR(string); \
140
+ b_len = RSTRING_LEN(string); \
141
+ } else { \
142
+ a_ptr = RSTRING_PTR(string); \
143
+ a_len = RSTRING_LEN(string); \
144
+ b_ptr = amatch->pattern; \
145
+ b_len = amatch->pattern_len; \
146
+ }
147
+
148
+ #define DONT_OPTIMIZE \
149
+ a_ptr = amatch->pattern; \
150
+ a_len = amatch->pattern_len; \
151
+ b_ptr = RSTRING_PTR(string); \
152
+ b_len = RSTRING_LEN(string); \
153
+
154
+ /*
155
+ * C structures of the Amatch classes
156
+ */
157
+
158
+ typedef struct GeneralStruct {
159
+ char *pattern;
160
+ int pattern_len;
161
+ } General;
162
+
163
+ DEF_ALLOCATOR(General)
164
+ DEF_PATTERN_ACCESSOR(General)
165
+ DEF_ITERATE_STRINGS(General)
166
+
167
+ typedef struct SellersStruct {
168
+ char *pattern;
169
+ int pattern_len;
170
+ double substitution;
171
+ double deletion;
172
+ double insertion;
173
+ } Sellers;
174
+
175
+ DEF_ALLOCATOR(Sellers)
176
+ DEF_PATTERN_ACCESSOR(Sellers)
177
+ DEF_ITERATE_STRINGS(Sellers)
178
+
179
+ static void Sellers_reset_weights(Sellers *self)
180
+ {
181
+ self->substitution = 1.0;
182
+ self->deletion = 1.0;
183
+ self->insertion = 1.0;
184
+ }
185
+
186
+ typedef struct PairDistanceStruct {
187
+ char *pattern;
188
+ int pattern_len;
189
+ PairArray *pattern_pair_array;
190
+ } PairDistance;
191
+
192
+ DEF_ALLOCATOR(PairDistance)
193
+ DEF_PATTERN_ACCESSOR(PairDistance)
194
+
195
+ /*
196
+ * Levenshtein edit distances are computed here:
197
+ */
198
+
199
+ #define COMPUTE_LEVENSHTEIN_DISTANCE \
200
+ for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
201
+ c = i % 2; /* current row */ \
202
+ p = (i + 1) % 2; /* previous row */ \
203
+ v[c][0] = i; /* first column */ \
204
+ for (j = 1; j <= b_len; j++) { \
205
+ /* Bellman's principle of optimality: */ \
206
+ weight = v[p][j - 1] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
207
+ if (weight > v[p][j] + 1) { \
208
+ weight = v[p][j] + 1; \
209
+ } \
210
+ if (weight > v[c][j - 1] + 1) { \
211
+ weight = v[c][j - 1] + 1; \
212
+ } \
213
+ v[c][j] = weight; \
214
+ } \
215
+ p = c; \
216
+ c = (c + 1) % 2; \
217
+ }
218
+
219
+ static VALUE Levenshtein_match(General *amatch, VALUE string)
220
+ {
221
+ VALUE result;
222
+ char *a_ptr, *b_ptr;
223
+ int a_len, b_len;
224
+ int *v[2], weight;
225
+ int i, j, c, p;
226
+
227
+ Check_Type(string, T_STRING);
228
+ DONT_OPTIMIZE
229
+
230
+ v[0] = ALLOC_N(int, b_len + 1);
231
+ v[1] = ALLOC_N(int, b_len + 1);
232
+ for (i = 0; i <= b_len; i++) {
233
+ v[0][i] = i;
234
+ v[1][i] = i;
235
+ }
236
+
237
+ COMPUTE_LEVENSHTEIN_DISTANCE
238
+
239
+ result = INT2FIX(v[p][b_len]);
240
+
241
+ free(v[0]);
242
+ free(v[1]);
243
+
244
+ return result;
245
+ }
246
+
247
+ static VALUE Levenshtein_similar(General *amatch, VALUE string)
248
+ {
249
+ VALUE result;
250
+ char *a_ptr, *b_ptr;
251
+ int a_len, b_len;
252
+ int *v[2], weight;
253
+ int i, j, c, p;
254
+
255
+ Check_Type(string, T_STRING);
256
+ DONT_OPTIMIZE
257
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
258
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
259
+ v[0] = ALLOC_N(int, b_len + 1);
260
+ v[1] = ALLOC_N(int, b_len + 1);
261
+ for (i = 0; i <= b_len; i++) {
262
+ v[0][i] = i;
263
+ v[1][i] = i;
264
+ }
265
+
266
+ COMPUTE_LEVENSHTEIN_DISTANCE
267
+
268
+ if (b_len > a_len) {
269
+ result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
270
+ } else {
271
+ result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
272
+ }
273
+ free(v[0]);
274
+ free(v[1]);
275
+ return result;
276
+ }
277
+
278
+ static VALUE Levenshtein_search(General *amatch, VALUE string)
279
+ {
280
+ VALUE result;
281
+ char *a_ptr, *b_ptr;
282
+ int a_len, b_len;
283
+ int *v[2], weight, min;
284
+ int i, j, c, p;
285
+
286
+ Check_Type(string, T_STRING);
287
+ DONT_OPTIMIZE
288
+
289
+ v[0] = ALLOC_N(int, b_len + 1);
290
+ v[1] = ALLOC_N(int, b_len + 1);
291
+ MEMZERO(v[0], int, b_len + 1);
292
+ MEMZERO(v[1], int, b_len + 1);
293
+
294
+ COMPUTE_LEVENSHTEIN_DISTANCE
295
+
296
+ for (i = 0, min = a_len; i <= b_len; i++) {
297
+ if (v[p][i] < min) min = v[p][i];
298
+ }
299
+
300
+ result = INT2FIX(min);
301
+
302
+ free(v[0]);
303
+ free(v[1]);
304
+
305
+ return result;
306
+ }
307
+
308
+
309
+ /*
310
+ * Sellers edit distances are computed here:
311
+ */
312
+
313
+ #define COMPUTE_SELLERS_DISTANCE \
314
+ for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
315
+ c = i % 2; /* current row */ \
316
+ p = (i + 1) % 2; /* previous row */ \
317
+ v[c][0] = i * amatch->deletion; /* first column */ \
318
+ for (j = 1; j <= b_len; j++) { \
319
+ /* Bellman's principle of optimality: */ \
320
+ weight = v[p][j - 1] + \
321
+ (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : amatch->substitution); \
322
+ if (weight > v[p][j] + amatch->insertion) { \
323
+ weight = v[p][j] + amatch->insertion; \
324
+ } \
325
+ if (weight > v[c][j - 1] + amatch->deletion) { \
326
+ weight = v[c][j - 1] + amatch->deletion; \
327
+ } \
328
+ v[c][j] = weight; \
329
+ } \
330
+ p = c; \
331
+ c = (c + 1) % 2; \
332
+ }
333
+
334
+ static VALUE Sellers_match(Sellers *amatch, VALUE string)
335
+ {
336
+ VALUE result;
337
+ char *a_ptr, *b_ptr;
338
+ int a_len, b_len;
339
+ double *v[2], weight;
340
+ int i, j, c, p;
341
+
342
+ Check_Type(string, T_STRING);
343
+ DONT_OPTIMIZE
344
+
345
+ v[0] = ALLOC_N(double, b_len + 1);
346
+ v[1] = ALLOC_N(double, b_len + 1);
347
+ for (i = 0; i <= b_len; i++) {
348
+ v[0][i] = i * amatch->deletion;
349
+ v[1][i] = i * amatch->deletion;
350
+ }
351
+
352
+ COMPUTE_SELLERS_DISTANCE
353
+
354
+ result = rb_float_new(v[p][b_len]);
355
+ free(v[0]);
356
+ free(v[1]);
357
+ return result;
358
+ }
359
+
360
+ static VALUE Sellers_similar(Sellers *amatch, VALUE string)
361
+ {
362
+ VALUE result;
363
+ char *a_ptr, *b_ptr;
364
+ int a_len, b_len;
365
+ double *v[2], weight, max_weight;
366
+ int i, j, c, p;
367
+
368
+ if (amatch->insertion >= amatch->deletion) {
369
+ if (amatch->substitution >= amatch->insertion) {
370
+ max_weight = amatch->substitution;
371
+ } else {
372
+ max_weight = amatch->insertion;
373
+ }
374
+ } else {
375
+ if (amatch->substitution >= amatch->deletion) {
376
+ max_weight = amatch->substitution;
377
+ } else {
378
+ max_weight = amatch->deletion;
379
+ }
380
+ }
381
+
382
+ Check_Type(string, T_STRING);
383
+ DONT_OPTIMIZE
384
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
385
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
386
+ v[0] = ALLOC_N(double, b_len + 1);
387
+ v[1] = ALLOC_N(double, b_len + 1);
388
+ for (i = 0; i <= b_len; i++) {
389
+ v[0][i] = i * amatch->deletion;
390
+ v[1][i] = i * amatch->deletion;
391
+ }
392
+
393
+ COMPUTE_SELLERS_DISTANCE
394
+
395
+ if (b_len > a_len) {
396
+ result = rb_float_new(1.0 - v[p][b_len] / (b_len * max_weight));
397
+ } else {
398
+ result = rb_float_new(1.0 - v[p][b_len] / (a_len * max_weight));
399
+ }
400
+ free(v[0]);
401
+ free(v[1]);
402
+ return result;
403
+ }
404
+
405
+ static VALUE Sellers_search(Sellers *amatch, VALUE string)
406
+ {
407
+ VALUE result;
408
+ char *a_ptr, *b_ptr;
409
+ int a_len, b_len;
410
+ double *v[2], weight, min;
411
+ int i, j, c, p;
412
+
413
+ Check_Type(string, T_STRING);
414
+ DONT_OPTIMIZE
415
+
416
+ v[0] = ALLOC_N(double, b_len + 1);
417
+ v[1] = ALLOC_N(double, b_len + 1);
418
+ MEMZERO(v[0], double, b_len + 1);
419
+ MEMZERO(v[1], double, b_len + 1);
420
+
421
+ COMPUTE_SELLERS_DISTANCE
422
+
423
+ for (i = 0, min = a_len; i <= b_len; i++) {
424
+ if (v[p][i] < min) min = v[p][i];
425
+ }
426
+ result = rb_float_new(min);
427
+ free(v[0]);
428
+ free(v[1]);
429
+
430
+ return result;
431
+ }
432
+
433
+ /*
434
+ * Pair distances are computed here:
435
+ */
436
+
437
+ static VALUE PairDistance_match(
438
+ PairDistance *amatch, VALUE string, VALUE regexp, int use_regexp)
439
+ {
440
+ double result;
441
+ VALUE tokens;
442
+ PairArray *pair_array;
443
+
444
+ Check_Type(string, T_STRING);
445
+ if (!NIL_P(regexp) || use_regexp) {
446
+ tokens = rb_funcall(
447
+ rb_str_new(amatch->pattern, amatch->pattern_len),
448
+ id_split, 1, regexp
449
+ );
450
+ if (!amatch->pattern_pair_array) {
451
+ amatch->pattern_pair_array = PairArray_new(tokens);
452
+ } else {
453
+ pair_array_reactivate(amatch->pattern_pair_array);
454
+ }
455
+ tokens = rb_funcall(string, id_split, 1, regexp);
456
+ pair_array = PairArray_new(tokens);
457
+ } else {
458
+ VALUE tmp = rb_str_new(amatch->pattern, amatch->pattern_len);
459
+ tokens = rb_ary_new4(1, &tmp);
460
+ if (!amatch->pattern_pair_array) {
461
+ amatch->pattern_pair_array = PairArray_new(tokens);
462
+ } else {
463
+ pair_array_reactivate(amatch->pattern_pair_array);
464
+ }
465
+ tokens = rb_ary_new4(1, &string);
466
+ pair_array = PairArray_new(tokens);
467
+ }
468
+ result = pair_array_match(amatch->pattern_pair_array, pair_array);
469
+ pair_array_destroy(pair_array);
470
+ return rb_float_new(result);
471
+ }
472
+
473
+ /*
474
+ * Hamming distances are computed here:
475
+ */
476
+
477
+ #define COMPUTE_HAMMING_DISTANCE \
478
+ for (i = 0, result = b_len - a_len; i < a_len; i++) { \
479
+ if (i >= b_len) { \
480
+ result += a_len - b_len; \
481
+ break; \
482
+ } \
483
+ if (b_ptr[i] != a_ptr[i]) result++; \
484
+ }
485
+
486
+ static VALUE Hamming_match(General *amatch, VALUE string)
487
+ {
488
+ char *a_ptr, *b_ptr;
489
+ int a_len, b_len;
490
+ int i, result;
491
+
492
+ Check_Type(string, T_STRING);
493
+ OPTIMIZE_TIME
494
+ COMPUTE_HAMMING_DISTANCE
495
+ return INT2FIX(result);
496
+ }
497
+
498
+ static VALUE Hamming_similar(General *amatch, VALUE string)
499
+ {
500
+ char *a_ptr, *b_ptr;
501
+ int a_len, b_len;
502
+ int i, result;
503
+
504
+ Check_Type(string, T_STRING);
505
+ OPTIMIZE_TIME
506
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
507
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
508
+ COMPUTE_HAMMING_DISTANCE
509
+ return rb_float_new(1.0 - ((double) result) / b_len);
510
+ }
511
+
512
+ /*
513
+ * Longest Common Subsequence computation
514
+ */
515
+
516
+ #define COMPUTE_LONGEST_SUBSEQUENCE \
517
+ l[0] = ALLOC_N(int, b_len + 1); \
518
+ l[1] = ALLOC_N(int, b_len + 1); \
519
+ for (i = a_len, c = 0, p = 1; i >= 0; i--) { \
520
+ for (j = b_len; j >= 0; j--) { \
521
+ if (i == a_len || j == b_len) { \
522
+ l[c][j] = 0; \
523
+ } else if (a_ptr[i] == b_ptr[j]) { \
524
+ l[c][j] = 1 + l[p][j + 1]; \
525
+ } else { \
526
+ int x = l[p][j], y = l[c][j + 1]; \
527
+ if (x > y) l[c][j] = x; else l[c][j] = y; \
528
+ } \
529
+ } \
530
+ p = c; \
531
+ c = (c + 1) % 2; \
532
+ } \
533
+ result = l[p][0]; \
534
+ free(l[0]); \
535
+ free(l[1]);
536
+
537
+
538
+ static VALUE LongestSubsequence_match(General *amatch, VALUE string)
539
+ {
540
+ char *a_ptr, *b_ptr;
541
+ int a_len, b_len;
542
+ int result, c, p, i, j, *l[2];
543
+
544
+ Check_Type(string, T_STRING);
545
+ OPTIMIZE_TIME
546
+
547
+ if (a_len == 0 || b_len == 0) return INT2FIX(0);
548
+ COMPUTE_LONGEST_SUBSEQUENCE
549
+ return INT2FIX(result);
550
+ }
551
+
552
+ static VALUE LongestSubsequence_similar(General *amatch, VALUE string)
553
+ {
554
+ char *a_ptr, *b_ptr;
555
+ int a_len, b_len;
556
+ int result, c, p, i, j, *l[2];
557
+
558
+ Check_Type(string, T_STRING);
559
+ OPTIMIZE_TIME
560
+
561
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
562
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
563
+ COMPUTE_LONGEST_SUBSEQUENCE
564
+ return rb_float_new(((double) result) / b_len);
565
+ }
566
+
567
+ /*
568
+ * Longest Common Substring computation
569
+ */
570
+
571
+ #define COMPUTE_LONGEST_SUBSTRING \
572
+ l[0] = ALLOC_N(int, b_len); \
573
+ MEMZERO(l[0], int, b_len); \
574
+ l[1] = ALLOC_N(int, b_len); \
575
+ MEMZERO(l[1], int, b_len); \
576
+ result = 0; \
577
+ for (i = 0, c = 0, p = 1; i < a_len; i++) { \
578
+ for (j = 0; j < b_len; j++) { \
579
+ if (a_ptr[i] == b_ptr[j]) { \
580
+ l[c][j] = j == 0 ? 1 : 1 + l[p][j - 1]; \
581
+ if (l[c][j] > result) result = l[c][j]; \
582
+ } else { \
583
+ l[c][j] = 0; \
584
+ } \
585
+ } \
586
+ p = c; \
587
+ c = (c + 1) % 2; \
588
+ } \
589
+ free(l[0]); \
590
+ free(l[1]);
591
+
592
+ static VALUE LongestSubstring_match(General *amatch, VALUE string)
593
+ {
594
+ char *a_ptr, *b_ptr;
595
+ int a_len, b_len;
596
+ int result, c, p, i, j, *l[2];
597
+
598
+ Check_Type(string, T_STRING);
599
+ OPTIMIZE_TIME
600
+ if (a_len == 0 || b_len == 0) return INT2FIX(0);
601
+ COMPUTE_LONGEST_SUBSTRING
602
+ return INT2FIX(result);
603
+ }
604
+
605
+ static VALUE LongestSubstring_similar(General *amatch, VALUE string)
606
+ {
607
+ char *a_ptr, *b_ptr;
608
+ int a_len, b_len;
609
+ int result, c, p, i, j, *l[2];
610
+
611
+ Check_Type(string, T_STRING);
612
+ OPTIMIZE_TIME
613
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
614
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
615
+ COMPUTE_LONGEST_SUBSTRING
616
+ return rb_float_new(((double) result) / b_len);
617
+ }
618
+
619
+ /*
620
+ * Ruby API
621
+ */
622
+
623
+ /*
624
+ * Document-class: Amatch::Levenshtein
625
+ *
626
+ * The Levenshtein edit distance is defined as the minimal costs involved to
627
+ * transform one string into another by using three elementary operations:
628
+ * deletion, insertion and substitution of a character. To transform "water"
629
+ * into "wine", for instance, you have to substitute "a" -> "i": "witer", "t"
630
+ * -> "n": "winer" and delete "r": "wine". The edit distance between "water"
631
+ * and "wine" is 3, because you have to apply three operations. The edit
632
+ * distance between "wine" and "wine" is 0 of course: no operation is
633
+ * necessary for the transformation -- they're already the same string. It's
634
+ * easy to see that more similar strings have smaller edit distances than
635
+ * strings that differ a lot.
636
+ */
637
+
638
+ DEF_RB_FREE(Levenshtein, General)
639
+
640
+ /*
641
+ * call-seq: new(pattern)
642
+ *
643
+ * Creates a new Amatch::Levenshtein instance from <code>pattern</code>.
644
+ */
645
+ static VALUE rb_Levenshtein_initialize(VALUE self, VALUE pattern)
646
+ {
647
+ GET_STRUCT(General)
648
+ General_pattern_set(amatch, pattern);
649
+ return self;
650
+ }
651
+
652
+ DEF_CONSTRUCTOR(Levenshtein, General)
653
+
654
+ /*
655
+ * call-seq: match(strings) -> results
656
+ *
657
+ * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
658
+ * against <code>strings</code>. It returns the number operations, the Sellers
659
+ * distance. <code>strings</code> has to be either a String or an Array of
660
+ * Strings. The returned <code>results</code> are either a Float or an Array of
661
+ * Floats respectively.
662
+ */
663
+ static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
664
+ {
665
+ GET_STRUCT(General)
666
+ return General_iterate_strings(amatch, strings, Levenshtein_match);
667
+ }
668
+
669
+ /*
670
+ * call-seq: similar(strings) -> results
671
+ *
672
+ * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
673
+ * against <code>strings</code>, and compute a Levenshtein distance metric
674
+ * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
675
+ * <code>strings</code> has to be either a String or an Array of Strings. The
676
+ * returned <code>results</code> are either a Fixnum or an Array of Fixnums
677
+ * respectively.
678
+ */
679
+ static VALUE rb_Levenshtein_similar(VALUE self, VALUE strings)
680
+ {
681
+ GET_STRUCT(General)
682
+ return General_iterate_strings(amatch, strings, Levenshtein_similar);
683
+ }
684
+
685
+ /*
686
+ * call-seq: levenshtein_similar(strings) -> results
687
+ *
688
+ * If called on a String, this string is used as a Amatch::Levenshtein#pattern
689
+ * to match against <code>strings</code>. It returns a Levenshtein distance
690
+ * metric number between 0.0 for very unsimilar strings and 1.0 for an exact
691
+ * match. <code>strings</code> has to be either a String or an Array of
692
+ * Strings. The returned <code>results</code> are either a Float or an Array of
693
+ * Floats respectively.
694
+ */
695
+ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
696
+ {
697
+ VALUE amatch = rb_Levenshtein_new(rb_cLevenshtein, self);
698
+ return rb_Levenshtein_similar(amatch, strings);
699
+ }
700
+
701
+ /*
702
+ * call-seq: search(strings) -> results
703
+ *
704
+ * searches Amatch::Levenshtein#pattern in <code>strings</code> and returns the
705
+ * edit distance (the sum of character operations) as a Fixnum value, by greedy
706
+ * trimming prefixes or postfixes of the match. <code>strings</code> has
707
+ * to be either a String or an Array of Strings. The returned
708
+ * <code>results</code> are either a Float or an Array of Floats respectively.
709
+ */
710
+ static VALUE rb_Levenshtein_search(VALUE self, VALUE strings)
711
+ {
712
+ GET_STRUCT(General)
713
+ return General_iterate_strings(amatch, strings, Levenshtein_search);
714
+ }
715
+
716
+ /*
717
+ * Document-class: Amatch::Sellers
718
+ *
719
+ * The Sellers edit distance is very similar to the Levenshtein edit distance.
720
+ * The difference is, that you can also specify different weights for every
721
+ * operation to prefer special operations over others. This extension of the
722
+ * Sellers edit distance is also known under the names: Needleman-Wunsch
723
+ * distance.
724
+ */
725
+
726
+ DEF_RB_FREE(Sellers, Sellers)
727
+
728
+ /*
729
+ * Document-method: substitution
730
+ *
731
+ * call-seq: substitution -> weight
732
+ *
733
+ * Returns the weight of the substitution operation, that is used to compute
734
+ * the Sellers distance.
735
+ */
736
+ DEF_RB_READER(Sellers, rb_Sellers_substitution, substitution,
737
+ rb_float_new)
738
+
739
+ /*
740
+ * Document-method: deletion
741
+ *
742
+ * call-seq: deletion -> weight
743
+ *
744
+ * Returns the weight of the deletion operation, that is used to compute
745
+ * the Sellers distance.
746
+ */
747
+ DEF_RB_READER(Sellers, rb_Sellers_deletion, deletion,
748
+ rb_float_new)
749
+
750
+ /*
751
+ * Document-method: insertion
752
+ *
753
+ * call-seq: insertion -> weight
754
+ *
755
+ * Returns the weight of the insertion operation, that is used to compute
756
+ * the Sellers distance.
757
+ */
758
+ DEF_RB_READER(Sellers, rb_Sellers_insertion, insertion,
759
+ rb_float_new)
760
+
761
+ /*
762
+ * Document-method: substitution=
763
+ *
764
+ * call-seq: substitution=(weight)
765
+ *
766
+ * Sets the weight of the substitution operation, that is used to compute
767
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
768
+ * should be a Float value >= 0.0.
769
+ */
770
+ DEF_RB_WRITER(Sellers, rb_Sellers_substitution_set, substitution,
771
+ double, CAST2FLOAT, FLOAT2C, >= 0)
772
+
773
+ /*
774
+ * Document-method: deletion=
775
+ *
776
+ * call-seq: deletion=(weight)
777
+ *
778
+ * Sets the weight of the deletion operation, that is used to compute
779
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
780
+ * should be a Float value >= 0.0.
781
+ */
782
+ DEF_RB_WRITER(Sellers, rb_Sellers_deletion_set, deletion,
783
+ double, CAST2FLOAT, FLOAT2C, >= 0)
784
+
785
+ /*
786
+ * Document-method: insertion=
787
+ *
788
+ * call-seq: insertion=(weight)
789
+ *
790
+ * Sets the weight of the insertion operation, that is used to compute
791
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
792
+ * should be a Float value >= 0.0.
793
+ */
794
+ DEF_RB_WRITER(Sellers, rb_Sellers_insertion_set, insertion,
795
+ double, CAST2FLOAT, FLOAT2C, >= 0)
796
+
797
+ /*
798
+ * Resets all weights (substitution, deletion, and insertion) to 1.0.
799
+ */
800
+ static VALUE rb_Sellers_reset_weights(VALUE self)
801
+ {
802
+ GET_STRUCT(Sellers)
803
+ Sellers_reset_weights(amatch);
804
+ return self;
805
+ }
806
+
807
+ /*
808
+ * call-seq: new(pattern)
809
+ *
810
+ * Creates a new Amatch::Sellers instance from <code>pattern</code>,
811
+ * with all weights initially set to 1.0.
812
+ */
813
+ static VALUE rb_Sellers_initialize(VALUE self, VALUE pattern)
814
+ {
815
+ GET_STRUCT(Sellers)
816
+ Sellers_pattern_set(amatch, pattern);
817
+ Sellers_reset_weights(amatch);
818
+ return self;
819
+ }
820
+
821
+ DEF_CONSTRUCTOR(Sellers, Sellers)
822
+
823
+ /*
824
+ * Document-method: pattern
825
+ *
826
+ * call-seq: pattern -> pattern string
827
+ *
828
+ * Returns the current pattern string of this Amatch::Sellers instance.
829
+ */
830
+
831
+ /*
832
+ * Document-method: pattern=
833
+ *
834
+ * call-seq: pattern=(pattern)
835
+ *
836
+ * Sets the current pattern string of this Amatch::Sellers instance to
837
+ * <code>pattern</code>.
838
+ */
839
+
840
+ /*
841
+ * call-seq: match(strings) -> results
842
+ *
843
+ * Uses this Amatch::Sellers instance to match Sellers#pattern against
844
+ * <code>strings</code>, while taking into account the given weights. It
845
+ * returns the number of weighted character operations, the Sellers distance.
846
+ * <code>strings</code> has to be either a String or an Array of Strings. The
847
+ * returned <code>results</code> are either a Float or an Array of Floats
848
+ * respectively.
849
+ */
850
+ static VALUE rb_Sellers_match(VALUE self, VALUE strings)
851
+ {
852
+ GET_STRUCT(Sellers)
853
+ return Sellers_iterate_strings(amatch, strings, Sellers_match);
854
+ }
855
+
856
+ /*
857
+ * call-seq: similar(strings) -> results
858
+ *
859
+ * Uses this Amatch::Sellers instance to match Amatch::Sellers#pattern
860
+ * against <code>strings</code> (taking into account the given weights), and
861
+ * compute a Sellers distance metric number between 0.0 for very unsimilar
862
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
863
+ * String or an Array of Strings. The returned <code>results</code> are either
864
+ * a Fixnum or an Array of Fixnums
865
+ * respectively.
866
+ */
867
+ static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
868
+ {
869
+ GET_STRUCT(Sellers)
870
+ return Sellers_iterate_strings(amatch, strings, Sellers_similar);
871
+ }
872
+
873
+ /*
874
+ * call-seq: search(strings) -> results
875
+ *
876
+ * searches Sellers#pattern in <code>strings</code> and returns the edit
877
+ * distance (the sum of weighted character operations) as a Float value, by
878
+ * greedy trimming prefixes or postfixes of the match. <code>strings</code> has
879
+ * to be either a String or an Array of Strings. The returned
880
+ * <code>results</code> are either a Float or an Array of Floats respectively.
881
+ */
882
+ static VALUE rb_Sellers_search(VALUE self, VALUE strings)
883
+ {
884
+ GET_STRUCT(Sellers)
885
+ return Sellers_iterate_strings(amatch, strings, Sellers_search);
886
+ }
887
+
888
+ /*
889
+ * Document-class: Amatch::PairDistance
890
+ *
891
+ * The pair distance between two strings is based on the number of adjacent
892
+ * character pairs, that are contained in both strings. The similiarity
893
+ * metric of two strings s1 and s2 is
894
+ * 2*|union(pairs(s1), pairs(s2))| / |pairs(s1)| + |pairs(s2)|
895
+ * If it is 1.0 the two strings are an exact match, if less than 1.0 they
896
+ * are more dissimilar. The advantage of considering adjacent characters, is to
897
+ * take account not only of the characters, but also of the character ordering
898
+ * in the original strings.
899
+ *
900
+ * This metric is very capable to find similarities in natural languages.
901
+ * It is explained in more detail in Simon White's article "How to Strike a
902
+ * Match", located at this url:
903
+ * http://www.catalysoft.com/articles/StrikeAMatch.html
904
+ * It is also very similar (a special case) to the method described under
905
+ * http://citeseer.lcs.mit.edu/gravano01using.html in "Using q-grams in a DBMS
906
+ * for Approximate String Processing."
907
+ */
908
+ DEF_RB_FREE(PairDistance, PairDistance)
909
+
910
+ /*
911
+ * call-seq: new(pattern)
912
+ *
913
+ * Creates a new Amatch::PairDistance instance from <code>pattern</code>.
914
+ */
915
+ static VALUE rb_PairDistance_initialize(VALUE self, VALUE pattern)
916
+ {
917
+ GET_STRUCT(PairDistance)
918
+ PairDistance_pattern_set(amatch, pattern);
919
+ return self;
920
+ }
921
+
922
+ DEF_CONSTRUCTOR(PairDistance, PairDistance)
923
+
924
+ /*
925
+ * call-seq: match(strings, regexp = /\s+/) -> results
926
+ *
927
+ * Uses this Amatch::PairDistance instance to match PairDistance#pattern against
928
+ * <code>strings</code>. It returns the pair distance measure, that is a
929
+ * returned value of 1.0 is an exact match, partial matches are lower
930
+ * values, while 0.0 means no match at all.
931
+ *
932
+ * <code>strings</code> has to be either a String or an
933
+ * Array of Strings. The argument <code>regexp</code> is used to split the
934
+ * pattern and strings into tokens first. It defaults to /\s+/. If the
935
+ * splitting should be omitted, call the method with nil as <code>regexp</code>
936
+ * explicitly.
937
+ *
938
+ * The returned <code>results</code> are either a Float or an
939
+ * Array of Floats respectively.
940
+ */
941
+ static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
942
+ {
943
+ VALUE result, strings, regexp = Qnil;
944
+ int use_regexp;
945
+ GET_STRUCT(PairDistance)
946
+
947
+ rb_scan_args(argc, argv, "11", &strings, &regexp);
948
+ use_regexp = NIL_P(regexp) && argc != 2;
949
+ if (TYPE(strings) == T_STRING) {
950
+ result = PairDistance_match(amatch, strings, regexp, use_regexp);
951
+ } else {
952
+ Check_Type(strings, T_ARRAY);
953
+ int i;
954
+ result = rb_ary_new2(RARRAY_LEN(strings));
955
+ for (i = 0; i < RARRAY_LEN(strings); i++) {
956
+ VALUE string = rb_ary_entry(strings, i);
957
+ if (TYPE(string) != T_STRING) {
958
+ rb_raise(rb_eTypeError,
959
+ "array has to contain only strings (%s given)",
960
+ NIL_P(string) ?
961
+ "NilClass" :
962
+ rb_class2name(CLASS_OF(string)));
963
+ }
964
+ rb_ary_push(result,
965
+ PairDistance_match(amatch, string, regexp, use_regexp));
966
+ }
967
+ }
968
+ pair_array_destroy(amatch->pattern_pair_array);
969
+ amatch->pattern_pair_array = NULL;
970
+ return result;
971
+ }
972
+
973
+ /*
974
+ * call-seq: pair_distance_similar(strings) -> results
975
+ *
976
+ * If called on a String, this string is used as a Amatch::PairDistance#pattern
977
+ * to match against <code>strings</code> using /\s+/ as the tokenizing regular
978
+ * expression. It returns a pair distance metric number between 0.0 for very
979
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
980
+ * either a String or an Array of Strings. The returned <code>results</code>
981
+ * are either a Float or an Array of Floats respectively.
982
+ */
983
+ static VALUE rb_str_pair_distance_similar(VALUE self, VALUE strings)
984
+ {
985
+ VALUE amatch = rb_PairDistance_new(rb_cPairDistance, self);
986
+ return rb_PairDistance_match(1, &strings, amatch);
987
+ }
988
+
989
+ /*
990
+ * Document-class: Amatch::Hamming
991
+ *
992
+ * This class computes the Hamming distance between two strings.
993
+ *
994
+ * The Hamming distance between two strings is the number of characters, that
995
+ * are different. Thus a hamming distance of 0 means an exact
996
+ * match, a hamming distance of 1 means one character is different, and so on.
997
+ * If one string is longer than the other string, the missing characters are
998
+ * counted as different characters.
999
+ */
1000
+
1001
+ DEF_RB_FREE(Hamming, General)
1002
+
1003
+ /*
1004
+ * call-seq: new(pattern)
1005
+ *
1006
+ * Creates a new Amatch::Hamming instance from <code>pattern</code>.
1007
+ */
1008
+ static VALUE rb_Hamming_initialize(VALUE self, VALUE pattern)
1009
+ {
1010
+ GET_STRUCT(General)
1011
+ General_pattern_set(amatch, pattern);
1012
+ return self;
1013
+ }
1014
+
1015
+ DEF_CONSTRUCTOR(Hamming, General)
1016
+
1017
+ /*
1018
+ * call-seq: match(strings) -> results
1019
+ *
1020
+ * Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
1021
+ * <code>strings</code>, that is compute the hamming distance between
1022
+ * <code>pattern</code> and <code>strings</code>. <code>strings</code> has to
1023
+ * be either a String or an Array of Strings. The returned <code>results</code>
1024
+ * are either a Fixnum or an Array of Fixnums respectively.
1025
+ */
1026
+ static VALUE rb_Hamming_match(VALUE self, VALUE strings)
1027
+ {
1028
+ GET_STRUCT(General)
1029
+ return General_iterate_strings(amatch, strings, Hamming_match);
1030
+ }
1031
+
1032
+ /*
1033
+ * call-seq: similar(strings) -> results
1034
+ *
1035
+ * Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
1036
+ * <code>strings</code>, and compute a Hamming distance metric number between
1037
+ * 0.0 for very unsimilar strings and 1.0 for an exact match.
1038
+ * <code>strings</code> has to be either a String or an Array of Strings. The
1039
+ * returned <code>results</code> are either a Fixnum or an Array of Fixnums
1040
+ * respectively.
1041
+ */
1042
+ static VALUE rb_Hamming_similar(VALUE self, VALUE strings)
1043
+ {
1044
+ GET_STRUCT(General)
1045
+ return General_iterate_strings(amatch, strings, Hamming_similar);
1046
+ }
1047
+
1048
+ /*
1049
+ * call-seq: hamming_similar(strings) -> results
1050
+ *
1051
+ * If called on a String, this string is used as a Amatch::Hamming#pattern to
1052
+ * match against <code>strings</code>. It returns a Hamming distance metric
1053
+ * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
1054
+ * <code>strings</code>
1055
+ * has to be either a String or an Array of Strings. The returned
1056
+ * <code>results</code> are either a Float or an Array of Floats respectively.
1057
+ */
1058
+ static VALUE rb_str_hamming_similar(VALUE self, VALUE strings)
1059
+ {
1060
+ VALUE amatch = rb_Hamming_new(rb_cHamming, self);
1061
+ return rb_Hamming_similar(amatch, strings);
1062
+ }
1063
+
1064
+
1065
+ /*
1066
+ * Document-class: Amatch::LongestSubsequence
1067
+ *
1068
+ * This class computes the length of the longest subsequence common to two
1069
+ * strings. A subsequence doesn't have to be contiguous. The longer the common
1070
+ * subsequence is, the more similar the two strings will be.
1071
+ *
1072
+ * The longest common subsequence between "test" and "test" is of length 4,
1073
+ * because "test" itself is this subsequence. The longest common subsequence
1074
+ * between "test" and "east" is "e", "s", "t" and the length of the
1075
+ * sequence is 3.
1076
+ */
1077
+ DEF_RB_FREE(LongestSubsequence, General)
1078
+
1079
+ /*
1080
+ * call-seq: new(pattern)
1081
+ *
1082
+ * Creates a new Amatch::LongestSubsequence instance from <code>pattern</code>.
1083
+ */
1084
+ static VALUE rb_LongestSubsequence_initialize(VALUE self, VALUE pattern)
1085
+ {
1086
+ GET_STRUCT(General)
1087
+ General_pattern_set(amatch, pattern);
1088
+ return self;
1089
+ }
1090
+
1091
+ DEF_CONSTRUCTOR(LongestSubsequence, General)
1092
+
1093
+ /*
1094
+ * call-seq: match(strings) -> results
1095
+ *
1096
+ * Uses this Amatch::LongestSubsequence instance to match
1097
+ * LongestSubsequence#pattern against <code>strings</code>, that is compute the
1098
+ * length of the longest common subsequence. <code>strings</code> has to be
1099
+ * either a String or an Array of Strings. The returned <code>results</code>
1100
+ * are either a Fixnum or an Array of Fixnums respectively.
1101
+ */
1102
+ static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
1103
+ {
1104
+ GET_STRUCT(General)
1105
+ return General_iterate_strings(amatch, strings, LongestSubsequence_match);
1106
+ }
1107
+
1108
+ /*
1109
+ * call-seq: similar(strings) -> results
1110
+ *
1111
+ * Uses this Amatch::LongestSubsequence instance to match
1112
+ * Amatch::LongestSubsequence#pattern against <code>strings</code>, and compute
1113
+ * a longest substring distance metric number between 0.0 for very unsimilar
1114
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1115
+ * String or an Array of Strings. The returned <code>results</code> are either
1116
+ * a Fixnum or an Array of Fixnums
1117
+ */
1118
+ static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
1119
+ {
1120
+ GET_STRUCT(General)
1121
+ return General_iterate_strings(amatch, strings, LongestSubsequence_similar);
1122
+ }
1123
+
1124
+ /*
1125
+ * call-seq: longest_subsequence_similar(strings) -> results
1126
+ *
1127
+ * If called on a String, this string is used as a
1128
+ * Amatch::LongestSubsequence#pattern to match against <code>strings</code>. It
1129
+ * returns a longest subsequence distance metric number between 0.0 for very
1130
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1131
+ * either a String or an Array of Strings. The returned <code>results</code>
1132
+ * are either a Float or an Array of Floats respectively.
1133
+ */
1134
+ static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
1135
+ {
1136
+ VALUE amatch = rb_LongestSubsequence_new(rb_cLongestSubsequence, self);
1137
+ return rb_LongestSubsequence_similar(amatch, strings);
1138
+ }
1139
+
1140
+ /*
1141
+ * Document-class: Amatch::LongestSubstring
1142
+ *
1143
+ * The longest common substring is the longest substring, that is part of
1144
+ * two strings. A substring is contiguous, while a subsequence need not to
1145
+ * be. The longer the common substring is, the more similar the two strings
1146
+ * will be.
1147
+ *
1148
+ * The longest common substring between 'string' and 'string' is 'string'
1149
+ * again, thus the longest common substring length is 6. The longest common
1150
+ * substring between 'string' and 'storing' is 'ring', thus the longest common
1151
+ * substring length is 4.
1152
+ */
1153
+
1154
+ DEF_RB_FREE(LongestSubstring, General)
1155
+
1156
+ /*
1157
+ * call-seq: new(pattern)
1158
+ *
1159
+ * Creates a new Amatch::LongestSubstring instance from <code>pattern</code>.
1160
+ */
1161
+ static VALUE rb_LongestSubstring_initialize(VALUE self, VALUE pattern)
1162
+ {
1163
+ GET_STRUCT(General)
1164
+ General_pattern_set(amatch, pattern);
1165
+ return self;
1166
+ }
1167
+
1168
+ DEF_CONSTRUCTOR(LongestSubstring, General)
1169
+
1170
+ /*
1171
+ * call-seq: match(strings) -> results
1172
+ *
1173
+ * Uses this Amatch::LongestSubstring instance to match
1174
+ * LongestSubstring#pattern against <code>strings</code>, that is compute the
1175
+ * length of the longest common substring. <code>strings</code> has to be
1176
+ * either a String or an Array of Strings. The returned <code>results</code>
1177
+ * are either a Fixnum or an Array of Fixnums respectively.
1178
+ */
1179
+ static VALUE rb_LongestSubstring_match(VALUE self, VALUE strings)
1180
+ {
1181
+ GET_STRUCT(General)
1182
+ return General_iterate_strings(amatch, strings, LongestSubstring_match);
1183
+ }
1184
+
1185
+ /*
1186
+ * call-seq: similar(strings) -> results
1187
+ *
1188
+ * Uses this Amatch::LongestSubstring instance to match
1189
+ * Amatch::LongestSubstring#pattern against <code>strings</code>, and compute a
1190
+ * longest substring distance metric number between 0.0 for very unsimilar
1191
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1192
+ * String or an Array of Strings. The returned <code>results</code> are either
1193
+ * a Fixnum or an Array of Fixnums
1194
+ * respectively.
1195
+ */
1196
+ static VALUE rb_LongestSubstring_similar(VALUE self, VALUE strings)
1197
+ {
1198
+ GET_STRUCT(General)
1199
+ return General_iterate_strings(amatch, strings, LongestSubstring_similar);
1200
+ }
1201
+
1202
+ /*
1203
+ * call-seq: longest_substring_similar(strings) -> results
1204
+ *
1205
+ * If called on a String, this string is used as a
1206
+ * Amatch::LongestSubstring#pattern to match against <code>strings</code>. It
1207
+ * returns a longest substring distance metric number between 0.0 for very
1208
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1209
+ * either a String or an Array of Strings. The returned <code>results</code>
1210
+ * are either a Float or an Array of Floats respectively.
1211
+ */
1212
+ static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
1213
+ {
1214
+ VALUE amatch = rb_LongestSubstring_new(rb_cLongestSubstring, self);
1215
+ return rb_LongestSubstring_similar(amatch, strings);
1216
+ }
1217
+
1218
+ /*
1219
+ * = amatch - Approximate Matching Extension for Ruby
1220
+ *
1221
+ * == Description
1222
+ *
1223
+ * This is a collection of classes that can be used for Approximate
1224
+ * matching, searching, and comparing of Strings. They implement algorithms
1225
+ * that compute the Levenshtein edit distance, Sellers edit distance, the
1226
+ * Hamming distance, the longest common subsequence length, the longest common
1227
+ * substring length, and the pair distance metric.
1228
+ *
1229
+ * == Author
1230
+ *
1231
+ * Florian Frank mailto:flori@ping.de
1232
+ *
1233
+ * == License
1234
+ *
1235
+ * This is free software; you can redistribute it and/or modify it under
1236
+ * the terms of the GNU General Public License Version 2 as published by
1237
+ * the Free Software Foundation: http://www.gnu.org/copyleft/gpl.html
1238
+ *
1239
+ * == Download
1240
+ *
1241
+ * The latest version of <b>amatch</b> can be found at
1242
+ *
1243
+ * * http://rubyforge.org/frs/?group_id=390
1244
+ *
1245
+ * Online Documentation should be located at
1246
+ *
1247
+ * * http://amatch.rubyforge.org
1248
+ *
1249
+ * == Examples
1250
+ * require 'amatch'
1251
+ * # => true
1252
+ * include Amatch
1253
+ * # => Object
1254
+ *
1255
+ * m = Sellers.new("pattern")
1256
+ * # => #<Amatch::Sellers:0x40366324>
1257
+ * m.match("pattren")
1258
+ * # => 2.0
1259
+ * m.substitution = m.insertion = 3
1260
+ * # => 3
1261
+ * m.match("pattren")
1262
+ * # => 4.0
1263
+ * m.reset_weights
1264
+ * # => #<Amatch::Sellers:0x40366324>
1265
+ * m.match(["pattren","parent"])
1266
+ * # => [2.0, 4.0]
1267
+ * m.search("abcpattrendef")
1268
+ * # => 2.0
1269
+ *
1270
+ * m = Levenshtein.new("pattern")
1271
+ * # => #<Amatch::Levenshtein:0x4035919c>
1272
+ * m.match("pattren")
1273
+ * # => 2
1274
+ * m.search("abcpattrendef")
1275
+ * # => 2
1276
+ * "pattern language".levenshtein_similar("language of patterns")
1277
+ * # => 0.2
1278
+ *
1279
+ * m = Hamming.new("pattern")
1280
+ * # => #<Amatch::Hamming:0x40350858>
1281
+ * m.match("pattren")
1282
+ * # => 2
1283
+ * "pattern language".hamming_similar("language of patterns")
1284
+ * # => 0.1
1285
+ *
1286
+ * m = PairDistance.new("pattern")
1287
+ * # => #<Amatch::PairDistance:0x40349be8>
1288
+ * m.match("pattr en")
1289
+ * # => 0.545454545454545
1290
+ * m.match("pattr en", nil)
1291
+ * # => 0.461538461538462
1292
+ * m.match("pattr en", /t+/)
1293
+ * # => 0.285714285714286
1294
+ * "pattern language".pair_distance_similar("language of patterns")
1295
+ * # => 0.928571428571429
1296
+ *
1297
+ * m = LongestSubsequence.new("pattern")
1298
+ * # => #<Amatch::LongestSubsequence:0x4033e900>
1299
+ * m.match("pattren")
1300
+ * # => 6
1301
+ * "pattern language".longest_subsequence_similar("language of patterns")
1302
+ * # => 0.4
1303
+ *
1304
+ * m = LongestSubstring.new("pattern")
1305
+ * # => #<Amatch::LongestSubstring:0x403378d0>
1306
+ * m.match("pattren")
1307
+ * # => 4
1308
+ * "pattern language".longest_substring_similar("language of patterns")
1309
+ * # => 0.4
1310
+ *
1311
+ */
1312
+
1313
+ void Init_amatch()
1314
+ {
1315
+ rb_mAmatch = rb_define_module("Amatch");
1316
+
1317
+ /* Levenshtein */
1318
+ rb_cLevenshtein = rb_define_class_under(rb_mAmatch, "Levenshtein", rb_cObject);
1319
+ rb_define_alloc_func(rb_cLevenshtein, rb_Levenshtein_s_allocate);
1320
+ rb_define_method(rb_cLevenshtein, "initialize", rb_Levenshtein_initialize, 1);
1321
+ rb_define_method(rb_cLevenshtein, "pattern", rb_General_pattern, 0);
1322
+ rb_define_method(rb_cLevenshtein, "pattern=", rb_General_pattern_set, 1);
1323
+ rb_define_method(rb_cLevenshtein, "match", rb_Levenshtein_match, 1);
1324
+ rb_define_method(rb_cLevenshtein, "search", rb_Levenshtein_search, 1);
1325
+ rb_define_method(rb_cLevenshtein, "similar", rb_Levenshtein_similar, 1);
1326
+ rb_define_method(rb_cString, "levenshtein_similar", rb_str_levenshtein_similar, 1);
1327
+
1328
+ /* Sellers */
1329
+ rb_cSellers = rb_define_class_under(rb_mAmatch, "Sellers", rb_cObject);
1330
+ rb_define_alloc_func(rb_cSellers, rb_Sellers_s_allocate);
1331
+ rb_define_method(rb_cSellers, "initialize", rb_Sellers_initialize, 1);
1332
+ rb_define_method(rb_cSellers, "pattern", rb_Sellers_pattern, 0);
1333
+ rb_define_method(rb_cSellers, "pattern=", rb_Sellers_pattern_set, 1);
1334
+ rb_define_method(rb_cSellers, "substitution", rb_Sellers_substitution, 0);
1335
+ rb_define_method(rb_cSellers, "substitution=", rb_Sellers_substitution_set, 1);
1336
+ rb_define_method(rb_cSellers, "deletion", rb_Sellers_deletion, 0);
1337
+ rb_define_method(rb_cSellers, "deletion=", rb_Sellers_deletion_set, 1);
1338
+ rb_define_method(rb_cSellers, "insertion", rb_Sellers_insertion, 0);
1339
+ rb_define_method(rb_cSellers, "insertion=", rb_Sellers_insertion_set, 1);
1340
+ rb_define_method(rb_cSellers, "reset_weights", rb_Sellers_reset_weights, 0);
1341
+ rb_define_method(rb_cSellers, "match", rb_Sellers_match, 1);
1342
+ rb_define_method(rb_cSellers, "search", rb_Sellers_search, 1);
1343
+ rb_define_method(rb_cSellers, "similar", rb_Sellers_similar, 1);
1344
+
1345
+ /* Hamming */
1346
+ rb_cHamming = rb_define_class_under(rb_mAmatch, "Hamming", rb_cObject);
1347
+ rb_define_alloc_func(rb_cHamming, rb_Hamming_s_allocate);
1348
+ rb_define_method(rb_cHamming, "initialize", rb_Hamming_initialize, 1);
1349
+ rb_define_method(rb_cHamming, "pattern", rb_General_pattern, 0);
1350
+ rb_define_method(rb_cHamming, "pattern=", rb_General_pattern_set, 1);
1351
+ rb_define_method(rb_cHamming, "match", rb_Hamming_match, 1);
1352
+ rb_define_method(rb_cHamming, "similar", rb_Hamming_similar, 1);
1353
+ rb_define_method(rb_cString, "hamming_similar", rb_str_hamming_similar, 1);
1354
+
1355
+ /* Pair Distance Metric */
1356
+ rb_cPairDistance = rb_define_class_under(rb_mAmatch, "PairDistance", rb_cObject);
1357
+ rb_define_alloc_func(rb_cPairDistance, rb_PairDistance_s_allocate);
1358
+ rb_define_method(rb_cPairDistance, "initialize", rb_PairDistance_initialize, 1);
1359
+ rb_define_method(rb_cPairDistance, "pattern", rb_PairDistance_pattern, 0);
1360
+ rb_define_method(rb_cPairDistance, "pattern=", rb_PairDistance_pattern_set, 1);
1361
+ rb_define_method(rb_cPairDistance, "match", rb_PairDistance_match, -1);
1362
+ rb_define_alias(rb_cPairDistance, "similar", "match");
1363
+ rb_define_method(rb_cString, "pair_distance_similar", rb_str_pair_distance_similar, 1);
1364
+
1365
+ /* Longest Common Subsequence */
1366
+ rb_cLongestSubsequence = rb_define_class_under(rb_mAmatch, "LongestSubsequence", rb_cObject);
1367
+ rb_define_alloc_func(rb_cLongestSubsequence, rb_LongestSubsequence_s_allocate);
1368
+ rb_define_method(rb_cLongestSubsequence, "initialize", rb_LongestSubsequence_initialize, 1);
1369
+ rb_define_method(rb_cLongestSubsequence, "pattern", rb_General_pattern, 0);
1370
+ rb_define_method(rb_cLongestSubsequence, "pattern=", rb_General_pattern_set, 1);
1371
+ rb_define_method(rb_cLongestSubsequence, "match", rb_LongestSubsequence_match, 1);
1372
+ rb_define_method(rb_cLongestSubsequence, "similar", rb_LongestSubsequence_similar, 1);
1373
+ rb_define_method(rb_cString, "longest_subsequence_similar", rb_str_longest_subsequence_similar, 1);
1374
+
1375
+ /* Longest Common Substring */
1376
+ rb_cLongestSubstring = rb_define_class_under(rb_mAmatch, "LongestSubstring", rb_cObject);
1377
+ rb_define_alloc_func(rb_cLongestSubstring, rb_LongestSubstring_s_allocate);
1378
+ rb_define_method(rb_cLongestSubstring, "initialize", rb_LongestSubstring_initialize, 1);
1379
+ rb_define_method(rb_cLongestSubstring, "pattern", rb_General_pattern, 0);
1380
+ rb_define_method(rb_cLongestSubstring, "pattern=", rb_General_pattern_set, 1);
1381
+ rb_define_method(rb_cLongestSubstring, "match", rb_LongestSubstring_match, 1);
1382
+ rb_define_method(rb_cLongestSubstring, "similar", rb_LongestSubstring_similar, 1);
1383
+ rb_define_method(rb_cString, "longest_substring_similar", rb_str_longest_substring_similar, 1);
1384
+
1385
+ id_split = rb_intern("split");
1386
+ id_to_f = rb_intern("to_f");
1387
+ }
1388
+ /* vim: set et cin sw=4 ts=4: */