mumboe-amatch 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.2.3
@@ -0,0 +1,83 @@
1
+ #! /usr/bin/env ruby
2
+ # vim: set et sw=2 ts=2:
3
+ #
4
+ ## $Id: agrep.rb,v 1.5 2006/09/26 15:59:48 flori Exp $
5
+ #
6
+
7
+ require 'amatch'
8
+ require 'getoptlong'
9
+
10
+ def usage(msg, options)
11
+ puts msg, "Usage: #{File.basename($0)} [OPTIONS] PATTERN [FILE ...]", ""
12
+ options.each do |o|
13
+ puts " " + o[1] + ", " + o[0] + " " +
14
+ (o[2] == GetoptLong::REQUIRED_ARGUMENT ? 'ARGUMENT' : '')
15
+ end
16
+ puts "\nReport bugs to <flori@ping.de>."
17
+ exit 0
18
+ end
19
+
20
+ class Amatch::Levenshtein
21
+ def search_relative(strings)
22
+ search(strings).to_f / pattern.size
23
+ end
24
+ end
25
+
26
+ $distance = 1
27
+ $mode = :search
28
+ begin
29
+ parser = GetoptLong.new
30
+ options = [
31
+ [ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
32
+ [ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
33
+ [ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
34
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
35
+ ]
36
+ parser.set_options(*options)
37
+ parser.each_option do |name, arg|
38
+ name = name.sub(/^--/, '')
39
+ case name
40
+ when 'distance'
41
+ $distance = arg.to_f
42
+ when 'relative'
43
+ $mode = :search_relative
44
+ when 'verbose'
45
+ $verbose = 1
46
+ when 'help'
47
+ usage('You\'ve asked for it!', options)
48
+ end
49
+ end
50
+ rescue
51
+ exit 1
52
+ end
53
+ pattern = ARGV.shift or usage('Pattern needed!', options)
54
+
55
+ matcher = Amatch::Levenshtein.new(pattern)
56
+ size = 0
57
+ start = Time.new
58
+ if ARGV.size > 0 then
59
+ ARGV.each do |filename|
60
+ File.stat(filename).file? or next
61
+ size += File.size(filename)
62
+ begin
63
+ File.open(filename, 'r').each_line do |line|
64
+ if matcher.__send__($mode, line) <= $distance
65
+ puts "#{filename}:#{line}"
66
+ end
67
+ end
68
+ rescue
69
+ STDERR.puts "Failure at #{filename}: #{$!} => Skipping!"
70
+ end
71
+ end
72
+ else
73
+ STDIN.each_line do |line|
74
+ size += line.size
75
+ if matcher.__send__($mode, line) <= $distance
76
+ puts line
77
+ end
78
+ end
79
+ end
80
+ time = Time.new - start
81
+ $verbose and STDERR.printf "%.3f secs running, scanned %.3f KB/s.\n",
82
+ time, size / time / 1024
83
+ exit 0
@@ -0,0 +1,2 @@
1
+ amatch.c
2
+ extconf.rb
@@ -0,0 +1,149 @@
1
+
2
+ SHELL = /bin/sh
3
+
4
+ #### Start of system configuration section. ####
5
+
6
+ srcdir = .
7
+ topdir = /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/universal-darwin9.0
8
+ hdrdir = $(topdir)
9
+ VPATH = $(srcdir):$(topdir):$(hdrdir)
10
+ prefix = $(DESTDIR)/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr
11
+ exec_prefix = $(prefix)
12
+ sitedir = $(DESTDIR)/Library/Ruby/Site
13
+ rubylibdir = $(libdir)/ruby/$(ruby_version)
14
+ docdir = $(datarootdir)/doc/$(PACKAGE)
15
+ dvidir = $(docdir)
16
+ datarootdir = $(prefix)/share
17
+ archdir = $(rubylibdir)/$(arch)
18
+ sbindir = $(exec_prefix)/sbin
19
+ psdir = $(docdir)
20
+ localedir = $(datarootdir)/locale
21
+ htmldir = $(docdir)
22
+ datadir = $(datarootdir)
23
+ includedir = $(prefix)/include
24
+ infodir = $(DESTDIR)/usr/share/info
25
+ sysconfdir = $(prefix)/etc
26
+ mandir = $(DESTDIR)/usr/share/man
27
+ libdir = $(exec_prefix)/lib
28
+ sharedstatedir = $(prefix)/com
29
+ oldincludedir = $(DESTDIR)/usr/include
30
+ pdfdir = $(docdir)
31
+ sitearchdir = $(sitelibdir)/$(sitearch)
32
+ bindir = $(exec_prefix)/bin
33
+ localstatedir = $(prefix)/var
34
+ sitelibdir = $(sitedir)/$(ruby_version)
35
+ libexecdir = $(exec_prefix)/libexec
36
+
37
+ CC = gcc -Wall
38
+ LIBRUBY = $(LIBRUBY_SO)
39
+ LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
40
+ LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
41
+ LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)
42
+
43
+ RUBY_EXTCONF_H =
44
+ CFLAGS = -fno-common -arch ppc -arch i386 -Os -pipe -fno-common
45
+ INCFLAGS = -I. -I$(topdir) -I$(hdrdir) -I$(srcdir)
46
+ CPPFLAGS =
47
+ CXXFLAGS = $(CFLAGS)
48
+ DLDFLAGS = -L. -arch ppc -arch i386
49
+ LDSHARED = cc -arch ppc -arch i386 -pipe -bundle -undefined dynamic_lookup
50
+ AR = ar
51
+ EXEEXT =
52
+
53
+ RUBY_INSTALL_NAME = ruby
54
+ RUBY_SO_NAME = ruby
55
+ arch = universal-darwin9.0
56
+ sitearch = universal-darwin9.0
57
+ ruby_version = 1.8
58
+ ruby = /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/bin/ruby
59
+ RUBY = $(ruby)
60
+ RM = rm -f
61
+ MAKEDIRS = mkdir -p
62
+ INSTALL = /usr/bin/install -c
63
+ INSTALL_PROG = $(INSTALL) -m 0755
64
+ INSTALL_DATA = $(INSTALL) -m 644
65
+ COPY = cp
66
+
67
+ #### End of system configuration section. ####
68
+
69
+ preload =
70
+
71
+ libpath = . $(libdir)
72
+ LIBPATH = -L"." -L"$(libdir)"
73
+ DEFFILE =
74
+
75
+ CLEANFILES = mkmf.log
76
+ DISTCLEANFILES =
77
+
78
+ extout =
79
+ extout_prefix =
80
+ target_prefix =
81
+ LOCAL_LIBS =
82
+ LIBS = $(LIBRUBYARG_SHARED) -lpthread -ldl -lm
83
+ SRCS = amatch.c pair.c
84
+ OBJS = amatch.o pair.o
85
+ TARGET = amatch
86
+ DLLIB = $(TARGET).bundle
87
+ EXTSTATIC =
88
+ STATIC_LIB =
89
+
90
+ RUBYCOMMONDIR = $(sitedir)$(target_prefix)
91
+ RUBYLIBDIR = /Library/Ruby/Gems/1.8/gems/amatch-0.2.3/ext$(target_prefix)
92
+ RUBYARCHDIR = /Library/Ruby/Gems/1.8/gems/amatch-0.2.3/ext$(target_prefix)
93
+
94
+ TARGET_SO = $(DLLIB)
95
+ CLEANLIBS = $(TARGET).bundle $(TARGET).il? $(TARGET).tds $(TARGET).map
96
+ CLEANOBJS = *.o *.a *.s[ol] *.pdb *.exp *.bak
97
+
98
+ all: $(DLLIB)
99
+ static: $(STATIC_LIB)
100
+
101
+ clean:
102
+ @-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
103
+
104
+ distclean: clean
105
+ @-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
106
+ @-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
107
+
108
+ realclean: distclean
109
+ install: install-so install-rb
110
+
111
+ install-so: $(RUBYARCHDIR)
112
+ install-so: $(RUBYARCHDIR)/$(DLLIB)
113
+ $(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
114
+ $(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
115
+ install-rb: pre-install-rb install-rb-default
116
+ install-rb-default: pre-install-rb-default
117
+ pre-install-rb: Makefile
118
+ pre-install-rb-default: Makefile
119
+ $(RUBYARCHDIR):
120
+ $(MAKEDIRS) $@
121
+
122
+ site-install: site-install-so site-install-rb
123
+ site-install-so: install-so
124
+ site-install-rb: install-rb
125
+
126
+ .SUFFIXES: .c .m .cc .cxx .cpp .C .o
127
+
128
+ .cc.o:
129
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
130
+
131
+ .cxx.o:
132
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
133
+
134
+ .cpp.o:
135
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
136
+
137
+ .C.o:
138
+ $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
139
+
140
+ .c.o:
141
+ $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) -c $<
142
+
143
+ $(DLLIB): $(OBJS)
144
+ @-$(RM) $@
145
+ $(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
146
+
147
+
148
+
149
+ $(OBJS): ruby.h defines.h
Binary file
@@ -0,0 +1,1388 @@
1
+ #include "ruby.h"
2
+ #include "pair.h"
3
+
4
+ /*
5
+ * Document-method: pattern
6
+ *
7
+ * call-seq: pattern -> pattern string
8
+ *
9
+ * Returns the current pattern string of this instance.
10
+ */
11
+
12
+ /*
13
+ * Document-method: pattern=
14
+ *
15
+ * call-seq: pattern=(pattern)
16
+ *
17
+ * Sets the current pattern string of this instance to <code>pattern</code>.
18
+ */
19
+
20
+
21
+ static VALUE rb_mAmatch, rb_cLevenshtein, rb_cSellers, rb_cHamming,
22
+ rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring;
23
+
24
+ static ID id_split, id_to_f;
25
+
26
+ #define GET_STRUCT(klass) \
27
+ klass *amatch; \
28
+ Data_Get_Struct(self, klass, amatch);
29
+
30
+ #define DEF_ALLOCATOR(type) \
31
+ static type *type##_allocate() \
32
+ { \
33
+ type *obj = ALLOC(type); \
34
+ MEMZERO(obj, type, 1); \
35
+ return obj; \
36
+ }
37
+
38
+ #define DEF_CONSTRUCTOR(klass, type) \
39
+ static VALUE rb_##klass##_s_allocate(VALUE klass2) \
40
+ { \
41
+ type *amatch = type##_allocate(); \
42
+ return Data_Wrap_Struct(klass2, NULL, rb_##klass##_free, amatch); \
43
+ } \
44
+ VALUE rb_##klass##_new(VALUE klass2, VALUE pattern) \
45
+ { \
46
+ VALUE obj = rb_##klass##_s_allocate(klass2); \
47
+ rb_##klass##_initialize(obj, pattern); \
48
+ return obj; \
49
+ }
50
+
51
+ #define DEF_RB_FREE(klass, type) \
52
+ static void rb_##klass##_free(type *amatch) \
53
+ { \
54
+ MEMZERO(amatch->pattern, char, amatch->pattern_len); \
55
+ free(amatch->pattern); \
56
+ MEMZERO(amatch, type, 1); \
57
+ free(amatch); \
58
+ }
59
+
60
+ #define DEF_PATTERN_ACCESSOR(type) \
61
+ static void type##_pattern_set(type *amatch, VALUE pattern) \
62
+ { \
63
+ Check_Type(pattern, T_STRING); \
64
+ free(amatch->pattern); \
65
+ amatch->pattern_len = RSTRING_LEN(pattern); \
66
+ amatch->pattern = ALLOC_N(char, amatch->pattern_len); \
67
+ MEMCPY(amatch->pattern, RSTRING_PTR(pattern), char, \
68
+ RSTRING_LEN(pattern)); \
69
+ } \
70
+ static VALUE rb_##type##_pattern(VALUE self) \
71
+ { \
72
+ GET_STRUCT(type) \
73
+ return rb_str_new(amatch->pattern, amatch->pattern_len); \
74
+ } \
75
+ static VALUE rb_##type##_pattern_set(VALUE self, VALUE pattern) \
76
+ { \
77
+ GET_STRUCT(type) \
78
+ type##_pattern_set(amatch, pattern); \
79
+ return Qnil; \
80
+ }
81
+
82
+ #define DEF_ITERATE_STRINGS(type) \
83
+ static VALUE type##_iterate_strings(type *amatch, VALUE strings, \
84
+ VALUE (*match_function) (type *amatch, VALUE strings)) \
85
+ { \
86
+ if (TYPE(strings) == T_STRING) { \
87
+ return match_function(amatch, strings); \
88
+ } else { \
89
+ Check_Type(strings, T_ARRAY); \
90
+ int i; \
91
+ VALUE result = rb_ary_new2(RARRAY_LEN(strings)); \
92
+ for (i = 0; i < RARRAY_LEN(strings); i++) { \
93
+ VALUE string = rb_ary_entry(strings, i); \
94
+ if (TYPE(string) != T_STRING) { \
95
+ rb_raise(rb_eTypeError, \
96
+ "array has to contain only strings (%s given)", \
97
+ NIL_P(string) ? \
98
+ "NilClass" : \
99
+ rb_class2name(CLASS_OF(string))); \
100
+ } \
101
+ rb_ary_push(result, match_function(amatch, string)); \
102
+ } \
103
+ return result; \
104
+ } \
105
+ }
106
+
107
+ #define DEF_RB_READER(type, function, name, converter) \
108
+ VALUE function(VALUE self) \
109
+ { \
110
+ GET_STRUCT(type) \
111
+ return converter(amatch->name); \
112
+ }
113
+
114
+ #define DEF_RB_WRITER(type, function, name, vtype, caster, converter, check)\
115
+ VALUE function(VALUE self, VALUE value) \
116
+ { \
117
+ vtype value_ ## vtype; \
118
+ GET_STRUCT(type) \
119
+ caster(value); \
120
+ value_ ## vtype = converter(value); \
121
+ if (!(value_ ## vtype check)) \
122
+ rb_raise(rb_eTypeError, "check of value " #check " failed"); \
123
+ amatch->name = value_ ## vtype; \
124
+ return Qnil; \
125
+ }
126
+
127
+
128
+ #define CAST2FLOAT(obj) \
129
+ if (TYPE(obj) != T_FLOAT && rb_respond_to(obj, id_to_f)) \
130
+ obj = rb_funcall(obj, id_to_f, 0, 0); \
131
+ else \
132
+ Check_Type(obj, T_FLOAT)
133
+ #define FLOAT2C(obj) RFLOAT_VALUE(obj)
134
+
135
+ #define OPTIMIZE_TIME \
136
+ if (amatch->pattern_len < RSTRING_LEN(string)) { \
137
+ a_ptr = amatch->pattern; \
138
+ a_len = amatch->pattern_len; \
139
+ b_ptr = RSTRING_PTR(string); \
140
+ b_len = RSTRING_LEN(string); \
141
+ } else { \
142
+ a_ptr = RSTRING_PTR(string); \
143
+ a_len = RSTRING_LEN(string); \
144
+ b_ptr = amatch->pattern; \
145
+ b_len = amatch->pattern_len; \
146
+ }
147
+
148
+ #define DONT_OPTIMIZE \
149
+ a_ptr = amatch->pattern; \
150
+ a_len = amatch->pattern_len; \
151
+ b_ptr = RSTRING_PTR(string); \
152
+ b_len = RSTRING_LEN(string); \
153
+
154
+ /*
155
+ * C structures of the Amatch classes
156
+ */
157
+
158
+ typedef struct GeneralStruct {
159
+ char *pattern;
160
+ int pattern_len;
161
+ } General;
162
+
163
+ DEF_ALLOCATOR(General)
164
+ DEF_PATTERN_ACCESSOR(General)
165
+ DEF_ITERATE_STRINGS(General)
166
+
167
+ typedef struct SellersStruct {
168
+ char *pattern;
169
+ int pattern_len;
170
+ double substitution;
171
+ double deletion;
172
+ double insertion;
173
+ } Sellers;
174
+
175
+ DEF_ALLOCATOR(Sellers)
176
+ DEF_PATTERN_ACCESSOR(Sellers)
177
+ DEF_ITERATE_STRINGS(Sellers)
178
+
179
+ static void Sellers_reset_weights(Sellers *self)
180
+ {
181
+ self->substitution = 1.0;
182
+ self->deletion = 1.0;
183
+ self->insertion = 1.0;
184
+ }
185
+
186
+ typedef struct PairDistanceStruct {
187
+ char *pattern;
188
+ int pattern_len;
189
+ PairArray *pattern_pair_array;
190
+ } PairDistance;
191
+
192
+ DEF_ALLOCATOR(PairDistance)
193
+ DEF_PATTERN_ACCESSOR(PairDistance)
194
+
195
+ /*
196
+ * Levenshtein edit distances are computed here:
197
+ */
198
+
199
+ #define COMPUTE_LEVENSHTEIN_DISTANCE \
200
+ for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
201
+ c = i % 2; /* current row */ \
202
+ p = (i + 1) % 2; /* previous row */ \
203
+ v[c][0] = i; /* first column */ \
204
+ for (j = 1; j <= b_len; j++) { \
205
+ /* Bellman's principle of optimality: */ \
206
+ weight = v[p][j - 1] + (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : 1); \
207
+ if (weight > v[p][j] + 1) { \
208
+ weight = v[p][j] + 1; \
209
+ } \
210
+ if (weight > v[c][j - 1] + 1) { \
211
+ weight = v[c][j - 1] + 1; \
212
+ } \
213
+ v[c][j] = weight; \
214
+ } \
215
+ p = c; \
216
+ c = (c + 1) % 2; \
217
+ }
218
+
219
+ static VALUE Levenshtein_match(General *amatch, VALUE string)
220
+ {
221
+ VALUE result;
222
+ char *a_ptr, *b_ptr;
223
+ int a_len, b_len;
224
+ int *v[2], weight;
225
+ int i, j, c, p;
226
+
227
+ Check_Type(string, T_STRING);
228
+ DONT_OPTIMIZE
229
+
230
+ v[0] = ALLOC_N(int, b_len + 1);
231
+ v[1] = ALLOC_N(int, b_len + 1);
232
+ for (i = 0; i <= b_len; i++) {
233
+ v[0][i] = i;
234
+ v[1][i] = i;
235
+ }
236
+
237
+ COMPUTE_LEVENSHTEIN_DISTANCE
238
+
239
+ result = INT2FIX(v[p][b_len]);
240
+
241
+ free(v[0]);
242
+ free(v[1]);
243
+
244
+ return result;
245
+ }
246
+
247
+ static VALUE Levenshtein_similar(General *amatch, VALUE string)
248
+ {
249
+ VALUE result;
250
+ char *a_ptr, *b_ptr;
251
+ int a_len, b_len;
252
+ int *v[2], weight;
253
+ int i, j, c, p;
254
+
255
+ Check_Type(string, T_STRING);
256
+ DONT_OPTIMIZE
257
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
258
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
259
+ v[0] = ALLOC_N(int, b_len + 1);
260
+ v[1] = ALLOC_N(int, b_len + 1);
261
+ for (i = 0; i <= b_len; i++) {
262
+ v[0][i] = i;
263
+ v[1][i] = i;
264
+ }
265
+
266
+ COMPUTE_LEVENSHTEIN_DISTANCE
267
+
268
+ if (b_len > a_len) {
269
+ result = rb_float_new(1.0 - ((double) v[p][b_len]) / b_len);
270
+ } else {
271
+ result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
272
+ }
273
+ free(v[0]);
274
+ free(v[1]);
275
+ return result;
276
+ }
277
+
278
+ static VALUE Levenshtein_search(General *amatch, VALUE string)
279
+ {
280
+ VALUE result;
281
+ char *a_ptr, *b_ptr;
282
+ int a_len, b_len;
283
+ int *v[2], weight, min;
284
+ int i, j, c, p;
285
+
286
+ Check_Type(string, T_STRING);
287
+ DONT_OPTIMIZE
288
+
289
+ v[0] = ALLOC_N(int, b_len + 1);
290
+ v[1] = ALLOC_N(int, b_len + 1);
291
+ MEMZERO(v[0], int, b_len + 1);
292
+ MEMZERO(v[1], int, b_len + 1);
293
+
294
+ COMPUTE_LEVENSHTEIN_DISTANCE
295
+
296
+ for (i = 0, min = a_len; i <= b_len; i++) {
297
+ if (v[p][i] < min) min = v[p][i];
298
+ }
299
+
300
+ result = INT2FIX(min);
301
+
302
+ free(v[0]);
303
+ free(v[1]);
304
+
305
+ return result;
306
+ }
307
+
308
+
309
+ /*
310
+ * Sellers edit distances are computed here:
311
+ */
312
+
313
+ #define COMPUTE_SELLERS_DISTANCE \
314
+ for (i = 1, c = 0, p = 1; i <= a_len; i++) { \
315
+ c = i % 2; /* current row */ \
316
+ p = (i + 1) % 2; /* previous row */ \
317
+ v[c][0] = i * amatch->deletion; /* first column */ \
318
+ for (j = 1; j <= b_len; j++) { \
319
+ /* Bellman's principle of optimality: */ \
320
+ weight = v[p][j - 1] + \
321
+ (a_ptr[i - 1] == b_ptr[j - 1] ? 0 : amatch->substitution); \
322
+ if (weight > v[p][j] + amatch->insertion) { \
323
+ weight = v[p][j] + amatch->insertion; \
324
+ } \
325
+ if (weight > v[c][j - 1] + amatch->deletion) { \
326
+ weight = v[c][j - 1] + amatch->deletion; \
327
+ } \
328
+ v[c][j] = weight; \
329
+ } \
330
+ p = c; \
331
+ c = (c + 1) % 2; \
332
+ }
333
+
334
+ static VALUE Sellers_match(Sellers *amatch, VALUE string)
335
+ {
336
+ VALUE result;
337
+ char *a_ptr, *b_ptr;
338
+ int a_len, b_len;
339
+ double *v[2], weight;
340
+ int i, j, c, p;
341
+
342
+ Check_Type(string, T_STRING);
343
+ DONT_OPTIMIZE
344
+
345
+ v[0] = ALLOC_N(double, b_len + 1);
346
+ v[1] = ALLOC_N(double, b_len + 1);
347
+ for (i = 0; i <= b_len; i++) {
348
+ v[0][i] = i * amatch->deletion;
349
+ v[1][i] = i * amatch->deletion;
350
+ }
351
+
352
+ COMPUTE_SELLERS_DISTANCE
353
+
354
+ result = rb_float_new(v[p][b_len]);
355
+ free(v[0]);
356
+ free(v[1]);
357
+ return result;
358
+ }
359
+
360
+ static VALUE Sellers_similar(Sellers *amatch, VALUE string)
361
+ {
362
+ VALUE result;
363
+ char *a_ptr, *b_ptr;
364
+ int a_len, b_len;
365
+ double *v[2], weight, max_weight;
366
+ int i, j, c, p;
367
+
368
+ if (amatch->insertion >= amatch->deletion) {
369
+ if (amatch->substitution >= amatch->insertion) {
370
+ max_weight = amatch->substitution;
371
+ } else {
372
+ max_weight = amatch->insertion;
373
+ }
374
+ } else {
375
+ if (amatch->substitution >= amatch->deletion) {
376
+ max_weight = amatch->substitution;
377
+ } else {
378
+ max_weight = amatch->deletion;
379
+ }
380
+ }
381
+
382
+ Check_Type(string, T_STRING);
383
+ DONT_OPTIMIZE
384
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
385
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
386
+ v[0] = ALLOC_N(double, b_len + 1);
387
+ v[1] = ALLOC_N(double, b_len + 1);
388
+ for (i = 0; i <= b_len; i++) {
389
+ v[0][i] = i * amatch->deletion;
390
+ v[1][i] = i * amatch->deletion;
391
+ }
392
+
393
+ COMPUTE_SELLERS_DISTANCE
394
+
395
+ if (b_len > a_len) {
396
+ result = rb_float_new(1.0 - v[p][b_len] / (b_len * max_weight));
397
+ } else {
398
+ result = rb_float_new(1.0 - v[p][b_len] / (a_len * max_weight));
399
+ }
400
+ free(v[0]);
401
+ free(v[1]);
402
+ return result;
403
+ }
404
+
405
+ static VALUE Sellers_search(Sellers *amatch, VALUE string)
406
+ {
407
+ VALUE result;
408
+ char *a_ptr, *b_ptr;
409
+ int a_len, b_len;
410
+ double *v[2], weight, min;
411
+ int i, j, c, p;
412
+
413
+ Check_Type(string, T_STRING);
414
+ DONT_OPTIMIZE
415
+
416
+ v[0] = ALLOC_N(double, b_len + 1);
417
+ v[1] = ALLOC_N(double, b_len + 1);
418
+ MEMZERO(v[0], double, b_len + 1);
419
+ MEMZERO(v[1], double, b_len + 1);
420
+
421
+ COMPUTE_SELLERS_DISTANCE
422
+
423
+ for (i = 0, min = a_len; i <= b_len; i++) {
424
+ if (v[p][i] < min) min = v[p][i];
425
+ }
426
+ result = rb_float_new(min);
427
+ free(v[0]);
428
+ free(v[1]);
429
+
430
+ return result;
431
+ }
432
+
433
+ /*
434
+ * Pair distances are computed here:
435
+ */
436
+
437
+ static VALUE PairDistance_match(
438
+ PairDistance *amatch, VALUE string, VALUE regexp, int use_regexp)
439
+ {
440
+ double result;
441
+ VALUE tokens;
442
+ PairArray *pair_array;
443
+
444
+ Check_Type(string, T_STRING);
445
+ if (!NIL_P(regexp) || use_regexp) {
446
+ tokens = rb_funcall(
447
+ rb_str_new(amatch->pattern, amatch->pattern_len),
448
+ id_split, 1, regexp
449
+ );
450
+ if (!amatch->pattern_pair_array) {
451
+ amatch->pattern_pair_array = PairArray_new(tokens);
452
+ } else {
453
+ pair_array_reactivate(amatch->pattern_pair_array);
454
+ }
455
+ tokens = rb_funcall(string, id_split, 1, regexp);
456
+ pair_array = PairArray_new(tokens);
457
+ } else {
458
+ VALUE tmp = rb_str_new(amatch->pattern, amatch->pattern_len);
459
+ tokens = rb_ary_new4(1, &tmp);
460
+ if (!amatch->pattern_pair_array) {
461
+ amatch->pattern_pair_array = PairArray_new(tokens);
462
+ } else {
463
+ pair_array_reactivate(amatch->pattern_pair_array);
464
+ }
465
+ tokens = rb_ary_new4(1, &string);
466
+ pair_array = PairArray_new(tokens);
467
+ }
468
+ result = pair_array_match(amatch->pattern_pair_array, pair_array);
469
+ pair_array_destroy(pair_array);
470
+ return rb_float_new(result);
471
+ }
472
+
473
+ /*
474
+ * Hamming distances are computed here:
475
+ */
476
+
477
+ #define COMPUTE_HAMMING_DISTANCE \
478
+ for (i = 0, result = b_len - a_len; i < a_len; i++) { \
479
+ if (i >= b_len) { \
480
+ result += a_len - b_len; \
481
+ break; \
482
+ } \
483
+ if (b_ptr[i] != a_ptr[i]) result++; \
484
+ }
485
+
486
+ static VALUE Hamming_match(General *amatch, VALUE string)
487
+ {
488
+ char *a_ptr, *b_ptr;
489
+ int a_len, b_len;
490
+ int i, result;
491
+
492
+ Check_Type(string, T_STRING);
493
+ OPTIMIZE_TIME
494
+ COMPUTE_HAMMING_DISTANCE
495
+ return INT2FIX(result);
496
+ }
497
+
498
+ static VALUE Hamming_similar(General *amatch, VALUE string)
499
+ {
500
+ char *a_ptr, *b_ptr;
501
+ int a_len, b_len;
502
+ int i, result;
503
+
504
+ Check_Type(string, T_STRING);
505
+ OPTIMIZE_TIME
506
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
507
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
508
+ COMPUTE_HAMMING_DISTANCE
509
+ return rb_float_new(1.0 - ((double) result) / b_len);
510
+ }
511
+
512
+ /*
513
+ * Longest Common Subsequence computation
514
+ */
515
+
516
+ #define COMPUTE_LONGEST_SUBSEQUENCE \
517
+ l[0] = ALLOC_N(int, b_len + 1); \
518
+ l[1] = ALLOC_N(int, b_len + 1); \
519
+ for (i = a_len, c = 0, p = 1; i >= 0; i--) { \
520
+ for (j = b_len; j >= 0; j--) { \
521
+ if (i == a_len || j == b_len) { \
522
+ l[c][j] = 0; \
523
+ } else if (a_ptr[i] == b_ptr[j]) { \
524
+ l[c][j] = 1 + l[p][j + 1]; \
525
+ } else { \
526
+ int x = l[p][j], y = l[c][j + 1]; \
527
+ if (x > y) l[c][j] = x; else l[c][j] = y; \
528
+ } \
529
+ } \
530
+ p = c; \
531
+ c = (c + 1) % 2; \
532
+ } \
533
+ result = l[p][0]; \
534
+ free(l[0]); \
535
+ free(l[1]);
536
+
537
+
538
+ static VALUE LongestSubsequence_match(General *amatch, VALUE string)
539
+ {
540
+ char *a_ptr, *b_ptr;
541
+ int a_len, b_len;
542
+ int result, c, p, i, j, *l[2];
543
+
544
+ Check_Type(string, T_STRING);
545
+ OPTIMIZE_TIME
546
+
547
+ if (a_len == 0 || b_len == 0) return INT2FIX(0);
548
+ COMPUTE_LONGEST_SUBSEQUENCE
549
+ return INT2FIX(result);
550
+ }
551
+
552
+ static VALUE LongestSubsequence_similar(General *amatch, VALUE string)
553
+ {
554
+ char *a_ptr, *b_ptr;
555
+ int a_len, b_len;
556
+ int result, c, p, i, j, *l[2];
557
+
558
+ Check_Type(string, T_STRING);
559
+ OPTIMIZE_TIME
560
+
561
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
562
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
563
+ COMPUTE_LONGEST_SUBSEQUENCE
564
+ return rb_float_new(((double) result) / b_len);
565
+ }
566
+
567
+ /*
568
+ * Longest Common Substring computation
569
+ */
570
+
571
+ #define COMPUTE_LONGEST_SUBSTRING \
572
+ l[0] = ALLOC_N(int, b_len); \
573
+ MEMZERO(l[0], int, b_len); \
574
+ l[1] = ALLOC_N(int, b_len); \
575
+ MEMZERO(l[1], int, b_len); \
576
+ result = 0; \
577
+ for (i = 0, c = 0, p = 1; i < a_len; i++) { \
578
+ for (j = 0; j < b_len; j++) { \
579
+ if (a_ptr[i] == b_ptr[j]) { \
580
+ l[c][j] = j == 0 ? 1 : 1 + l[p][j - 1]; \
581
+ if (l[c][j] > result) result = l[c][j]; \
582
+ } else { \
583
+ l[c][j] = 0; \
584
+ } \
585
+ } \
586
+ p = c; \
587
+ c = (c + 1) % 2; \
588
+ } \
589
+ free(l[0]); \
590
+ free(l[1]);
591
+
592
+ static VALUE LongestSubstring_match(General *amatch, VALUE string)
593
+ {
594
+ char *a_ptr, *b_ptr;
595
+ int a_len, b_len;
596
+ int result, c, p, i, j, *l[2];
597
+
598
+ Check_Type(string, T_STRING);
599
+ OPTIMIZE_TIME
600
+ if (a_len == 0 || b_len == 0) return INT2FIX(0);
601
+ COMPUTE_LONGEST_SUBSTRING
602
+ return INT2FIX(result);
603
+ }
604
+
605
+ static VALUE LongestSubstring_similar(General *amatch, VALUE string)
606
+ {
607
+ char *a_ptr, *b_ptr;
608
+ int a_len, b_len;
609
+ int result, c, p, i, j, *l[2];
610
+
611
+ Check_Type(string, T_STRING);
612
+ OPTIMIZE_TIME
613
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
614
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
615
+ COMPUTE_LONGEST_SUBSTRING
616
+ return rb_float_new(((double) result) / b_len);
617
+ }
618
+
619
+ /*
620
+ * Ruby API
621
+ */
622
+
623
+ /*
624
+ * Document-class: Amatch::Levenshtein
625
+ *
626
+ * The Levenshtein edit distance is defined as the minimal costs involved to
627
+ * transform one string into another by using three elementary operations:
628
+ * deletion, insertion and substitution of a character. To transform "water"
629
+ * into "wine", for instance, you have to substitute "a" -> "i": "witer", "t"
630
+ * -> "n": "winer" and delete "r": "wine". The edit distance between "water"
631
+ * and "wine" is 3, because you have to apply three operations. The edit
632
+ * distance between "wine" and "wine" is 0 of course: no operation is
633
+ * necessary for the transformation -- they're already the same string. It's
634
+ * easy to see that more similar strings have smaller edit distances than
635
+ * strings that differ a lot.
636
+ */
637
+
638
+ DEF_RB_FREE(Levenshtein, General)
639
+
640
+ /*
641
+ * call-seq: new(pattern)
642
+ *
643
+ * Creates a new Amatch::Levenshtein instance from <code>pattern</code>.
644
+ */
645
+ static VALUE rb_Levenshtein_initialize(VALUE self, VALUE pattern)
646
+ {
647
+ GET_STRUCT(General)
648
+ General_pattern_set(amatch, pattern);
649
+ return self;
650
+ }
651
+
652
+ DEF_CONSTRUCTOR(Levenshtein, General)
653
+
654
+ /*
655
+ * call-seq: match(strings) -> results
656
+ *
657
+ * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
658
+ * against <code>strings</code>. It returns the number operations, the Sellers
659
+ * distance. <code>strings</code> has to be either a String or an Array of
660
+ * Strings. The returned <code>results</code> are either a Float or an Array of
661
+ * Floats respectively.
662
+ */
663
+ static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
664
+ {
665
+ GET_STRUCT(General)
666
+ return General_iterate_strings(amatch, strings, Levenshtein_match);
667
+ }
668
+
669
+ /*
670
+ * call-seq: similar(strings) -> results
671
+ *
672
+ * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
673
+ * against <code>strings</code>, and compute a Levenshtein distance metric
674
+ * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
675
+ * <code>strings</code> has to be either a String or an Array of Strings. The
676
+ * returned <code>results</code> are either a Fixnum or an Array of Fixnums
677
+ * respectively.
678
+ */
679
+ static VALUE rb_Levenshtein_similar(VALUE self, VALUE strings)
680
+ {
681
+ GET_STRUCT(General)
682
+ return General_iterate_strings(amatch, strings, Levenshtein_similar);
683
+ }
684
+
685
+ /*
686
+ * call-seq: levenshtein_similar(strings) -> results
687
+ *
688
+ * If called on a String, this string is used as a Amatch::Levenshtein#pattern
689
+ * to match against <code>strings</code>. It returns a Levenshtein distance
690
+ * metric number between 0.0 for very unsimilar strings and 1.0 for an exact
691
+ * match. <code>strings</code> has to be either a String or an Array of
692
+ * Strings. The returned <code>results</code> are either a Float or an Array of
693
+ * Floats respectively.
694
+ */
695
+ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
696
+ {
697
+ VALUE amatch = rb_Levenshtein_new(rb_cLevenshtein, self);
698
+ return rb_Levenshtein_similar(amatch, strings);
699
+ }
700
+
701
+ /*
702
+ * call-seq: search(strings) -> results
703
+ *
704
+ * searches Amatch::Levenshtein#pattern in <code>strings</code> and returns the
705
+ * edit distance (the sum of character operations) as a Fixnum value, by greedy
706
+ * trimming prefixes or postfixes of the match. <code>strings</code> has
707
+ * to be either a String or an Array of Strings. The returned
708
+ * <code>results</code> are either a Float or an Array of Floats respectively.
709
+ */
710
+ static VALUE rb_Levenshtein_search(VALUE self, VALUE strings)
711
+ {
712
+ GET_STRUCT(General)
713
+ return General_iterate_strings(amatch, strings, Levenshtein_search);
714
+ }
715
+
716
+ /*
717
+ * Document-class: Amatch::Sellers
718
+ *
719
+ * The Sellers edit distance is very similar to the Levenshtein edit distance.
720
+ * The difference is, that you can also specify different weights for every
721
+ * operation to prefer special operations over others. This extension of the
722
+ * Sellers edit distance is also known under the names: Needleman-Wunsch
723
+ * distance.
724
+ */
725
+
726
+ DEF_RB_FREE(Sellers, Sellers)
727
+
728
+ /*
729
+ * Document-method: substitution
730
+ *
731
+ * call-seq: substitution -> weight
732
+ *
733
+ * Returns the weight of the substitution operation, that is used to compute
734
+ * the Sellers distance.
735
+ */
736
+ DEF_RB_READER(Sellers, rb_Sellers_substitution, substitution,
737
+ rb_float_new)
738
+
739
+ /*
740
+ * Document-method: deletion
741
+ *
742
+ * call-seq: deletion -> weight
743
+ *
744
+ * Returns the weight of the deletion operation, that is used to compute
745
+ * the Sellers distance.
746
+ */
747
+ DEF_RB_READER(Sellers, rb_Sellers_deletion, deletion,
748
+ rb_float_new)
749
+
750
+ /*
751
+ * Document-method: insertion
752
+ *
753
+ * call-seq: insertion -> weight
754
+ *
755
+ * Returns the weight of the insertion operation, that is used to compute
756
+ * the Sellers distance.
757
+ */
758
+ DEF_RB_READER(Sellers, rb_Sellers_insertion, insertion,
759
+ rb_float_new)
760
+
761
+ /*
762
+ * Document-method: substitution=
763
+ *
764
+ * call-seq: substitution=(weight)
765
+ *
766
+ * Sets the weight of the substitution operation, that is used to compute
767
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
768
+ * should be a Float value >= 0.0.
769
+ */
770
+ DEF_RB_WRITER(Sellers, rb_Sellers_substitution_set, substitution,
771
+ double, CAST2FLOAT, FLOAT2C, >= 0)
772
+
773
+ /*
774
+ * Document-method: deletion=
775
+ *
776
+ * call-seq: deletion=(weight)
777
+ *
778
+ * Sets the weight of the deletion operation, that is used to compute
779
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
780
+ * should be a Float value >= 0.0.
781
+ */
782
+ DEF_RB_WRITER(Sellers, rb_Sellers_deletion_set, deletion,
783
+ double, CAST2FLOAT, FLOAT2C, >= 0)
784
+
785
+ /*
786
+ * Document-method: insertion=
787
+ *
788
+ * call-seq: insertion=(weight)
789
+ *
790
+ * Sets the weight of the insertion operation, that is used to compute
791
+ * the Sellers distance, to <code>weight</code>. The <code>weight</code>
792
+ * should be a Float value >= 0.0.
793
+ */
794
+ DEF_RB_WRITER(Sellers, rb_Sellers_insertion_set, insertion,
795
+ double, CAST2FLOAT, FLOAT2C, >= 0)
796
+
797
+ /*
798
+ * Resets all weights (substitution, deletion, and insertion) to 1.0.
799
+ */
800
+ static VALUE rb_Sellers_reset_weights(VALUE self)
801
+ {
802
+ GET_STRUCT(Sellers)
803
+ Sellers_reset_weights(amatch);
804
+ return self;
805
+ }
806
+
807
+ /*
808
+ * call-seq: new(pattern)
809
+ *
810
+ * Creates a new Amatch::Sellers instance from <code>pattern</code>,
811
+ * with all weights initially set to 1.0.
812
+ */
813
+ static VALUE rb_Sellers_initialize(VALUE self, VALUE pattern)
814
+ {
815
+ GET_STRUCT(Sellers)
816
+ Sellers_pattern_set(amatch, pattern);
817
+ Sellers_reset_weights(amatch);
818
+ return self;
819
+ }
820
+
821
+ DEF_CONSTRUCTOR(Sellers, Sellers)
822
+
823
+ /*
824
+ * Document-method: pattern
825
+ *
826
+ * call-seq: pattern -> pattern string
827
+ *
828
+ * Returns the current pattern string of this Amatch::Sellers instance.
829
+ */
830
+
831
+ /*
832
+ * Document-method: pattern=
833
+ *
834
+ * call-seq: pattern=(pattern)
835
+ *
836
+ * Sets the current pattern string of this Amatch::Sellers instance to
837
+ * <code>pattern</code>.
838
+ */
839
+
840
+ /*
841
+ * call-seq: match(strings) -> results
842
+ *
843
+ * Uses this Amatch::Sellers instance to match Sellers#pattern against
844
+ * <code>strings</code>, while taking into account the given weights. It
845
+ * returns the number of weighted character operations, the Sellers distance.
846
+ * <code>strings</code> has to be either a String or an Array of Strings. The
847
+ * returned <code>results</code> are either a Float or an Array of Floats
848
+ * respectively.
849
+ */
850
+ static VALUE rb_Sellers_match(VALUE self, VALUE strings)
851
+ {
852
+ GET_STRUCT(Sellers)
853
+ return Sellers_iterate_strings(amatch, strings, Sellers_match);
854
+ }
855
+
856
+ /*
857
+ * call-seq: similar(strings) -> results
858
+ *
859
+ * Uses this Amatch::Sellers instance to match Amatch::Sellers#pattern
860
+ * against <code>strings</code> (taking into account the given weights), and
861
+ * compute a Sellers distance metric number between 0.0 for very unsimilar
862
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
863
+ * String or an Array of Strings. The returned <code>results</code> are either
864
+ * a Fixnum or an Array of Fixnums
865
+ * respectively.
866
+ */
867
+ static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
868
+ {
869
+ GET_STRUCT(Sellers)
870
+ return Sellers_iterate_strings(amatch, strings, Sellers_similar);
871
+ }
872
+
873
+ /*
874
+ * call-seq: search(strings) -> results
875
+ *
876
+ * searches Sellers#pattern in <code>strings</code> and returns the edit
877
+ * distance (the sum of weighted character operations) as a Float value, by
878
+ * greedy trimming prefixes or postfixes of the match. <code>strings</code> has
879
+ * to be either a String or an Array of Strings. The returned
880
+ * <code>results</code> are either a Float or an Array of Floats respectively.
881
+ */
882
+ static VALUE rb_Sellers_search(VALUE self, VALUE strings)
883
+ {
884
+ GET_STRUCT(Sellers)
885
+ return Sellers_iterate_strings(amatch, strings, Sellers_search);
886
+ }
887
+
888
+ /*
889
+ * Document-class: Amatch::PairDistance
890
+ *
891
+ * The pair distance between two strings is based on the number of adjacent
892
+ * character pairs, that are contained in both strings. The similiarity
893
+ * metric of two strings s1 and s2 is
894
+ * 2*|union(pairs(s1), pairs(s2))| / |pairs(s1)| + |pairs(s2)|
895
+ * If it is 1.0 the two strings are an exact match, if less than 1.0 they
896
+ * are more dissimilar. The advantage of considering adjacent characters, is to
897
+ * take account not only of the characters, but also of the character ordering
898
+ * in the original strings.
899
+ *
900
+ * This metric is very capable to find similarities in natural languages.
901
+ * It is explained in more detail in Simon White's article "How to Strike a
902
+ * Match", located at this url:
903
+ * http://www.catalysoft.com/articles/StrikeAMatch.html
904
+ * It is also very similar (a special case) to the method described under
905
+ * http://citeseer.lcs.mit.edu/gravano01using.html in "Using q-grams in a DBMS
906
+ * for Approximate String Processing."
907
+ */
908
+ DEF_RB_FREE(PairDistance, PairDistance)
909
+
910
+ /*
911
+ * call-seq: new(pattern)
912
+ *
913
+ * Creates a new Amatch::PairDistance instance from <code>pattern</code>.
914
+ */
915
+ static VALUE rb_PairDistance_initialize(VALUE self, VALUE pattern)
916
+ {
917
+ GET_STRUCT(PairDistance)
918
+ PairDistance_pattern_set(amatch, pattern);
919
+ return self;
920
+ }
921
+
922
+ DEF_CONSTRUCTOR(PairDistance, PairDistance)
923
+
924
+ /*
925
+ * call-seq: match(strings, regexp = /\s+/) -> results
926
+ *
927
+ * Uses this Amatch::PairDistance instance to match PairDistance#pattern against
928
+ * <code>strings</code>. It returns the pair distance measure, that is a
929
+ * returned value of 1.0 is an exact match, partial matches are lower
930
+ * values, while 0.0 means no match at all.
931
+ *
932
+ * <code>strings</code> has to be either a String or an
933
+ * Array of Strings. The argument <code>regexp</code> is used to split the
934
+ * pattern and strings into tokens first. It defaults to /\s+/. If the
935
+ * splitting should be omitted, call the method with nil as <code>regexp</code>
936
+ * explicitly.
937
+ *
938
+ * The returned <code>results</code> are either a Float or an
939
+ * Array of Floats respectively.
940
+ */
941
+ static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
942
+ {
943
+ VALUE result, strings, regexp = Qnil;
944
+ int use_regexp;
945
+ GET_STRUCT(PairDistance)
946
+
947
+ rb_scan_args(argc, argv, "11", &strings, &regexp);
948
+ use_regexp = NIL_P(regexp) && argc != 2;
949
+ if (TYPE(strings) == T_STRING) {
950
+ result = PairDistance_match(amatch, strings, regexp, use_regexp);
951
+ } else {
952
+ Check_Type(strings, T_ARRAY);
953
+ int i;
954
+ result = rb_ary_new2(RARRAY_LEN(strings));
955
+ for (i = 0; i < RARRAY_LEN(strings); i++) {
956
+ VALUE string = rb_ary_entry(strings, i);
957
+ if (TYPE(string) != T_STRING) {
958
+ rb_raise(rb_eTypeError,
959
+ "array has to contain only strings (%s given)",
960
+ NIL_P(string) ?
961
+ "NilClass" :
962
+ rb_class2name(CLASS_OF(string)));
963
+ }
964
+ rb_ary_push(result,
965
+ PairDistance_match(amatch, string, regexp, use_regexp));
966
+ }
967
+ }
968
+ pair_array_destroy(amatch->pattern_pair_array);
969
+ amatch->pattern_pair_array = NULL;
970
+ return result;
971
+ }
972
+
973
+ /*
974
+ * call-seq: pair_distance_similar(strings) -> results
975
+ *
976
+ * If called on a String, this string is used as a Amatch::PairDistance#pattern
977
+ * to match against <code>strings</code> using /\s+/ as the tokenizing regular
978
+ * expression. It returns a pair distance metric number between 0.0 for very
979
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
980
+ * either a String or an Array of Strings. The returned <code>results</code>
981
+ * are either a Float or an Array of Floats respectively.
982
+ */
983
+ static VALUE rb_str_pair_distance_similar(VALUE self, VALUE strings)
984
+ {
985
+ VALUE amatch = rb_PairDistance_new(rb_cPairDistance, self);
986
+ return rb_PairDistance_match(1, &strings, amatch);
987
+ }
988
+
989
+ /*
990
+ * Document-class: Amatch::Hamming
991
+ *
992
+ * This class computes the Hamming distance between two strings.
993
+ *
994
+ * The Hamming distance between two strings is the number of characters, that
995
+ * are different. Thus a hamming distance of 0 means an exact
996
+ * match, a hamming distance of 1 means one character is different, and so on.
997
+ * If one string is longer than the other string, the missing characters are
998
+ * counted as different characters.
999
+ */
1000
+
1001
+ DEF_RB_FREE(Hamming, General)
1002
+
1003
+ /*
1004
+ * call-seq: new(pattern)
1005
+ *
1006
+ * Creates a new Amatch::Hamming instance from <code>pattern</code>.
1007
+ */
1008
+ static VALUE rb_Hamming_initialize(VALUE self, VALUE pattern)
1009
+ {
1010
+ GET_STRUCT(General)
1011
+ General_pattern_set(amatch, pattern);
1012
+ return self;
1013
+ }
1014
+
1015
+ DEF_CONSTRUCTOR(Hamming, General)
1016
+
1017
+ /*
1018
+ * call-seq: match(strings) -> results
1019
+ *
1020
+ * Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
1021
+ * <code>strings</code>, that is compute the hamming distance between
1022
+ * <code>pattern</code> and <code>strings</code>. <code>strings</code> has to
1023
+ * be either a String or an Array of Strings. The returned <code>results</code>
1024
+ * are either a Fixnum or an Array of Fixnums respectively.
1025
+ */
1026
+ static VALUE rb_Hamming_match(VALUE self, VALUE strings)
1027
+ {
1028
+ GET_STRUCT(General)
1029
+ return General_iterate_strings(amatch, strings, Hamming_match);
1030
+ }
1031
+
1032
+ /*
1033
+ * call-seq: similar(strings) -> results
1034
+ *
1035
+ * Uses this Amatch::Hamming instance to match Amatch::Hamming#pattern against
1036
+ * <code>strings</code>, and compute a Hamming distance metric number between
1037
+ * 0.0 for very unsimilar strings and 1.0 for an exact match.
1038
+ * <code>strings</code> has to be either a String or an Array of Strings. The
1039
+ * returned <code>results</code> are either a Fixnum or an Array of Fixnums
1040
+ * respectively.
1041
+ */
1042
+ static VALUE rb_Hamming_similar(VALUE self, VALUE strings)
1043
+ {
1044
+ GET_STRUCT(General)
1045
+ return General_iterate_strings(amatch, strings, Hamming_similar);
1046
+ }
1047
+
1048
+ /*
1049
+ * call-seq: hamming_similar(strings) -> results
1050
+ *
1051
+ * If called on a String, this string is used as a Amatch::Hamming#pattern to
1052
+ * match against <code>strings</code>. It returns a Hamming distance metric
1053
+ * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
1054
+ * <code>strings</code>
1055
+ * has to be either a String or an Array of Strings. The returned
1056
+ * <code>results</code> are either a Float or an Array of Floats respectively.
1057
+ */
1058
+ static VALUE rb_str_hamming_similar(VALUE self, VALUE strings)
1059
+ {
1060
+ VALUE amatch = rb_Hamming_new(rb_cHamming, self);
1061
+ return rb_Hamming_similar(amatch, strings);
1062
+ }
1063
+
1064
+
1065
+ /*
1066
+ * Document-class: Amatch::LongestSubsequence
1067
+ *
1068
+ * This class computes the length of the longest subsequence common to two
1069
+ * strings. A subsequence doesn't have to be contiguous. The longer the common
1070
+ * subsequence is, the more similar the two strings will be.
1071
+ *
1072
+ * The longest common subsequence between "test" and "test" is of length 4,
1073
+ * because "test" itself is this subsequence. The longest common subsequence
1074
+ * between "test" and "east" is "e", "s", "t" and the length of the
1075
+ * sequence is 3.
1076
+ */
1077
+ DEF_RB_FREE(LongestSubsequence, General)
1078
+
1079
+ /*
1080
+ * call-seq: new(pattern)
1081
+ *
1082
+ * Creates a new Amatch::LongestSubsequence instance from <code>pattern</code>.
1083
+ */
1084
+ static VALUE rb_LongestSubsequence_initialize(VALUE self, VALUE pattern)
1085
+ {
1086
+ GET_STRUCT(General)
1087
+ General_pattern_set(amatch, pattern);
1088
+ return self;
1089
+ }
1090
+
1091
+ DEF_CONSTRUCTOR(LongestSubsequence, General)
1092
+
1093
+ /*
1094
+ * call-seq: match(strings) -> results
1095
+ *
1096
+ * Uses this Amatch::LongestSubsequence instance to match
1097
+ * LongestSubsequence#pattern against <code>strings</code>, that is compute the
1098
+ * length of the longest common subsequence. <code>strings</code> has to be
1099
+ * either a String or an Array of Strings. The returned <code>results</code>
1100
+ * are either a Fixnum or an Array of Fixnums respectively.
1101
+ */
1102
+ static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
1103
+ {
1104
+ GET_STRUCT(General)
1105
+ return General_iterate_strings(amatch, strings, LongestSubsequence_match);
1106
+ }
1107
+
1108
+ /*
1109
+ * call-seq: similar(strings) -> results
1110
+ *
1111
+ * Uses this Amatch::LongestSubsequence instance to match
1112
+ * Amatch::LongestSubsequence#pattern against <code>strings</code>, and compute
1113
+ * a longest substring distance metric number between 0.0 for very unsimilar
1114
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1115
+ * String or an Array of Strings. The returned <code>results</code> are either
1116
+ * a Fixnum or an Array of Fixnums
1117
+ */
1118
+ static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
1119
+ {
1120
+ GET_STRUCT(General)
1121
+ return General_iterate_strings(amatch, strings, LongestSubsequence_similar);
1122
+ }
1123
+
1124
+ /*
1125
+ * call-seq: longest_subsequence_similar(strings) -> results
1126
+ *
1127
+ * If called on a String, this string is used as a
1128
+ * Amatch::LongestSubsequence#pattern to match against <code>strings</code>. It
1129
+ * returns a longest subsequence distance metric number between 0.0 for very
1130
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1131
+ * either a String or an Array of Strings. The returned <code>results</code>
1132
+ * are either a Float or an Array of Floats respectively.
1133
+ */
1134
+ static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
1135
+ {
1136
+ VALUE amatch = rb_LongestSubsequence_new(rb_cLongestSubsequence, self);
1137
+ return rb_LongestSubsequence_similar(amatch, strings);
1138
+ }
1139
+
1140
+ /*
1141
+ * Document-class: Amatch::LongestSubstring
1142
+ *
1143
+ * The longest common substring is the longest substring, that is part of
1144
+ * two strings. A substring is contiguous, while a subsequence need not to
1145
+ * be. The longer the common substring is, the more similar the two strings
1146
+ * will be.
1147
+ *
1148
+ * The longest common substring between 'string' and 'string' is 'string'
1149
+ * again, thus the longest common substring length is 6. The longest common
1150
+ * substring between 'string' and 'storing' is 'ring', thus the longest common
1151
+ * substring length is 4.
1152
+ */
1153
+
1154
+ DEF_RB_FREE(LongestSubstring, General)
1155
+
1156
+ /*
1157
+ * call-seq: new(pattern)
1158
+ *
1159
+ * Creates a new Amatch::LongestSubstring instance from <code>pattern</code>.
1160
+ */
1161
+ static VALUE rb_LongestSubstring_initialize(VALUE self, VALUE pattern)
1162
+ {
1163
+ GET_STRUCT(General)
1164
+ General_pattern_set(amatch, pattern);
1165
+ return self;
1166
+ }
1167
+
1168
+ DEF_CONSTRUCTOR(LongestSubstring, General)
1169
+
1170
+ /*
1171
+ * call-seq: match(strings) -> results
1172
+ *
1173
+ * Uses this Amatch::LongestSubstring instance to match
1174
+ * LongestSubstring#pattern against <code>strings</code>, that is compute the
1175
+ * length of the longest common substring. <code>strings</code> has to be
1176
+ * either a String or an Array of Strings. The returned <code>results</code>
1177
+ * are either a Fixnum or an Array of Fixnums respectively.
1178
+ */
1179
+ static VALUE rb_LongestSubstring_match(VALUE self, VALUE strings)
1180
+ {
1181
+ GET_STRUCT(General)
1182
+ return General_iterate_strings(amatch, strings, LongestSubstring_match);
1183
+ }
1184
+
1185
+ /*
1186
+ * call-seq: similar(strings) -> results
1187
+ *
1188
+ * Uses this Amatch::LongestSubstring instance to match
1189
+ * Amatch::LongestSubstring#pattern against <code>strings</code>, and compute a
1190
+ * longest substring distance metric number between 0.0 for very unsimilar
1191
+ * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1192
+ * String or an Array of Strings. The returned <code>results</code> are either
1193
+ * a Fixnum or an Array of Fixnums
1194
+ * respectively.
1195
+ */
1196
+ static VALUE rb_LongestSubstring_similar(VALUE self, VALUE strings)
1197
+ {
1198
+ GET_STRUCT(General)
1199
+ return General_iterate_strings(amatch, strings, LongestSubstring_similar);
1200
+ }
1201
+
1202
+ /*
1203
+ * call-seq: longest_substring_similar(strings) -> results
1204
+ *
1205
+ * If called on a String, this string is used as a
1206
+ * Amatch::LongestSubstring#pattern to match against <code>strings</code>. It
1207
+ * returns a longest substring distance metric number between 0.0 for very
1208
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1209
+ * either a String or an Array of Strings. The returned <code>results</code>
1210
+ * are either a Float or an Array of Floats respectively.
1211
+ */
1212
+ static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
1213
+ {
1214
+ VALUE amatch = rb_LongestSubstring_new(rb_cLongestSubstring, self);
1215
+ return rb_LongestSubstring_similar(amatch, strings);
1216
+ }
1217
+
1218
+ /*
1219
+ * = amatch - Approximate Matching Extension for Ruby
1220
+ *
1221
+ * == Description
1222
+ *
1223
+ * This is a collection of classes that can be used for Approximate
1224
+ * matching, searching, and comparing of Strings. They implement algorithms
1225
+ * that compute the Levenshtein edit distance, Sellers edit distance, the
1226
+ * Hamming distance, the longest common subsequence length, the longest common
1227
+ * substring length, and the pair distance metric.
1228
+ *
1229
+ * == Author
1230
+ *
1231
+ * Florian Frank mailto:flori@ping.de
1232
+ *
1233
+ * == License
1234
+ *
1235
+ * This is free software; you can redistribute it and/or modify it under
1236
+ * the terms of the GNU General Public License Version 2 as published by
1237
+ * the Free Software Foundation: http://www.gnu.org/copyleft/gpl.html
1238
+ *
1239
+ * == Download
1240
+ *
1241
+ * The latest version of <b>amatch</b> can be found at
1242
+ *
1243
+ * * http://rubyforge.org/frs/?group_id=390
1244
+ *
1245
+ * Online Documentation should be located at
1246
+ *
1247
+ * * http://amatch.rubyforge.org
1248
+ *
1249
+ * == Examples
1250
+ * require 'amatch'
1251
+ * # => true
1252
+ * include Amatch
1253
+ * # => Object
1254
+ *
1255
+ * m = Sellers.new("pattern")
1256
+ * # => #<Amatch::Sellers:0x40366324>
1257
+ * m.match("pattren")
1258
+ * # => 2.0
1259
+ * m.substitution = m.insertion = 3
1260
+ * # => 3
1261
+ * m.match("pattren")
1262
+ * # => 4.0
1263
+ * m.reset_weights
1264
+ * # => #<Amatch::Sellers:0x40366324>
1265
+ * m.match(["pattren","parent"])
1266
+ * # => [2.0, 4.0]
1267
+ * m.search("abcpattrendef")
1268
+ * # => 2.0
1269
+ *
1270
+ * m = Levenshtein.new("pattern")
1271
+ * # => #<Amatch::Levenshtein:0x4035919c>
1272
+ * m.match("pattren")
1273
+ * # => 2
1274
+ * m.search("abcpattrendef")
1275
+ * # => 2
1276
+ * "pattern language".levenshtein_similar("language of patterns")
1277
+ * # => 0.2
1278
+ *
1279
+ * m = Hamming.new("pattern")
1280
+ * # => #<Amatch::Hamming:0x40350858>
1281
+ * m.match("pattren")
1282
+ * # => 2
1283
+ * "pattern language".hamming_similar("language of patterns")
1284
+ * # => 0.1
1285
+ *
1286
+ * m = PairDistance.new("pattern")
1287
+ * # => #<Amatch::PairDistance:0x40349be8>
1288
+ * m.match("pattr en")
1289
+ * # => 0.545454545454545
1290
+ * m.match("pattr en", nil)
1291
+ * # => 0.461538461538462
1292
+ * m.match("pattr en", /t+/)
1293
+ * # => 0.285714285714286
1294
+ * "pattern language".pair_distance_similar("language of patterns")
1295
+ * # => 0.928571428571429
1296
+ *
1297
+ * m = LongestSubsequence.new("pattern")
1298
+ * # => #<Amatch::LongestSubsequence:0x4033e900>
1299
+ * m.match("pattren")
1300
+ * # => 6
1301
+ * "pattern language".longest_subsequence_similar("language of patterns")
1302
+ * # => 0.4
1303
+ *
1304
+ * m = LongestSubstring.new("pattern")
1305
+ * # => #<Amatch::LongestSubstring:0x403378d0>
1306
+ * m.match("pattren")
1307
+ * # => 4
1308
+ * "pattern language".longest_substring_similar("language of patterns")
1309
+ * # => 0.4
1310
+ *
1311
+ */
1312
+
1313
+ void Init_amatch()
1314
+ {
1315
+ rb_mAmatch = rb_define_module("Amatch");
1316
+
1317
+ /* Levenshtein */
1318
+ rb_cLevenshtein = rb_define_class_under(rb_mAmatch, "Levenshtein", rb_cObject);
1319
+ rb_define_alloc_func(rb_cLevenshtein, rb_Levenshtein_s_allocate);
1320
+ rb_define_method(rb_cLevenshtein, "initialize", rb_Levenshtein_initialize, 1);
1321
+ rb_define_method(rb_cLevenshtein, "pattern", rb_General_pattern, 0);
1322
+ rb_define_method(rb_cLevenshtein, "pattern=", rb_General_pattern_set, 1);
1323
+ rb_define_method(rb_cLevenshtein, "match", rb_Levenshtein_match, 1);
1324
+ rb_define_method(rb_cLevenshtein, "search", rb_Levenshtein_search, 1);
1325
+ rb_define_method(rb_cLevenshtein, "similar", rb_Levenshtein_similar, 1);
1326
+ rb_define_method(rb_cString, "levenshtein_similar", rb_str_levenshtein_similar, 1);
1327
+
1328
+ /* Sellers */
1329
+ rb_cSellers = rb_define_class_under(rb_mAmatch, "Sellers", rb_cObject);
1330
+ rb_define_alloc_func(rb_cSellers, rb_Sellers_s_allocate);
1331
+ rb_define_method(rb_cSellers, "initialize", rb_Sellers_initialize, 1);
1332
+ rb_define_method(rb_cSellers, "pattern", rb_Sellers_pattern, 0);
1333
+ rb_define_method(rb_cSellers, "pattern=", rb_Sellers_pattern_set, 1);
1334
+ rb_define_method(rb_cSellers, "substitution", rb_Sellers_substitution, 0);
1335
+ rb_define_method(rb_cSellers, "substitution=", rb_Sellers_substitution_set, 1);
1336
+ rb_define_method(rb_cSellers, "deletion", rb_Sellers_deletion, 0);
1337
+ rb_define_method(rb_cSellers, "deletion=", rb_Sellers_deletion_set, 1);
1338
+ rb_define_method(rb_cSellers, "insertion", rb_Sellers_insertion, 0);
1339
+ rb_define_method(rb_cSellers, "insertion=", rb_Sellers_insertion_set, 1);
1340
+ rb_define_method(rb_cSellers, "reset_weights", rb_Sellers_reset_weights, 0);
1341
+ rb_define_method(rb_cSellers, "match", rb_Sellers_match, 1);
1342
+ rb_define_method(rb_cSellers, "search", rb_Sellers_search, 1);
1343
+ rb_define_method(rb_cSellers, "similar", rb_Sellers_similar, 1);
1344
+
1345
+ /* Hamming */
1346
+ rb_cHamming = rb_define_class_under(rb_mAmatch, "Hamming", rb_cObject);
1347
+ rb_define_alloc_func(rb_cHamming, rb_Hamming_s_allocate);
1348
+ rb_define_method(rb_cHamming, "initialize", rb_Hamming_initialize, 1);
1349
+ rb_define_method(rb_cHamming, "pattern", rb_General_pattern, 0);
1350
+ rb_define_method(rb_cHamming, "pattern=", rb_General_pattern_set, 1);
1351
+ rb_define_method(rb_cHamming, "match", rb_Hamming_match, 1);
1352
+ rb_define_method(rb_cHamming, "similar", rb_Hamming_similar, 1);
1353
+ rb_define_method(rb_cString, "hamming_similar", rb_str_hamming_similar, 1);
1354
+
1355
+ /* Pair Distance Metric */
1356
+ rb_cPairDistance = rb_define_class_under(rb_mAmatch, "PairDistance", rb_cObject);
1357
+ rb_define_alloc_func(rb_cPairDistance, rb_PairDistance_s_allocate);
1358
+ rb_define_method(rb_cPairDistance, "initialize", rb_PairDistance_initialize, 1);
1359
+ rb_define_method(rb_cPairDistance, "pattern", rb_PairDistance_pattern, 0);
1360
+ rb_define_method(rb_cPairDistance, "pattern=", rb_PairDistance_pattern_set, 1);
1361
+ rb_define_method(rb_cPairDistance, "match", rb_PairDistance_match, -1);
1362
+ rb_define_alias(rb_cPairDistance, "similar", "match");
1363
+ rb_define_method(rb_cString, "pair_distance_similar", rb_str_pair_distance_similar, 1);
1364
+
1365
+ /* Longest Common Subsequence */
1366
+ rb_cLongestSubsequence = rb_define_class_under(rb_mAmatch, "LongestSubsequence", rb_cObject);
1367
+ rb_define_alloc_func(rb_cLongestSubsequence, rb_LongestSubsequence_s_allocate);
1368
+ rb_define_method(rb_cLongestSubsequence, "initialize", rb_LongestSubsequence_initialize, 1);
1369
+ rb_define_method(rb_cLongestSubsequence, "pattern", rb_General_pattern, 0);
1370
+ rb_define_method(rb_cLongestSubsequence, "pattern=", rb_General_pattern_set, 1);
1371
+ rb_define_method(rb_cLongestSubsequence, "match", rb_LongestSubsequence_match, 1);
1372
+ rb_define_method(rb_cLongestSubsequence, "similar", rb_LongestSubsequence_similar, 1);
1373
+ rb_define_method(rb_cString, "longest_subsequence_similar", rb_str_longest_subsequence_similar, 1);
1374
+
1375
+ /* Longest Common Substring */
1376
+ rb_cLongestSubstring = rb_define_class_under(rb_mAmatch, "LongestSubstring", rb_cObject);
1377
+ rb_define_alloc_func(rb_cLongestSubstring, rb_LongestSubstring_s_allocate);
1378
+ rb_define_method(rb_cLongestSubstring, "initialize", rb_LongestSubstring_initialize, 1);
1379
+ rb_define_method(rb_cLongestSubstring, "pattern", rb_General_pattern, 0);
1380
+ rb_define_method(rb_cLongestSubstring, "pattern=", rb_General_pattern_set, 1);
1381
+ rb_define_method(rb_cLongestSubstring, "match", rb_LongestSubstring_match, 1);
1382
+ rb_define_method(rb_cLongestSubstring, "similar", rb_LongestSubstring_similar, 1);
1383
+ rb_define_method(rb_cString, "longest_substring_similar", rb_str_longest_substring_similar, 1);
1384
+
1385
+ id_split = rb_intern("split");
1386
+ id_to_f = rb_intern("to_f");
1387
+ }
1388
+ /* vim: set et cin sw=4 ts=4: */