amatch 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.3
@@ -0,0 +1,117 @@
1
+ AMatch
2
+
3
+ Approximate Matching/Searching/Comparing
4
+
5
+ SYNOPSIS
6
+
7
+ require 'amatch'
8
+
9
+ m = Amatch.new("pattern")
10
+
11
+ p m.match("pattren")
12
+ p m.match(["pattren","parent"])
13
+ p m.matchr("pattren")
14
+ p m.compare("pattren")
15
+ p m.comparer("pattren")
16
+ p m.compare("pattn")
17
+ p m.comparer("pattn")
18
+ p m.search("abcpattrendef")
19
+ p m.searchr("abcpattrendef")
20
+
21
+ DESCRIPTION
22
+
23
+ This class enables your programs to do approximate matching, searching and
24
+ comparing of strings. It uses an algorithm that calculates the Levenstein
25
+ distance between those strings to implement those features.
26
+
27
+ The Levenstein edit distance is defined as the minimal costs involved to
28
+ transform one string into another by using three elementary operations:
29
+ deletion, insertion and substitution of a character. To transform "water" into
30
+ "wine", for instance, you have to substitute ?a -> i?: "witer", ?t -> ?n:
31
+ "winer" and delete ?r: "wine". The edit distance between "water" and "wine" is
32
+ 3, because you have to apply three operations. The edit distance between
33
+ "wine" and "wine" is 0, of course: no operation is necessary for the
34
+ transformation -- they're already the same string. It's easy to see that more
35
+ similar strings have smaller edit distances than strings that differ a lot.
36
+
37
+ You can als use different weights for every operation to prefer special
38
+ operations over others. There are three different kinds of match methods
39
+ defined in this class: "match" computes the Levenstein distance between a
40
+ pattern and some strings, "search" searches in some text for a special pattern
41
+ returning a minimal distance, "compare" calculates a value that can be used to
42
+ define a partial order between strings in relation to a given pattern. It's
43
+ also possible to compute a relative distance. This floating point value is
44
+ computed as absolute distance / length of search pattern.
45
+
46
+ CONSTRUCTOR
47
+
48
+ - Amatch#new(pattern)
49
+
50
+ constructs an Amatch object and initializes it with 'pattern'. If no 'pattern'
51
+ is given it has to be set with Amatch#pattern before matching.
52
+
53
+ METHODS
54
+
55
+ - Amatch#pattern pattern string to match against
56
+
57
+ - Amatch#subw weight of one substitution (type Fixnum)
58
+
59
+ - Amatch#delw weight of one deletion (type Fixnum)
60
+
61
+ - Amatch#insw weight of one insertion (type Fixnum)
62
+
63
+ - Amatch#resetw resets all weights to their default values (=1).
64
+
65
+ The following methods require the parameter 'strings'. This parameter can be
66
+ of type String or Array of Strings. The method executes the matching operation
67
+ and returns a number if a string was given. If an array of strings was given
68
+ it returns an array of numbers.
69
+
70
+ - Amatch#match(strings)
71
+
72
+ calculates the absolute edit distance(s) between 'pattern' and 'strings' =
73
+ the Levenstein distance in char operations. See also Amatch#pattern.
74
+
75
+ - Amatch#matchr(strings)
76
+
77
+ calculates the relative edit distance as float. This value is defined as the
78
+ edit distance divided by the length of 'pattern'. See also Amatch#pattern.
79
+
80
+ - Amatch#search(strings)
81
+
82
+ searches 'pattern' in strings and returns the edit distance by greedy
83
+ trimming prefixes or postfixes of the match.
84
+
85
+ - Amatch#searchr(strings)
86
+
87
+ does the same as Amatch#search but divides the edit distance by the length
88
+ of 'pattern' and returns the value as float.
89
+
90
+ - Amatch#compare(strings)
91
+
92
+ calculates the same absolute value like Amatch#match. The sign of the result
93
+ value is negative if the strings are shorter than 'pattern' or positive
94
+ else.
95
+
96
+ - Amatch#comparer(strings)
97
+
98
+ calculates the same absolute value like Amatch#matchr. The sign of the
99
+ result value is negative if the strings are shorter than 'pattern' or
100
+ positive else.
101
+
102
+ EXAMPLES
103
+
104
+ An agrep utility will be installed that demonstrates the usage of this
105
+ library.
106
+
107
+ AUTHOR
108
+
109
+ Florian Frank <flori@ping.de>
110
+
111
+ COPYRIGHT
112
+
113
+ Copyright (c) 2002 Florian Frank <flori@ping.de>
114
+
115
+ This is free software; you can redistribute it and/or modify it under the
116
+ terms of the GNU General Public License Version 2 as published by the Free
117
+ Software Foundation: http://www.gnu.org/copyleft/gpl.html
@@ -0,0 +1,74 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ ## $Id: agrep.rb,v 1.1.1.1 2004/09/27 19:23:42 flori Exp $
4
+ #
5
+
6
+ require 'amatch'
7
+ require 'getoptlong'
8
+
9
+ def usage(msg, options)
10
+ print msg, "\nUsage: #{File.basename($0)} pattern [FILE ...]\n\n"
11
+ options.each { |o|
12
+ print " " + o[1] + ", " + o[0] + " " +
13
+ (o[2] == GetoptLong::REQUIRED_ARGUMENT ? 'ARGUMENT' : '') + "\n"
14
+ }
15
+ print "\nReport bugs to <flori@ping.de>.\n"
16
+ exit 0
17
+ end
18
+
19
+ $distance = 1
20
+ begin
21
+ parser = GetoptLong.new
22
+ options = [
23
+ [ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
24
+ [ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
25
+ [ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
26
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
27
+ ]
28
+ parser.set_options(*options)
29
+ parser.each_option { |name, arg|
30
+ name = name.sub(/^--/, '')
31
+ case name
32
+ when 'distance'
33
+ $distance = arg.to_f
34
+ when 'relative'
35
+ $relative = 1
36
+ when 'verbose'
37
+ $verbose = 1
38
+ when 'help'
39
+ usage('You\'ve asked for it!', options)
40
+ end
41
+ }
42
+ rescue
43
+ exit 1
44
+ end
45
+ $pattern = ARGV.shift or usage('Pattern needed!', options)
46
+
47
+ matcher = Amatch.new($pattern)
48
+ size = 0
49
+ start = Time.new
50
+ if ARGV.size > 0 then
51
+ ARGV.each { |filename|
52
+ File.stat(filename).file? or next
53
+ size += File.size(filename)
54
+ begin
55
+ File.open(filename, 'r').each_line { |line|
56
+ print "#{filename}:#{line}" if
57
+ ($relative ? matcher.searchr(line) :
58
+ matcher.search(line)) <= $distance
59
+ }
60
+ rescue
61
+ $stderr.print "Failure at #{filename}: #{$!} => Skipping!\n"
62
+ end
63
+ }
64
+ else
65
+ $stdin.each_line { |line|
66
+ size += line.size
67
+ print line if ($relative ? matcher.searchr(line) :
68
+ matcher.search(line)) <= $distance
69
+ }
70
+ end
71
+ time = Time.new - start
72
+ $verbose and $stderr.printf "%.3f secs running, scanned %.3f KB/s.\n",
73
+ time, size / time / 1024
74
+ exit 0
@@ -0,0 +1,2 @@
1
+ amatch.c
2
+ extconf.rb
@@ -0,0 +1,317 @@
1
+ #include "ruby.h"
2
+
3
+ static VALUE cAmatch;
4
+
5
+ /*
6
+ * Vector stuff
7
+ */
8
+
9
+ typedef struct {
10
+ int *ptr;
11
+ int len;
12
+ } vector;
13
+
14
+ static vector *
15
+ vector_new(len)
16
+ int len;
17
+ {
18
+ vector *v;
19
+ v = ALLOC(vector);
20
+ if (v == NULL) rb_raise(rb_eNoMemError, "couldn't malloc vector");
21
+ v->ptr = ALLOC_N(int, len + 1);
22
+ if (v->ptr == NULL) rb_raise(rb_eNoMemError, "couldn't malloc vector data");
23
+ v->len = len;
24
+ return v;
25
+ }
26
+
27
+ static void
28
+ vector_print(v)
29
+ vector *v;
30
+ {
31
+ int i;
32
+ for(i = 0; i < v->len; i++) printf("%d", v->ptr[i]);
33
+ puts("");
34
+ }
35
+
36
+ static void
37
+ vector_destroy(v)
38
+ vector *v;
39
+ {
40
+ xfree(v->ptr);
41
+ xfree(v);
42
+ }
43
+
44
+ static int
45
+ vector_minimum(v)
46
+ vector *v;
47
+ {
48
+ int i;
49
+ int min;
50
+
51
+ if (v->len == 0) return -1;
52
+ min = v->ptr[0];
53
+ for (i = 1; i <= v->len; i++) {
54
+ if (min > v->ptr[i]) min = v->ptr[i];
55
+ }
56
+ return min;
57
+ }
58
+
59
+ static int
60
+ vector_last(v)
61
+ vector *v;
62
+ {
63
+ return v->ptr[v->len];
64
+ }
65
+
66
+ /*
67
+ * Edit distances are calculated here
68
+ */
69
+
70
+ enum { MATCH = 1, MATCHR, SEARCH, SEARCHR, COMPARE, COMPARER };
71
+
72
+ static int weight2int(weight, name)
73
+ VALUE weight;
74
+ char *name;
75
+ {
76
+ if (TYPE(weight) != T_FIXNUM) {
77
+ rb_raise(rb_eTypeError,
78
+ "value of weight %s has to be of type Fixnum (%s given)",
79
+ "subw", NIL_P(weight) ? "NilClass" : rb_class2name(CLASS_OF(weight)));
80
+ }
81
+ return FIX2INT(weight);
82
+ }
83
+
84
+ static VALUE
85
+ calculate_distance (self, string, mode)
86
+ VALUE self;
87
+ VALUE string;
88
+ char mode;
89
+ {
90
+ VALUE pattern, tmp;
91
+ static VALUE result;
92
+ int pattern_len, string_len;
93
+ char *pattern_ptr, *string_ptr;
94
+ vector *v[2];
95
+ int weight, sw, dw, iw, i, j, tmpi;
96
+ int c = 0, p = 1;
97
+
98
+ Check_Type(string, T_STRING);
99
+ string_ptr = RSTRING(string)->ptr;
100
+ string_len = RSTRING(string)->len;
101
+
102
+ pattern = rb_iv_get(self, "@pattern");
103
+ Check_Type(pattern, T_STRING);
104
+ pattern_ptr = RSTRING(pattern)->ptr;
105
+ pattern_len = RSTRING(pattern)->len;
106
+
107
+ sw = weight2int(rb_iv_get(self, "@subw"), "subw");
108
+ dw = weight2int(rb_iv_get(self, "@delw"), "delw");
109
+ iw = weight2int(rb_iv_get(self, "@insw"), "insw");
110
+
111
+ v[0] = vector_new(string_len);
112
+ switch (mode) {
113
+ case MATCH:
114
+ case MATCHR:
115
+ case COMPARE:
116
+ case COMPARER:
117
+ for (i = 0; i <= v[0]->len; i++) v[0]->ptr[i] = i * iw;
118
+ break;
119
+ case SEARCH:
120
+ case SEARCHR:
121
+ for (i = 0; i <= v[0]->len; i++) v[0]->ptr[i] = 0;
122
+ break;
123
+ default:
124
+ rb_raise(rb_eFatal, "unknown mode in calculate_distance");
125
+ }
126
+
127
+ v[1] = vector_new(string_len);
128
+ for (i = 1; i <= pattern_len; i++) {
129
+ c = i % 2; /* current row */
130
+ p = (i - 1) % 2; /* previous row */
131
+ v[c]->ptr[0] = i * dw; /* first column */
132
+ for (j = 1; j <= string_len; j++) {
133
+ /* Bellman's principle of optimality: */
134
+ weight = v[p]->ptr[j - 1] +
135
+ (pattern_ptr[i - 1] == string_ptr[j - 1] ? 0 : sw);
136
+ if (weight > v[p]->ptr[j] + 1) weight = v[p]->ptr[j] + dw;
137
+ if (weight > v[c]->ptr[j - 1] + 1) weight = v[c]->ptr[j - 1] + iw;
138
+ v[c]->ptr[j] = weight;
139
+ }
140
+ }
141
+ switch (mode) {
142
+ case MATCH:
143
+ result = INT2FIX(vector_last(v[c]));
144
+ break;
145
+ case MATCHR:
146
+ result = rb_float_new((double) vector_last(v[c]) / pattern_len);
147
+ break;
148
+ case SEARCH:
149
+ tmpi = vector_minimum(v[c]);
150
+ result = tmpi < 0 ? INT2FIX(pattern_len) : INT2FIX(tmpi);
151
+ break;
152
+ case SEARCHR:
153
+ tmpi = vector_minimum(v[c]);
154
+ result = rb_float_new( tmpi < 0 ? 1.0 : (double) tmpi / pattern_len);
155
+ break;
156
+ case COMPARE:
157
+ result = INT2FIX((string_len < pattern_len ? -1 : 1) *
158
+ vector_last(v[c]));
159
+ break;
160
+ case COMPARER:
161
+ result = rb_float_new((double)
162
+ (string_len < pattern_len ? -1 : 1) *
163
+ vector_last(v[c]) / pattern_len);
164
+ break;
165
+ default:
166
+ rb_raise(rb_eFatal, "unknown mode in calculate_distance");
167
+ }
168
+ vector_destroy(v[0]);
169
+ vector_destroy(v[1]);
170
+ return result;
171
+ }
172
+
173
+ static VALUE
174
+ handle_strings(self, strings, mode)
175
+ VALUE self;
176
+ VALUE strings;
177
+ char mode;
178
+ {
179
+ if (TYPE(strings) == T_ARRAY) {
180
+ int i;
181
+ VALUE result = rb_ary_new2(RARRAY(strings)->len);
182
+ for (i = 0; i < RARRAY(strings)->len; i++) {
183
+ VALUE string = rb_ary_entry(strings, i);
184
+ if (TYPE(string) != T_STRING) {
185
+ rb_raise(rb_eTypeError,
186
+ "array has to contain only strings (%s given)",
187
+ NIL_P(string) ? "NilClass" :
188
+ rb_class2name(CLASS_OF(string)));
189
+ }
190
+ rb_ary_push(result, calculate_distance(self, string, mode));
191
+ }
192
+ return result;
193
+ } else if (TYPE(strings) == T_STRING) {
194
+ return calculate_distance(self, strings, mode);
195
+ } else {
196
+ rb_raise(rb_eTypeError,
197
+ "value of strings needs to be string or array (%s given)",
198
+ NIL_P(strings) ? "NilClass" : rb_class2name(CLASS_OF(strings)));
199
+ }
200
+ }
201
+
202
+ /*
203
+ * Ruby API
204
+ */
205
+
206
+ static VALUE
207
+ rb_amatch_resetw(self)
208
+ VALUE self;
209
+ {
210
+ rb_iv_set(self, "@subw", INT2FIX(1));
211
+ rb_iv_set(self, "@delw", INT2FIX(1));
212
+ rb_iv_set(self, "@insw", INT2FIX(1));
213
+
214
+ return Qtrue;
215
+ }
216
+
217
+ static VALUE
218
+ rb_amatch_initialize(argc, argv, self)
219
+ int argc;
220
+ VALUE* argv;
221
+ VALUE self;
222
+ {
223
+ VALUE pattern;
224
+
225
+ rb_scan_args(argc, argv, "01", &pattern);
226
+ Check_Type(pattern, T_STRING);
227
+ rb_iv_set(self, "@pattern", pattern);
228
+
229
+ rb_amatch_resetw(self);
230
+
231
+ return self;
232
+ }
233
+
234
+ static VALUE
235
+ rb_amatch_pattern_is(self, pattern)
236
+ VALUE self;
237
+ VALUE pattern;
238
+ {
239
+ Check_Type(pattern, T_STRING);
240
+ rb_iv_set(self, "@pattern", pattern);
241
+
242
+ return pattern;
243
+ }
244
+
245
+
246
+ static VALUE
247
+ rb_amatch_match(self, strings)
248
+ VALUE self;
249
+ VALUE strings;
250
+ {
251
+ return handle_strings(self, strings, MATCH);
252
+ }
253
+
254
+ static VALUE
255
+ rb_amatch_matchr(self, strings)
256
+ VALUE self;
257
+ VALUE strings;
258
+ {
259
+ return handle_strings(self, strings, MATCHR);
260
+ }
261
+
262
+ static VALUE
263
+ rb_amatch_compare(self, strings)
264
+ VALUE self;
265
+ VALUE strings;
266
+ {
267
+ return handle_strings(self, strings, COMPARE);
268
+ }
269
+
270
+ static VALUE
271
+ rb_amatch_comparer(self, strings)
272
+ VALUE self;
273
+ VALUE strings;
274
+ {
275
+ return handle_strings(self, strings, COMPARER);
276
+ }
277
+
278
+
279
+ static VALUE
280
+ rb_amatch_search(self, strings)
281
+ VALUE self;
282
+ VALUE strings;
283
+ {
284
+ return handle_strings(self, strings, SEARCH);
285
+ }
286
+
287
+ static VALUE
288
+ rb_amatch_searchr(self, strings)
289
+ VALUE self;
290
+ VALUE strings;
291
+ {
292
+ return handle_strings(self, strings, SEARCHR);
293
+ }
294
+
295
+ void
296
+ Init_amatch()
297
+ {
298
+ cAmatch = rb_define_class("Amatch", rb_cObject);
299
+ rb_define_method(cAmatch, "initialize", rb_amatch_initialize, -1);
300
+
301
+ rb_define_attr(cAmatch, "debug", 1, 1);
302
+ rb_define_attr(cAmatch, "subw", 1, 1);
303
+ rb_define_attr(cAmatch, "delw", 1, 1);
304
+ rb_define_attr(cAmatch, "insw", 1, 1);
305
+ rb_define_method(cAmatch, "resetw", rb_amatch_resetw, 0);
306
+
307
+ rb_define_method(cAmatch, "pattern=", rb_amatch_pattern_is, 1);
308
+ rb_define_attr(cAmatch, "pattern", 1, 0);
309
+
310
+ rb_define_method(cAmatch, "match", rb_amatch_match, 1);
311
+ rb_define_method(cAmatch, "matchr", rb_amatch_matchr, 1);
312
+ rb_define_method(cAmatch, "compare", rb_amatch_compare, 1);
313
+ rb_define_method(cAmatch, "comparer", rb_amatch_comparer, 1);
314
+ rb_define_method(cAmatch, "search", rb_amatch_search, 1);
315
+ rb_define_method(cAmatch, "searchr", rb_amatch_searchr, 1);
316
+ }
317
+ /* vim: set cin sw=4 ts=4: */