amatch 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.3
@@ -0,0 +1,117 @@
1
+ AMatch
2
+
3
+ Approximate Matching/Searching/Comparing
4
+
5
+ SYNOPSIS
6
+
7
+ require 'amatch'
8
+
9
+ m = Amatch.new("pattern")
10
+
11
+ p m.match("pattren")
12
+ p m.match(["pattren","parent"])
13
+ p m.matchr("pattren")
14
+ p m.compare("pattren")
15
+ p m.comparer("pattren")
16
+ p m.compare("pattn")
17
+ p m.comparer("pattn")
18
+ p m.search("abcpattrendef")
19
+ p m.searchr("abcpattrendef")
20
+
21
+ DESCRIPTION
22
+
23
+ This class enables your programs to do approximate matching, searching and
24
+ comparing of strings. It uses an algorithm that calculates the Levenstein
25
+ distance between those strings to implement those features.
26
+
27
+ The Levenstein edit distance is defined as the minimal costs involved to
28
+ transform one string into another by using three elementary operations:
29
+ deletion, insertion and substitution of a character. To transform "water" into
30
+ "wine", for instance, you have to substitute ?a -> i?: "witer", ?t -> ?n:
31
+ "winer" and delete ?r: "wine". The edit distance between "water" and "wine" is
32
+ 3, because you have to apply three operations. The edit distance between
33
+ "wine" and "wine" is 0, of course: no operation is necessary for the
34
+ transformation -- they're already the same string. It's easy to see that more
35
+ similar strings have smaller edit distances than strings that differ a lot.
36
+
37
+ You can als use different weights for every operation to prefer special
38
+ operations over others. There are three different kinds of match methods
39
+ defined in this class: "match" computes the Levenstein distance between a
40
+ pattern and some strings, "search" searches in some text for a special pattern
41
+ returning a minimal distance, "compare" calculates a value that can be used to
42
+ define a partial order between strings in relation to a given pattern. It's
43
+ also possible to compute a relative distance. This floating point value is
44
+ computed as absolute distance / length of search pattern.
45
+
46
+ CONSTRUCTOR
47
+
48
+ - Amatch#new(pattern)
49
+
50
+ constructs an Amatch object and initializes it with 'pattern'. If no 'pattern'
51
+ is given it has to be set with Amatch#pattern before matching.
52
+
53
+ METHODS
54
+
55
+ - Amatch#pattern pattern string to match against
56
+
57
+ - Amatch#subw weight of one substitution (type Fixnum)
58
+
59
+ - Amatch#delw weight of one deletion (type Fixnum)
60
+
61
+ - Amatch#insw weight of one insertion (type Fixnum)
62
+
63
+ - Amatch#resetw resets all weights to their default values (=1).
64
+
65
+ The following methods require the parameter 'strings'. This parameter can be
66
+ of type String or Array of Strings. The method executes the matching operation
67
+ and returns a number if a string was given. If an array of strings was given
68
+ it returns an array of numbers.
69
+
70
+ - Amatch#match(strings)
71
+
72
+ calculates the absolute edit distance(s) between 'pattern' and 'strings' =
73
+ the Levenstein distance in char operations. See also Amatch#pattern.
74
+
75
+ - Amatch#matchr(strings)
76
+
77
+ calculates the relative edit distance as float. This value is defined as the
78
+ edit distance divided by the length of 'pattern'. See also Amatch#pattern.
79
+
80
+ - Amatch#search(strings)
81
+
82
+ searches 'pattern' in strings and returns the edit distance by greedy
83
+ trimming prefixes or postfixes of the match.
84
+
85
+ - Amatch#searchr(strings)
86
+
87
+ does the same as Amatch#search but divides the edit distance by the length
88
+ of 'pattern' and returns the value as float.
89
+
90
+ - Amatch#compare(strings)
91
+
92
+ calculates the same absolute value like Amatch#match. The sign of the result
93
+ value is negative if the strings are shorter than 'pattern' or positive
94
+ else.
95
+
96
+ - Amatch#comparer(strings)
97
+
98
+ calculates the same absolute value like Amatch#matchr. The sign of the
99
+ result value is negative if the strings are shorter than 'pattern' or
100
+ positive else.
101
+
102
+ EXAMPLES
103
+
104
+ An agrep utility will be installed that demonstrates the usage of this
105
+ library.
106
+
107
+ AUTHOR
108
+
109
+ Florian Frank <flori@ping.de>
110
+
111
+ COPYRIGHT
112
+
113
+ Copyright (c) 2002 Florian Frank <flori@ping.de>
114
+
115
+ This is free software; you can redistribute it and/or modify it under the
116
+ terms of the GNU General Public License Version 2 as published by the Free
117
+ Software Foundation: http://www.gnu.org/copyleft/gpl.html
@@ -0,0 +1,74 @@
1
+ #! /usr/bin/env ruby
2
+ #
3
+ ## $Id: agrep.rb,v 1.1.1.1 2004/09/27 19:23:42 flori Exp $
4
+ #
5
+
6
+ require 'amatch'
7
+ require 'getoptlong'
8
+
9
+ def usage(msg, options)
10
+ print msg, "\nUsage: #{File.basename($0)} pattern [FILE ...]\n\n"
11
+ options.each { |o|
12
+ print " " + o[1] + ", " + o[0] + " " +
13
+ (o[2] == GetoptLong::REQUIRED_ARGUMENT ? 'ARGUMENT' : '') + "\n"
14
+ }
15
+ print "\nReport bugs to <flori@ping.de>.\n"
16
+ exit 0
17
+ end
18
+
19
+ $distance = 1
20
+ begin
21
+ parser = GetoptLong.new
22
+ options = [
23
+ [ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
24
+ [ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
25
+ [ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
26
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
27
+ ]
28
+ parser.set_options(*options)
29
+ parser.each_option { |name, arg|
30
+ name = name.sub(/^--/, '')
31
+ case name
32
+ when 'distance'
33
+ $distance = arg.to_f
34
+ when 'relative'
35
+ $relative = 1
36
+ when 'verbose'
37
+ $verbose = 1
38
+ when 'help'
39
+ usage('You\'ve asked for it!', options)
40
+ end
41
+ }
42
+ rescue
43
+ exit 1
44
+ end
45
+ $pattern = ARGV.shift or usage('Pattern needed!', options)
46
+
47
+ matcher = Amatch.new($pattern)
48
+ size = 0
49
+ start = Time.new
50
+ if ARGV.size > 0 then
51
+ ARGV.each { |filename|
52
+ File.stat(filename).file? or next
53
+ size += File.size(filename)
54
+ begin
55
+ File.open(filename, 'r').each_line { |line|
56
+ print "#{filename}:#{line}" if
57
+ ($relative ? matcher.searchr(line) :
58
+ matcher.search(line)) <= $distance
59
+ }
60
+ rescue
61
+ $stderr.print "Failure at #{filename}: #{$!} => Skipping!\n"
62
+ end
63
+ }
64
+ else
65
+ $stdin.each_line { |line|
66
+ size += line.size
67
+ print line if ($relative ? matcher.searchr(line) :
68
+ matcher.search(line)) <= $distance
69
+ }
70
+ end
71
+ time = Time.new - start
72
+ $verbose and $stderr.printf "%.3f secs running, scanned %.3f KB/s.\n",
73
+ time, size / time / 1024
74
+ exit 0
@@ -0,0 +1,2 @@
1
+ amatch.c
2
+ extconf.rb
@@ -0,0 +1,317 @@
1
+ #include "ruby.h"
2
+
3
+ static VALUE cAmatch;
4
+
5
+ /*
6
+ * Vector stuff
7
+ */
8
+
9
+ typedef struct {
10
+ int *ptr;
11
+ int len;
12
+ } vector;
13
+
14
+ static vector *
15
+ vector_new(len)
16
+ int len;
17
+ {
18
+ vector *v;
19
+ v = ALLOC(vector);
20
+ if (v == NULL) rb_raise(rb_eNoMemError, "couldn't malloc vector");
21
+ v->ptr = ALLOC_N(int, len + 1);
22
+ if (v->ptr == NULL) rb_raise(rb_eNoMemError, "couldn't malloc vector data");
23
+ v->len = len;
24
+ return v;
25
+ }
26
+
27
+ static void
28
+ vector_print(v)
29
+ vector *v;
30
+ {
31
+ int i;
32
+ for(i = 0; i < v->len; i++) printf("%d", v->ptr[i]);
33
+ puts("");
34
+ }
35
+
36
+ static void
37
+ vector_destroy(v)
38
+ vector *v;
39
+ {
40
+ xfree(v->ptr);
41
+ xfree(v);
42
+ }
43
+
44
+ static int
45
+ vector_minimum(v)
46
+ vector *v;
47
+ {
48
+ int i;
49
+ int min;
50
+
51
+ if (v->len == 0) return -1;
52
+ min = v->ptr[0];
53
+ for (i = 1; i <= v->len; i++) {
54
+ if (min > v->ptr[i]) min = v->ptr[i];
55
+ }
56
+ return min;
57
+ }
58
+
59
+ static int
60
+ vector_last(v)
61
+ vector *v;
62
+ {
63
+ return v->ptr[v->len];
64
+ }
65
+
66
+ /*
67
+ * Edit distances are calculated here
68
+ */
69
+
70
+ enum { MATCH = 1, MATCHR, SEARCH, SEARCHR, COMPARE, COMPARER };
71
+
72
+ static int weight2int(weight, name)
73
+ VALUE weight;
74
+ char *name;
75
+ {
76
+ if (TYPE(weight) != T_FIXNUM) {
77
+ rb_raise(rb_eTypeError,
78
+ "value of weight %s has to be of type Fixnum (%s given)",
79
+ "subw", NIL_P(weight) ? "NilClass" : rb_class2name(CLASS_OF(weight)));
80
+ }
81
+ return FIX2INT(weight);
82
+ }
83
+
84
+ static VALUE
85
+ calculate_distance (self, string, mode)
86
+ VALUE self;
87
+ VALUE string;
88
+ char mode;
89
+ {
90
+ VALUE pattern, tmp;
91
+ static VALUE result;
92
+ int pattern_len, string_len;
93
+ char *pattern_ptr, *string_ptr;
94
+ vector *v[2];
95
+ int weight, sw, dw, iw, i, j, tmpi;
96
+ int c = 0, p = 1;
97
+
98
+ Check_Type(string, T_STRING);
99
+ string_ptr = RSTRING(string)->ptr;
100
+ string_len = RSTRING(string)->len;
101
+
102
+ pattern = rb_iv_get(self, "@pattern");
103
+ Check_Type(pattern, T_STRING);
104
+ pattern_ptr = RSTRING(pattern)->ptr;
105
+ pattern_len = RSTRING(pattern)->len;
106
+
107
+ sw = weight2int(rb_iv_get(self, "@subw"), "subw");
108
+ dw = weight2int(rb_iv_get(self, "@delw"), "delw");
109
+ iw = weight2int(rb_iv_get(self, "@insw"), "insw");
110
+
111
+ v[0] = vector_new(string_len);
112
+ switch (mode) {
113
+ case MATCH:
114
+ case MATCHR:
115
+ case COMPARE:
116
+ case COMPARER:
117
+ for (i = 0; i <= v[0]->len; i++) v[0]->ptr[i] = i * iw;
118
+ break;
119
+ case SEARCH:
120
+ case SEARCHR:
121
+ for (i = 0; i <= v[0]->len; i++) v[0]->ptr[i] = 0;
122
+ break;
123
+ default:
124
+ rb_raise(rb_eFatal, "unknown mode in calculate_distance");
125
+ }
126
+
127
+ v[1] = vector_new(string_len);
128
+ for (i = 1; i <= pattern_len; i++) {
129
+ c = i % 2; /* current row */
130
+ p = (i - 1) % 2; /* previous row */
131
+ v[c]->ptr[0] = i * dw; /* first column */
132
+ for (j = 1; j <= string_len; j++) {
133
+ /* Bellman's principle of optimality: */
134
+ weight = v[p]->ptr[j - 1] +
135
+ (pattern_ptr[i - 1] == string_ptr[j - 1] ? 0 : sw);
136
+ if (weight > v[p]->ptr[j] + 1) weight = v[p]->ptr[j] + dw;
137
+ if (weight > v[c]->ptr[j - 1] + 1) weight = v[c]->ptr[j - 1] + iw;
138
+ v[c]->ptr[j] = weight;
139
+ }
140
+ }
141
+ switch (mode) {
142
+ case MATCH:
143
+ result = INT2FIX(vector_last(v[c]));
144
+ break;
145
+ case MATCHR:
146
+ result = rb_float_new((double) vector_last(v[c]) / pattern_len);
147
+ break;
148
+ case SEARCH:
149
+ tmpi = vector_minimum(v[c]);
150
+ result = tmpi < 0 ? INT2FIX(pattern_len) : INT2FIX(tmpi);
151
+ break;
152
+ case SEARCHR:
153
+ tmpi = vector_minimum(v[c]);
154
+ result = rb_float_new( tmpi < 0 ? 1.0 : (double) tmpi / pattern_len);
155
+ break;
156
+ case COMPARE:
157
+ result = INT2FIX((string_len < pattern_len ? -1 : 1) *
158
+ vector_last(v[c]));
159
+ break;
160
+ case COMPARER:
161
+ result = rb_float_new((double)
162
+ (string_len < pattern_len ? -1 : 1) *
163
+ vector_last(v[c]) / pattern_len);
164
+ break;
165
+ default:
166
+ rb_raise(rb_eFatal, "unknown mode in calculate_distance");
167
+ }
168
+ vector_destroy(v[0]);
169
+ vector_destroy(v[1]);
170
+ return result;
171
+ }
172
+
173
+ static VALUE
174
+ handle_strings(self, strings, mode)
175
+ VALUE self;
176
+ VALUE strings;
177
+ char mode;
178
+ {
179
+ if (TYPE(strings) == T_ARRAY) {
180
+ int i;
181
+ VALUE result = rb_ary_new2(RARRAY(strings)->len);
182
+ for (i = 0; i < RARRAY(strings)->len; i++) {
183
+ VALUE string = rb_ary_entry(strings, i);
184
+ if (TYPE(string) != T_STRING) {
185
+ rb_raise(rb_eTypeError,
186
+ "array has to contain only strings (%s given)",
187
+ NIL_P(string) ? "NilClass" :
188
+ rb_class2name(CLASS_OF(string)));
189
+ }
190
+ rb_ary_push(result, calculate_distance(self, string, mode));
191
+ }
192
+ return result;
193
+ } else if (TYPE(strings) == T_STRING) {
194
+ return calculate_distance(self, strings, mode);
195
+ } else {
196
+ rb_raise(rb_eTypeError,
197
+ "value of strings needs to be string or array (%s given)",
198
+ NIL_P(strings) ? "NilClass" : rb_class2name(CLASS_OF(strings)));
199
+ }
200
+ }
201
+
202
+ /*
203
+ * Ruby API
204
+ */
205
+
206
+ static VALUE
207
+ rb_amatch_resetw(self)
208
+ VALUE self;
209
+ {
210
+ rb_iv_set(self, "@subw", INT2FIX(1));
211
+ rb_iv_set(self, "@delw", INT2FIX(1));
212
+ rb_iv_set(self, "@insw", INT2FIX(1));
213
+
214
+ return Qtrue;
215
+ }
216
+
217
+ static VALUE
218
+ rb_amatch_initialize(argc, argv, self)
219
+ int argc;
220
+ VALUE* argv;
221
+ VALUE self;
222
+ {
223
+ VALUE pattern;
224
+
225
+ rb_scan_args(argc, argv, "01", &pattern);
226
+ Check_Type(pattern, T_STRING);
227
+ rb_iv_set(self, "@pattern", pattern);
228
+
229
+ rb_amatch_resetw(self);
230
+
231
+ return self;
232
+ }
233
+
234
+ static VALUE
235
+ rb_amatch_pattern_is(self, pattern)
236
+ VALUE self;
237
+ VALUE pattern;
238
+ {
239
+ Check_Type(pattern, T_STRING);
240
+ rb_iv_set(self, "@pattern", pattern);
241
+
242
+ return pattern;
243
+ }
244
+
245
+
246
+ static VALUE
247
+ rb_amatch_match(self, strings)
248
+ VALUE self;
249
+ VALUE strings;
250
+ {
251
+ return handle_strings(self, strings, MATCH);
252
+ }
253
+
254
+ static VALUE
255
+ rb_amatch_matchr(self, strings)
256
+ VALUE self;
257
+ VALUE strings;
258
+ {
259
+ return handle_strings(self, strings, MATCHR);
260
+ }
261
+
262
+ static VALUE
263
+ rb_amatch_compare(self, strings)
264
+ VALUE self;
265
+ VALUE strings;
266
+ {
267
+ return handle_strings(self, strings, COMPARE);
268
+ }
269
+
270
+ static VALUE
271
+ rb_amatch_comparer(self, strings)
272
+ VALUE self;
273
+ VALUE strings;
274
+ {
275
+ return handle_strings(self, strings, COMPARER);
276
+ }
277
+
278
+
279
+ static VALUE
280
+ rb_amatch_search(self, strings)
281
+ VALUE self;
282
+ VALUE strings;
283
+ {
284
+ return handle_strings(self, strings, SEARCH);
285
+ }
286
+
287
+ static VALUE
288
+ rb_amatch_searchr(self, strings)
289
+ VALUE self;
290
+ VALUE strings;
291
+ {
292
+ return handle_strings(self, strings, SEARCHR);
293
+ }
294
+
295
+ void
296
+ Init_amatch()
297
+ {
298
+ cAmatch = rb_define_class("Amatch", rb_cObject);
299
+ rb_define_method(cAmatch, "initialize", rb_amatch_initialize, -1);
300
+
301
+ rb_define_attr(cAmatch, "debug", 1, 1);
302
+ rb_define_attr(cAmatch, "subw", 1, 1);
303
+ rb_define_attr(cAmatch, "delw", 1, 1);
304
+ rb_define_attr(cAmatch, "insw", 1, 1);
305
+ rb_define_method(cAmatch, "resetw", rb_amatch_resetw, 0);
306
+
307
+ rb_define_method(cAmatch, "pattern=", rb_amatch_pattern_is, 1);
308
+ rb_define_attr(cAmatch, "pattern", 1, 0);
309
+
310
+ rb_define_method(cAmatch, "match", rb_amatch_match, 1);
311
+ rb_define_method(cAmatch, "matchr", rb_amatch_matchr, 1);
312
+ rb_define_method(cAmatch, "compare", rb_amatch_compare, 1);
313
+ rb_define_method(cAmatch, "comparer", rb_amatch_comparer, 1);
314
+ rb_define_method(cAmatch, "search", rb_amatch_search, 1);
315
+ rb_define_method(cAmatch, "searchr", rb_amatch_searchr, 1);
316
+ }
317
+ /* vim: set cin sw=4 ts=4: */