amatch 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +8 -0
- data/GPL +340 -0
- data/README.en +27 -0
- data/Rakefile +99 -0
- data/VERSION +1 -0
- data/amatch.txt.en +117 -0
- data/bin/agrep.rb +74 -0
- data/ext/MANIFEST +2 -0
- data/ext/amatch.c +317 -0
- data/ext/extconf.rb +6 -0
- data/index.html +9 -0
- data/install.rb +1015 -0
- data/test.rb +94 -0
- metadata +57 -0
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.3
|
data/amatch.txt.en
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
AMatch
|
2
|
+
|
3
|
+
Approximate Matching/Searching/Comparing
|
4
|
+
|
5
|
+
SYNOPSIS
|
6
|
+
|
7
|
+
require 'amatch'
|
8
|
+
|
9
|
+
m = Amatch.new("pattern")
|
10
|
+
|
11
|
+
p m.match("pattren")
|
12
|
+
p m.match(["pattren","parent"])
|
13
|
+
p m.matchr("pattren")
|
14
|
+
p m.compare("pattren")
|
15
|
+
p m.comparer("pattren")
|
16
|
+
p m.compare("pattn")
|
17
|
+
p m.comparer("pattn")
|
18
|
+
p m.search("abcpattrendef")
|
19
|
+
p m.searchr("abcpattrendef")
|
20
|
+
|
21
|
+
DESCRIPTION
|
22
|
+
|
23
|
+
This class enables your programs to do approximate matching, searching and
|
24
|
+
comparing of strings. It uses an algorithm that calculates the Levenstein
|
25
|
+
distance between those strings to implement those features.
|
26
|
+
|
27
|
+
The Levenstein edit distance is defined as the minimal costs involved to
|
28
|
+
transform one string into another by using three elementary operations:
|
29
|
+
deletion, insertion and substitution of a character. To transform "water" into
|
30
|
+
"wine", for instance, you have to substitute ?a -> i?: "witer", ?t -> ?n:
|
31
|
+
"winer" and delete ?r: "wine". The edit distance between "water" and "wine" is
|
32
|
+
3, because you have to apply three operations. The edit distance between
|
33
|
+
"wine" and "wine" is 0, of course: no operation is necessary for the
|
34
|
+
transformation -- they're already the same string. It's easy to see that more
|
35
|
+
similar strings have smaller edit distances than strings that differ a lot.
|
36
|
+
|
37
|
+
You can als use different weights for every operation to prefer special
|
38
|
+
operations over others. There are three different kinds of match methods
|
39
|
+
defined in this class: "match" computes the Levenstein distance between a
|
40
|
+
pattern and some strings, "search" searches in some text for a special pattern
|
41
|
+
returning a minimal distance, "compare" calculates a value that can be used to
|
42
|
+
define a partial order between strings in relation to a given pattern. It's
|
43
|
+
also possible to compute a relative distance. This floating point value is
|
44
|
+
computed as absolute distance / length of search pattern.
|
45
|
+
|
46
|
+
CONSTRUCTOR
|
47
|
+
|
48
|
+
- Amatch#new(pattern)
|
49
|
+
|
50
|
+
constructs an Amatch object and initializes it with 'pattern'. If no 'pattern'
|
51
|
+
is given it has to be set with Amatch#pattern before matching.
|
52
|
+
|
53
|
+
METHODS
|
54
|
+
|
55
|
+
- Amatch#pattern pattern string to match against
|
56
|
+
|
57
|
+
- Amatch#subw weight of one substitution (type Fixnum)
|
58
|
+
|
59
|
+
- Amatch#delw weight of one deletion (type Fixnum)
|
60
|
+
|
61
|
+
- Amatch#insw weight of one insertion (type Fixnum)
|
62
|
+
|
63
|
+
- Amatch#resetw resets all weights to their default values (=1).
|
64
|
+
|
65
|
+
The following methods require the parameter 'strings'. This parameter can be
|
66
|
+
of type String or Array of Strings. The method executes the matching operation
|
67
|
+
and returns a number if a string was given. If an array of strings was given
|
68
|
+
it returns an array of numbers.
|
69
|
+
|
70
|
+
- Amatch#match(strings)
|
71
|
+
|
72
|
+
calculates the absolute edit distance(s) between 'pattern' and 'strings' =
|
73
|
+
the Levenstein distance in char operations. See also Amatch#pattern.
|
74
|
+
|
75
|
+
- Amatch#matchr(strings)
|
76
|
+
|
77
|
+
calculates the relative edit distance as float. This value is defined as the
|
78
|
+
edit distance divided by the length of 'pattern'. See also Amatch#pattern.
|
79
|
+
|
80
|
+
- Amatch#search(strings)
|
81
|
+
|
82
|
+
searches 'pattern' in strings and returns the edit distance by greedy
|
83
|
+
trimming prefixes or postfixes of the match.
|
84
|
+
|
85
|
+
- Amatch#searchr(strings)
|
86
|
+
|
87
|
+
does the same as Amatch#search but divides the edit distance by the length
|
88
|
+
of 'pattern' and returns the value as float.
|
89
|
+
|
90
|
+
- Amatch#compare(strings)
|
91
|
+
|
92
|
+
calculates the same absolute value like Amatch#match. The sign of the result
|
93
|
+
value is negative if the strings are shorter than 'pattern' or positive
|
94
|
+
else.
|
95
|
+
|
96
|
+
- Amatch#comparer(strings)
|
97
|
+
|
98
|
+
calculates the same absolute value like Amatch#matchr. The sign of the
|
99
|
+
result value is negative if the strings are shorter than 'pattern' or
|
100
|
+
positive else.
|
101
|
+
|
102
|
+
EXAMPLES
|
103
|
+
|
104
|
+
An agrep utility will be installed that demonstrates the usage of this
|
105
|
+
library.
|
106
|
+
|
107
|
+
AUTHOR
|
108
|
+
|
109
|
+
Florian Frank <flori@ping.de>
|
110
|
+
|
111
|
+
COPYRIGHT
|
112
|
+
|
113
|
+
Copyright (c) 2002 Florian Frank <flori@ping.de>
|
114
|
+
|
115
|
+
This is free software; you can redistribute it and/or modify it under the
|
116
|
+
terms of the GNU General Public License Version 2 as published by the Free
|
117
|
+
Software Foundation: http://www.gnu.org/copyleft/gpl.html
|
data/bin/agrep.rb
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
## $Id: agrep.rb,v 1.1.1.1 2004/09/27 19:23:42 flori Exp $
|
4
|
+
#
|
5
|
+
|
6
|
+
require 'amatch'
|
7
|
+
require 'getoptlong'
|
8
|
+
|
9
|
+
def usage(msg, options)
|
10
|
+
print msg, "\nUsage: #{File.basename($0)} pattern [FILE ...]\n\n"
|
11
|
+
options.each { |o|
|
12
|
+
print " " + o[1] + ", " + o[0] + " " +
|
13
|
+
(o[2] == GetoptLong::REQUIRED_ARGUMENT ? 'ARGUMENT' : '') + "\n"
|
14
|
+
}
|
15
|
+
print "\nReport bugs to <flori@ping.de>.\n"
|
16
|
+
exit 0
|
17
|
+
end
|
18
|
+
|
19
|
+
$distance = 1
|
20
|
+
begin
|
21
|
+
parser = GetoptLong.new
|
22
|
+
options = [
|
23
|
+
[ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
|
24
|
+
[ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
|
25
|
+
[ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
|
26
|
+
[ '--help', '-h', GetoptLong::NO_ARGUMENT ],
|
27
|
+
]
|
28
|
+
parser.set_options(*options)
|
29
|
+
parser.each_option { |name, arg|
|
30
|
+
name = name.sub(/^--/, '')
|
31
|
+
case name
|
32
|
+
when 'distance'
|
33
|
+
$distance = arg.to_f
|
34
|
+
when 'relative'
|
35
|
+
$relative = 1
|
36
|
+
when 'verbose'
|
37
|
+
$verbose = 1
|
38
|
+
when 'help'
|
39
|
+
usage('You\'ve asked for it!', options)
|
40
|
+
end
|
41
|
+
}
|
42
|
+
rescue
|
43
|
+
exit 1
|
44
|
+
end
|
45
|
+
$pattern = ARGV.shift or usage('Pattern needed!', options)
|
46
|
+
|
47
|
+
matcher = Amatch.new($pattern)
|
48
|
+
size = 0
|
49
|
+
start = Time.new
|
50
|
+
if ARGV.size > 0 then
|
51
|
+
ARGV.each { |filename|
|
52
|
+
File.stat(filename).file? or next
|
53
|
+
size += File.size(filename)
|
54
|
+
begin
|
55
|
+
File.open(filename, 'r').each_line { |line|
|
56
|
+
print "#{filename}:#{line}" if
|
57
|
+
($relative ? matcher.searchr(line) :
|
58
|
+
matcher.search(line)) <= $distance
|
59
|
+
}
|
60
|
+
rescue
|
61
|
+
$stderr.print "Failure at #{filename}: #{$!} => Skipping!\n"
|
62
|
+
end
|
63
|
+
}
|
64
|
+
else
|
65
|
+
$stdin.each_line { |line|
|
66
|
+
size += line.size
|
67
|
+
print line if ($relative ? matcher.searchr(line) :
|
68
|
+
matcher.search(line)) <= $distance
|
69
|
+
}
|
70
|
+
end
|
71
|
+
time = Time.new - start
|
72
|
+
$verbose and $stderr.printf "%.3f secs running, scanned %.3f KB/s.\n",
|
73
|
+
time, size / time / 1024
|
74
|
+
exit 0
|
data/ext/MANIFEST
ADDED
data/ext/amatch.c
ADDED
@@ -0,0 +1,317 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
static VALUE cAmatch;
|
4
|
+
|
5
|
+
/*
|
6
|
+
* Vector stuff
|
7
|
+
*/
|
8
|
+
|
9
|
+
typedef struct {
|
10
|
+
int *ptr;
|
11
|
+
int len;
|
12
|
+
} vector;
|
13
|
+
|
14
|
+
static vector *
|
15
|
+
vector_new(len)
|
16
|
+
int len;
|
17
|
+
{
|
18
|
+
vector *v;
|
19
|
+
v = ALLOC(vector);
|
20
|
+
if (v == NULL) rb_raise(rb_eNoMemError, "couldn't malloc vector");
|
21
|
+
v->ptr = ALLOC_N(int, len + 1);
|
22
|
+
if (v->ptr == NULL) rb_raise(rb_eNoMemError, "couldn't malloc vector data");
|
23
|
+
v->len = len;
|
24
|
+
return v;
|
25
|
+
}
|
26
|
+
|
27
|
+
static void
|
28
|
+
vector_print(v)
|
29
|
+
vector *v;
|
30
|
+
{
|
31
|
+
int i;
|
32
|
+
for(i = 0; i < v->len; i++) printf("%d", v->ptr[i]);
|
33
|
+
puts("");
|
34
|
+
}
|
35
|
+
|
36
|
+
static void
|
37
|
+
vector_destroy(v)
|
38
|
+
vector *v;
|
39
|
+
{
|
40
|
+
xfree(v->ptr);
|
41
|
+
xfree(v);
|
42
|
+
}
|
43
|
+
|
44
|
+
static int
|
45
|
+
vector_minimum(v)
|
46
|
+
vector *v;
|
47
|
+
{
|
48
|
+
int i;
|
49
|
+
int min;
|
50
|
+
|
51
|
+
if (v->len == 0) return -1;
|
52
|
+
min = v->ptr[0];
|
53
|
+
for (i = 1; i <= v->len; i++) {
|
54
|
+
if (min > v->ptr[i]) min = v->ptr[i];
|
55
|
+
}
|
56
|
+
return min;
|
57
|
+
}
|
58
|
+
|
59
|
+
static int
|
60
|
+
vector_last(v)
|
61
|
+
vector *v;
|
62
|
+
{
|
63
|
+
return v->ptr[v->len];
|
64
|
+
}
|
65
|
+
|
66
|
+
/*
|
67
|
+
* Edit distances are calculated here
|
68
|
+
*/
|
69
|
+
|
70
|
+
enum { MATCH = 1, MATCHR, SEARCH, SEARCHR, COMPARE, COMPARER };
|
71
|
+
|
72
|
+
static int weight2int(weight, name)
|
73
|
+
VALUE weight;
|
74
|
+
char *name;
|
75
|
+
{
|
76
|
+
if (TYPE(weight) != T_FIXNUM) {
|
77
|
+
rb_raise(rb_eTypeError,
|
78
|
+
"value of weight %s has to be of type Fixnum (%s given)",
|
79
|
+
"subw", NIL_P(weight) ? "NilClass" : rb_class2name(CLASS_OF(weight)));
|
80
|
+
}
|
81
|
+
return FIX2INT(weight);
|
82
|
+
}
|
83
|
+
|
84
|
+
static VALUE
|
85
|
+
calculate_distance (self, string, mode)
|
86
|
+
VALUE self;
|
87
|
+
VALUE string;
|
88
|
+
char mode;
|
89
|
+
{
|
90
|
+
VALUE pattern, tmp;
|
91
|
+
static VALUE result;
|
92
|
+
int pattern_len, string_len;
|
93
|
+
char *pattern_ptr, *string_ptr;
|
94
|
+
vector *v[2];
|
95
|
+
int weight, sw, dw, iw, i, j, tmpi;
|
96
|
+
int c = 0, p = 1;
|
97
|
+
|
98
|
+
Check_Type(string, T_STRING);
|
99
|
+
string_ptr = RSTRING(string)->ptr;
|
100
|
+
string_len = RSTRING(string)->len;
|
101
|
+
|
102
|
+
pattern = rb_iv_get(self, "@pattern");
|
103
|
+
Check_Type(pattern, T_STRING);
|
104
|
+
pattern_ptr = RSTRING(pattern)->ptr;
|
105
|
+
pattern_len = RSTRING(pattern)->len;
|
106
|
+
|
107
|
+
sw = weight2int(rb_iv_get(self, "@subw"), "subw");
|
108
|
+
dw = weight2int(rb_iv_get(self, "@delw"), "delw");
|
109
|
+
iw = weight2int(rb_iv_get(self, "@insw"), "insw");
|
110
|
+
|
111
|
+
v[0] = vector_new(string_len);
|
112
|
+
switch (mode) {
|
113
|
+
case MATCH:
|
114
|
+
case MATCHR:
|
115
|
+
case COMPARE:
|
116
|
+
case COMPARER:
|
117
|
+
for (i = 0; i <= v[0]->len; i++) v[0]->ptr[i] = i * iw;
|
118
|
+
break;
|
119
|
+
case SEARCH:
|
120
|
+
case SEARCHR:
|
121
|
+
for (i = 0; i <= v[0]->len; i++) v[0]->ptr[i] = 0;
|
122
|
+
break;
|
123
|
+
default:
|
124
|
+
rb_raise(rb_eFatal, "unknown mode in calculate_distance");
|
125
|
+
}
|
126
|
+
|
127
|
+
v[1] = vector_new(string_len);
|
128
|
+
for (i = 1; i <= pattern_len; i++) {
|
129
|
+
c = i % 2; /* current row */
|
130
|
+
p = (i - 1) % 2; /* previous row */
|
131
|
+
v[c]->ptr[0] = i * dw; /* first column */
|
132
|
+
for (j = 1; j <= string_len; j++) {
|
133
|
+
/* Bellman's principle of optimality: */
|
134
|
+
weight = v[p]->ptr[j - 1] +
|
135
|
+
(pattern_ptr[i - 1] == string_ptr[j - 1] ? 0 : sw);
|
136
|
+
if (weight > v[p]->ptr[j] + 1) weight = v[p]->ptr[j] + dw;
|
137
|
+
if (weight > v[c]->ptr[j - 1] + 1) weight = v[c]->ptr[j - 1] + iw;
|
138
|
+
v[c]->ptr[j] = weight;
|
139
|
+
}
|
140
|
+
}
|
141
|
+
switch (mode) {
|
142
|
+
case MATCH:
|
143
|
+
result = INT2FIX(vector_last(v[c]));
|
144
|
+
break;
|
145
|
+
case MATCHR:
|
146
|
+
result = rb_float_new((double) vector_last(v[c]) / pattern_len);
|
147
|
+
break;
|
148
|
+
case SEARCH:
|
149
|
+
tmpi = vector_minimum(v[c]);
|
150
|
+
result = tmpi < 0 ? INT2FIX(pattern_len) : INT2FIX(tmpi);
|
151
|
+
break;
|
152
|
+
case SEARCHR:
|
153
|
+
tmpi = vector_minimum(v[c]);
|
154
|
+
result = rb_float_new( tmpi < 0 ? 1.0 : (double) tmpi / pattern_len);
|
155
|
+
break;
|
156
|
+
case COMPARE:
|
157
|
+
result = INT2FIX((string_len < pattern_len ? -1 : 1) *
|
158
|
+
vector_last(v[c]));
|
159
|
+
break;
|
160
|
+
case COMPARER:
|
161
|
+
result = rb_float_new((double)
|
162
|
+
(string_len < pattern_len ? -1 : 1) *
|
163
|
+
vector_last(v[c]) / pattern_len);
|
164
|
+
break;
|
165
|
+
default:
|
166
|
+
rb_raise(rb_eFatal, "unknown mode in calculate_distance");
|
167
|
+
}
|
168
|
+
vector_destroy(v[0]);
|
169
|
+
vector_destroy(v[1]);
|
170
|
+
return result;
|
171
|
+
}
|
172
|
+
|
173
|
+
static VALUE
|
174
|
+
handle_strings(self, strings, mode)
|
175
|
+
VALUE self;
|
176
|
+
VALUE strings;
|
177
|
+
char mode;
|
178
|
+
{
|
179
|
+
if (TYPE(strings) == T_ARRAY) {
|
180
|
+
int i;
|
181
|
+
VALUE result = rb_ary_new2(RARRAY(strings)->len);
|
182
|
+
for (i = 0; i < RARRAY(strings)->len; i++) {
|
183
|
+
VALUE string = rb_ary_entry(strings, i);
|
184
|
+
if (TYPE(string) != T_STRING) {
|
185
|
+
rb_raise(rb_eTypeError,
|
186
|
+
"array has to contain only strings (%s given)",
|
187
|
+
NIL_P(string) ? "NilClass" :
|
188
|
+
rb_class2name(CLASS_OF(string)));
|
189
|
+
}
|
190
|
+
rb_ary_push(result, calculate_distance(self, string, mode));
|
191
|
+
}
|
192
|
+
return result;
|
193
|
+
} else if (TYPE(strings) == T_STRING) {
|
194
|
+
return calculate_distance(self, strings, mode);
|
195
|
+
} else {
|
196
|
+
rb_raise(rb_eTypeError,
|
197
|
+
"value of strings needs to be string or array (%s given)",
|
198
|
+
NIL_P(strings) ? "NilClass" : rb_class2name(CLASS_OF(strings)));
|
199
|
+
}
|
200
|
+
}
|
201
|
+
|
202
|
+
/*
|
203
|
+
* Ruby API
|
204
|
+
*/
|
205
|
+
|
206
|
+
static VALUE
|
207
|
+
rb_amatch_resetw(self)
|
208
|
+
VALUE self;
|
209
|
+
{
|
210
|
+
rb_iv_set(self, "@subw", INT2FIX(1));
|
211
|
+
rb_iv_set(self, "@delw", INT2FIX(1));
|
212
|
+
rb_iv_set(self, "@insw", INT2FIX(1));
|
213
|
+
|
214
|
+
return Qtrue;
|
215
|
+
}
|
216
|
+
|
217
|
+
static VALUE
|
218
|
+
rb_amatch_initialize(argc, argv, self)
|
219
|
+
int argc;
|
220
|
+
VALUE* argv;
|
221
|
+
VALUE self;
|
222
|
+
{
|
223
|
+
VALUE pattern;
|
224
|
+
|
225
|
+
rb_scan_args(argc, argv, "01", &pattern);
|
226
|
+
Check_Type(pattern, T_STRING);
|
227
|
+
rb_iv_set(self, "@pattern", pattern);
|
228
|
+
|
229
|
+
rb_amatch_resetw(self);
|
230
|
+
|
231
|
+
return self;
|
232
|
+
}
|
233
|
+
|
234
|
+
static VALUE
|
235
|
+
rb_amatch_pattern_is(self, pattern)
|
236
|
+
VALUE self;
|
237
|
+
VALUE pattern;
|
238
|
+
{
|
239
|
+
Check_Type(pattern, T_STRING);
|
240
|
+
rb_iv_set(self, "@pattern", pattern);
|
241
|
+
|
242
|
+
return pattern;
|
243
|
+
}
|
244
|
+
|
245
|
+
|
246
|
+
static VALUE
|
247
|
+
rb_amatch_match(self, strings)
|
248
|
+
VALUE self;
|
249
|
+
VALUE strings;
|
250
|
+
{
|
251
|
+
return handle_strings(self, strings, MATCH);
|
252
|
+
}
|
253
|
+
|
254
|
+
static VALUE
|
255
|
+
rb_amatch_matchr(self, strings)
|
256
|
+
VALUE self;
|
257
|
+
VALUE strings;
|
258
|
+
{
|
259
|
+
return handle_strings(self, strings, MATCHR);
|
260
|
+
}
|
261
|
+
|
262
|
+
static VALUE
|
263
|
+
rb_amatch_compare(self, strings)
|
264
|
+
VALUE self;
|
265
|
+
VALUE strings;
|
266
|
+
{
|
267
|
+
return handle_strings(self, strings, COMPARE);
|
268
|
+
}
|
269
|
+
|
270
|
+
static VALUE
|
271
|
+
rb_amatch_comparer(self, strings)
|
272
|
+
VALUE self;
|
273
|
+
VALUE strings;
|
274
|
+
{
|
275
|
+
return handle_strings(self, strings, COMPARER);
|
276
|
+
}
|
277
|
+
|
278
|
+
|
279
|
+
static VALUE
|
280
|
+
rb_amatch_search(self, strings)
|
281
|
+
VALUE self;
|
282
|
+
VALUE strings;
|
283
|
+
{
|
284
|
+
return handle_strings(self, strings, SEARCH);
|
285
|
+
}
|
286
|
+
|
287
|
+
static VALUE
|
288
|
+
rb_amatch_searchr(self, strings)
|
289
|
+
VALUE self;
|
290
|
+
VALUE strings;
|
291
|
+
{
|
292
|
+
return handle_strings(self, strings, SEARCHR);
|
293
|
+
}
|
294
|
+
|
295
|
+
void
|
296
|
+
Init_amatch()
|
297
|
+
{
|
298
|
+
cAmatch = rb_define_class("Amatch", rb_cObject);
|
299
|
+
rb_define_method(cAmatch, "initialize", rb_amatch_initialize, -1);
|
300
|
+
|
301
|
+
rb_define_attr(cAmatch, "debug", 1, 1);
|
302
|
+
rb_define_attr(cAmatch, "subw", 1, 1);
|
303
|
+
rb_define_attr(cAmatch, "delw", 1, 1);
|
304
|
+
rb_define_attr(cAmatch, "insw", 1, 1);
|
305
|
+
rb_define_method(cAmatch, "resetw", rb_amatch_resetw, 0);
|
306
|
+
|
307
|
+
rb_define_method(cAmatch, "pattern=", rb_amatch_pattern_is, 1);
|
308
|
+
rb_define_attr(cAmatch, "pattern", 1, 0);
|
309
|
+
|
310
|
+
rb_define_method(cAmatch, "match", rb_amatch_match, 1);
|
311
|
+
rb_define_method(cAmatch, "matchr", rb_amatch_matchr, 1);
|
312
|
+
rb_define_method(cAmatch, "compare", rb_amatch_compare, 1);
|
313
|
+
rb_define_method(cAmatch, "comparer", rb_amatch_comparer, 1);
|
314
|
+
rb_define_method(cAmatch, "search", rb_amatch_search, 1);
|
315
|
+
rb_define_method(cAmatch, "searchr", rb_amatch_searchr, 1);
|
316
|
+
}
|
317
|
+
/* vim: set cin sw=4 ts=4: */
|