amatch 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGES +8 -0
- data/GPL +340 -0
- data/README.en +27 -0
- data/Rakefile +99 -0
- data/VERSION +1 -0
- data/amatch.txt.en +117 -0
- data/bin/agrep.rb +74 -0
- data/ext/MANIFEST +2 -0
- data/ext/amatch.c +317 -0
- data/ext/extconf.rb +6 -0
- data/index.html +9 -0
- data/install.rb +1015 -0
- data/test.rb +94 -0
- metadata +57 -0
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.3
|
data/amatch.txt.en
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
AMatch
|
2
|
+
|
3
|
+
Approximate Matching/Searching/Comparing
|
4
|
+
|
5
|
+
SYNOPSIS
|
6
|
+
|
7
|
+
require 'amatch'
|
8
|
+
|
9
|
+
m = Amatch.new("pattern")
|
10
|
+
|
11
|
+
p m.match("pattren")
|
12
|
+
p m.match(["pattren","parent"])
|
13
|
+
p m.matchr("pattren")
|
14
|
+
p m.compare("pattren")
|
15
|
+
p m.comparer("pattren")
|
16
|
+
p m.compare("pattn")
|
17
|
+
p m.comparer("pattn")
|
18
|
+
p m.search("abcpattrendef")
|
19
|
+
p m.searchr("abcpattrendef")
|
20
|
+
|
21
|
+
DESCRIPTION
|
22
|
+
|
23
|
+
This class enables your programs to do approximate matching, searching and
|
24
|
+
comparing of strings. It uses an algorithm that calculates the Levenstein
|
25
|
+
distance between those strings to implement those features.
|
26
|
+
|
27
|
+
The Levenstein edit distance is defined as the minimal costs involved to
|
28
|
+
transform one string into another by using three elementary operations:
|
29
|
+
deletion, insertion and substitution of a character. To transform "water" into
|
30
|
+
"wine", for instance, you have to substitute ?a -> i?: "witer", ?t -> ?n:
|
31
|
+
"winer" and delete ?r: "wine". The edit distance between "water" and "wine" is
|
32
|
+
3, because you have to apply three operations. The edit distance between
|
33
|
+
"wine" and "wine" is 0, of course: no operation is necessary for the
|
34
|
+
transformation -- they're already the same string. It's easy to see that more
|
35
|
+
similar strings have smaller edit distances than strings that differ a lot.
|
36
|
+
|
37
|
+
You can als use different weights for every operation to prefer special
|
38
|
+
operations over others. There are three different kinds of match methods
|
39
|
+
defined in this class: "match" computes the Levenstein distance between a
|
40
|
+
pattern and some strings, "search" searches in some text for a special pattern
|
41
|
+
returning a minimal distance, "compare" calculates a value that can be used to
|
42
|
+
define a partial order between strings in relation to a given pattern. It's
|
43
|
+
also possible to compute a relative distance. This floating point value is
|
44
|
+
computed as absolute distance / length of search pattern.
|
45
|
+
|
46
|
+
CONSTRUCTOR
|
47
|
+
|
48
|
+
- Amatch#new(pattern)
|
49
|
+
|
50
|
+
constructs an Amatch object and initializes it with 'pattern'. If no 'pattern'
|
51
|
+
is given it has to be set with Amatch#pattern before matching.
|
52
|
+
|
53
|
+
METHODS
|
54
|
+
|
55
|
+
- Amatch#pattern pattern string to match against
|
56
|
+
|
57
|
+
- Amatch#subw weight of one substitution (type Fixnum)
|
58
|
+
|
59
|
+
- Amatch#delw weight of one deletion (type Fixnum)
|
60
|
+
|
61
|
+
- Amatch#insw weight of one insertion (type Fixnum)
|
62
|
+
|
63
|
+
- Amatch#resetw resets all weights to their default values (=1).
|
64
|
+
|
65
|
+
The following methods require the parameter 'strings'. This parameter can be
|
66
|
+
of type String or Array of Strings. The method executes the matching operation
|
67
|
+
and returns a number if a string was given. If an array of strings was given
|
68
|
+
it returns an array of numbers.
|
69
|
+
|
70
|
+
- Amatch#match(strings)
|
71
|
+
|
72
|
+
calculates the absolute edit distance(s) between 'pattern' and 'strings' =
|
73
|
+
the Levenstein distance in char operations. See also Amatch#pattern.
|
74
|
+
|
75
|
+
- Amatch#matchr(strings)
|
76
|
+
|
77
|
+
calculates the relative edit distance as float. This value is defined as the
|
78
|
+
edit distance divided by the length of 'pattern'. See also Amatch#pattern.
|
79
|
+
|
80
|
+
- Amatch#search(strings)
|
81
|
+
|
82
|
+
searches 'pattern' in strings and returns the edit distance by greedy
|
83
|
+
trimming prefixes or postfixes of the match.
|
84
|
+
|
85
|
+
- Amatch#searchr(strings)
|
86
|
+
|
87
|
+
does the same as Amatch#search but divides the edit distance by the length
|
88
|
+
of 'pattern' and returns the value as float.
|
89
|
+
|
90
|
+
- Amatch#compare(strings)
|
91
|
+
|
92
|
+
calculates the same absolute value like Amatch#match. The sign of the result
|
93
|
+
value is negative if the strings are shorter than 'pattern' or positive
|
94
|
+
else.
|
95
|
+
|
96
|
+
- Amatch#comparer(strings)
|
97
|
+
|
98
|
+
calculates the same absolute value like Amatch#matchr. The sign of the
|
99
|
+
result value is negative if the strings are shorter than 'pattern' or
|
100
|
+
positive else.
|
101
|
+
|
102
|
+
EXAMPLES
|
103
|
+
|
104
|
+
An agrep utility will be installed that demonstrates the usage of this
|
105
|
+
library.
|
106
|
+
|
107
|
+
AUTHOR
|
108
|
+
|
109
|
+
Florian Frank <flori@ping.de>
|
110
|
+
|
111
|
+
COPYRIGHT
|
112
|
+
|
113
|
+
Copyright (c) 2002 Florian Frank <flori@ping.de>
|
114
|
+
|
115
|
+
This is free software; you can redistribute it and/or modify it under the
|
116
|
+
terms of the GNU General Public License Version 2 as published by the Free
|
117
|
+
Software Foundation: http://www.gnu.org/copyleft/gpl.html
|
data/bin/agrep.rb
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
#! /usr/bin/env ruby
|
2
|
+
#
|
3
|
+
## $Id: agrep.rb,v 1.1.1.1 2004/09/27 19:23:42 flori Exp $
|
4
|
+
#
|
5
|
+
|
6
|
+
require 'amatch'
|
7
|
+
require 'getoptlong'
|
8
|
+
|
9
|
+
def usage(msg, options)
|
10
|
+
print msg, "\nUsage: #{File.basename($0)} pattern [FILE ...]\n\n"
|
11
|
+
options.each { |o|
|
12
|
+
print " " + o[1] + ", " + o[0] + " " +
|
13
|
+
(o[2] == GetoptLong::REQUIRED_ARGUMENT ? 'ARGUMENT' : '') + "\n"
|
14
|
+
}
|
15
|
+
print "\nReport bugs to <flori@ping.de>.\n"
|
16
|
+
exit 0
|
17
|
+
end
|
18
|
+
|
19
|
+
$distance = 1
|
20
|
+
begin
|
21
|
+
parser = GetoptLong.new
|
22
|
+
options = [
|
23
|
+
[ '--distance', '-d', GetoptLong::REQUIRED_ARGUMENT ],
|
24
|
+
[ '--relative', '-r', GetoptLong::NO_ARGUMENT ],
|
25
|
+
[ '--verbose', '-v', GetoptLong::NO_ARGUMENT ],
|
26
|
+
[ '--help', '-h', GetoptLong::NO_ARGUMENT ],
|
27
|
+
]
|
28
|
+
parser.set_options(*options)
|
29
|
+
parser.each_option { |name, arg|
|
30
|
+
name = name.sub(/^--/, '')
|
31
|
+
case name
|
32
|
+
when 'distance'
|
33
|
+
$distance = arg.to_f
|
34
|
+
when 'relative'
|
35
|
+
$relative = 1
|
36
|
+
when 'verbose'
|
37
|
+
$verbose = 1
|
38
|
+
when 'help'
|
39
|
+
usage('You\'ve asked for it!', options)
|
40
|
+
end
|
41
|
+
}
|
42
|
+
rescue
|
43
|
+
exit 1
|
44
|
+
end
|
45
|
+
$pattern = ARGV.shift or usage('Pattern needed!', options)
|
46
|
+
|
47
|
+
matcher = Amatch.new($pattern)
|
48
|
+
size = 0
|
49
|
+
start = Time.new
|
50
|
+
if ARGV.size > 0 then
|
51
|
+
ARGV.each { |filename|
|
52
|
+
File.stat(filename).file? or next
|
53
|
+
size += File.size(filename)
|
54
|
+
begin
|
55
|
+
File.open(filename, 'r').each_line { |line|
|
56
|
+
print "#{filename}:#{line}" if
|
57
|
+
($relative ? matcher.searchr(line) :
|
58
|
+
matcher.search(line)) <= $distance
|
59
|
+
}
|
60
|
+
rescue
|
61
|
+
$stderr.print "Failure at #{filename}: #{$!} => Skipping!\n"
|
62
|
+
end
|
63
|
+
}
|
64
|
+
else
|
65
|
+
$stdin.each_line { |line|
|
66
|
+
size += line.size
|
67
|
+
print line if ($relative ? matcher.searchr(line) :
|
68
|
+
matcher.search(line)) <= $distance
|
69
|
+
}
|
70
|
+
end
|
71
|
+
time = Time.new - start
|
72
|
+
$verbose and $stderr.printf "%.3f secs running, scanned %.3f KB/s.\n",
|
73
|
+
time, size / time / 1024
|
74
|
+
exit 0
|
data/ext/MANIFEST
ADDED
data/ext/amatch.c
ADDED
@@ -0,0 +1,317 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
|
3
|
+
static VALUE cAmatch;
|
4
|
+
|
5
|
+
/*
|
6
|
+
* Vector stuff
|
7
|
+
*/
|
8
|
+
|
9
|
+
typedef struct {
|
10
|
+
int *ptr;
|
11
|
+
int len;
|
12
|
+
} vector;
|
13
|
+
|
14
|
+
static vector *
|
15
|
+
vector_new(len)
|
16
|
+
int len;
|
17
|
+
{
|
18
|
+
vector *v;
|
19
|
+
v = ALLOC(vector);
|
20
|
+
if (v == NULL) rb_raise(rb_eNoMemError, "couldn't malloc vector");
|
21
|
+
v->ptr = ALLOC_N(int, len + 1);
|
22
|
+
if (v->ptr == NULL) rb_raise(rb_eNoMemError, "couldn't malloc vector data");
|
23
|
+
v->len = len;
|
24
|
+
return v;
|
25
|
+
}
|
26
|
+
|
27
|
+
static void
|
28
|
+
vector_print(v)
|
29
|
+
vector *v;
|
30
|
+
{
|
31
|
+
int i;
|
32
|
+
for(i = 0; i < v->len; i++) printf("%d", v->ptr[i]);
|
33
|
+
puts("");
|
34
|
+
}
|
35
|
+
|
36
|
+
static void
|
37
|
+
vector_destroy(v)
|
38
|
+
vector *v;
|
39
|
+
{
|
40
|
+
xfree(v->ptr);
|
41
|
+
xfree(v);
|
42
|
+
}
|
43
|
+
|
44
|
+
static int
|
45
|
+
vector_minimum(v)
|
46
|
+
vector *v;
|
47
|
+
{
|
48
|
+
int i;
|
49
|
+
int min;
|
50
|
+
|
51
|
+
if (v->len == 0) return -1;
|
52
|
+
min = v->ptr[0];
|
53
|
+
for (i = 1; i <= v->len; i++) {
|
54
|
+
if (min > v->ptr[i]) min = v->ptr[i];
|
55
|
+
}
|
56
|
+
return min;
|
57
|
+
}
|
58
|
+
|
59
|
+
static int
|
60
|
+
vector_last(v)
|
61
|
+
vector *v;
|
62
|
+
{
|
63
|
+
return v->ptr[v->len];
|
64
|
+
}
|
65
|
+
|
66
|
+
/*
|
67
|
+
* Edit distances are calculated here
|
68
|
+
*/
|
69
|
+
|
70
|
+
enum { MATCH = 1, MATCHR, SEARCH, SEARCHR, COMPARE, COMPARER };
|
71
|
+
|
72
|
+
static int weight2int(weight, name)
|
73
|
+
VALUE weight;
|
74
|
+
char *name;
|
75
|
+
{
|
76
|
+
if (TYPE(weight) != T_FIXNUM) {
|
77
|
+
rb_raise(rb_eTypeError,
|
78
|
+
"value of weight %s has to be of type Fixnum (%s given)",
|
79
|
+
"subw", NIL_P(weight) ? "NilClass" : rb_class2name(CLASS_OF(weight)));
|
80
|
+
}
|
81
|
+
return FIX2INT(weight);
|
82
|
+
}
|
83
|
+
|
84
|
+
static VALUE
|
85
|
+
calculate_distance (self, string, mode)
|
86
|
+
VALUE self;
|
87
|
+
VALUE string;
|
88
|
+
char mode;
|
89
|
+
{
|
90
|
+
VALUE pattern, tmp;
|
91
|
+
static VALUE result;
|
92
|
+
int pattern_len, string_len;
|
93
|
+
char *pattern_ptr, *string_ptr;
|
94
|
+
vector *v[2];
|
95
|
+
int weight, sw, dw, iw, i, j, tmpi;
|
96
|
+
int c = 0, p = 1;
|
97
|
+
|
98
|
+
Check_Type(string, T_STRING);
|
99
|
+
string_ptr = RSTRING(string)->ptr;
|
100
|
+
string_len = RSTRING(string)->len;
|
101
|
+
|
102
|
+
pattern = rb_iv_get(self, "@pattern");
|
103
|
+
Check_Type(pattern, T_STRING);
|
104
|
+
pattern_ptr = RSTRING(pattern)->ptr;
|
105
|
+
pattern_len = RSTRING(pattern)->len;
|
106
|
+
|
107
|
+
sw = weight2int(rb_iv_get(self, "@subw"), "subw");
|
108
|
+
dw = weight2int(rb_iv_get(self, "@delw"), "delw");
|
109
|
+
iw = weight2int(rb_iv_get(self, "@insw"), "insw");
|
110
|
+
|
111
|
+
v[0] = vector_new(string_len);
|
112
|
+
switch (mode) {
|
113
|
+
case MATCH:
|
114
|
+
case MATCHR:
|
115
|
+
case COMPARE:
|
116
|
+
case COMPARER:
|
117
|
+
for (i = 0; i <= v[0]->len; i++) v[0]->ptr[i] = i * iw;
|
118
|
+
break;
|
119
|
+
case SEARCH:
|
120
|
+
case SEARCHR:
|
121
|
+
for (i = 0; i <= v[0]->len; i++) v[0]->ptr[i] = 0;
|
122
|
+
break;
|
123
|
+
default:
|
124
|
+
rb_raise(rb_eFatal, "unknown mode in calculate_distance");
|
125
|
+
}
|
126
|
+
|
127
|
+
v[1] = vector_new(string_len);
|
128
|
+
for (i = 1; i <= pattern_len; i++) {
|
129
|
+
c = i % 2; /* current row */
|
130
|
+
p = (i - 1) % 2; /* previous row */
|
131
|
+
v[c]->ptr[0] = i * dw; /* first column */
|
132
|
+
for (j = 1; j <= string_len; j++) {
|
133
|
+
/* Bellman's principle of optimality: */
|
134
|
+
weight = v[p]->ptr[j - 1] +
|
135
|
+
(pattern_ptr[i - 1] == string_ptr[j - 1] ? 0 : sw);
|
136
|
+
if (weight > v[p]->ptr[j] + 1) weight = v[p]->ptr[j] + dw;
|
137
|
+
if (weight > v[c]->ptr[j - 1] + 1) weight = v[c]->ptr[j - 1] + iw;
|
138
|
+
v[c]->ptr[j] = weight;
|
139
|
+
}
|
140
|
+
}
|
141
|
+
switch (mode) {
|
142
|
+
case MATCH:
|
143
|
+
result = INT2FIX(vector_last(v[c]));
|
144
|
+
break;
|
145
|
+
case MATCHR:
|
146
|
+
result = rb_float_new((double) vector_last(v[c]) / pattern_len);
|
147
|
+
break;
|
148
|
+
case SEARCH:
|
149
|
+
tmpi = vector_minimum(v[c]);
|
150
|
+
result = tmpi < 0 ? INT2FIX(pattern_len) : INT2FIX(tmpi);
|
151
|
+
break;
|
152
|
+
case SEARCHR:
|
153
|
+
tmpi = vector_minimum(v[c]);
|
154
|
+
result = rb_float_new( tmpi < 0 ? 1.0 : (double) tmpi / pattern_len);
|
155
|
+
break;
|
156
|
+
case COMPARE:
|
157
|
+
result = INT2FIX((string_len < pattern_len ? -1 : 1) *
|
158
|
+
vector_last(v[c]));
|
159
|
+
break;
|
160
|
+
case COMPARER:
|
161
|
+
result = rb_float_new((double)
|
162
|
+
(string_len < pattern_len ? -1 : 1) *
|
163
|
+
vector_last(v[c]) / pattern_len);
|
164
|
+
break;
|
165
|
+
default:
|
166
|
+
rb_raise(rb_eFatal, "unknown mode in calculate_distance");
|
167
|
+
}
|
168
|
+
vector_destroy(v[0]);
|
169
|
+
vector_destroy(v[1]);
|
170
|
+
return result;
|
171
|
+
}
|
172
|
+
|
173
|
+
static VALUE
|
174
|
+
handle_strings(self, strings, mode)
|
175
|
+
VALUE self;
|
176
|
+
VALUE strings;
|
177
|
+
char mode;
|
178
|
+
{
|
179
|
+
if (TYPE(strings) == T_ARRAY) {
|
180
|
+
int i;
|
181
|
+
VALUE result = rb_ary_new2(RARRAY(strings)->len);
|
182
|
+
for (i = 0; i < RARRAY(strings)->len; i++) {
|
183
|
+
VALUE string = rb_ary_entry(strings, i);
|
184
|
+
if (TYPE(string) != T_STRING) {
|
185
|
+
rb_raise(rb_eTypeError,
|
186
|
+
"array has to contain only strings (%s given)",
|
187
|
+
NIL_P(string) ? "NilClass" :
|
188
|
+
rb_class2name(CLASS_OF(string)));
|
189
|
+
}
|
190
|
+
rb_ary_push(result, calculate_distance(self, string, mode));
|
191
|
+
}
|
192
|
+
return result;
|
193
|
+
} else if (TYPE(strings) == T_STRING) {
|
194
|
+
return calculate_distance(self, strings, mode);
|
195
|
+
} else {
|
196
|
+
rb_raise(rb_eTypeError,
|
197
|
+
"value of strings needs to be string or array (%s given)",
|
198
|
+
NIL_P(strings) ? "NilClass" : rb_class2name(CLASS_OF(strings)));
|
199
|
+
}
|
200
|
+
}
|
201
|
+
|
202
|
+
/*
|
203
|
+
* Ruby API
|
204
|
+
*/
|
205
|
+
|
206
|
+
static VALUE
|
207
|
+
rb_amatch_resetw(self)
|
208
|
+
VALUE self;
|
209
|
+
{
|
210
|
+
rb_iv_set(self, "@subw", INT2FIX(1));
|
211
|
+
rb_iv_set(self, "@delw", INT2FIX(1));
|
212
|
+
rb_iv_set(self, "@insw", INT2FIX(1));
|
213
|
+
|
214
|
+
return Qtrue;
|
215
|
+
}
|
216
|
+
|
217
|
+
static VALUE
|
218
|
+
rb_amatch_initialize(argc, argv, self)
|
219
|
+
int argc;
|
220
|
+
VALUE* argv;
|
221
|
+
VALUE self;
|
222
|
+
{
|
223
|
+
VALUE pattern;
|
224
|
+
|
225
|
+
rb_scan_args(argc, argv, "01", &pattern);
|
226
|
+
Check_Type(pattern, T_STRING);
|
227
|
+
rb_iv_set(self, "@pattern", pattern);
|
228
|
+
|
229
|
+
rb_amatch_resetw(self);
|
230
|
+
|
231
|
+
return self;
|
232
|
+
}
|
233
|
+
|
234
|
+
static VALUE
|
235
|
+
rb_amatch_pattern_is(self, pattern)
|
236
|
+
VALUE self;
|
237
|
+
VALUE pattern;
|
238
|
+
{
|
239
|
+
Check_Type(pattern, T_STRING);
|
240
|
+
rb_iv_set(self, "@pattern", pattern);
|
241
|
+
|
242
|
+
return pattern;
|
243
|
+
}
|
244
|
+
|
245
|
+
|
246
|
+
static VALUE
|
247
|
+
rb_amatch_match(self, strings)
|
248
|
+
VALUE self;
|
249
|
+
VALUE strings;
|
250
|
+
{
|
251
|
+
return handle_strings(self, strings, MATCH);
|
252
|
+
}
|
253
|
+
|
254
|
+
static VALUE
|
255
|
+
rb_amatch_matchr(self, strings)
|
256
|
+
VALUE self;
|
257
|
+
VALUE strings;
|
258
|
+
{
|
259
|
+
return handle_strings(self, strings, MATCHR);
|
260
|
+
}
|
261
|
+
|
262
|
+
static VALUE
|
263
|
+
rb_amatch_compare(self, strings)
|
264
|
+
VALUE self;
|
265
|
+
VALUE strings;
|
266
|
+
{
|
267
|
+
return handle_strings(self, strings, COMPARE);
|
268
|
+
}
|
269
|
+
|
270
|
+
static VALUE
|
271
|
+
rb_amatch_comparer(self, strings)
|
272
|
+
VALUE self;
|
273
|
+
VALUE strings;
|
274
|
+
{
|
275
|
+
return handle_strings(self, strings, COMPARER);
|
276
|
+
}
|
277
|
+
|
278
|
+
|
279
|
+
static VALUE
|
280
|
+
rb_amatch_search(self, strings)
|
281
|
+
VALUE self;
|
282
|
+
VALUE strings;
|
283
|
+
{
|
284
|
+
return handle_strings(self, strings, SEARCH);
|
285
|
+
}
|
286
|
+
|
287
|
+
static VALUE
|
288
|
+
rb_amatch_searchr(self, strings)
|
289
|
+
VALUE self;
|
290
|
+
VALUE strings;
|
291
|
+
{
|
292
|
+
return handle_strings(self, strings, SEARCHR);
|
293
|
+
}
|
294
|
+
|
295
|
+
void
|
296
|
+
Init_amatch()
|
297
|
+
{
|
298
|
+
cAmatch = rb_define_class("Amatch", rb_cObject);
|
299
|
+
rb_define_method(cAmatch, "initialize", rb_amatch_initialize, -1);
|
300
|
+
|
301
|
+
rb_define_attr(cAmatch, "debug", 1, 1);
|
302
|
+
rb_define_attr(cAmatch, "subw", 1, 1);
|
303
|
+
rb_define_attr(cAmatch, "delw", 1, 1);
|
304
|
+
rb_define_attr(cAmatch, "insw", 1, 1);
|
305
|
+
rb_define_method(cAmatch, "resetw", rb_amatch_resetw, 0);
|
306
|
+
|
307
|
+
rb_define_method(cAmatch, "pattern=", rb_amatch_pattern_is, 1);
|
308
|
+
rb_define_attr(cAmatch, "pattern", 1, 0);
|
309
|
+
|
310
|
+
rb_define_method(cAmatch, "match", rb_amatch_match, 1);
|
311
|
+
rb_define_method(cAmatch, "matchr", rb_amatch_matchr, 1);
|
312
|
+
rb_define_method(cAmatch, "compare", rb_amatch_compare, 1);
|
313
|
+
rb_define_method(cAmatch, "comparer", rb_amatch_comparer, 1);
|
314
|
+
rb_define_method(cAmatch, "search", rb_amatch_search, 1);
|
315
|
+
rb_define_method(cAmatch, "searchr", rb_amatch_searchr, 1);
|
316
|
+
}
|
317
|
+
/* vim: set cin sw=4 ts=4: */
|