jwilkins-spamsum 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README +24 -0
- data/Rakefile +3 -0
- data/ext/edit_dist.c +269 -0
- data/ext/extconf.rb +6 -0
- data/ext/spamsum.c +679 -0
- data/ext/spamsum.i +16 -0
- data/ext/spamsum_wrap.c +2405 -0
- data/spamsum.gemspec +29 -0
- data/test.rb +33 -0
- metadata +63 -0
data/README
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
SpamSum v0.1
|
2
|
+
==============
|
3
|
+
SpamSum is a distance based hash. This is the opposite of a cryptographic
|
4
|
+
hash in that two slightly different inputs will have similar (or even
|
5
|
+
identical outputs).
|
6
|
+
|
7
|
+
The original spamsum code was released under the Perl Artistic License
|
8
|
+
and is Copyright Andrew Tridgell <tridge@samba.org> 2002
|
9
|
+
It is available here:
|
10
|
+
http://www.samba.org/junkcode/#spamsum
|
11
|
+
or here:
|
12
|
+
http://linux.anu.edu.au/linux.conf.au/2004/papers/junkcode/spamsum
|
13
|
+
|
14
|
+
More information on the algorithm itself is here:
|
15
|
+
http://samba.org/ftp/unpacked/junkcode/spamsum/README
|
16
|
+
|
17
|
+
Ruby wrapper is released under GPL v2
|
18
|
+
Copyright 2009 Jonathan Wilkins <jwilkins@bitland.net>
|
19
|
+
|
20
|
+
Building
|
21
|
+
--------
|
22
|
+
$ swig -ruby spamsum.i
|
23
|
+
$ ruby extconf.rb
|
24
|
+
$ make
|
data/Rakefile
ADDED
data/ext/edit_dist.c
ADDED
@@ -0,0 +1,269 @@
|
|
1
|
+
/*
|
2
|
+
This edit distance code is taken from trn3.6. A few minor
|
3
|
+
modifications have been made by Andrew Tridgell <tridge@samba.org>
|
4
|
+
for use in spamsum.
|
5
|
+
*/
|
6
|
+
|
7
|
+
|
8
|
+
/***************************************************************************/
|
9
|
+
|
10
|
+
|
11
|
+
/* The authors make no claims as to the fitness or correctness of this software
|
12
|
+
* for any use whatsoever, and it is provided as is. Any use of this software
|
13
|
+
* is at the user's own risk.
|
14
|
+
*/
|
15
|
+
|
16
|
+
#include <stdio.h>
|
17
|
+
#include <unistd.h>
|
18
|
+
#include <stdlib.h>
|
19
|
+
|
20
|
+
/* edit_dist -- returns the minimum edit distance between two strings
|
21
|
+
|
22
|
+
Program by: Mark Maimone CMU Computer Science 13 Nov 89
|
23
|
+
Last Modified: 28 Jan 90
|
24
|
+
|
25
|
+
If the input strings have length n and m, the algorithm runs in time
|
26
|
+
O(nm) and space O(min(m,n)).
|
27
|
+
|
28
|
+
HISTORY
|
29
|
+
13 Nov 89 (mwm) Created edit_dist() and set_costs().
|
30
|
+
|
31
|
+
28 Jan 90 (mwm) Added view_costs(). Should verify that THRESHOLD
|
32
|
+
computations will work even when THRESHOLD is not a multiple of
|
33
|
+
sizeof(int).
|
34
|
+
|
35
|
+
17 May 93 (mwm) Improved performance when used with trn's newsgroup
|
36
|
+
processing; assume all costs are 1, and you can terminate when a
|
37
|
+
threshold is exceeded.
|
38
|
+
*/
|
39
|
+
|
40
|
+
#define MIN_DIST 100
|
41
|
+
|
42
|
+
#define TRN_SPEEDUP /* Use a less-general version of the
|
43
|
+
routine, one that's better for trn.
|
44
|
+
All change costs are 1, and it's okay
|
45
|
+
to terminate if the edit distance is
|
46
|
+
known to exceed MIN_DIST */
|
47
|
+
|
48
|
+
#define THRESHOLD 4000 /* worry about allocating more memory only
|
49
|
+
when this # of bytes is exceeded */
|
50
|
+
#define STRLENTHRESHOLD ((int) ((THRESHOLD / sizeof (int) - 3) / 2))
|
51
|
+
|
52
|
+
#define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y))
|
53
|
+
|
54
|
+
#define swap_int(x,y) (_iswap = (x), (x) = (y), (y) = _iswap)
|
55
|
+
#define swap_char(x,y) (_cswap = (x), (x) = (y), (y) = _cswap)
|
56
|
+
#define min3(x,y,z) (_mx = (x), _my = (y), _mz = (z), (_mx < _my ? (_mx < _mz ? _mx : _mz) : (_mz < _my) ? _mz : _my))
|
57
|
+
#define min2(x,y) (_mx = (x), _my = (y), (_mx < _my ? _mx : _my))
|
58
|
+
|
59
|
+
|
60
|
+
static int insert_cost = 1;
|
61
|
+
static int delete_cost = 1;
|
62
|
+
#ifndef TRN_SPEEDUP
|
63
|
+
static int change_cost = 1;
|
64
|
+
static int swap_cost = 1;
|
65
|
+
#endif
|
66
|
+
|
67
|
+
static int _iswap; /* swap_int temp variable */
|
68
|
+
static char *_cswap; /* swap_char temp variable */
|
69
|
+
static int _mx, _my, _mz; /* min2, min3 temp variables */
|
70
|
+
|
71
|
+
|
72
|
+
|
73
|
+
/* edit_distn -- returns the edit distance between two strings, or -1 on
|
74
|
+
failure */
|
75
|
+
|
76
|
+
int
|
77
|
+
edit_distn(from, from_len, to, to_len)
|
78
|
+
char *from, *to;
|
79
|
+
register int from_len, to_len;
|
80
|
+
{
|
81
|
+
#ifndef TRN_SPEEDUP
|
82
|
+
register int ins, del, ch; /* local copies of edit costs */
|
83
|
+
#endif
|
84
|
+
register int row, col, index; /* dynamic programming counters */
|
85
|
+
register int radix; /* radix for modular indexing */
|
86
|
+
#ifdef TRN_SPEEDUP
|
87
|
+
register int low;
|
88
|
+
#endif
|
89
|
+
int *buffer; /* pointer to storage for one row
|
90
|
+
of the d.p. array */
|
91
|
+
static int store[THRESHOLD / sizeof (int)];
|
92
|
+
/* a small amount of static
|
93
|
+
storage, to be used when the
|
94
|
+
input strings are small enough */
|
95
|
+
|
96
|
+
/* Handle trivial cases when one string is empty */
|
97
|
+
|
98
|
+
if (from == NULL || !from_len)
|
99
|
+
if (to == NULL || !to_len)
|
100
|
+
return 0;
|
101
|
+
else
|
102
|
+
return to_len * insert_cost;
|
103
|
+
else if (to == NULL || !to_len)
|
104
|
+
return from_len * delete_cost;
|
105
|
+
|
106
|
+
/* Initialize registers */
|
107
|
+
|
108
|
+
radix = 2 * from_len + 3;
|
109
|
+
#ifdef TRN_SPEEDUP
|
110
|
+
#define ins 1
|
111
|
+
#define del 1
|
112
|
+
#define ch 3
|
113
|
+
#define swap_cost 5
|
114
|
+
#else
|
115
|
+
ins = insert_cost;
|
116
|
+
del = delete_cost;
|
117
|
+
ch = change_cost;
|
118
|
+
#endif
|
119
|
+
|
120
|
+
/* Make from short enough to fit in the static storage, if it's at all
|
121
|
+
possible */
|
122
|
+
|
123
|
+
if (from_len > to_len && from_len > STRLENTHRESHOLD) {
|
124
|
+
swap_int(from_len, to_len);
|
125
|
+
swap_char(from, to);
|
126
|
+
#ifndef TRN_SPEEDUP
|
127
|
+
swap_int(ins, del);
|
128
|
+
#endif
|
129
|
+
} /* if from_len > to_len */
|
130
|
+
|
131
|
+
/* Allocate the array storage (from the heap if necessary) */
|
132
|
+
|
133
|
+
if (from_len <= STRLENTHRESHOLD)
|
134
|
+
buffer = store;
|
135
|
+
else
|
136
|
+
buffer = (int *) malloc(radix * sizeof (int));
|
137
|
+
|
138
|
+
/* Here's where the fun begins. We will find the minimum edit distance
|
139
|
+
using dynamic programming. We only need to store two rows of the matrix
|
140
|
+
at a time, since we always progress down the matrix. For example,
|
141
|
+
given the strings "one" and "two", and insert, delete and change costs
|
142
|
+
equal to 1:
|
143
|
+
|
144
|
+
_ o n e
|
145
|
+
_ 0 1 2 3
|
146
|
+
t 1 1 2 3
|
147
|
+
w 2 2 2 3
|
148
|
+
o 3 2 3 3
|
149
|
+
|
150
|
+
The dynamic programming recursion is defined as follows:
|
151
|
+
|
152
|
+
ar(x,0) := x * insert_cost
|
153
|
+
ar(0,y) := y * delete_cost
|
154
|
+
ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 : change),
|
155
|
+
a(x - 1, y) + insert_cost,
|
156
|
+
a(x, y - 1) + delete_cost,
|
157
|
+
a(x - 2, y - 2) + (from[x] == to[y-1] &&
|
158
|
+
from[x-1] == to[y] ? swap_cost :
|
159
|
+
infinity))
|
160
|
+
|
161
|
+
Since this only looks at most two rows and three columns back, we need
|
162
|
+
only store the values for the two preceeding rows. In this
|
163
|
+
implementation, we do not explicitly store the zero column, so only 2 *
|
164
|
+
from_len + 2 words are needed. However, in the implementation of the
|
165
|
+
swap_cost check, the current matrix value is used as a buffer; we
|
166
|
+
can't overwrite the earlier value until the swap_cost check has
|
167
|
+
been performed. So we use 2 * from_len + 3 elements in the buffer.
|
168
|
+
*/
|
169
|
+
|
170
|
+
#define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins : \
|
171
|
+
buffer[mod(index)]))
|
172
|
+
#define NW(x,y) ar(x, y, index + from_len + 2)
|
173
|
+
#define N(x,y) ar(x, y, index + from_len + 3)
|
174
|
+
#define W(x,y) ar(x, y, index + radix - 1)
|
175
|
+
#define NNWW(x,y) ar(x, y, index + 1)
|
176
|
+
#define mod(x) ((x) % radix)
|
177
|
+
|
178
|
+
index = 0;
|
179
|
+
|
180
|
+
#ifdef DEBUG_EDITDIST
|
181
|
+
printf(" ");
|
182
|
+
for (col = 0; col < from_len; col++)
|
183
|
+
printf(" %c ", from[col]);
|
184
|
+
printf("\n ");
|
185
|
+
|
186
|
+
for (col = 0; col <= from_len; col++)
|
187
|
+
printf("%2d ", col * del);
|
188
|
+
#endif
|
189
|
+
|
190
|
+
/* Row 0 is handled implicitly; its value at a given column is col*del.
|
191
|
+
The loop below computes the values for Row 1. At this point we know the
|
192
|
+
strings are nonempty. We also don't need to consider swap costs in row
|
193
|
+
1.
|
194
|
+
|
195
|
+
COMMENT: the indicies row and col below point into the STRING, so
|
196
|
+
the corresponding MATRIX indicies are row+1 and col+1.
|
197
|
+
*/
|
198
|
+
|
199
|
+
buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch));
|
200
|
+
#ifdef TRN_SPEEDUP
|
201
|
+
low = buffer[mod(index + radix - 1)];
|
202
|
+
#endif
|
203
|
+
|
204
|
+
#ifdef DEBUG_EDITDIST
|
205
|
+
printf("\n %c %2d %2d ", to[0], ins, buffer[index - 1]);
|
206
|
+
#endif
|
207
|
+
|
208
|
+
for (col = 1; col < from_len; col++) {
|
209
|
+
buffer[index] = min3(
|
210
|
+
col * del + ((from[col] == to[0]) ? 0 : ch),
|
211
|
+
(col + 1) * del + ins,
|
212
|
+
buffer[index - 1] + del);
|
213
|
+
#ifdef TRN_SPEEDUP
|
214
|
+
if (buffer[index] < low)
|
215
|
+
low = buffer[index];
|
216
|
+
#endif
|
217
|
+
index++;
|
218
|
+
|
219
|
+
#ifdef DEBUG_EDITDIST
|
220
|
+
printf("%2d ", buffer[index - 1]);
|
221
|
+
#endif
|
222
|
+
|
223
|
+
} /* for col = 1 */
|
224
|
+
|
225
|
+
#ifdef DEBUG_EDITDIST
|
226
|
+
printf("\n %c %2d ", to[1], 2 * ins);
|
227
|
+
#endif
|
228
|
+
|
229
|
+
/* Now handle the rest of the matrix */
|
230
|
+
|
231
|
+
for (row = 1; row < to_len; row++) {
|
232
|
+
for (col = 0; col < from_len; col++) {
|
233
|
+
buffer[index] = min3(
|
234
|
+
NW(row, col) + ((from[col] == to[row]) ? 0 : ch),
|
235
|
+
N(row, col + 1) + ins,
|
236
|
+
W(row + 1, col) + del);
|
237
|
+
if (from[col] == to[row - 1] && col > 0 &&
|
238
|
+
from[col - 1] == to[row])
|
239
|
+
buffer[index] = min2(buffer[index],
|
240
|
+
NNWW(row - 1, col - 1) + swap_cost);
|
241
|
+
|
242
|
+
#ifdef DEBUG_EDITDIST
|
243
|
+
printf("%2d ", buffer[index]);
|
244
|
+
#endif
|
245
|
+
#ifdef TRN_SPEEDUP
|
246
|
+
if (buffer[index] < low || col == 0)
|
247
|
+
low = buffer[index];
|
248
|
+
#endif
|
249
|
+
|
250
|
+
index = mod(index + 1);
|
251
|
+
} /* for col = 1 */
|
252
|
+
#ifdef DEBUG_EDITDIST
|
253
|
+
if (row < to_len - 1)
|
254
|
+
printf("\n %c %2d ", to[row+1], (row + 2) * ins);
|
255
|
+
else
|
256
|
+
printf("\n");
|
257
|
+
#endif
|
258
|
+
#ifdef TRN_SPEEDUP
|
259
|
+
if (low > MIN_DIST)
|
260
|
+
break;
|
261
|
+
#endif
|
262
|
+
} /* for row = 1 */
|
263
|
+
|
264
|
+
row = buffer[mod(index + radix - 1)];
|
265
|
+
if (buffer != store)
|
266
|
+
free((char *) buffer);
|
267
|
+
return row;
|
268
|
+
} /* edit_distn */
|
269
|
+
|