jwilkins-spamsum 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +24 -0
- data/Rakefile +3 -0
- data/ext/edit_dist.c +269 -0
- data/ext/extconf.rb +6 -0
- data/ext/spamsum.c +679 -0
- data/ext/spamsum.i +16 -0
- data/ext/spamsum_wrap.c +2405 -0
- data/spamsum.gemspec +29 -0
- data/test.rb +33 -0
- metadata +63 -0
data/README
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
SpamSum v0.1
|
2
|
+
==============
|
3
|
+
SpamSum is a distance based hash. This is the opposite of a cryptographic
|
4
|
+
hash in that two slightly different inputs will have similar (or even
|
5
|
+
identical outputs).
|
6
|
+
|
7
|
+
The original spamsum code was released under the Perl Artistic License
|
8
|
+
and is Copyright Andrew Tridgell <tridge@samba.org> 2002
|
9
|
+
It is available here:
|
10
|
+
http://www.samba.org/junkcode/#spamsum
|
11
|
+
or here:
|
12
|
+
http://linux.anu.edu.au/linux.conf.au/2004/papers/junkcode/spamsum
|
13
|
+
|
14
|
+
More information on the algorithm itself is here:
|
15
|
+
http://samba.org/ftp/unpacked/junkcode/spamsum/README
|
16
|
+
|
17
|
+
Ruby wrapper is released under GPL v2
|
18
|
+
Copyright 2009 Jonathan Wilkins <jwilkins@bitland.net>
|
19
|
+
|
20
|
+
Building
|
21
|
+
--------
|
22
|
+
$ swig -ruby spamsum.i
|
23
|
+
$ ruby extconf.rb
|
24
|
+
$ make
|
data/Rakefile
ADDED
data/ext/edit_dist.c
ADDED
@@ -0,0 +1,269 @@
|
|
1
|
+
/*
|
2
|
+
This edit distance code is taken from trn3.6. A few minor
|
3
|
+
modifications have been made by Andrew Tridgell <tridge@samba.org>
|
4
|
+
for use in spamsum.
|
5
|
+
*/
|
6
|
+
|
7
|
+
|
8
|
+
/***************************************************************************/
|
9
|
+
|
10
|
+
|
11
|
+
/* The authors make no claims as to the fitness or correctness of this software
|
12
|
+
* for any use whatsoever, and it is provided as is. Any use of this software
|
13
|
+
* is at the user's own risk.
|
14
|
+
*/
|
15
|
+
|
16
|
+
#include <stdio.h>
|
17
|
+
#include <unistd.h>
|
18
|
+
#include <stdlib.h>
|
19
|
+
|
20
|
+
/* edit_dist -- returns the minimum edit distance between two strings
|
21
|
+
|
22
|
+
Program by: Mark Maimone CMU Computer Science 13 Nov 89
|
23
|
+
Last Modified: 28 Jan 90
|
24
|
+
|
25
|
+
If the input strings have length n and m, the algorithm runs in time
|
26
|
+
O(nm) and space O(min(m,n)).
|
27
|
+
|
28
|
+
HISTORY
|
29
|
+
13 Nov 89 (mwm) Created edit_dist() and set_costs().
|
30
|
+
|
31
|
+
28 Jan 90 (mwm) Added view_costs(). Should verify that THRESHOLD
|
32
|
+
computations will work even when THRESHOLD is not a multiple of
|
33
|
+
sizeof(int).
|
34
|
+
|
35
|
+
17 May 93 (mwm) Improved performance when used with trn's newsgroup
|
36
|
+
processing; assume all costs are 1, and you can terminate when a
|
37
|
+
threshold is exceeded.
|
38
|
+
*/
|
39
|
+
|
40
|
+
#define MIN_DIST 100
|
41
|
+
|
42
|
+
#define TRN_SPEEDUP /* Use a less-general version of the
|
43
|
+
routine, one that's better for trn.
|
44
|
+
All change costs are 1, and it's okay
|
45
|
+
to terminate if the edit distance is
|
46
|
+
known to exceed MIN_DIST */
|
47
|
+
|
48
|
+
#define THRESHOLD 4000 /* worry about allocating more memory only
|
49
|
+
when this # of bytes is exceeded */
|
50
|
+
#define STRLENTHRESHOLD ((int) ((THRESHOLD / sizeof (int) - 3) / 2))
|
51
|
+
|
52
|
+
#define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y))
|
53
|
+
|
54
|
+
#define swap_int(x,y) (_iswap = (x), (x) = (y), (y) = _iswap)
|
55
|
+
#define swap_char(x,y) (_cswap = (x), (x) = (y), (y) = _cswap)
|
56
|
+
#define min3(x,y,z) (_mx = (x), _my = (y), _mz = (z), (_mx < _my ? (_mx < _mz ? _mx : _mz) : (_mz < _my) ? _mz : _my))
|
57
|
+
#define min2(x,y) (_mx = (x), _my = (y), (_mx < _my ? _mx : _my))
|
58
|
+
|
59
|
+
|
60
|
+
static int insert_cost = 1;
|
61
|
+
static int delete_cost = 1;
|
62
|
+
#ifndef TRN_SPEEDUP
|
63
|
+
static int change_cost = 1;
|
64
|
+
static int swap_cost = 1;
|
65
|
+
#endif
|
66
|
+
|
67
|
+
static int _iswap; /* swap_int temp variable */
|
68
|
+
static char *_cswap; /* swap_char temp variable */
|
69
|
+
static int _mx, _my, _mz; /* min2, min3 temp variables */
|
70
|
+
|
71
|
+
|
72
|
+
|
73
|
+
/* edit_distn -- returns the edit distance between two strings, or -1 on
|
74
|
+
failure */
|
75
|
+
|
76
|
+
int
|
77
|
+
edit_distn(from, from_len, to, to_len)
|
78
|
+
char *from, *to;
|
79
|
+
register int from_len, to_len;
|
80
|
+
{
|
81
|
+
#ifndef TRN_SPEEDUP
|
82
|
+
register int ins, del, ch; /* local copies of edit costs */
|
83
|
+
#endif
|
84
|
+
register int row, col, index; /* dynamic programming counters */
|
85
|
+
register int radix; /* radix for modular indexing */
|
86
|
+
#ifdef TRN_SPEEDUP
|
87
|
+
register int low;
|
88
|
+
#endif
|
89
|
+
int *buffer; /* pointer to storage for one row
|
90
|
+
of the d.p. array */
|
91
|
+
static int store[THRESHOLD / sizeof (int)];
|
92
|
+
/* a small amount of static
|
93
|
+
storage, to be used when the
|
94
|
+
input strings are small enough */
|
95
|
+
|
96
|
+
/* Handle trivial cases when one string is empty */
|
97
|
+
|
98
|
+
if (from == NULL || !from_len)
|
99
|
+
if (to == NULL || !to_len)
|
100
|
+
return 0;
|
101
|
+
else
|
102
|
+
return to_len * insert_cost;
|
103
|
+
else if (to == NULL || !to_len)
|
104
|
+
return from_len * delete_cost;
|
105
|
+
|
106
|
+
/* Initialize registers */
|
107
|
+
|
108
|
+
radix = 2 * from_len + 3;
|
109
|
+
#ifdef TRN_SPEEDUP
|
110
|
+
#define ins 1
|
111
|
+
#define del 1
|
112
|
+
#define ch 3
|
113
|
+
#define swap_cost 5
|
114
|
+
#else
|
115
|
+
ins = insert_cost;
|
116
|
+
del = delete_cost;
|
117
|
+
ch = change_cost;
|
118
|
+
#endif
|
119
|
+
|
120
|
+
/* Make from short enough to fit in the static storage, if it's at all
|
121
|
+
possible */
|
122
|
+
|
123
|
+
if (from_len > to_len && from_len > STRLENTHRESHOLD) {
|
124
|
+
swap_int(from_len, to_len);
|
125
|
+
swap_char(from, to);
|
126
|
+
#ifndef TRN_SPEEDUP
|
127
|
+
swap_int(ins, del);
|
128
|
+
#endif
|
129
|
+
} /* if from_len > to_len */
|
130
|
+
|
131
|
+
/* Allocate the array storage (from the heap if necessary) */
|
132
|
+
|
133
|
+
if (from_len <= STRLENTHRESHOLD)
|
134
|
+
buffer = store;
|
135
|
+
else
|
136
|
+
buffer = (int *) malloc(radix * sizeof (int));
|
137
|
+
|
138
|
+
/* Here's where the fun begins. We will find the minimum edit distance
|
139
|
+
using dynamic programming. We only need to store two rows of the matrix
|
140
|
+
at a time, since we always progress down the matrix. For example,
|
141
|
+
given the strings "one" and "two", and insert, delete and change costs
|
142
|
+
equal to 1:
|
143
|
+
|
144
|
+
_ o n e
|
145
|
+
_ 0 1 2 3
|
146
|
+
t 1 1 2 3
|
147
|
+
w 2 2 2 3
|
148
|
+
o 3 2 3 3
|
149
|
+
|
150
|
+
The dynamic programming recursion is defined as follows:
|
151
|
+
|
152
|
+
ar(x,0) := x * insert_cost
|
153
|
+
ar(0,y) := y * delete_cost
|
154
|
+
ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 : change),
|
155
|
+
a(x - 1, y) + insert_cost,
|
156
|
+
a(x, y - 1) + delete_cost,
|
157
|
+
a(x - 2, y - 2) + (from[x] == to[y-1] &&
|
158
|
+
from[x-1] == to[y] ? swap_cost :
|
159
|
+
infinity))
|
160
|
+
|
161
|
+
Since this only looks at most two rows and three columns back, we need
|
162
|
+
only store the values for the two preceeding rows. In this
|
163
|
+
implementation, we do not explicitly store the zero column, so only 2 *
|
164
|
+
from_len + 2 words are needed. However, in the implementation of the
|
165
|
+
swap_cost check, the current matrix value is used as a buffer; we
|
166
|
+
can't overwrite the earlier value until the swap_cost check has
|
167
|
+
been performed. So we use 2 * from_len + 3 elements in the buffer.
|
168
|
+
*/
|
169
|
+
|
170
|
+
#define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins : \
|
171
|
+
buffer[mod(index)]))
|
172
|
+
#define NW(x,y) ar(x, y, index + from_len + 2)
|
173
|
+
#define N(x,y) ar(x, y, index + from_len + 3)
|
174
|
+
#define W(x,y) ar(x, y, index + radix - 1)
|
175
|
+
#define NNWW(x,y) ar(x, y, index + 1)
|
176
|
+
#define mod(x) ((x) % radix)
|
177
|
+
|
178
|
+
index = 0;
|
179
|
+
|
180
|
+
#ifdef DEBUG_EDITDIST
|
181
|
+
printf(" ");
|
182
|
+
for (col = 0; col < from_len; col++)
|
183
|
+
printf(" %c ", from[col]);
|
184
|
+
printf("\n ");
|
185
|
+
|
186
|
+
for (col = 0; col <= from_len; col++)
|
187
|
+
printf("%2d ", col * del);
|
188
|
+
#endif
|
189
|
+
|
190
|
+
/* Row 0 is handled implicitly; its value at a given column is col*del.
|
191
|
+
The loop below computes the values for Row 1. At this point we know the
|
192
|
+
strings are nonempty. We also don't need to consider swap costs in row
|
193
|
+
1.
|
194
|
+
|
195
|
+
COMMENT: the indicies row and col below point into the STRING, so
|
196
|
+
the corresponding MATRIX indicies are row+1 and col+1.
|
197
|
+
*/
|
198
|
+
|
199
|
+
buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch));
|
200
|
+
#ifdef TRN_SPEEDUP
|
201
|
+
low = buffer[mod(index + radix - 1)];
|
202
|
+
#endif
|
203
|
+
|
204
|
+
#ifdef DEBUG_EDITDIST
|
205
|
+
printf("\n %c %2d %2d ", to[0], ins, buffer[index - 1]);
|
206
|
+
#endif
|
207
|
+
|
208
|
+
for (col = 1; col < from_len; col++) {
|
209
|
+
buffer[index] = min3(
|
210
|
+
col * del + ((from[col] == to[0]) ? 0 : ch),
|
211
|
+
(col + 1) * del + ins,
|
212
|
+
buffer[index - 1] + del);
|
213
|
+
#ifdef TRN_SPEEDUP
|
214
|
+
if (buffer[index] < low)
|
215
|
+
low = buffer[index];
|
216
|
+
#endif
|
217
|
+
index++;
|
218
|
+
|
219
|
+
#ifdef DEBUG_EDITDIST
|
220
|
+
printf("%2d ", buffer[index - 1]);
|
221
|
+
#endif
|
222
|
+
|
223
|
+
} /* for col = 1 */
|
224
|
+
|
225
|
+
#ifdef DEBUG_EDITDIST
|
226
|
+
printf("\n %c %2d ", to[1], 2 * ins);
|
227
|
+
#endif
|
228
|
+
|
229
|
+
/* Now handle the rest of the matrix */
|
230
|
+
|
231
|
+
for (row = 1; row < to_len; row++) {
|
232
|
+
for (col = 0; col < from_len; col++) {
|
233
|
+
buffer[index] = min3(
|
234
|
+
NW(row, col) + ((from[col] == to[row]) ? 0 : ch),
|
235
|
+
N(row, col + 1) + ins,
|
236
|
+
W(row + 1, col) + del);
|
237
|
+
if (from[col] == to[row - 1] && col > 0 &&
|
238
|
+
from[col - 1] == to[row])
|
239
|
+
buffer[index] = min2(buffer[index],
|
240
|
+
NNWW(row - 1, col - 1) + swap_cost);
|
241
|
+
|
242
|
+
#ifdef DEBUG_EDITDIST
|
243
|
+
printf("%2d ", buffer[index]);
|
244
|
+
#endif
|
245
|
+
#ifdef TRN_SPEEDUP
|
246
|
+
if (buffer[index] < low || col == 0)
|
247
|
+
low = buffer[index];
|
248
|
+
#endif
|
249
|
+
|
250
|
+
index = mod(index + 1);
|
251
|
+
} /* for col = 1 */
|
252
|
+
#ifdef DEBUG_EDITDIST
|
253
|
+
if (row < to_len - 1)
|
254
|
+
printf("\n %c %2d ", to[row+1], (row + 2) * ins);
|
255
|
+
else
|
256
|
+
printf("\n");
|
257
|
+
#endif
|
258
|
+
#ifdef TRN_SPEEDUP
|
259
|
+
if (low > MIN_DIST)
|
260
|
+
break;
|
261
|
+
#endif
|
262
|
+
} /* for row = 1 */
|
263
|
+
|
264
|
+
row = buffer[mod(index + radix - 1)];
|
265
|
+
if (buffer != store)
|
266
|
+
free((char *) buffer);
|
267
|
+
return row;
|
268
|
+
} /* edit_distn */
|
269
|
+
|