jwilkins-spamsum 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,24 @@
1
+ SpamSum v0.1
2
+ ==============
3
+ SpamSum is a distance based hash. This is the opposite of a cryptographic
4
+ hash in that two slightly different inputs will have similar (or even
5
+ identical outputs).
6
+
7
+ The original spamsum code was released under the Perl Artistic License
8
+ and is Copyright Andrew Tridgell <tridge@samba.org> 2002
9
+ It is available here:
10
+ http://www.samba.org/junkcode/#spamsum
11
+ or here:
12
+ http://linux.anu.edu.au/linux.conf.au/2004/papers/junkcode/spamsum
13
+
14
+ More information on the algorithm itself is here:
15
+ http://samba.org/ftp/unpacked/junkcode/spamsum/README
16
+
17
+ Ruby wrapper is released under GPL v2
18
+ Copyright 2009 Jonathan Wilkins <jwilkins@bitland.net>
19
+
20
+ Building
21
+ --------
22
+ $ swig -ruby spamsum.i
23
+ $ ruby extconf.rb
24
+ $ make
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ task :default do
2
+ `cd ext && swig -ruby spamsum.i && ruby extconf.rb && make`
3
+ end
data/ext/edit_dist.c ADDED
@@ -0,0 +1,269 @@
1
+ /*
2
+ This edit distance code is taken from trn3.6. A few minor
3
+ modifications have been made by Andrew Tridgell <tridge@samba.org>
4
+ for use in spamsum.
5
+ */
6
+
7
+
8
+ /***************************************************************************/
9
+
10
+
11
+ /* The authors make no claims as to the fitness or correctness of this software
12
+ * for any use whatsoever, and it is provided as is. Any use of this software
13
+ * is at the user's own risk.
14
+ */
15
+
16
+ #include <stdio.h>
17
+ #include <unistd.h>
18
+ #include <stdlib.h>
19
+
20
+ /* edit_dist -- returns the minimum edit distance between two strings
21
+
22
+ Program by: Mark Maimone CMU Computer Science 13 Nov 89
23
+ Last Modified: 28 Jan 90
24
+
25
+ If the input strings have length n and m, the algorithm runs in time
26
+ O(nm) and space O(min(m,n)).
27
+
28
+ HISTORY
29
+ 13 Nov 89 (mwm) Created edit_dist() and set_costs().
30
+
31
+ 28 Jan 90 (mwm) Added view_costs(). Should verify that THRESHOLD
32
+ computations will work even when THRESHOLD is not a multiple of
33
+ sizeof(int).
34
+
35
+ 17 May 93 (mwm) Improved performance when used with trn's newsgroup
36
+ processing; assume all costs are 1, and you can terminate when a
37
+ threshold is exceeded.
38
+ */
39
+
40
+ #define MIN_DIST 100
41
+
42
+ #define TRN_SPEEDUP /* Use a less-general version of the
43
+ routine, one that's better for trn.
44
+ All change costs are 1, and it's okay
45
+ to terminate if the edit distance is
46
+ known to exceed MIN_DIST */
47
+
48
+ #define THRESHOLD 4000 /* worry about allocating more memory only
49
+ when this # of bytes is exceeded */
50
+ #define STRLENTHRESHOLD ((int) ((THRESHOLD / sizeof (int) - 3) / 2))
51
+
52
+ #define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y))
53
+
54
+ #define swap_int(x,y) (_iswap = (x), (x) = (y), (y) = _iswap)
55
+ #define swap_char(x,y) (_cswap = (x), (x) = (y), (y) = _cswap)
56
+ #define min3(x,y,z) (_mx = (x), _my = (y), _mz = (z), (_mx < _my ? (_mx < _mz ? _mx : _mz) : (_mz < _my) ? _mz : _my))
57
+ #define min2(x,y) (_mx = (x), _my = (y), (_mx < _my ? _mx : _my))
58
+
59
+
60
+ static int insert_cost = 1;
61
+ static int delete_cost = 1;
62
+ #ifndef TRN_SPEEDUP
63
+ static int change_cost = 1;
64
+ static int swap_cost = 1;
65
+ #endif
66
+
67
+ static int _iswap; /* swap_int temp variable */
68
+ static char *_cswap; /* swap_char temp variable */
69
+ static int _mx, _my, _mz; /* min2, min3 temp variables */
70
+
71
+
72
+
73
+ /* edit_distn -- returns the edit distance between two strings, or -1 on
74
+ failure */
75
+
76
+ int
77
+ edit_distn(from, from_len, to, to_len)
78
+ char *from, *to;
79
+ register int from_len, to_len;
80
+ {
81
+ #ifndef TRN_SPEEDUP
82
+ register int ins, del, ch; /* local copies of edit costs */
83
+ #endif
84
+ register int row, col, index; /* dynamic programming counters */
85
+ register int radix; /* radix for modular indexing */
86
+ #ifdef TRN_SPEEDUP
87
+ register int low;
88
+ #endif
89
+ int *buffer; /* pointer to storage for one row
90
+ of the d.p. array */
91
+ static int store[THRESHOLD / sizeof (int)];
92
+ /* a small amount of static
93
+ storage, to be used when the
94
+ input strings are small enough */
95
+
96
+ /* Handle trivial cases when one string is empty */
97
+
98
+ if (from == NULL || !from_len)
99
+ if (to == NULL || !to_len)
100
+ return 0;
101
+ else
102
+ return to_len * insert_cost;
103
+ else if (to == NULL || !to_len)
104
+ return from_len * delete_cost;
105
+
106
+ /* Initialize registers */
107
+
108
+ radix = 2 * from_len + 3;
109
+ #ifdef TRN_SPEEDUP
110
+ #define ins 1
111
+ #define del 1
112
+ #define ch 3
113
+ #define swap_cost 5
114
+ #else
115
+ ins = insert_cost;
116
+ del = delete_cost;
117
+ ch = change_cost;
118
+ #endif
119
+
120
+ /* Make from short enough to fit in the static storage, if it's at all
121
+ possible */
122
+
123
+ if (from_len > to_len && from_len > STRLENTHRESHOLD) {
124
+ swap_int(from_len, to_len);
125
+ swap_char(from, to);
126
+ #ifndef TRN_SPEEDUP
127
+ swap_int(ins, del);
128
+ #endif
129
+ } /* if from_len > to_len */
130
+
131
+ /* Allocate the array storage (from the heap if necessary) */
132
+
133
+ if (from_len <= STRLENTHRESHOLD)
134
+ buffer = store;
135
+ else
136
+ buffer = (int *) malloc(radix * sizeof (int));
137
+
138
+ /* Here's where the fun begins. We will find the minimum edit distance
139
+ using dynamic programming. We only need to store two rows of the matrix
140
+ at a time, since we always progress down the matrix. For example,
141
+ given the strings "one" and "two", and insert, delete and change costs
142
+ equal to 1:
143
+
144
+ _ o n e
145
+ _ 0 1 2 3
146
+ t 1 1 2 3
147
+ w 2 2 2 3
148
+ o 3 2 3 3
149
+
150
+ The dynamic programming recursion is defined as follows:
151
+
152
+ ar(x,0) := x * insert_cost
153
+ ar(0,y) := y * delete_cost
154
+ ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 : change),
155
+ a(x - 1, y) + insert_cost,
156
+ a(x, y - 1) + delete_cost,
157
+ a(x - 2, y - 2) + (from[x] == to[y-1] &&
158
+ from[x-1] == to[y] ? swap_cost :
159
+ infinity))
160
+
161
+ Since this only looks at most two rows and three columns back, we need
162
+ only store the values for the two preceeding rows. In this
163
+ implementation, we do not explicitly store the zero column, so only 2 *
164
+ from_len + 2 words are needed. However, in the implementation of the
165
+ swap_cost check, the current matrix value is used as a buffer; we
166
+ can't overwrite the earlier value until the swap_cost check has
167
+ been performed. So we use 2 * from_len + 3 elements in the buffer.
168
+ */
169
+
170
+ #define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins : \
171
+ buffer[mod(index)]))
172
+ #define NW(x,y) ar(x, y, index + from_len + 2)
173
+ #define N(x,y) ar(x, y, index + from_len + 3)
174
+ #define W(x,y) ar(x, y, index + radix - 1)
175
+ #define NNWW(x,y) ar(x, y, index + 1)
176
+ #define mod(x) ((x) % radix)
177
+
178
+ index = 0;
179
+
180
+ #ifdef DEBUG_EDITDIST
181
+ printf(" ");
182
+ for (col = 0; col < from_len; col++)
183
+ printf(" %c ", from[col]);
184
+ printf("\n ");
185
+
186
+ for (col = 0; col <= from_len; col++)
187
+ printf("%2d ", col * del);
188
+ #endif
189
+
190
+ /* Row 0 is handled implicitly; its value at a given column is col*del.
191
+ The loop below computes the values for Row 1. At this point we know the
192
+ strings are nonempty. We also don't need to consider swap costs in row
193
+ 1.
194
+
195
+ COMMENT: the indicies row and col below point into the STRING, so
196
+ the corresponding MATRIX indicies are row+1 and col+1.
197
+ */
198
+
199
+ buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch));
200
+ #ifdef TRN_SPEEDUP
201
+ low = buffer[mod(index + radix - 1)];
202
+ #endif
203
+
204
+ #ifdef DEBUG_EDITDIST
205
+ printf("\n %c %2d %2d ", to[0], ins, buffer[index - 1]);
206
+ #endif
207
+
208
+ for (col = 1; col < from_len; col++) {
209
+ buffer[index] = min3(
210
+ col * del + ((from[col] == to[0]) ? 0 : ch),
211
+ (col + 1) * del + ins,
212
+ buffer[index - 1] + del);
213
+ #ifdef TRN_SPEEDUP
214
+ if (buffer[index] < low)
215
+ low = buffer[index];
216
+ #endif
217
+ index++;
218
+
219
+ #ifdef DEBUG_EDITDIST
220
+ printf("%2d ", buffer[index - 1]);
221
+ #endif
222
+
223
+ } /* for col = 1 */
224
+
225
+ #ifdef DEBUG_EDITDIST
226
+ printf("\n %c %2d ", to[1], 2 * ins);
227
+ #endif
228
+
229
+ /* Now handle the rest of the matrix */
230
+
231
+ for (row = 1; row < to_len; row++) {
232
+ for (col = 0; col < from_len; col++) {
233
+ buffer[index] = min3(
234
+ NW(row, col) + ((from[col] == to[row]) ? 0 : ch),
235
+ N(row, col + 1) + ins,
236
+ W(row + 1, col) + del);
237
+ if (from[col] == to[row - 1] && col > 0 &&
238
+ from[col - 1] == to[row])
239
+ buffer[index] = min2(buffer[index],
240
+ NNWW(row - 1, col - 1) + swap_cost);
241
+
242
+ #ifdef DEBUG_EDITDIST
243
+ printf("%2d ", buffer[index]);
244
+ #endif
245
+ #ifdef TRN_SPEEDUP
246
+ if (buffer[index] < low || col == 0)
247
+ low = buffer[index];
248
+ #endif
249
+
250
+ index = mod(index + 1);
251
+ } /* for col = 1 */
252
+ #ifdef DEBUG_EDITDIST
253
+ if (row < to_len - 1)
254
+ printf("\n %c %2d ", to[row+1], (row + 2) * ins);
255
+ else
256
+ printf("\n");
257
+ #endif
258
+ #ifdef TRN_SPEEDUP
259
+ if (low > MIN_DIST)
260
+ break;
261
+ #endif
262
+ } /* for row = 1 */
263
+
264
+ row = buffer[mod(index + radix - 1)];
265
+ if (buffer != store)
266
+ free((char *) buffer);
267
+ return row;
268
+ } /* edit_distn */
269
+
data/ext/extconf.rb ADDED
@@ -0,0 +1,6 @@
1
+ require 'mkmf'
2
+
3
+ dir_config("spamsum_swig")
4
+
5
+ create_makefile("spamsum_swig")
6
+