jwilkins-spamsum 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,24 @@
1
+ SpamSum v0.1
2
+ ==============
3
+ SpamSum is a distance based hash. This is the opposite of a cryptographic
4
+ hash in that two slightly different inputs will have similar (or even
5
+ identical outputs).
6
+
7
+ The original spamsum code was released under the Perl Artistic License
8
+ and is Copyright Andrew Tridgell <tridge@samba.org> 2002
9
+ It is available here:
10
+ http://www.samba.org/junkcode/#spamsum
11
+ or here:
12
+ http://linux.anu.edu.au/linux.conf.au/2004/papers/junkcode/spamsum
13
+
14
+ More information on the algorithm itself is here:
15
+ http://samba.org/ftp/unpacked/junkcode/spamsum/README
16
+
17
+ Ruby wrapper is released under GPL v2
18
+ Copyright 2009 Jonathan Wilkins <jwilkins@bitland.net>
19
+
20
+ Building
21
+ --------
22
+ $ swig -ruby spamsum.i
23
+ $ ruby extconf.rb
24
+ $ make
data/Rakefile ADDED
@@ -0,0 +1,3 @@
1
+ task :default do
2
+ `cd ext && swig -ruby spamsum.i && ruby extconf.rb && make`
3
+ end
data/ext/edit_dist.c ADDED
@@ -0,0 +1,269 @@
1
+ /*
2
+ This edit distance code is taken from trn3.6. A few minor
3
+ modifications have been made by Andrew Tridgell <tridge@samba.org>
4
+ for use in spamsum.
5
+ */
6
+
7
+
8
+ /***************************************************************************/
9
+
10
+
11
+ /* The authors make no claims as to the fitness or correctness of this software
12
+ * for any use whatsoever, and it is provided as is. Any use of this software
13
+ * is at the user's own risk.
14
+ */
15
+
16
+ #include <stdio.h>
17
+ #include <unistd.h>
18
+ #include <stdlib.h>
19
+
20
+ /* edit_dist -- returns the minimum edit distance between two strings
21
+
22
+ Program by: Mark Maimone CMU Computer Science 13 Nov 89
23
+ Last Modified: 28 Jan 90
24
+
25
+ If the input strings have length n and m, the algorithm runs in time
26
+ O(nm) and space O(min(m,n)).
27
+
28
+ HISTORY
29
+ 13 Nov 89 (mwm) Created edit_dist() and set_costs().
30
+
31
+ 28 Jan 90 (mwm) Added view_costs(). Should verify that THRESHOLD
32
+ computations will work even when THRESHOLD is not a multiple of
33
+ sizeof(int).
34
+
35
+ 17 May 93 (mwm) Improved performance when used with trn's newsgroup
36
+ processing; assume all costs are 1, and you can terminate when a
37
+ threshold is exceeded.
38
+ */
39
+
40
+ #define MIN_DIST 100
41
+
42
+ #define TRN_SPEEDUP /* Use a less-general version of the
43
+ routine, one that's better for trn.
44
+ All change costs are 1, and it's okay
45
+ to terminate if the edit distance is
46
+ known to exceed MIN_DIST */
47
+
48
+ #define THRESHOLD 4000 /* worry about allocating more memory only
49
+ when this # of bytes is exceeded */
50
+ #define STRLENTHRESHOLD ((int) ((THRESHOLD / sizeof (int) - 3) / 2))
51
+
52
+ #define SAFE_ASSIGN(x,y) (((x) != NULL) ? (*(x) = (y)) : (y))
53
+
54
+ #define swap_int(x,y) (_iswap = (x), (x) = (y), (y) = _iswap)
55
+ #define swap_char(x,y) (_cswap = (x), (x) = (y), (y) = _cswap)
56
+ #define min3(x,y,z) (_mx = (x), _my = (y), _mz = (z), (_mx < _my ? (_mx < _mz ? _mx : _mz) : (_mz < _my) ? _mz : _my))
57
+ #define min2(x,y) (_mx = (x), _my = (y), (_mx < _my ? _mx : _my))
58
+
59
+
60
+ static int insert_cost = 1;
61
+ static int delete_cost = 1;
62
+ #ifndef TRN_SPEEDUP
63
+ static int change_cost = 1;
64
+ static int swap_cost = 1;
65
+ #endif
66
+
67
+ static int _iswap; /* swap_int temp variable */
68
+ static char *_cswap; /* swap_char temp variable */
69
+ static int _mx, _my, _mz; /* min2, min3 temp variables */
70
+
71
+
72
+
73
+ /* edit_distn -- returns the edit distance between two strings, or -1 on
74
+ failure */
75
+
76
+ int
77
+ edit_distn(from, from_len, to, to_len)
78
+ char *from, *to;
79
+ register int from_len, to_len;
80
+ {
81
+ #ifndef TRN_SPEEDUP
82
+ register int ins, del, ch; /* local copies of edit costs */
83
+ #endif
84
+ register int row, col, index; /* dynamic programming counters */
85
+ register int radix; /* radix for modular indexing */
86
+ #ifdef TRN_SPEEDUP
87
+ register int low;
88
+ #endif
89
+ int *buffer; /* pointer to storage for one row
90
+ of the d.p. array */
91
+ static int store[THRESHOLD / sizeof (int)];
92
+ /* a small amount of static
93
+ storage, to be used when the
94
+ input strings are small enough */
95
+
96
+ /* Handle trivial cases when one string is empty */
97
+
98
+ if (from == NULL || !from_len)
99
+ if (to == NULL || !to_len)
100
+ return 0;
101
+ else
102
+ return to_len * insert_cost;
103
+ else if (to == NULL || !to_len)
104
+ return from_len * delete_cost;
105
+
106
+ /* Initialize registers */
107
+
108
+ radix = 2 * from_len + 3;
109
+ #ifdef TRN_SPEEDUP
110
+ #define ins 1
111
+ #define del 1
112
+ #define ch 3
113
+ #define swap_cost 5
114
+ #else
115
+ ins = insert_cost;
116
+ del = delete_cost;
117
+ ch = change_cost;
118
+ #endif
119
+
120
+ /* Make from short enough to fit in the static storage, if it's at all
121
+ possible */
122
+
123
+ if (from_len > to_len && from_len > STRLENTHRESHOLD) {
124
+ swap_int(from_len, to_len);
125
+ swap_char(from, to);
126
+ #ifndef TRN_SPEEDUP
127
+ swap_int(ins, del);
128
+ #endif
129
+ } /* if from_len > to_len */
130
+
131
+ /* Allocate the array storage (from the heap if necessary) */
132
+
133
+ if (from_len <= STRLENTHRESHOLD)
134
+ buffer = store;
135
+ else
136
+ buffer = (int *) malloc(radix * sizeof (int));
137
+
138
+ /* Here's where the fun begins. We will find the minimum edit distance
139
+ using dynamic programming. We only need to store two rows of the matrix
140
+ at a time, since we always progress down the matrix. For example,
141
+ given the strings "one" and "two", and insert, delete and change costs
142
+ equal to 1:
143
+
144
+ _ o n e
145
+ _ 0 1 2 3
146
+ t 1 1 2 3
147
+ w 2 2 2 3
148
+ o 3 2 3 3
149
+
150
+ The dynamic programming recursion is defined as follows:
151
+
152
+ ar(x,0) := x * insert_cost
153
+ ar(0,y) := y * delete_cost
154
+ ar(x,y) := min(a(x - 1, y - 1) + (from[x] == to[y] ? 0 : change),
155
+ a(x - 1, y) + insert_cost,
156
+ a(x, y - 1) + delete_cost,
157
+ a(x - 2, y - 2) + (from[x] == to[y-1] &&
158
+ from[x-1] == to[y] ? swap_cost :
159
+ infinity))
160
+
161
+ Since this only looks at most two rows and three columns back, we need
162
+ only store the values for the two preceeding rows. In this
163
+ implementation, we do not explicitly store the zero column, so only 2 *
164
+ from_len + 2 words are needed. However, in the implementation of the
165
+ swap_cost check, the current matrix value is used as a buffer; we
166
+ can't overwrite the earlier value until the swap_cost check has
167
+ been performed. So we use 2 * from_len + 3 elements in the buffer.
168
+ */
169
+
170
+ #define ar(x,y,index) (((x) == 0) ? (y) * del : (((y) == 0) ? (x) * ins : \
171
+ buffer[mod(index)]))
172
+ #define NW(x,y) ar(x, y, index + from_len + 2)
173
+ #define N(x,y) ar(x, y, index + from_len + 3)
174
+ #define W(x,y) ar(x, y, index + radix - 1)
175
+ #define NNWW(x,y) ar(x, y, index + 1)
176
+ #define mod(x) ((x) % radix)
177
+
178
+ index = 0;
179
+
180
+ #ifdef DEBUG_EDITDIST
181
+ printf(" ");
182
+ for (col = 0; col < from_len; col++)
183
+ printf(" %c ", from[col]);
184
+ printf("\n ");
185
+
186
+ for (col = 0; col <= from_len; col++)
187
+ printf("%2d ", col * del);
188
+ #endif
189
+
190
+ /* Row 0 is handled implicitly; its value at a given column is col*del.
191
+ The loop below computes the values for Row 1. At this point we know the
192
+ strings are nonempty. We also don't need to consider swap costs in row
193
+ 1.
194
+
195
+ COMMENT: the indicies row and col below point into the STRING, so
196
+ the corresponding MATRIX indicies are row+1 and col+1.
197
+ */
198
+
199
+ buffer[index++] = min2(ins + del, (from[0] == to[0] ? 0 : ch));
200
+ #ifdef TRN_SPEEDUP
201
+ low = buffer[mod(index + radix - 1)];
202
+ #endif
203
+
204
+ #ifdef DEBUG_EDITDIST
205
+ printf("\n %c %2d %2d ", to[0], ins, buffer[index - 1]);
206
+ #endif
207
+
208
+ for (col = 1; col < from_len; col++) {
209
+ buffer[index] = min3(
210
+ col * del + ((from[col] == to[0]) ? 0 : ch),
211
+ (col + 1) * del + ins,
212
+ buffer[index - 1] + del);
213
+ #ifdef TRN_SPEEDUP
214
+ if (buffer[index] < low)
215
+ low = buffer[index];
216
+ #endif
217
+ index++;
218
+
219
+ #ifdef DEBUG_EDITDIST
220
+ printf("%2d ", buffer[index - 1]);
221
+ #endif
222
+
223
+ } /* for col = 1 */
224
+
225
+ #ifdef DEBUG_EDITDIST
226
+ printf("\n %c %2d ", to[1], 2 * ins);
227
+ #endif
228
+
229
+ /* Now handle the rest of the matrix */
230
+
231
+ for (row = 1; row < to_len; row++) {
232
+ for (col = 0; col < from_len; col++) {
233
+ buffer[index] = min3(
234
+ NW(row, col) + ((from[col] == to[row]) ? 0 : ch),
235
+ N(row, col + 1) + ins,
236
+ W(row + 1, col) + del);
237
+ if (from[col] == to[row - 1] && col > 0 &&
238
+ from[col - 1] == to[row])
239
+ buffer[index] = min2(buffer[index],
240
+ NNWW(row - 1, col - 1) + swap_cost);
241
+
242
+ #ifdef DEBUG_EDITDIST
243
+ printf("%2d ", buffer[index]);
244
+ #endif
245
+ #ifdef TRN_SPEEDUP
246
+ if (buffer[index] < low || col == 0)
247
+ low = buffer[index];
248
+ #endif
249
+
250
+ index = mod(index + 1);
251
+ } /* for col = 1 */
252
+ #ifdef DEBUG_EDITDIST
253
+ if (row < to_len - 1)
254
+ printf("\n %c %2d ", to[row+1], (row + 2) * ins);
255
+ else
256
+ printf("\n");
257
+ #endif
258
+ #ifdef TRN_SPEEDUP
259
+ if (low > MIN_DIST)
260
+ break;
261
+ #endif
262
+ } /* for row = 1 */
263
+
264
+ row = buffer[mod(index + radix - 1)];
265
+ if (buffer != store)
266
+ free((char *) buffer);
267
+ return row;
268
+ } /* edit_distn */
269
+
data/ext/extconf.rb ADDED
@@ -0,0 +1,6 @@
1
+ require 'mkmf'
2
+
3
+ dir_config("spamsum_swig")
4
+
5
+ create_makefile("spamsum_swig")
6
+