jwilkins-spamsum 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README +24 -0
- data/Rakefile +3 -0
- data/ext/edit_dist.c +269 -0
- data/ext/extconf.rb +6 -0
- data/ext/spamsum.c +679 -0
- data/ext/spamsum.i +16 -0
- data/ext/spamsum_wrap.c +2405 -0
- data/spamsum.gemspec +29 -0
- data/test.rb +33 -0
- metadata +63 -0
data/ext/spamsum.c
ADDED
@@ -0,0 +1,679 @@
|
|
1
|
+
/*
|
2
|
+
this is a checksum routine that is specifically designed for spam.
|
3
|
+
Copyright Andrew Tridgell <tridge@samba.org> 2002
|
4
|
+
|
5
|
+
This code is released under the GNU General Public License version 2
|
6
|
+
or later. Alteratively, you may also use this code under the terms
|
7
|
+
of the Perl Artistic license.
|
8
|
+
|
9
|
+
If you wish to distribute this code under the terms of a different
|
10
|
+
free software license then please ask me. If there is a good reason
|
11
|
+
then I will probably say yes.
|
12
|
+
|
13
|
+
---
|
14
|
+
|
15
|
+
Modified by Russell Keith-Magee, 20 Jan 2009:
|
16
|
+
* removed the condition preventing comparison of small block sizes
|
17
|
+
(lines 364-366)
|
18
|
+
* Modified the help string to be legal cross platform C.
|
19
|
+
*/
|
20
|
+
#include <stdio.h>
|
21
|
+
#include <stdlib.h>
|
22
|
+
#include <string.h>
|
23
|
+
#include <fcntl.h>
|
24
|
+
#include <errno.h>
|
25
|
+
#include <sys/mman.h>
|
26
|
+
#include <sys/stat.h>
|
27
|
+
#include <unistd.h>
|
28
|
+
#include <ctype.h>
|
29
|
+
|
30
|
+
/* the output is a string of length 64 in base64 */
|
31
|
+
#define SPAMSUM_LENGTH 64
|
32
|
+
|
33
|
+
#define MIN_BLOCKSIZE 3
|
34
|
+
#define HASH_PRIME 0x01000193
|
35
|
+
#define HASH_INIT 0x28021967
|
36
|
+
|
37
|
+
#define ROLLING_WINDOW 7
|
38
|
+
|
39
|
+
#ifndef MIN
|
40
|
+
#define MIN(a,b) ((a)<(b)?(a):(b))
|
41
|
+
#endif
|
42
|
+
|
43
|
+
#ifndef MAX
|
44
|
+
#define MAX(a,b) ((a)>(b)?(a):(b))
|
45
|
+
#endif
|
46
|
+
|
47
|
+
typedef unsigned u32;
|
48
|
+
typedef unsigned char uchar;
|
49
|
+
|
50
|
+
#define FLAG_IGNORE_WHITESPACE 1
|
51
|
+
#define FLAG_IGNORE_HEADERS 2
|
52
|
+
|
53
|
+
static struct {
|
54
|
+
uchar window[ROLLING_WINDOW];
|
55
|
+
u32 h1, h2, h3;
|
56
|
+
u32 n;
|
57
|
+
} roll_state;
|
58
|
+
|
59
|
+
/*
|
60
|
+
a rolling hash, based on the Adler checksum. By using a rolling hash
|
61
|
+
we can perform auto resynchronisation after inserts/deletes
|
62
|
+
|
63
|
+
internally, h1 is the sum of the bytes in the window and h2
|
64
|
+
is the sum of the bytes times the index
|
65
|
+
|
66
|
+
h3 is a shift/xor based rolling hash, and is mostly needed to ensure that
|
67
|
+
we can cope with large blocksize values
|
68
|
+
*/
|
69
|
+
static inline u32 roll_hash(uchar c)
|
70
|
+
{
|
71
|
+
roll_state.h2 -= roll_state.h1;
|
72
|
+
roll_state.h2 += ROLLING_WINDOW * c;
|
73
|
+
|
74
|
+
roll_state.h1 += c;
|
75
|
+
roll_state.h1 -= roll_state.window[roll_state.n % ROLLING_WINDOW];
|
76
|
+
|
77
|
+
roll_state.window[roll_state.n % ROLLING_WINDOW] = c;
|
78
|
+
roll_state.n++;
|
79
|
+
|
80
|
+
roll_state.h3 = (roll_state.h3 << 5) & 0xFFFFFFFF;
|
81
|
+
roll_state.h3 ^= c;
|
82
|
+
|
83
|
+
return roll_state.h1 + roll_state.h2 + roll_state.h3;
|
84
|
+
}
|
85
|
+
|
86
|
+
/*
|
87
|
+
reset the state of the rolling hash and return the initial rolling hash value
|
88
|
+
*/
|
89
|
+
static u32 roll_reset(void)
|
90
|
+
{
|
91
|
+
memset(&roll_state, 0, sizeof(roll_state));
|
92
|
+
return 0;
|
93
|
+
}
|
94
|
+
|
95
|
+
/* a simple non-rolling hash, based on the FNV hash */
|
96
|
+
static inline u32 sum_hash(uchar c, u32 h)
|
97
|
+
{
|
98
|
+
h *= HASH_PRIME;
|
99
|
+
h ^= c;
|
100
|
+
return h;
|
101
|
+
}
|
102
|
+
|
103
|
+
/*
|
104
|
+
take a message of length 'length' and return a string representing a hash of that message,
|
105
|
+
prefixed by the selected blocksize
|
106
|
+
*/
|
107
|
+
char *spamsum(const uchar *in, u32 length, u32 flags, u32 bsize)
|
108
|
+
{
|
109
|
+
const char *b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
110
|
+
char *ret, *p;
|
111
|
+
u32 total_chars;
|
112
|
+
u32 h, h2, h3;
|
113
|
+
u32 j, n, i, k;
|
114
|
+
u32 block_size;
|
115
|
+
uchar ret2[SPAMSUM_LENGTH/2 + 1];
|
116
|
+
|
117
|
+
/* if we are ignoring email headers then skip past them now */
|
118
|
+
if (flags & FLAG_IGNORE_HEADERS) {
|
119
|
+
const uchar *s = strstr(in, "\n\n");
|
120
|
+
if (s) {
|
121
|
+
length -= (s+2 - in);
|
122
|
+
in = s+2;
|
123
|
+
}
|
124
|
+
}
|
125
|
+
|
126
|
+
if (flags & FLAG_IGNORE_WHITESPACE) {
|
127
|
+
/* count the non-ignored chars */
|
128
|
+
for (n=0, i=0; i<length; i++) {
|
129
|
+
if (isspace(in[i])) continue;
|
130
|
+
n++;
|
131
|
+
}
|
132
|
+
total_chars = n;
|
133
|
+
} else {
|
134
|
+
total_chars = length;
|
135
|
+
}
|
136
|
+
|
137
|
+
if (bsize == 0) {
|
138
|
+
/* guess a reasonable block size */
|
139
|
+
block_size = MIN_BLOCKSIZE;
|
140
|
+
while (block_size * SPAMSUM_LENGTH < total_chars) {
|
141
|
+
block_size = block_size * 2;
|
142
|
+
}
|
143
|
+
} else {
|
144
|
+
block_size = bsize;
|
145
|
+
}
|
146
|
+
|
147
|
+
ret = malloc(SPAMSUM_LENGTH + SPAMSUM_LENGTH/2 + 20);
|
148
|
+
if (!ret) return NULL;
|
149
|
+
|
150
|
+
again:
|
151
|
+
/* the first part of the spamsum signature is the blocksize */
|
152
|
+
snprintf(ret, 12, "%u:", block_size);
|
153
|
+
p = ret + strlen(ret);
|
154
|
+
|
155
|
+
memset(p, 0, SPAMSUM_LENGTH+1);
|
156
|
+
memset(ret2, 0, sizeof(ret2));
|
157
|
+
|
158
|
+
k = j = 0;
|
159
|
+
h3 = h2 = HASH_INIT;
|
160
|
+
h = roll_reset();
|
161
|
+
|
162
|
+
for (i=0; i<length; i++) {
|
163
|
+
if ((flags & FLAG_IGNORE_WHITESPACE) &&
|
164
|
+
isspace(in[i])) continue;
|
165
|
+
|
166
|
+
/*
|
167
|
+
at each character we update the rolling hash and
|
168
|
+
the normal hash. When the rolling hash hits the
|
169
|
+
reset value then we emit the normal hash as a
|
170
|
+
element of the signature and reset both hashes
|
171
|
+
*/
|
172
|
+
h = roll_hash(in[i]);
|
173
|
+
h2 = sum_hash(in[i], h2);
|
174
|
+
h3 = sum_hash(in[i], h3);
|
175
|
+
|
176
|
+
if (h % block_size == (block_size-1)) {
|
177
|
+
/* we have hit a reset point. We now emit a
|
178
|
+
hash which is based on all chacaters in the
|
179
|
+
piece of the message between the last reset
|
180
|
+
point and this one */
|
181
|
+
p[j] = b64[h2 % 64];
|
182
|
+
if (j < SPAMSUM_LENGTH-1) {
|
183
|
+
/* we can have a problem with the tail
|
184
|
+
overflowing. The easiest way to
|
185
|
+
cope with this is to only reset the
|
186
|
+
second hash if we have room for
|
187
|
+
more characters in our
|
188
|
+
signature. This has the effect of
|
189
|
+
combining the last few pieces of
|
190
|
+
the message into a single piece */
|
191
|
+
h2 = HASH_INIT;
|
192
|
+
j++;
|
193
|
+
}
|
194
|
+
}
|
195
|
+
|
196
|
+
/* this produces a second signature with a block size
|
197
|
+
of block_size*2. By producing dual signatures in
|
198
|
+
this way the effect of small changes in the message
|
199
|
+
size near a block size boundary is greatly reduced. */
|
200
|
+
if (h % (block_size*2) == ((block_size*2)-1)) {
|
201
|
+
ret2[k] = b64[h3 % 64];
|
202
|
+
if (k < SPAMSUM_LENGTH/2-1) {
|
203
|
+
h3 = HASH_INIT;
|
204
|
+
k++;
|
205
|
+
}
|
206
|
+
}
|
207
|
+
}
|
208
|
+
|
209
|
+
/* if we have anything left then add it to the end. This
|
210
|
+
ensures that the last part of the message is always
|
211
|
+
considered */
|
212
|
+
if (h != 0) {
|
213
|
+
p[j] = b64[h2 % 64];
|
214
|
+
ret2[k] = b64[h3 % 64];
|
215
|
+
}
|
216
|
+
|
217
|
+
strcat(p+j, ":");
|
218
|
+
strcat(p+j, ret2);
|
219
|
+
|
220
|
+
/* our blocksize guess may have been way off - repeat if necessary */
|
221
|
+
if (bsize == 0 && block_size > MIN_BLOCKSIZE && j < SPAMSUM_LENGTH/2) {
|
222
|
+
block_size = block_size / 2;
|
223
|
+
goto again;
|
224
|
+
}
|
225
|
+
|
226
|
+
return ret;
|
227
|
+
}
|
228
|
+
|
229
|
+
|
230
|
+
/*
|
231
|
+
we only accept a match if we have at least one common substring in
|
232
|
+
the signature of length ROLLING_WINDOW. This dramatically drops the
|
233
|
+
false positive rate for low score thresholds while having
|
234
|
+
negligable affect on the rate of spam detection.
|
235
|
+
|
236
|
+
return 1 if the two strings do have a common substring, 0 otherwise
|
237
|
+
*/
|
238
|
+
static int has_common_substring(const char *s1, const char *s2)
|
239
|
+
{
|
240
|
+
int i, j;
|
241
|
+
int num_hashes;
|
242
|
+
u32 hashes[SPAMSUM_LENGTH];
|
243
|
+
|
244
|
+
/* there are many possible algorithms for common substring
|
245
|
+
detection. In this case I am re-using the rolling hash code
|
246
|
+
to act as a filter for possible substring matches */
|
247
|
+
|
248
|
+
roll_reset();
|
249
|
+
memset(hashes, 0, sizeof(hashes));
|
250
|
+
|
251
|
+
/* first compute the windowed rolling hash at each offset in
|
252
|
+
the first string */
|
253
|
+
for (i=0;s1[i];i++) {
|
254
|
+
hashes[i] = roll_hash((uchar)s1[i]);
|
255
|
+
}
|
256
|
+
num_hashes = i;
|
257
|
+
|
258
|
+
roll_reset();
|
259
|
+
|
260
|
+
/* now for each offset in the second string compute the
|
261
|
+
rolling hash and compare it to all of the rolling hashes
|
262
|
+
for the first string. If one matches then we have a
|
263
|
+
candidate substring match. We then confirm that match with
|
264
|
+
a direct string comparison */
|
265
|
+
for (i=0;s2[i];i++) {
|
266
|
+
u32 h = roll_hash((uchar)s2[i]);
|
267
|
+
if (i < ROLLING_WINDOW-1) continue;
|
268
|
+
for (j=ROLLING_WINDOW-1;j<num_hashes;j++) {
|
269
|
+
if (hashes[j] != 0 && hashes[j] == h) {
|
270
|
+
/* we have a potential match - confirm it */
|
271
|
+
if (strlen(s2+i-(ROLLING_WINDOW-1)) >= ROLLING_WINDOW &&
|
272
|
+
strncmp(s2+i-(ROLLING_WINDOW-1),
|
273
|
+
s1+j-(ROLLING_WINDOW-1),
|
274
|
+
ROLLING_WINDOW) == 0) {
|
275
|
+
return 1;
|
276
|
+
}
|
277
|
+
}
|
278
|
+
}
|
279
|
+
}
|
280
|
+
|
281
|
+
return 0;
|
282
|
+
}
|
283
|
+
|
284
|
+
|
285
|
+
/*
|
286
|
+
eliminate sequences of longer than 3 identical characters. These
|
287
|
+
sequences contain very little information so they tend to just bias
|
288
|
+
the result unfairly
|
289
|
+
*/
|
290
|
+
static char *eliminate_sequences(const char *str)
|
291
|
+
{
|
292
|
+
char *ret;
|
293
|
+
int i, j, len;
|
294
|
+
|
295
|
+
ret = strdup(str);
|
296
|
+
if (!ret) return NULL;
|
297
|
+
|
298
|
+
len = strlen(str);
|
299
|
+
|
300
|
+
for (i=j=3;i<len;i++) {
|
301
|
+
if (str[i] != str[i-1] ||
|
302
|
+
str[i] != str[i-2] ||
|
303
|
+
str[i] != str[i-3]) {
|
304
|
+
ret[j++] = str[i];
|
305
|
+
}
|
306
|
+
}
|
307
|
+
|
308
|
+
ret[j] = 0;
|
309
|
+
|
310
|
+
return ret;
|
311
|
+
}
|
312
|
+
|
313
|
+
/*
|
314
|
+
this is the low level string scoring algorithm. It takes two strings
|
315
|
+
and scores them on a scale of 0-100 where 0 is a terrible match and
|
316
|
+
100 is a great match. The block_size is used to cope with very small
|
317
|
+
messages.
|
318
|
+
*/
|
319
|
+
static unsigned score_strings(const char *s1, const char *s2, u32 block_size)
|
320
|
+
{
|
321
|
+
u32 score;
|
322
|
+
u32 len1, len2;
|
323
|
+
int edit_distn(const char *from, int from_len, const char *to, int to_len);
|
324
|
+
|
325
|
+
len1 = strlen(s1);
|
326
|
+
len2 = strlen(s2);
|
327
|
+
|
328
|
+
if (len1 > SPAMSUM_LENGTH || len2 > SPAMSUM_LENGTH) {
|
329
|
+
/* not a real spamsum signature? */
|
330
|
+
return 0;
|
331
|
+
}
|
332
|
+
|
333
|
+
/* the two strings must have a common substring of length
|
334
|
+
ROLLING_WINDOW to be candidates */
|
335
|
+
if (has_common_substring(s1, s2) == 0) {
|
336
|
+
return 0;
|
337
|
+
}
|
338
|
+
|
339
|
+
/* compute the edit distance between the two strings. The edit distance gives
|
340
|
+
us a pretty good idea of how closely related the two strings are */
|
341
|
+
score = edit_distn(s1, len1, s2, len2);
|
342
|
+
|
343
|
+
/* scale the edit distance by the lengths of the two
|
344
|
+
strings. This changes the score to be a measure of the
|
345
|
+
proportion of the message that has changed rather than an
|
346
|
+
absolute quantity. It also copes with the variability of
|
347
|
+
the string lengths. */
|
348
|
+
score = (score * SPAMSUM_LENGTH) / (len1 + len2);
|
349
|
+
|
350
|
+
/* at this stage the score occurs roughly on a 0-64 scale,
|
351
|
+
* with 0 being a good match and 64 being a complete
|
352
|
+
* mismatch */
|
353
|
+
|
354
|
+
/* rescale to a 0-100 scale (friendlier to humans) */
|
355
|
+
score = (100 * score) / 64;
|
356
|
+
|
357
|
+
/* it is possible to get a score above 100 here, but it is a
|
358
|
+
really terrible match */
|
359
|
+
if (score >= 100) return 0;
|
360
|
+
|
361
|
+
/* now re-scale on a 0-100 scale with 0 being a poor match and
|
362
|
+
100 being a excellent match. */
|
363
|
+
score = 100 - score;
|
364
|
+
|
365
|
+
/* when the blocksize is small we may not want to exaggerate the match size */
|
366
|
+
// if (score > block_size/MIN_BLOCKSIZE * MIN(len1, len2)) {
|
367
|
+
// score = block_size/MIN_BLOCKSIZE * MIN(len1, len2);
|
368
|
+
// }
|
369
|
+
|
370
|
+
return score;
|
371
|
+
}
|
372
|
+
|
373
|
+
/*
|
374
|
+
given two spamsum strings return a value indicating the degree to which they match.
|
375
|
+
*/
|
376
|
+
u32 spamsum_match(const char *str1, const char *str2)
|
377
|
+
{
|
378
|
+
u32 block_size1, block_size2;
|
379
|
+
u32 score = 0;
|
380
|
+
char *s1, *s2;
|
381
|
+
char *s1_1, *s1_2;
|
382
|
+
char *s2_1, *s2_2;
|
383
|
+
|
384
|
+
/* each spamsum is prefixed by its block size */
|
385
|
+
if (sscanf(str1, "%u:", &block_size1) != 1 ||
|
386
|
+
sscanf(str2, "%u:", &block_size2) != 1) {
|
387
|
+
return 0;
|
388
|
+
}
|
389
|
+
|
390
|
+
/* if the blocksizes don't match then we are comparing
|
391
|
+
apples to oranges ... */
|
392
|
+
if (block_size1 != block_size2 &&
|
393
|
+
block_size1 != block_size2*2 &&
|
394
|
+
block_size2 != block_size1*2) {
|
395
|
+
return 0;
|
396
|
+
}
|
397
|
+
|
398
|
+
/* move past the prefix */
|
399
|
+
str1 = strchr(str1, ':');
|
400
|
+
str2 = strchr(str2, ':');
|
401
|
+
|
402
|
+
if (!str1 || !str2) {
|
403
|
+
/* badly formed ... */
|
404
|
+
return 0;
|
405
|
+
}
|
406
|
+
|
407
|
+
/* there is very little information content is sequences of
|
408
|
+
the same character like 'LLLLL'. Eliminate any sequences
|
409
|
+
longer than 3. This is especially important when combined
|
410
|
+
with the has_common_substring() test below. */
|
411
|
+
s1 = eliminate_sequences(str1+1);
|
412
|
+
s2 = eliminate_sequences(str2+1);
|
413
|
+
|
414
|
+
if (!s1 || !s2) return -4;
|
415
|
+
|
416
|
+
/* now break them into the two pieces */
|
417
|
+
s1_1 = s1;
|
418
|
+
s2_1 = s2;
|
419
|
+
|
420
|
+
s1_2 = strchr(s1, ':');
|
421
|
+
s2_2 = strchr(s2, ':');
|
422
|
+
|
423
|
+
if (!s1_2 || !s2_2) {
|
424
|
+
/* a signature is malformed - it doesn't have 2 parts */
|
425
|
+
free(s1); free(s2);
|
426
|
+
return 0;
|
427
|
+
}
|
428
|
+
|
429
|
+
*s1_2++ = 0;
|
430
|
+
*s2_2++ = 0;
|
431
|
+
|
432
|
+
/* each signature has a string for two block sizes. We now
|
433
|
+
choose how to combine the two block sizes. We checked above
|
434
|
+
that they have at least one block size in common */
|
435
|
+
if (block_size1 == block_size2) {
|
436
|
+
u32 score1, score2;
|
437
|
+
score1 = score_strings(s1_1, s2_1, block_size1);
|
438
|
+
score2 = score_strings(s1_2, s2_2, block_size2);
|
439
|
+
score = MAX(score1, score2);
|
440
|
+
} else if (block_size1 == block_size2*2) {
|
441
|
+
score = score_strings(s1_1, s2_2, block_size1);
|
442
|
+
} else {
|
443
|
+
score = score_strings(s1_2, s2_1, block_size2);
|
444
|
+
}
|
445
|
+
|
446
|
+
free(s1);
|
447
|
+
free(s2);
|
448
|
+
|
449
|
+
return score;
|
450
|
+
}
|
451
|
+
|
452
|
+
/*
|
453
|
+
return the maximum match for a file containing a list of spamsums
|
454
|
+
*/
|
455
|
+
u32 spamsum_match_db(const char *fname, const char *sum, u32 threshold)
|
456
|
+
{
|
457
|
+
FILE *f;
|
458
|
+
char line[100];
|
459
|
+
u32 best = 0;
|
460
|
+
|
461
|
+
f = fopen(fname, "r");
|
462
|
+
if (!f) return 0;
|
463
|
+
|
464
|
+
/* on each line of the database we compute the spamsum match
|
465
|
+
score. We then pick the best score */
|
466
|
+
while (fgets(line, sizeof(line)-1, f)) {
|
467
|
+
u32 score;
|
468
|
+
int len;
|
469
|
+
len = strlen(line);
|
470
|
+
if (line[len-1] == '\n') line[len-1] = 0;
|
471
|
+
|
472
|
+
score = spamsum_match(sum, line);
|
473
|
+
|
474
|
+
if (score > best) {
|
475
|
+
best = score;
|
476
|
+
if (best >= threshold) break;
|
477
|
+
}
|
478
|
+
}
|
479
|
+
|
480
|
+
fclose(f);
|
481
|
+
|
482
|
+
return best;
|
483
|
+
}
|
484
|
+
|
485
|
+
/*
|
486
|
+
return the spamsum on stdin
|
487
|
+
*/
|
488
|
+
static char *spamsum_stdin(u32 flags, u32 block_size)
|
489
|
+
{
|
490
|
+
uchar buf[10*1024];
|
491
|
+
uchar *msg;
|
492
|
+
u32 length = 0;
|
493
|
+
int n;
|
494
|
+
char *sum;
|
495
|
+
|
496
|
+
msg = malloc(sizeof(buf));
|
497
|
+
if (!msg) return NULL;
|
498
|
+
|
499
|
+
/* load the file, expanding the allocation as needed. */
|
500
|
+
while (1) {
|
501
|
+
n = read(0, buf, sizeof(buf));
|
502
|
+
if (n == -1 && errno == EINTR) continue;
|
503
|
+
if (n <= 0) break;
|
504
|
+
|
505
|
+
msg = realloc(msg, length + n);
|
506
|
+
if (!msg) return NULL;
|
507
|
+
|
508
|
+
memcpy(msg+length, buf, n);
|
509
|
+
length += n;
|
510
|
+
}
|
511
|
+
|
512
|
+
sum = spamsum(msg, length, flags, block_size);
|
513
|
+
|
514
|
+
free(msg);
|
515
|
+
|
516
|
+
return sum;
|
517
|
+
}
|
518
|
+
|
519
|
+
|
520
|
+
/*
|
521
|
+
return the spamsum on a file
|
522
|
+
*/
|
523
|
+
char *spamsum_file(const char *fname, u32 flags, u32 block_size)
|
524
|
+
{
|
525
|
+
int fd;
|
526
|
+
char *sum;
|
527
|
+
struct stat st;
|
528
|
+
uchar *msg;
|
529
|
+
|
530
|
+
if (strcmp(fname, "-") == 0) {
|
531
|
+
return spamsum_stdin(flags, block_size);
|
532
|
+
}
|
533
|
+
|
534
|
+
fd = open(fname, O_RDONLY);
|
535
|
+
if (fd == -1) {
|
536
|
+
perror(fname);
|
537
|
+
return NULL;
|
538
|
+
}
|
539
|
+
|
540
|
+
if (fstat(fd, &st) == -1) {
|
541
|
+
perror("fstat");
|
542
|
+
return NULL;
|
543
|
+
}
|
544
|
+
|
545
|
+
msg = mmap(NULL, st.st_size, PROT_READ, MAP_FILE|MAP_PRIVATE, fd, 0);
|
546
|
+
if (msg == (uchar *)-1) {
|
547
|
+
perror("mmap");
|
548
|
+
return NULL;
|
549
|
+
}
|
550
|
+
close(fd);
|
551
|
+
|
552
|
+
sum = spamsum(msg, st.st_size, flags, block_size);
|
553
|
+
|
554
|
+
munmap(msg, st.st_size);
|
555
|
+
|
556
|
+
return sum;
|
557
|
+
}
|
558
|
+
|
559
|
+
static void show_help(void)
|
560
|
+
{
|
561
|
+
printf("\n\
|
562
|
+
spamsum v1.1 written by Andrew Tridgell <tridge@samba.org>\n\
|
563
|
+
\n\
|
564
|
+
spamsum computes a signature string that is particular good for detecting if two emails\n\
|
565
|
+
are very similar. This can be used to detect SPAM.\n\
|
566
|
+
\n\
|
567
|
+
Syntax:\n\
|
568
|
+
spamsum [options] <files>\n\
|
569
|
+
or\n\
|
570
|
+
spamsum [options] -d sigs.txt -c SIG\n\
|
571
|
+
or\n\
|
572
|
+
spamsum [options] -d sigs.txt -C file\n\
|
573
|
+
\n\
|
574
|
+
When called with a list of filenames spamsum will write out the\n\
|
575
|
+
signatures of each file on a separate line. You can specify the\n\
|
576
|
+
filename '-' for standard input.\n\
|
577
|
+
\n\
|
578
|
+
When called with the second form, spamsum will print the best score\n\
|
579
|
+
for the given signature with the signatures in the given database. A\n\
|
580
|
+
score of 100 means a perfect match, and a score of 0 means a complete\n\
|
581
|
+
mismatch.\n\
|
582
|
+
\n\
|
583
|
+
When checking, spamsum returns 0 (success) when the message *is* spam,\n\
|
584
|
+
1 for internal errors, and 2 for messages whose signature is not\n\
|
585
|
+
found.\n\
|
586
|
+
\n\
|
587
|
+
The 3rd form is just like the second form, but you pass a file\n\
|
588
|
+
containing a message instead of a pre-computed signature.\n\
|
589
|
+
\n\
|
590
|
+
Options:\n\
|
591
|
+
-W ignore whitespace\n\
|
592
|
+
-H skip past mail headers\n\
|
593
|
+
-B <bsize> force a block size of bsize\n\
|
594
|
+
-T <threshold> set the threshold above which spamsum will stop\n\
|
595
|
+
looking (default 90)\n\
|
596
|
+
");
|
597
|
+
}
|
598
|
+
|
599
|
+
int main(int argc, char *argv[])
|
600
|
+
{
|
601
|
+
char *sum;
|
602
|
+
extern char *optarg;
|
603
|
+
extern int optind;
|
604
|
+
int c;
|
605
|
+
char *dbname = NULL;
|
606
|
+
u32 score;
|
607
|
+
int i;
|
608
|
+
u32 flags = 0;
|
609
|
+
u32 block_size = 0;
|
610
|
+
u32 threshold = 90;
|
611
|
+
|
612
|
+
while ((c = getopt(argc, argv, "B:WHd:c:C:hT:")) != -1) {
|
613
|
+
switch (c) {
|
614
|
+
case 'W':
|
615
|
+
flags |= FLAG_IGNORE_WHITESPACE;
|
616
|
+
break;
|
617
|
+
|
618
|
+
case 'H':
|
619
|
+
flags |= FLAG_IGNORE_HEADERS;
|
620
|
+
break;
|
621
|
+
|
622
|
+
case 'd':
|
623
|
+
dbname = optarg;
|
624
|
+
break;
|
625
|
+
|
626
|
+
case 'B':
|
627
|
+
block_size = atoi(optarg);
|
628
|
+
break;
|
629
|
+
|
630
|
+
case 'T':
|
631
|
+
threshold = atoi(optarg);
|
632
|
+
break;
|
633
|
+
|
634
|
+
case 'c':
|
635
|
+
if (!dbname) {
|
636
|
+
show_help();
|
637
|
+
exit(1);
|
638
|
+
}
|
639
|
+
score = spamsum_match_db(dbname, optarg,
|
640
|
+
threshold);
|
641
|
+
printf("%u\n", score);
|
642
|
+
exit(score >= threshold ? 0 : 2);
|
643
|
+
|
644
|
+
case 'C':
|
645
|
+
if (!dbname) {
|
646
|
+
show_help();
|
647
|
+
exit(1);
|
648
|
+
}
|
649
|
+
score = spamsum_match_db(dbname,
|
650
|
+
spamsum_file(optarg, flags,
|
651
|
+
block_size),
|
652
|
+
threshold);
|
653
|
+
printf("%u\n", score);
|
654
|
+
exit(score >= threshold ? 0 : 2);
|
655
|
+
|
656
|
+
case 'h':
|
657
|
+
default:
|
658
|
+
show_help();
|
659
|
+
exit(0);
|
660
|
+
}
|
661
|
+
}
|
662
|
+
|
663
|
+
argc -= optind;
|
664
|
+
argv += optind;
|
665
|
+
|
666
|
+
if (argc == 0) {
|
667
|
+
show_help();
|
668
|
+
return 0;
|
669
|
+
}
|
670
|
+
|
671
|
+
/* compute the spamsum on a list of files */
|
672
|
+
for (i=0;i<argc;i++) {
|
673
|
+
sum = spamsum_file(argv[i], flags, block_size);
|
674
|
+
printf("%s\n", sum);
|
675
|
+
free(sum);
|
676
|
+
}
|
677
|
+
|
678
|
+
return 0;
|
679
|
+
}
|
data/ext/spamsum.i
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
%include "cpointer.i"
|
2
|
+
%include "typemaps.i"
|
3
|
+
%module "spamsum_swig"
|
4
|
+
|
5
|
+
%typemap(in) (char *s1, int s1_len) {
|
6
|
+
$1 = STR2CSTR($input);
|
7
|
+
$2 = (int) RSTRING($input)->len;
|
8
|
+
};
|
9
|
+
%typemap(in) (char *s2, int s2_len) {
|
10
|
+
$1 = STR2CSTR($input);
|
11
|
+
$2 = (int) RSTRING($input)->len;
|
12
|
+
};
|
13
|
+
|
14
|
+
int edit_distn(char *s1, int s1_len, char *s2, int s2_len);
|
15
|
+
char *spamsum(char *str, unsigned int len, unsigned int flags=0, unsigned int bsize=0);
|
16
|
+
unsigned int spamsum_match(char *s1, char *s2);
|