jwilkins-spamsum 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
data/ext/spamsum.c ADDED
@@ -0,0 +1,679 @@
1
+ /*
2
+ this is a checksum routine that is specifically designed for spam.
3
+ Copyright Andrew Tridgell <tridge@samba.org> 2002
4
+
5
+ This code is released under the GNU General Public License version 2
6
+ or later. Alteratively, you may also use this code under the terms
7
+ of the Perl Artistic license.
8
+
9
+ If you wish to distribute this code under the terms of a different
10
+ free software license then please ask me. If there is a good reason
11
+ then I will probably say yes.
12
+
13
+ ---
14
+
15
+ Modified by Russell Keith-Magee, 20 Jan 2009:
16
+ * removed the condition preventing comparison of small block sizes
17
+ (lines 364-366)
18
+ * Modified the help string to be legal cross platform C.
19
+ */
20
+ #include <stdio.h>
21
+ #include <stdlib.h>
22
+ #include <string.h>
23
+ #include <fcntl.h>
24
+ #include <errno.h>
25
+ #include <sys/mman.h>
26
+ #include <sys/stat.h>
27
+ #include <unistd.h>
28
+ #include <ctype.h>
29
+
30
+ /* the output is a string of length 64 in base64 */
31
+ #define SPAMSUM_LENGTH 64
32
+
33
+ #define MIN_BLOCKSIZE 3
34
+ #define HASH_PRIME 0x01000193
35
+ #define HASH_INIT 0x28021967
36
+
37
+ #define ROLLING_WINDOW 7
38
+
39
+ #ifndef MIN
40
+ #define MIN(a,b) ((a)<(b)?(a):(b))
41
+ #endif
42
+
43
+ #ifndef MAX
44
+ #define MAX(a,b) ((a)>(b)?(a):(b))
45
+ #endif
46
+
47
+ typedef unsigned u32;
48
+ typedef unsigned char uchar;
49
+
50
+ #define FLAG_IGNORE_WHITESPACE 1
51
+ #define FLAG_IGNORE_HEADERS 2
52
+
53
+ static struct {
54
+ uchar window[ROLLING_WINDOW];
55
+ u32 h1, h2, h3;
56
+ u32 n;
57
+ } roll_state;
58
+
59
+ /*
60
+ a rolling hash, based on the Adler checksum. By using a rolling hash
61
+ we can perform auto resynchronisation after inserts/deletes
62
+
63
+ internally, h1 is the sum of the bytes in the window and h2
64
+ is the sum of the bytes times the index
65
+
66
+ h3 is a shift/xor based rolling hash, and is mostly needed to ensure that
67
+ we can cope with large blocksize values
68
+ */
69
+ static inline u32 roll_hash(uchar c)
70
+ {
71
+ roll_state.h2 -= roll_state.h1;
72
+ roll_state.h2 += ROLLING_WINDOW * c;
73
+
74
+ roll_state.h1 += c;
75
+ roll_state.h1 -= roll_state.window[roll_state.n % ROLLING_WINDOW];
76
+
77
+ roll_state.window[roll_state.n % ROLLING_WINDOW] = c;
78
+ roll_state.n++;
79
+
80
+ roll_state.h3 = (roll_state.h3 << 5) & 0xFFFFFFFF;
81
+ roll_state.h3 ^= c;
82
+
83
+ return roll_state.h1 + roll_state.h2 + roll_state.h3;
84
+ }
85
+
86
+ /*
87
+ reset the state of the rolling hash and return the initial rolling hash value
88
+ */
89
+ static u32 roll_reset(void)
90
+ {
91
+ memset(&roll_state, 0, sizeof(roll_state));
92
+ return 0;
93
+ }
94
+
95
+ /* a simple non-rolling hash, based on the FNV hash */
96
+ static inline u32 sum_hash(uchar c, u32 h)
97
+ {
98
+ h *= HASH_PRIME;
99
+ h ^= c;
100
+ return h;
101
+ }
102
+
103
+ /*
104
+ take a message of length 'length' and return a string representing a hash of that message,
105
+ prefixed by the selected blocksize
106
+ */
107
+ char *spamsum(const uchar *in, u32 length, u32 flags, u32 bsize)
108
+ {
109
+ const char *b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
110
+ char *ret, *p;
111
+ u32 total_chars;
112
+ u32 h, h2, h3;
113
+ u32 j, n, i, k;
114
+ u32 block_size;
115
+ uchar ret2[SPAMSUM_LENGTH/2 + 1];
116
+
117
+ /* if we are ignoring email headers then skip past them now */
118
+ if (flags & FLAG_IGNORE_HEADERS) {
119
+ const uchar *s = strstr(in, "\n\n");
120
+ if (s) {
121
+ length -= (s+2 - in);
122
+ in = s+2;
123
+ }
124
+ }
125
+
126
+ if (flags & FLAG_IGNORE_WHITESPACE) {
127
+ /* count the non-ignored chars */
128
+ for (n=0, i=0; i<length; i++) {
129
+ if (isspace(in[i])) continue;
130
+ n++;
131
+ }
132
+ total_chars = n;
133
+ } else {
134
+ total_chars = length;
135
+ }
136
+
137
+ if (bsize == 0) {
138
+ /* guess a reasonable block size */
139
+ block_size = MIN_BLOCKSIZE;
140
+ while (block_size * SPAMSUM_LENGTH < total_chars) {
141
+ block_size = block_size * 2;
142
+ }
143
+ } else {
144
+ block_size = bsize;
145
+ }
146
+
147
+ ret = malloc(SPAMSUM_LENGTH + SPAMSUM_LENGTH/2 + 20);
148
+ if (!ret) return NULL;
149
+
150
+ again:
151
+ /* the first part of the spamsum signature is the blocksize */
152
+ snprintf(ret, 12, "%u:", block_size);
153
+ p = ret + strlen(ret);
154
+
155
+ memset(p, 0, SPAMSUM_LENGTH+1);
156
+ memset(ret2, 0, sizeof(ret2));
157
+
158
+ k = j = 0;
159
+ h3 = h2 = HASH_INIT;
160
+ h = roll_reset();
161
+
162
+ for (i=0; i<length; i++) {
163
+ if ((flags & FLAG_IGNORE_WHITESPACE) &&
164
+ isspace(in[i])) continue;
165
+
166
+ /*
167
+ at each character we update the rolling hash and
168
+ the normal hash. When the rolling hash hits the
169
+ reset value then we emit the normal hash as a
170
+ element of the signature and reset both hashes
171
+ */
172
+ h = roll_hash(in[i]);
173
+ h2 = sum_hash(in[i], h2);
174
+ h3 = sum_hash(in[i], h3);
175
+
176
+ if (h % block_size == (block_size-1)) {
177
+ /* we have hit a reset point. We now emit a
178
+ hash which is based on all chacaters in the
179
+ piece of the message between the last reset
180
+ point and this one */
181
+ p[j] = b64[h2 % 64];
182
+ if (j < SPAMSUM_LENGTH-1) {
183
+ /* we can have a problem with the tail
184
+ overflowing. The easiest way to
185
+ cope with this is to only reset the
186
+ second hash if we have room for
187
+ more characters in our
188
+ signature. This has the effect of
189
+ combining the last few pieces of
190
+ the message into a single piece */
191
+ h2 = HASH_INIT;
192
+ j++;
193
+ }
194
+ }
195
+
196
+ /* this produces a second signature with a block size
197
+ of block_size*2. By producing dual signatures in
198
+ this way the effect of small changes in the message
199
+ size near a block size boundary is greatly reduced. */
200
+ if (h % (block_size*2) == ((block_size*2)-1)) {
201
+ ret2[k] = b64[h3 % 64];
202
+ if (k < SPAMSUM_LENGTH/2-1) {
203
+ h3 = HASH_INIT;
204
+ k++;
205
+ }
206
+ }
207
+ }
208
+
209
+ /* if we have anything left then add it to the end. This
210
+ ensures that the last part of the message is always
211
+ considered */
212
+ if (h != 0) {
213
+ p[j] = b64[h2 % 64];
214
+ ret2[k] = b64[h3 % 64];
215
+ }
216
+
217
+ strcat(p+j, ":");
218
+ strcat(p+j, ret2);
219
+
220
+ /* our blocksize guess may have been way off - repeat if necessary */
221
+ if (bsize == 0 && block_size > MIN_BLOCKSIZE && j < SPAMSUM_LENGTH/2) {
222
+ block_size = block_size / 2;
223
+ goto again;
224
+ }
225
+
226
+ return ret;
227
+ }
228
+
229
+
230
+ /*
231
+ we only accept a match if we have at least one common substring in
232
+ the signature of length ROLLING_WINDOW. This dramatically drops the
233
+ false positive rate for low score thresholds while having
234
+ negligable affect on the rate of spam detection.
235
+
236
+ return 1 if the two strings do have a common substring, 0 otherwise
237
+ */
238
+ static int has_common_substring(const char *s1, const char *s2)
239
+ {
240
+ int i, j;
241
+ int num_hashes;
242
+ u32 hashes[SPAMSUM_LENGTH];
243
+
244
+ /* there are many possible algorithms for common substring
245
+ detection. In this case I am re-using the rolling hash code
246
+ to act as a filter for possible substring matches */
247
+
248
+ roll_reset();
249
+ memset(hashes, 0, sizeof(hashes));
250
+
251
+ /* first compute the windowed rolling hash at each offset in
252
+ the first string */
253
+ for (i=0;s1[i];i++) {
254
+ hashes[i] = roll_hash((uchar)s1[i]);
255
+ }
256
+ num_hashes = i;
257
+
258
+ roll_reset();
259
+
260
+ /* now for each offset in the second string compute the
261
+ rolling hash and compare it to all of the rolling hashes
262
+ for the first string. If one matches then we have a
263
+ candidate substring match. We then confirm that match with
264
+ a direct string comparison */
265
+ for (i=0;s2[i];i++) {
266
+ u32 h = roll_hash((uchar)s2[i]);
267
+ if (i < ROLLING_WINDOW-1) continue;
268
+ for (j=ROLLING_WINDOW-1;j<num_hashes;j++) {
269
+ if (hashes[j] != 0 && hashes[j] == h) {
270
+ /* we have a potential match - confirm it */
271
+ if (strlen(s2+i-(ROLLING_WINDOW-1)) >= ROLLING_WINDOW &&
272
+ strncmp(s2+i-(ROLLING_WINDOW-1),
273
+ s1+j-(ROLLING_WINDOW-1),
274
+ ROLLING_WINDOW) == 0) {
275
+ return 1;
276
+ }
277
+ }
278
+ }
279
+ }
280
+
281
+ return 0;
282
+ }
283
+
284
+
285
+ /*
286
+ eliminate sequences of longer than 3 identical characters. These
287
+ sequences contain very little information so they tend to just bias
288
+ the result unfairly
289
+ */
290
+ static char *eliminate_sequences(const char *str)
291
+ {
292
+ char *ret;
293
+ int i, j, len;
294
+
295
+ ret = strdup(str);
296
+ if (!ret) return NULL;
297
+
298
+ len = strlen(str);
299
+
300
+ for (i=j=3;i<len;i++) {
301
+ if (str[i] != str[i-1] ||
302
+ str[i] != str[i-2] ||
303
+ str[i] != str[i-3]) {
304
+ ret[j++] = str[i];
305
+ }
306
+ }
307
+
308
+ ret[j] = 0;
309
+
310
+ return ret;
311
+ }
312
+
313
+ /*
314
+ this is the low level string scoring algorithm. It takes two strings
315
+ and scores them on a scale of 0-100 where 0 is a terrible match and
316
+ 100 is a great match. The block_size is used to cope with very small
317
+ messages.
318
+ */
319
+ static unsigned score_strings(const char *s1, const char *s2, u32 block_size)
320
+ {
321
+ u32 score;
322
+ u32 len1, len2;
323
+ int edit_distn(const char *from, int from_len, const char *to, int to_len);
324
+
325
+ len1 = strlen(s1);
326
+ len2 = strlen(s2);
327
+
328
+ if (len1 > SPAMSUM_LENGTH || len2 > SPAMSUM_LENGTH) {
329
+ /* not a real spamsum signature? */
330
+ return 0;
331
+ }
332
+
333
+ /* the two strings must have a common substring of length
334
+ ROLLING_WINDOW to be candidates */
335
+ if (has_common_substring(s1, s2) == 0) {
336
+ return 0;
337
+ }
338
+
339
+ /* compute the edit distance between the two strings. The edit distance gives
340
+ us a pretty good idea of how closely related the two strings are */
341
+ score = edit_distn(s1, len1, s2, len2);
342
+
343
+ /* scale the edit distance by the lengths of the two
344
+ strings. This changes the score to be a measure of the
345
+ proportion of the message that has changed rather than an
346
+ absolute quantity. It also copes with the variability of
347
+ the string lengths. */
348
+ score = (score * SPAMSUM_LENGTH) / (len1 + len2);
349
+
350
+ /* at this stage the score occurs roughly on a 0-64 scale,
351
+ * with 0 being a good match and 64 being a complete
352
+ * mismatch */
353
+
354
+ /* rescale to a 0-100 scale (friendlier to humans) */
355
+ score = (100 * score) / 64;
356
+
357
+ /* it is possible to get a score above 100 here, but it is a
358
+ really terrible match */
359
+ if (score >= 100) return 0;
360
+
361
+ /* now re-scale on a 0-100 scale with 0 being a poor match and
362
+ 100 being a excellent match. */
363
+ score = 100 - score;
364
+
365
+ /* when the blocksize is small we may not want to exaggerate the match size */
366
+ // if (score > block_size/MIN_BLOCKSIZE * MIN(len1, len2)) {
367
+ // score = block_size/MIN_BLOCKSIZE * MIN(len1, len2);
368
+ // }
369
+
370
+ return score;
371
+ }
372
+
373
+ /*
374
+ given two spamsum strings return a value indicating the degree to which they match.
375
+ */
376
+ u32 spamsum_match(const char *str1, const char *str2)
377
+ {
378
+ u32 block_size1, block_size2;
379
+ u32 score = 0;
380
+ char *s1, *s2;
381
+ char *s1_1, *s1_2;
382
+ char *s2_1, *s2_2;
383
+
384
+ /* each spamsum is prefixed by its block size */
385
+ if (sscanf(str1, "%u:", &block_size1) != 1 ||
386
+ sscanf(str2, "%u:", &block_size2) != 1) {
387
+ return 0;
388
+ }
389
+
390
+ /* if the blocksizes don't match then we are comparing
391
+ apples to oranges ... */
392
+ if (block_size1 != block_size2 &&
393
+ block_size1 != block_size2*2 &&
394
+ block_size2 != block_size1*2) {
395
+ return 0;
396
+ }
397
+
398
+ /* move past the prefix */
399
+ str1 = strchr(str1, ':');
400
+ str2 = strchr(str2, ':');
401
+
402
+ if (!str1 || !str2) {
403
+ /* badly formed ... */
404
+ return 0;
405
+ }
406
+
407
+ /* there is very little information content is sequences of
408
+ the same character like 'LLLLL'. Eliminate any sequences
409
+ longer than 3. This is especially important when combined
410
+ with the has_common_substring() test below. */
411
+ s1 = eliminate_sequences(str1+1);
412
+ s2 = eliminate_sequences(str2+1);
413
+
414
+ if (!s1 || !s2) return -4;
415
+
416
+ /* now break them into the two pieces */
417
+ s1_1 = s1;
418
+ s2_1 = s2;
419
+
420
+ s1_2 = strchr(s1, ':');
421
+ s2_2 = strchr(s2, ':');
422
+
423
+ if (!s1_2 || !s2_2) {
424
+ /* a signature is malformed - it doesn't have 2 parts */
425
+ free(s1); free(s2);
426
+ return 0;
427
+ }
428
+
429
+ *s1_2++ = 0;
430
+ *s2_2++ = 0;
431
+
432
+ /* each signature has a string for two block sizes. We now
433
+ choose how to combine the two block sizes. We checked above
434
+ that they have at least one block size in common */
435
+ if (block_size1 == block_size2) {
436
+ u32 score1, score2;
437
+ score1 = score_strings(s1_1, s2_1, block_size1);
438
+ score2 = score_strings(s1_2, s2_2, block_size2);
439
+ score = MAX(score1, score2);
440
+ } else if (block_size1 == block_size2*2) {
441
+ score = score_strings(s1_1, s2_2, block_size1);
442
+ } else {
443
+ score = score_strings(s1_2, s2_1, block_size2);
444
+ }
445
+
446
+ free(s1);
447
+ free(s2);
448
+
449
+ return score;
450
+ }
451
+
452
+ /*
453
+ return the maximum match for a file containing a list of spamsums
454
+ */
455
+ u32 spamsum_match_db(const char *fname, const char *sum, u32 threshold)
456
+ {
457
+ FILE *f;
458
+ char line[100];
459
+ u32 best = 0;
460
+
461
+ f = fopen(fname, "r");
462
+ if (!f) return 0;
463
+
464
+ /* on each line of the database we compute the spamsum match
465
+ score. We then pick the best score */
466
+ while (fgets(line, sizeof(line)-1, f)) {
467
+ u32 score;
468
+ int len;
469
+ len = strlen(line);
470
+ if (line[len-1] == '\n') line[len-1] = 0;
471
+
472
+ score = spamsum_match(sum, line);
473
+
474
+ if (score > best) {
475
+ best = score;
476
+ if (best >= threshold) break;
477
+ }
478
+ }
479
+
480
+ fclose(f);
481
+
482
+ return best;
483
+ }
484
+
485
+ /*
486
+ return the spamsum on stdin
487
+ */
488
+ static char *spamsum_stdin(u32 flags, u32 block_size)
489
+ {
490
+ uchar buf[10*1024];
491
+ uchar *msg;
492
+ u32 length = 0;
493
+ int n;
494
+ char *sum;
495
+
496
+ msg = malloc(sizeof(buf));
497
+ if (!msg) return NULL;
498
+
499
+ /* load the file, expanding the allocation as needed. */
500
+ while (1) {
501
+ n = read(0, buf, sizeof(buf));
502
+ if (n == -1 && errno == EINTR) continue;
503
+ if (n <= 0) break;
504
+
505
+ msg = realloc(msg, length + n);
506
+ if (!msg) return NULL;
507
+
508
+ memcpy(msg+length, buf, n);
509
+ length += n;
510
+ }
511
+
512
+ sum = spamsum(msg, length, flags, block_size);
513
+
514
+ free(msg);
515
+
516
+ return sum;
517
+ }
518
+
519
+
520
+ /*
521
+ return the spamsum on a file
522
+ */
523
+ char *spamsum_file(const char *fname, u32 flags, u32 block_size)
524
+ {
525
+ int fd;
526
+ char *sum;
527
+ struct stat st;
528
+ uchar *msg;
529
+
530
+ if (strcmp(fname, "-") == 0) {
531
+ return spamsum_stdin(flags, block_size);
532
+ }
533
+
534
+ fd = open(fname, O_RDONLY);
535
+ if (fd == -1) {
536
+ perror(fname);
537
+ return NULL;
538
+ }
539
+
540
+ if (fstat(fd, &st) == -1) {
541
+ perror("fstat");
542
+ return NULL;
543
+ }
544
+
545
+ msg = mmap(NULL, st.st_size, PROT_READ, MAP_FILE|MAP_PRIVATE, fd, 0);
546
+ if (msg == (uchar *)-1) {
547
+ perror("mmap");
548
+ return NULL;
549
+ }
550
+ close(fd);
551
+
552
+ sum = spamsum(msg, st.st_size, flags, block_size);
553
+
554
+ munmap(msg, st.st_size);
555
+
556
+ return sum;
557
+ }
558
+
559
+ static void show_help(void)
560
+ {
561
+ printf("\n\
562
+ spamsum v1.1 written by Andrew Tridgell <tridge@samba.org>\n\
563
+ \n\
564
+ spamsum computes a signature string that is particular good for detecting if two emails\n\
565
+ are very similar. This can be used to detect SPAM.\n\
566
+ \n\
567
+ Syntax:\n\
568
+ spamsum [options] <files>\n\
569
+ or\n\
570
+ spamsum [options] -d sigs.txt -c SIG\n\
571
+ or\n\
572
+ spamsum [options] -d sigs.txt -C file\n\
573
+ \n\
574
+ When called with a list of filenames spamsum will write out the\n\
575
+ signatures of each file on a separate line. You can specify the\n\
576
+ filename '-' for standard input.\n\
577
+ \n\
578
+ When called with the second form, spamsum will print the best score\n\
579
+ for the given signature with the signatures in the given database. A\n\
580
+ score of 100 means a perfect match, and a score of 0 means a complete\n\
581
+ mismatch.\n\
582
+ \n\
583
+ When checking, spamsum returns 0 (success) when the message *is* spam,\n\
584
+ 1 for internal errors, and 2 for messages whose signature is not\n\
585
+ found.\n\
586
+ \n\
587
+ The 3rd form is just like the second form, but you pass a file\n\
588
+ containing a message instead of a pre-computed signature.\n\
589
+ \n\
590
+ Options:\n\
591
+ -W ignore whitespace\n\
592
+ -H skip past mail headers\n\
593
+ -B <bsize> force a block size of bsize\n\
594
+ -T <threshold> set the threshold above which spamsum will stop\n\
595
+ looking (default 90)\n\
596
+ ");
597
+ }
598
+
599
+ int main(int argc, char *argv[])
600
+ {
601
+ char *sum;
602
+ extern char *optarg;
603
+ extern int optind;
604
+ int c;
605
+ char *dbname = NULL;
606
+ u32 score;
607
+ int i;
608
+ u32 flags = 0;
609
+ u32 block_size = 0;
610
+ u32 threshold = 90;
611
+
612
+ while ((c = getopt(argc, argv, "B:WHd:c:C:hT:")) != -1) {
613
+ switch (c) {
614
+ case 'W':
615
+ flags |= FLAG_IGNORE_WHITESPACE;
616
+ break;
617
+
618
+ case 'H':
619
+ flags |= FLAG_IGNORE_HEADERS;
620
+ break;
621
+
622
+ case 'd':
623
+ dbname = optarg;
624
+ break;
625
+
626
+ case 'B':
627
+ block_size = atoi(optarg);
628
+ break;
629
+
630
+ case 'T':
631
+ threshold = atoi(optarg);
632
+ break;
633
+
634
+ case 'c':
635
+ if (!dbname) {
636
+ show_help();
637
+ exit(1);
638
+ }
639
+ score = spamsum_match_db(dbname, optarg,
640
+ threshold);
641
+ printf("%u\n", score);
642
+ exit(score >= threshold ? 0 : 2);
643
+
644
+ case 'C':
645
+ if (!dbname) {
646
+ show_help();
647
+ exit(1);
648
+ }
649
+ score = spamsum_match_db(dbname,
650
+ spamsum_file(optarg, flags,
651
+ block_size),
652
+ threshold);
653
+ printf("%u\n", score);
654
+ exit(score >= threshold ? 0 : 2);
655
+
656
+ case 'h':
657
+ default:
658
+ show_help();
659
+ exit(0);
660
+ }
661
+ }
662
+
663
+ argc -= optind;
664
+ argv += optind;
665
+
666
+ if (argc == 0) {
667
+ show_help();
668
+ return 0;
669
+ }
670
+
671
+ /* compute the spamsum on a list of files */
672
+ for (i=0;i<argc;i++) {
673
+ sum = spamsum_file(argv[i], flags, block_size);
674
+ printf("%s\n", sum);
675
+ free(sum);
676
+ }
677
+
678
+ return 0;
679
+ }
data/ext/spamsum.i ADDED
@@ -0,0 +1,16 @@
1
+ %include "cpointer.i"
2
+ %include "typemaps.i"
3
+ %module "spamsum_swig"
4
+
5
+ %typemap(in) (char *s1, int s1_len) {
6
+ $1 = STR2CSTR($input);
7
+ $2 = (int) RSTRING($input)->len;
8
+ };
9
+ %typemap(in) (char *s2, int s2_len) {
10
+ $1 = STR2CSTR($input);
11
+ $2 = (int) RSTRING($input)->len;
12
+ };
13
+
14
+ int edit_distn(char *s1, int s1_len, char *s2, int s2_len);
15
+ char *spamsum(char *str, unsigned int len, unsigned int flags=0, unsigned int bsize=0);
16
+ unsigned int spamsum_match(char *s1, char *s2);