jwilkins-spamsum 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/spamsum.c ADDED
@@ -0,0 +1,679 @@
1
+ /*
2
+ this is a checksum routine that is specifically designed for spam.
3
+ Copyright Andrew Tridgell <tridge@samba.org> 2002
4
+
5
+ This code is released under the GNU General Public License version 2
6
+ or later. Alteratively, you may also use this code under the terms
7
+ of the Perl Artistic license.
8
+
9
+ If you wish to distribute this code under the terms of a different
10
+ free software license then please ask me. If there is a good reason
11
+ then I will probably say yes.
12
+
13
+ ---
14
+
15
+ Modified by Russell Keith-Magee, 20 Jan 2009:
16
+ * removed the condition preventing comparison of small block sizes
17
+ (lines 364-366)
18
+ * Modified the help string to be legal cross platform C.
19
+ */
20
+ #include <stdio.h>
21
+ #include <stdlib.h>
22
+ #include <string.h>
23
+ #include <fcntl.h>
24
+ #include <errno.h>
25
+ #include <sys/mman.h>
26
+ #include <sys/stat.h>
27
+ #include <unistd.h>
28
+ #include <ctype.h>
29
+
30
+ /* the output is a string of length 64 in base64 */
31
+ #define SPAMSUM_LENGTH 64
32
+
33
+ #define MIN_BLOCKSIZE 3
34
+ #define HASH_PRIME 0x01000193
35
+ #define HASH_INIT 0x28021967
36
+
37
+ #define ROLLING_WINDOW 7
38
+
39
+ #ifndef MIN
40
+ #define MIN(a,b) ((a)<(b)?(a):(b))
41
+ #endif
42
+
43
+ #ifndef MAX
44
+ #define MAX(a,b) ((a)>(b)?(a):(b))
45
+ #endif
46
+
47
+ typedef unsigned u32;
48
+ typedef unsigned char uchar;
49
+
50
+ #define FLAG_IGNORE_WHITESPACE 1
51
+ #define FLAG_IGNORE_HEADERS 2
52
+
53
+ static struct {
54
+ uchar window[ROLLING_WINDOW];
55
+ u32 h1, h2, h3;
56
+ u32 n;
57
+ } roll_state;
58
+
59
+ /*
60
+ a rolling hash, based on the Adler checksum. By using a rolling hash
61
+ we can perform auto resynchronisation after inserts/deletes
62
+
63
+ internally, h1 is the sum of the bytes in the window and h2
64
+ is the sum of the bytes times the index
65
+
66
+ h3 is a shift/xor based rolling hash, and is mostly needed to ensure that
67
+ we can cope with large blocksize values
68
+ */
69
+ static inline u32 roll_hash(uchar c)
70
+ {
71
+ roll_state.h2 -= roll_state.h1;
72
+ roll_state.h2 += ROLLING_WINDOW * c;
73
+
74
+ roll_state.h1 += c;
75
+ roll_state.h1 -= roll_state.window[roll_state.n % ROLLING_WINDOW];
76
+
77
+ roll_state.window[roll_state.n % ROLLING_WINDOW] = c;
78
+ roll_state.n++;
79
+
80
+ roll_state.h3 = (roll_state.h3 << 5) & 0xFFFFFFFF;
81
+ roll_state.h3 ^= c;
82
+
83
+ return roll_state.h1 + roll_state.h2 + roll_state.h3;
84
+ }
85
+
86
+ /*
87
+ reset the state of the rolling hash and return the initial rolling hash value
88
+ */
89
+ static u32 roll_reset(void)
90
+ {
91
+ memset(&roll_state, 0, sizeof(roll_state));
92
+ return 0;
93
+ }
94
+
95
+ /* a simple non-rolling hash, based on the FNV hash */
96
+ static inline u32 sum_hash(uchar c, u32 h)
97
+ {
98
+ h *= HASH_PRIME;
99
+ h ^= c;
100
+ return h;
101
+ }
102
+
103
+ /*
104
+ take a message of length 'length' and return a string representing a hash of that message,
105
+ prefixed by the selected blocksize
106
+ */
107
+ char *spamsum(const uchar *in, u32 length, u32 flags, u32 bsize)
108
+ {
109
+ const char *b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
110
+ char *ret, *p;
111
+ u32 total_chars;
112
+ u32 h, h2, h3;
113
+ u32 j, n, i, k;
114
+ u32 block_size;
115
+ uchar ret2[SPAMSUM_LENGTH/2 + 1];
116
+
117
+ /* if we are ignoring email headers then skip past them now */
118
+ if (flags & FLAG_IGNORE_HEADERS) {
119
+ const uchar *s = strstr(in, "\n\n");
120
+ if (s) {
121
+ length -= (s+2 - in);
122
+ in = s+2;
123
+ }
124
+ }
125
+
126
+ if (flags & FLAG_IGNORE_WHITESPACE) {
127
+ /* count the non-ignored chars */
128
+ for (n=0, i=0; i<length; i++) {
129
+ if (isspace(in[i])) continue;
130
+ n++;
131
+ }
132
+ total_chars = n;
133
+ } else {
134
+ total_chars = length;
135
+ }
136
+
137
+ if (bsize == 0) {
138
+ /* guess a reasonable block size */
139
+ block_size = MIN_BLOCKSIZE;
140
+ while (block_size * SPAMSUM_LENGTH < total_chars) {
141
+ block_size = block_size * 2;
142
+ }
143
+ } else {
144
+ block_size = bsize;
145
+ }
146
+
147
+ ret = malloc(SPAMSUM_LENGTH + SPAMSUM_LENGTH/2 + 20);
148
+ if (!ret) return NULL;
149
+
150
+ again:
151
+ /* the first part of the spamsum signature is the blocksize */
152
+ snprintf(ret, 12, "%u:", block_size);
153
+ p = ret + strlen(ret);
154
+
155
+ memset(p, 0, SPAMSUM_LENGTH+1);
156
+ memset(ret2, 0, sizeof(ret2));
157
+
158
+ k = j = 0;
159
+ h3 = h2 = HASH_INIT;
160
+ h = roll_reset();
161
+
162
+ for (i=0; i<length; i++) {
163
+ if ((flags & FLAG_IGNORE_WHITESPACE) &&
164
+ isspace(in[i])) continue;
165
+
166
+ /*
167
+ at each character we update the rolling hash and
168
+ the normal hash. When the rolling hash hits the
169
+ reset value then we emit the normal hash as a
170
+ element of the signature and reset both hashes
171
+ */
172
+ h = roll_hash(in[i]);
173
+ h2 = sum_hash(in[i], h2);
174
+ h3 = sum_hash(in[i], h3);
175
+
176
+ if (h % block_size == (block_size-1)) {
177
+ /* we have hit a reset point. We now emit a
178
+ hash which is based on all chacaters in the
179
+ piece of the message between the last reset
180
+ point and this one */
181
+ p[j] = b64[h2 % 64];
182
+ if (j < SPAMSUM_LENGTH-1) {
183
+ /* we can have a problem with the tail
184
+ overflowing. The easiest way to
185
+ cope with this is to only reset the
186
+ second hash if we have room for
187
+ more characters in our
188
+ signature. This has the effect of
189
+ combining the last few pieces of
190
+ the message into a single piece */
191
+ h2 = HASH_INIT;
192
+ j++;
193
+ }
194
+ }
195
+
196
+ /* this produces a second signature with a block size
197
+ of block_size*2. By producing dual signatures in
198
+ this way the effect of small changes in the message
199
+ size near a block size boundary is greatly reduced. */
200
+ if (h % (block_size*2) == ((block_size*2)-1)) {
201
+ ret2[k] = b64[h3 % 64];
202
+ if (k < SPAMSUM_LENGTH/2-1) {
203
+ h3 = HASH_INIT;
204
+ k++;
205
+ }
206
+ }
207
+ }
208
+
209
+ /* if we have anything left then add it to the end. This
210
+ ensures that the last part of the message is always
211
+ considered */
212
+ if (h != 0) {
213
+ p[j] = b64[h2 % 64];
214
+ ret2[k] = b64[h3 % 64];
215
+ }
216
+
217
+ strcat(p+j, ":");
218
+ strcat(p+j, ret2);
219
+
220
+ /* our blocksize guess may have been way off - repeat if necessary */
221
+ if (bsize == 0 && block_size > MIN_BLOCKSIZE && j < SPAMSUM_LENGTH/2) {
222
+ block_size = block_size / 2;
223
+ goto again;
224
+ }
225
+
226
+ return ret;
227
+ }
228
+
229
+
230
+ /*
231
+ we only accept a match if we have at least one common substring in
232
+ the signature of length ROLLING_WINDOW. This dramatically drops the
233
+ false positive rate for low score thresholds while having
234
+ negligable affect on the rate of spam detection.
235
+
236
+ return 1 if the two strings do have a common substring, 0 otherwise
237
+ */
238
+ static int has_common_substring(const char *s1, const char *s2)
239
+ {
240
+ int i, j;
241
+ int num_hashes;
242
+ u32 hashes[SPAMSUM_LENGTH];
243
+
244
+ /* there are many possible algorithms for common substring
245
+ detection. In this case I am re-using the rolling hash code
246
+ to act as a filter for possible substring matches */
247
+
248
+ roll_reset();
249
+ memset(hashes, 0, sizeof(hashes));
250
+
251
+ /* first compute the windowed rolling hash at each offset in
252
+ the first string */
253
+ for (i=0;s1[i];i++) {
254
+ hashes[i] = roll_hash((uchar)s1[i]);
255
+ }
256
+ num_hashes = i;
257
+
258
+ roll_reset();
259
+
260
+ /* now for each offset in the second string compute the
261
+ rolling hash and compare it to all of the rolling hashes
262
+ for the first string. If one matches then we have a
263
+ candidate substring match. We then confirm that match with
264
+ a direct string comparison */
265
+ for (i=0;s2[i];i++) {
266
+ u32 h = roll_hash((uchar)s2[i]);
267
+ if (i < ROLLING_WINDOW-1) continue;
268
+ for (j=ROLLING_WINDOW-1;j<num_hashes;j++) {
269
+ if (hashes[j] != 0 && hashes[j] == h) {
270
+ /* we have a potential match - confirm it */
271
+ if (strlen(s2+i-(ROLLING_WINDOW-1)) >= ROLLING_WINDOW &&
272
+ strncmp(s2+i-(ROLLING_WINDOW-1),
273
+ s1+j-(ROLLING_WINDOW-1),
274
+ ROLLING_WINDOW) == 0) {
275
+ return 1;
276
+ }
277
+ }
278
+ }
279
+ }
280
+
281
+ return 0;
282
+ }
283
+
284
+
285
+ /*
286
+ eliminate sequences of longer than 3 identical characters. These
287
+ sequences contain very little information so they tend to just bias
288
+ the result unfairly
289
+ */
290
+ static char *eliminate_sequences(const char *str)
291
+ {
292
+ char *ret;
293
+ int i, j, len;
294
+
295
+ ret = strdup(str);
296
+ if (!ret) return NULL;
297
+
298
+ len = strlen(str);
299
+
300
+ for (i=j=3;i<len;i++) {
301
+ if (str[i] != str[i-1] ||
302
+ str[i] != str[i-2] ||
303
+ str[i] != str[i-3]) {
304
+ ret[j++] = str[i];
305
+ }
306
+ }
307
+
308
+ ret[j] = 0;
309
+
310
+ return ret;
311
+ }
312
+
313
+ /*
314
+ this is the low level string scoring algorithm. It takes two strings
315
+ and scores them on a scale of 0-100 where 0 is a terrible match and
316
+ 100 is a great match. The block_size is used to cope with very small
317
+ messages.
318
+ */
319
+ static unsigned score_strings(const char *s1, const char *s2, u32 block_size)
320
+ {
321
+ u32 score;
322
+ u32 len1, len2;
323
+ int edit_distn(const char *from, int from_len, const char *to, int to_len);
324
+
325
+ len1 = strlen(s1);
326
+ len2 = strlen(s2);
327
+
328
+ if (len1 > SPAMSUM_LENGTH || len2 > SPAMSUM_LENGTH) {
329
+ /* not a real spamsum signature? */
330
+ return 0;
331
+ }
332
+
333
+ /* the two strings must have a common substring of length
334
+ ROLLING_WINDOW to be candidates */
335
+ if (has_common_substring(s1, s2) == 0) {
336
+ return 0;
337
+ }
338
+
339
+ /* compute the edit distance between the two strings. The edit distance gives
340
+ us a pretty good idea of how closely related the two strings are */
341
+ score = edit_distn(s1, len1, s2, len2);
342
+
343
+ /* scale the edit distance by the lengths of the two
344
+ strings. This changes the score to be a measure of the
345
+ proportion of the message that has changed rather than an
346
+ absolute quantity. It also copes with the variability of
347
+ the string lengths. */
348
+ score = (score * SPAMSUM_LENGTH) / (len1 + len2);
349
+
350
+ /* at this stage the score occurs roughly on a 0-64 scale,
351
+ * with 0 being a good match and 64 being a complete
352
+ * mismatch */
353
+
354
+ /* rescale to a 0-100 scale (friendlier to humans) */
355
+ score = (100 * score) / 64;
356
+
357
+ /* it is possible to get a score above 100 here, but it is a
358
+ really terrible match */
359
+ if (score >= 100) return 0;
360
+
361
+ /* now re-scale on a 0-100 scale with 0 being a poor match and
362
+ 100 being a excellent match. */
363
+ score = 100 - score;
364
+
365
+ /* when the blocksize is small we may not want to exaggerate the match size */
366
+ // if (score > block_size/MIN_BLOCKSIZE * MIN(len1, len2)) {
367
+ // score = block_size/MIN_BLOCKSIZE * MIN(len1, len2);
368
+ // }
369
+
370
+ return score;
371
+ }
372
+
373
+ /*
374
+ given two spamsum strings return a value indicating the degree to which they match.
375
+ */
376
+ u32 spamsum_match(const char *str1, const char *str2)
377
+ {
378
+ u32 block_size1, block_size2;
379
+ u32 score = 0;
380
+ char *s1, *s2;
381
+ char *s1_1, *s1_2;
382
+ char *s2_1, *s2_2;
383
+
384
+ /* each spamsum is prefixed by its block size */
385
+ if (sscanf(str1, "%u:", &block_size1) != 1 ||
386
+ sscanf(str2, "%u:", &block_size2) != 1) {
387
+ return 0;
388
+ }
389
+
390
+ /* if the blocksizes don't match then we are comparing
391
+ apples to oranges ... */
392
+ if (block_size1 != block_size2 &&
393
+ block_size1 != block_size2*2 &&
394
+ block_size2 != block_size1*2) {
395
+ return 0;
396
+ }
397
+
398
+ /* move past the prefix */
399
+ str1 = strchr(str1, ':');
400
+ str2 = strchr(str2, ':');
401
+
402
+ if (!str1 || !str2) {
403
+ /* badly formed ... */
404
+ return 0;
405
+ }
406
+
407
+ /* there is very little information content is sequences of
408
+ the same character like 'LLLLL'. Eliminate any sequences
409
+ longer than 3. This is especially important when combined
410
+ with the has_common_substring() test below. */
411
+ s1 = eliminate_sequences(str1+1);
412
+ s2 = eliminate_sequences(str2+1);
413
+
414
+ if (!s1 || !s2) return -4;
415
+
416
+ /* now break them into the two pieces */
417
+ s1_1 = s1;
418
+ s2_1 = s2;
419
+
420
+ s1_2 = strchr(s1, ':');
421
+ s2_2 = strchr(s2, ':');
422
+
423
+ if (!s1_2 || !s2_2) {
424
+ /* a signature is malformed - it doesn't have 2 parts */
425
+ free(s1); free(s2);
426
+ return 0;
427
+ }
428
+
429
+ *s1_2++ = 0;
430
+ *s2_2++ = 0;
431
+
432
+ /* each signature has a string for two block sizes. We now
433
+ choose how to combine the two block sizes. We checked above
434
+ that they have at least one block size in common */
435
+ if (block_size1 == block_size2) {
436
+ u32 score1, score2;
437
+ score1 = score_strings(s1_1, s2_1, block_size1);
438
+ score2 = score_strings(s1_2, s2_2, block_size2);
439
+ score = MAX(score1, score2);
440
+ } else if (block_size1 == block_size2*2) {
441
+ score = score_strings(s1_1, s2_2, block_size1);
442
+ } else {
443
+ score = score_strings(s1_2, s2_1, block_size2);
444
+ }
445
+
446
+ free(s1);
447
+ free(s2);
448
+
449
+ return score;
450
+ }
451
+
452
+ /*
453
+ return the maximum match for a file containing a list of spamsums
454
+ */
455
+ u32 spamsum_match_db(const char *fname, const char *sum, u32 threshold)
456
+ {
457
+ FILE *f;
458
+ char line[100];
459
+ u32 best = 0;
460
+
461
+ f = fopen(fname, "r");
462
+ if (!f) return 0;
463
+
464
+ /* on each line of the database we compute the spamsum match
465
+ score. We then pick the best score */
466
+ while (fgets(line, sizeof(line)-1, f)) {
467
+ u32 score;
468
+ int len;
469
+ len = strlen(line);
470
+ if (line[len-1] == '\n') line[len-1] = 0;
471
+
472
+ score = spamsum_match(sum, line);
473
+
474
+ if (score > best) {
475
+ best = score;
476
+ if (best >= threshold) break;
477
+ }
478
+ }
479
+
480
+ fclose(f);
481
+
482
+ return best;
483
+ }
484
+
485
+ /*
486
+ return the spamsum on stdin
487
+ */
488
+ static char *spamsum_stdin(u32 flags, u32 block_size)
489
+ {
490
+ uchar buf[10*1024];
491
+ uchar *msg;
492
+ u32 length = 0;
493
+ int n;
494
+ char *sum;
495
+
496
+ msg = malloc(sizeof(buf));
497
+ if (!msg) return NULL;
498
+
499
+ /* load the file, expanding the allocation as needed. */
500
+ while (1) {
501
+ n = read(0, buf, sizeof(buf));
502
+ if (n == -1 && errno == EINTR) continue;
503
+ if (n <= 0) break;
504
+
505
+ msg = realloc(msg, length + n);
506
+ if (!msg) return NULL;
507
+
508
+ memcpy(msg+length, buf, n);
509
+ length += n;
510
+ }
511
+
512
+ sum = spamsum(msg, length, flags, block_size);
513
+
514
+ free(msg);
515
+
516
+ return sum;
517
+ }
518
+
519
+
520
+ /*
521
+ return the spamsum on a file
522
+ */
523
+ char *spamsum_file(const char *fname, u32 flags, u32 block_size)
524
+ {
525
+ int fd;
526
+ char *sum;
527
+ struct stat st;
528
+ uchar *msg;
529
+
530
+ if (strcmp(fname, "-") == 0) {
531
+ return spamsum_stdin(flags, block_size);
532
+ }
533
+
534
+ fd = open(fname, O_RDONLY);
535
+ if (fd == -1) {
536
+ perror(fname);
537
+ return NULL;
538
+ }
539
+
540
+ if (fstat(fd, &st) == -1) {
541
+ perror("fstat");
542
+ return NULL;
543
+ }
544
+
545
+ msg = mmap(NULL, st.st_size, PROT_READ, MAP_FILE|MAP_PRIVATE, fd, 0);
546
+ if (msg == (uchar *)-1) {
547
+ perror("mmap");
548
+ return NULL;
549
+ }
550
+ close(fd);
551
+
552
+ sum = spamsum(msg, st.st_size, flags, block_size);
553
+
554
+ munmap(msg, st.st_size);
555
+
556
+ return sum;
557
+ }
558
+
559
+ static void show_help(void)
560
+ {
561
+ printf("\n\
562
+ spamsum v1.1 written by Andrew Tridgell <tridge@samba.org>\n\
563
+ \n\
564
+ spamsum computes a signature string that is particular good for detecting if two emails\n\
565
+ are very similar. This can be used to detect SPAM.\n\
566
+ \n\
567
+ Syntax:\n\
568
+ spamsum [options] <files>\n\
569
+ or\n\
570
+ spamsum [options] -d sigs.txt -c SIG\n\
571
+ or\n\
572
+ spamsum [options] -d sigs.txt -C file\n\
573
+ \n\
574
+ When called with a list of filenames spamsum will write out the\n\
575
+ signatures of each file on a separate line. You can specify the\n\
576
+ filename '-' for standard input.\n\
577
+ \n\
578
+ When called with the second form, spamsum will print the best score\n\
579
+ for the given signature with the signatures in the given database. A\n\
580
+ score of 100 means a perfect match, and a score of 0 means a complete\n\
581
+ mismatch.\n\
582
+ \n\
583
+ When checking, spamsum returns 0 (success) when the message *is* spam,\n\
584
+ 1 for internal errors, and 2 for messages whose signature is not\n\
585
+ found.\n\
586
+ \n\
587
+ The 3rd form is just like the second form, but you pass a file\n\
588
+ containing a message instead of a pre-computed signature.\n\
589
+ \n\
590
+ Options:\n\
591
+ -W ignore whitespace\n\
592
+ -H skip past mail headers\n\
593
+ -B <bsize> force a block size of bsize\n\
594
+ -T <threshold> set the threshold above which spamsum will stop\n\
595
+ looking (default 90)\n\
596
+ ");
597
+ }
598
+
599
+ int main(int argc, char *argv[])
600
+ {
601
+ char *sum;
602
+ extern char *optarg;
603
+ extern int optind;
604
+ int c;
605
+ char *dbname = NULL;
606
+ u32 score;
607
+ int i;
608
+ u32 flags = 0;
609
+ u32 block_size = 0;
610
+ u32 threshold = 90;
611
+
612
+ while ((c = getopt(argc, argv, "B:WHd:c:C:hT:")) != -1) {
613
+ switch (c) {
614
+ case 'W':
615
+ flags |= FLAG_IGNORE_WHITESPACE;
616
+ break;
617
+
618
+ case 'H':
619
+ flags |= FLAG_IGNORE_HEADERS;
620
+ break;
621
+
622
+ case 'd':
623
+ dbname = optarg;
624
+ break;
625
+
626
+ case 'B':
627
+ block_size = atoi(optarg);
628
+ break;
629
+
630
+ case 'T':
631
+ threshold = atoi(optarg);
632
+ break;
633
+
634
+ case 'c':
635
+ if (!dbname) {
636
+ show_help();
637
+ exit(1);
638
+ }
639
+ score = spamsum_match_db(dbname, optarg,
640
+ threshold);
641
+ printf("%u\n", score);
642
+ exit(score >= threshold ? 0 : 2);
643
+
644
+ case 'C':
645
+ if (!dbname) {
646
+ show_help();
647
+ exit(1);
648
+ }
649
+ score = spamsum_match_db(dbname,
650
+ spamsum_file(optarg, flags,
651
+ block_size),
652
+ threshold);
653
+ printf("%u\n", score);
654
+ exit(score >= threshold ? 0 : 2);
655
+
656
+ case 'h':
657
+ default:
658
+ show_help();
659
+ exit(0);
660
+ }
661
+ }
662
+
663
+ argc -= optind;
664
+ argv += optind;
665
+
666
+ if (argc == 0) {
667
+ show_help();
668
+ return 0;
669
+ }
670
+
671
+ /* compute the spamsum on a list of files */
672
+ for (i=0;i<argc;i++) {
673
+ sum = spamsum_file(argv[i], flags, block_size);
674
+ printf("%s\n", sum);
675
+ free(sum);
676
+ }
677
+
678
+ return 0;
679
+ }
data/ext/spamsum.i ADDED
@@ -0,0 +1,16 @@
1
+ %include "cpointer.i"
2
+ %include "typemaps.i"
3
+ %module "spamsum_swig"
4
+
5
+ %typemap(in) (char *s1, int s1_len) {
6
+ $1 = STR2CSTR($input);
7
+ $2 = (int) RSTRING($input)->len;
8
+ };
9
+ %typemap(in) (char *s2, int s2_len) {
10
+ $1 = STR2CSTR($input);
11
+ $2 = (int) RSTRING($input)->len;
12
+ };
13
+
14
+ int edit_distn(char *s1, int s1_len, char *s2, int s2_len);
15
+ char *spamsum(char *str, unsigned int len, unsigned int flags=0, unsigned int bsize=0);
16
+ unsigned int spamsum_match(char *s1, char *s2);