jwilkins-spamsum 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/ext/spamsum.c +126 -0
- data/ext/spamsum.i +1 -0
- data/ext/spamsum_wrap.c +56 -0
- data/spamsum.gemspec +1 -1
- data/test.rb +27 -22
- metadata +1 -1
data/ext/spamsum.c
CHANGED
@@ -29,6 +29,7 @@
|
|
29
29
|
|
30
30
|
/* the output is a string of length 64 in base64 */
|
31
31
|
#define SPAMSUM_LENGTH 64
|
32
|
+
#define SPAMSUM_HEX_LENGTH 128
|
32
33
|
|
33
34
|
#define MIN_BLOCKSIZE 3
|
34
35
|
#define HASH_PRIME 0x01000193
|
@@ -226,6 +227,131 @@ again:
|
|
226
227
|
return ret;
|
227
228
|
}
|
228
229
|
|
230
|
+
/*
|
231
|
+
take a message of length 'length' and return a string representing a hash of that message,
|
232
|
+
prefixed by the selected blocksize
|
233
|
+
*/
|
234
|
+
char *spamsum_hex(const uchar *in, u32 length, u32 flags, u32 bsize)
|
235
|
+
{
|
236
|
+
char *ret, *p;
|
237
|
+
u32 total_chars;
|
238
|
+
u32 h, h2, h3;
|
239
|
+
u32 j, n, i, k;
|
240
|
+
u32 block_size;
|
241
|
+
uchar ret2[SPAMSUM_HEX_LENGTH/2 + 1];
|
242
|
+
|
243
|
+
/* if we are ignoring email headers then skip past them now */
|
244
|
+
if (flags & FLAG_IGNORE_HEADERS) {
|
245
|
+
const uchar *s = strstr(in, "\n\n");
|
246
|
+
if (s) {
|
247
|
+
length -= (s+2 - in);
|
248
|
+
in = s+2;
|
249
|
+
}
|
250
|
+
}
|
251
|
+
|
252
|
+
if (flags & FLAG_IGNORE_WHITESPACE) {
|
253
|
+
/* count the non-ignored chars */
|
254
|
+
for (n=0, i=0; i<length; i++) {
|
255
|
+
if (isspace(in[i])) continue;
|
256
|
+
n++;
|
257
|
+
}
|
258
|
+
total_chars = n;
|
259
|
+
} else {
|
260
|
+
total_chars = length;
|
261
|
+
}
|
262
|
+
|
263
|
+
if (bsize == 0) {
|
264
|
+
/* guess a reasonable block size */
|
265
|
+
block_size = MIN_BLOCKSIZE;
|
266
|
+
while (block_size * SPAMSUM_HEX_LENGTH < total_chars) {
|
267
|
+
block_size = block_size * 2;
|
268
|
+
}
|
269
|
+
} else {
|
270
|
+
block_size = bsize;
|
271
|
+
}
|
272
|
+
|
273
|
+
ret = malloc(SPAMSUM_HEX_LENGTH + SPAMSUM_HEX_LENGTH/2 + 20);
|
274
|
+
if (!ret) return NULL;
|
275
|
+
|
276
|
+
again:
|
277
|
+
/* the first part of the spamsum signature is the blocksize */
|
278
|
+
snprintf(ret, 12, "%u:", block_size);
|
279
|
+
p = ret + strlen(ret);
|
280
|
+
|
281
|
+
memset(p, 0, SPAMSUM_HEX_LENGTH+1);
|
282
|
+
memset(ret2, 0, sizeof(ret2));
|
283
|
+
|
284
|
+
k = j = 0;
|
285
|
+
h3 = h2 = HASH_INIT;
|
286
|
+
h = roll_reset();
|
287
|
+
|
288
|
+
for (i=0; i<length; i++) {
|
289
|
+
if ((flags & FLAG_IGNORE_WHITESPACE) &&
|
290
|
+
isspace(in[i])) continue;
|
291
|
+
|
292
|
+
/*
|
293
|
+
at each character we update the rolling hash and
|
294
|
+
the normal hash. When the rolling hash hits the
|
295
|
+
reset value then we emit the normal hash as a
|
296
|
+
element of the signature and reset both hashes
|
297
|
+
*/
|
298
|
+
h = roll_hash(in[i]);
|
299
|
+
h2 = sum_hash(in[i], h2);
|
300
|
+
h3 = sum_hash(in[i], h3);
|
301
|
+
|
302
|
+
if (h % block_size == (block_size-1)) {
|
303
|
+
/* we have hit a reset point. We now emit a
|
304
|
+
hash which is based on all chacaters in the
|
305
|
+
piece of the message between the last reset
|
306
|
+
point and this one */
|
307
|
+
snprintf(&p[j], 2, "%02x", h2);
|
308
|
+
if (j < SPAMSUM_HEX_LENGTH-1) {
|
309
|
+
/* we can have a problem with the tail
|
310
|
+
overflowing. The easiest way to
|
311
|
+
cope with this is to only reset the
|
312
|
+
second hash if we have room for
|
313
|
+
more characters in our
|
314
|
+
signature. This has the effect of
|
315
|
+
combining the last few pieces of
|
316
|
+
the message into a single piece */
|
317
|
+
h2 = HASH_INIT;
|
318
|
+
j++;
|
319
|
+
}
|
320
|
+
}
|
321
|
+
|
322
|
+
/* this produces a second signature with a block size
|
323
|
+
of block_size*2. By producing dual signatures in
|
324
|
+
this way the effect of small changes in the message
|
325
|
+
size near a block size boundary is greatly reduced. */
|
326
|
+
if (h % (block_size*2) == ((block_size*2)-1)) {
|
327
|
+
snprintf(&ret2[k], 2, "%02x", h3);
|
328
|
+
if (k < SPAMSUM_HEX_LENGTH/2-1) {
|
329
|
+
h3 = HASH_INIT;
|
330
|
+
k++;
|
331
|
+
}
|
332
|
+
}
|
333
|
+
}
|
334
|
+
|
335
|
+
/* if we have anything left then add it to the end. This
|
336
|
+
ensures that the last part of the message is always
|
337
|
+
considered */
|
338
|
+
if (h != 0) {
|
339
|
+
snprintf(&p[j], 2, "%02x", h2);
|
340
|
+
snprintf(&ret2[k], 2, "%02x", h3);
|
341
|
+
}
|
342
|
+
|
343
|
+
strcat(p+j, ":");
|
344
|
+
strcat(p+j, ret2);
|
345
|
+
|
346
|
+
/* our blocksize guess may have been way off - repeat if necessary */
|
347
|
+
if (bsize == 0 && block_size > MIN_BLOCKSIZE && j < SPAMSUM_HEX_LENGTH/2) {
|
348
|
+
block_size = block_size / 2;
|
349
|
+
goto again;
|
350
|
+
}
|
351
|
+
|
352
|
+
return ret;
|
353
|
+
}
|
354
|
+
|
229
355
|
|
230
356
|
/*
|
231
357
|
we only accept a match if we have at least one common substring in
|
data/ext/spamsum.i
CHANGED
@@ -13,4 +13,5 @@
|
|
13
13
|
|
14
14
|
int edit_distn(char *s1, int s1_len, char *s2, int s2_len);
|
15
15
|
char *spamsum(char *str, unsigned int len, unsigned int flags=0, unsigned int bsize=0);
|
16
|
+
char *spamsum_hex(char *str, unsigned int len, unsigned int flags=0, unsigned int bsize=0);
|
16
17
|
unsigned int spamsum_match(char *s1, char *s2);
|
data/ext/spamsum_wrap.c
CHANGED
@@ -2087,6 +2087,61 @@ fail:
|
|
2087
2087
|
}
|
2088
2088
|
|
2089
2089
|
|
2090
|
+
SWIGINTERN VALUE
|
2091
|
+
_wrap_spamsum_hex(int argc, VALUE *argv, VALUE self) {
|
2092
|
+
char *arg1 = (char *) 0 ;
|
2093
|
+
unsigned int arg2 ;
|
2094
|
+
unsigned int arg3 = (unsigned int) 0 ;
|
2095
|
+
unsigned int arg4 = (unsigned int) 0 ;
|
2096
|
+
char *result = 0 ;
|
2097
|
+
int res1 ;
|
2098
|
+
char *buf1 = 0 ;
|
2099
|
+
int alloc1 = 0 ;
|
2100
|
+
unsigned int val2 ;
|
2101
|
+
int ecode2 = 0 ;
|
2102
|
+
unsigned int val3 ;
|
2103
|
+
int ecode3 = 0 ;
|
2104
|
+
unsigned int val4 ;
|
2105
|
+
int ecode4 = 0 ;
|
2106
|
+
VALUE vresult = Qnil;
|
2107
|
+
|
2108
|
+
if ((argc < 2) || (argc > 4)) {
|
2109
|
+
rb_raise(rb_eArgError, "wrong # of arguments(%d for 2)",argc); SWIG_fail;
|
2110
|
+
}
|
2111
|
+
res1 = SWIG_AsCharPtrAndSize(argv[0], &buf1, NULL, &alloc1);
|
2112
|
+
if (!SWIG_IsOK(res1)) {
|
2113
|
+
SWIG_exception_fail(SWIG_ArgError(res1), Ruby_Format_TypeError( "", "char *","spamsum_hex", 1, argv[0] ));
|
2114
|
+
}
|
2115
|
+
arg1 = (char *)(buf1);
|
2116
|
+
ecode2 = SWIG_AsVal_unsigned_SS_int(argv[1], &val2);
|
2117
|
+
if (!SWIG_IsOK(ecode2)) {
|
2118
|
+
SWIG_exception_fail(SWIG_ArgError(ecode2), Ruby_Format_TypeError( "", "unsigned int","spamsum_hex", 2, argv[1] ));
|
2119
|
+
}
|
2120
|
+
arg2 = (unsigned int)(val2);
|
2121
|
+
if (argc > 2) {
|
2122
|
+
ecode3 = SWIG_AsVal_unsigned_SS_int(argv[2], &val3);
|
2123
|
+
if (!SWIG_IsOK(ecode3)) {
|
2124
|
+
SWIG_exception_fail(SWIG_ArgError(ecode3), Ruby_Format_TypeError( "", "unsigned int","spamsum_hex", 3, argv[2] ));
|
2125
|
+
}
|
2126
|
+
arg3 = (unsigned int)(val3);
|
2127
|
+
}
|
2128
|
+
if (argc > 3) {
|
2129
|
+
ecode4 = SWIG_AsVal_unsigned_SS_int(argv[3], &val4);
|
2130
|
+
if (!SWIG_IsOK(ecode4)) {
|
2131
|
+
SWIG_exception_fail(SWIG_ArgError(ecode4), Ruby_Format_TypeError( "", "unsigned int","spamsum_hex", 4, argv[3] ));
|
2132
|
+
}
|
2133
|
+
arg4 = (unsigned int)(val4);
|
2134
|
+
}
|
2135
|
+
result = (char *)spamsum_hex(arg1,arg2,arg3,arg4);
|
2136
|
+
vresult = SWIG_FromCharPtr((const char *)result);
|
2137
|
+
if (alloc1 == SWIG_NEWOBJ) free((char*)buf1);
|
2138
|
+
return vresult;
|
2139
|
+
fail:
|
2140
|
+
if (alloc1 == SWIG_NEWOBJ) free((char*)buf1);
|
2141
|
+
return Qnil;
|
2142
|
+
}
|
2143
|
+
|
2144
|
+
|
2090
2145
|
SWIGINTERN VALUE
|
2091
2146
|
_wrap_spamsum_match(int argc, VALUE *argv, VALUE self) {
|
2092
2147
|
char *arg1 = (char *) 0 ;
|
@@ -2400,6 +2455,7 @@ SWIGEXPORT void Init_spamsum_swig(void) {
|
|
2400
2455
|
SWIG_RubyInitializeTrackings();
|
2401
2456
|
rb_define_module_function(mSpamsum_swig, "edit_distn", _wrap_edit_distn, -1);
|
2402
2457
|
rb_define_module_function(mSpamsum_swig, "spamsum", _wrap_spamsum, -1);
|
2458
|
+
rb_define_module_function(mSpamsum_swig, "spamsum_hex", _wrap_spamsum_hex, -1);
|
2403
2459
|
rb_define_module_function(mSpamsum_swig, "spamsum_match", _wrap_spamsum_match, -1);
|
2404
2460
|
}
|
2405
2461
|
|
data/spamsum.gemspec
CHANGED
data/test.rb
CHANGED
@@ -3,31 +3,36 @@ s1 = "And she turned to me and took me by the hand and said, I've lost control a
|
|
3
3
|
s2 = "And she screamed out kicking on her side and said, I've lost control again."
|
4
4
|
s3 = "Control"
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
puts "Spamsum.distance(s1, s3): #{Spamsum.distance(s1, s3)}"
|
11
|
-
puts "-"*40
|
6
|
+
# XXX: seems to be a min length issue for compares, replace with other dist func?
|
7
|
+
b1 = "\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb"
|
8
|
+
b2 = "\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc"
|
9
|
+
b3 = "\x00\xaa\xbb\xcc\xdd\x66\x77\x88\x99\x00\xaa"
|
12
10
|
|
13
|
-
(1..3).each { |x|
|
14
|
-
eval("s = s#{x}")
|
15
11
|
|
16
|
-
|
17
|
-
puts "
|
18
|
-
|
19
|
-
}
|
12
|
+
%w(s b).each { |t|
|
13
|
+
puts "-"*40
|
14
|
+
(1..3).each { |x|
|
15
|
+
puts "#{t}#{x}: #{eval("#{t}#{x}")}" if t == 's'
|
16
|
+
puts "Spamsum.distance(#{t}1, #{t}#{x}): #{Spamsum.distance(eval("#{t}1"), eval("#{t}#{x}"))}"
|
17
|
+
eval("s = #{t}#{x}")
|
18
|
+
puts "-"*20
|
20
19
|
|
21
|
-
|
20
|
+
eval("$#{t}sum#{x} = Spamsum.sum(s)")
|
21
|
+
puts "Spamsum.sum(#{t}#{x}): #{eval("$#{t}sum#{x}")}"
|
22
|
+
puts "Spamsum.match(#{t}1, #{t}#{x}): #{Spamsum.match(eval("$#{t}sum1"), eval("$#{t}sum#{x}"))}"
|
23
|
+
puts "-"*20
|
22
24
|
|
23
|
-
#
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
(1..3).each { |x|
|
28
|
-
eval("s = b#{x}")
|
25
|
+
eval("$#{t}hsum#{x} = Spamsum.sum_hex(s)")
|
26
|
+
puts "Spamsum.match(#{t}1, #{t}#{x}): #{Spamsum.match(eval("$#{t}hsum1"), eval("$#{t}hsum#{x}"))}"
|
27
|
+
}
|
28
|
+
}
|
29
29
|
|
30
|
-
|
31
|
-
|
32
|
-
|
30
|
+
puts "-"*40
|
31
|
+
%w(s b).each { |t|
|
32
|
+
(1..3).each { |x|
|
33
|
+
puts "Spamsum.sum_hex(#{t}#{x}): #{eval("$#{t}hsum#{x}")}"
|
34
|
+
}
|
33
35
|
}
|
36
|
+
|
37
|
+
puts "Spamsum.sum_hex(s1*1000): #{Spamsum.sum_hex(s1*1000)}"
|
38
|
+
puts "Spamsum.sum_hex(s1*1000): #{Spamsum.sum_hex(open('random.bin').read)}"
|