jwilkins-spamsum 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/ext/spamsum.c +126 -0
- data/ext/spamsum.i +1 -0
- data/ext/spamsum_wrap.c +56 -0
- data/spamsum.gemspec +1 -1
- data/test.rb +27 -22
- metadata +1 -1
data/ext/spamsum.c
CHANGED
@@ -29,6 +29,7 @@
|
|
29
29
|
|
30
30
|
/* the output is a string of length 64 in base64 */
|
31
31
|
#define SPAMSUM_LENGTH 64
|
32
|
+
#define SPAMSUM_HEX_LENGTH 128
|
32
33
|
|
33
34
|
#define MIN_BLOCKSIZE 3
|
34
35
|
#define HASH_PRIME 0x01000193
|
@@ -226,6 +227,131 @@ again:
|
|
226
227
|
return ret;
|
227
228
|
}
|
228
229
|
|
230
|
+
/*
|
231
|
+
take a message of length 'length' and return a string representing a hash of that message,
|
232
|
+
prefixed by the selected blocksize
|
233
|
+
*/
|
234
|
+
char *spamsum_hex(const uchar *in, u32 length, u32 flags, u32 bsize)
|
235
|
+
{
|
236
|
+
char *ret, *p;
|
237
|
+
u32 total_chars;
|
238
|
+
u32 h, h2, h3;
|
239
|
+
u32 j, n, i, k;
|
240
|
+
u32 block_size;
|
241
|
+
uchar ret2[SPAMSUM_HEX_LENGTH/2 + 1];
|
242
|
+
|
243
|
+
/* if we are ignoring email headers then skip past them now */
|
244
|
+
if (flags & FLAG_IGNORE_HEADERS) {
|
245
|
+
const uchar *s = strstr(in, "\n\n");
|
246
|
+
if (s) {
|
247
|
+
length -= (s+2 - in);
|
248
|
+
in = s+2;
|
249
|
+
}
|
250
|
+
}
|
251
|
+
|
252
|
+
if (flags & FLAG_IGNORE_WHITESPACE) {
|
253
|
+
/* count the non-ignored chars */
|
254
|
+
for (n=0, i=0; i<length; i++) {
|
255
|
+
if (isspace(in[i])) continue;
|
256
|
+
n++;
|
257
|
+
}
|
258
|
+
total_chars = n;
|
259
|
+
} else {
|
260
|
+
total_chars = length;
|
261
|
+
}
|
262
|
+
|
263
|
+
if (bsize == 0) {
|
264
|
+
/* guess a reasonable block size */
|
265
|
+
block_size = MIN_BLOCKSIZE;
|
266
|
+
while (block_size * SPAMSUM_HEX_LENGTH < total_chars) {
|
267
|
+
block_size = block_size * 2;
|
268
|
+
}
|
269
|
+
} else {
|
270
|
+
block_size = bsize;
|
271
|
+
}
|
272
|
+
|
273
|
+
ret = malloc(SPAMSUM_HEX_LENGTH + SPAMSUM_HEX_LENGTH/2 + 20);
|
274
|
+
if (!ret) return NULL;
|
275
|
+
|
276
|
+
again:
|
277
|
+
/* the first part of the spamsum signature is the blocksize */
|
278
|
+
snprintf(ret, 12, "%u:", block_size);
|
279
|
+
p = ret + strlen(ret);
|
280
|
+
|
281
|
+
memset(p, 0, SPAMSUM_HEX_LENGTH+1);
|
282
|
+
memset(ret2, 0, sizeof(ret2));
|
283
|
+
|
284
|
+
k = j = 0;
|
285
|
+
h3 = h2 = HASH_INIT;
|
286
|
+
h = roll_reset();
|
287
|
+
|
288
|
+
for (i=0; i<length; i++) {
|
289
|
+
if ((flags & FLAG_IGNORE_WHITESPACE) &&
|
290
|
+
isspace(in[i])) continue;
|
291
|
+
|
292
|
+
/*
|
293
|
+
at each character we update the rolling hash and
|
294
|
+
the normal hash. When the rolling hash hits the
|
295
|
+
reset value then we emit the normal hash as a
|
296
|
+
element of the signature and reset both hashes
|
297
|
+
*/
|
298
|
+
h = roll_hash(in[i]);
|
299
|
+
h2 = sum_hash(in[i], h2);
|
300
|
+
h3 = sum_hash(in[i], h3);
|
301
|
+
|
302
|
+
if (h % block_size == (block_size-1)) {
|
303
|
+
/* we have hit a reset point. We now emit a
|
304
|
+
hash which is based on all chacaters in the
|
305
|
+
piece of the message between the last reset
|
306
|
+
point and this one */
|
307
|
+
snprintf(&p[j], 2, "%02x", h2);
|
308
|
+
if (j < SPAMSUM_HEX_LENGTH-1) {
|
309
|
+
/* we can have a problem with the tail
|
310
|
+
overflowing. The easiest way to
|
311
|
+
cope with this is to only reset the
|
312
|
+
second hash if we have room for
|
313
|
+
more characters in our
|
314
|
+
signature. This has the effect of
|
315
|
+
combining the last few pieces of
|
316
|
+
the message into a single piece */
|
317
|
+
h2 = HASH_INIT;
|
318
|
+
j++;
|
319
|
+
}
|
320
|
+
}
|
321
|
+
|
322
|
+
/* this produces a second signature with a block size
|
323
|
+
of block_size*2. By producing dual signatures in
|
324
|
+
this way the effect of small changes in the message
|
325
|
+
size near a block size boundary is greatly reduced. */
|
326
|
+
if (h % (block_size*2) == ((block_size*2)-1)) {
|
327
|
+
snprintf(&ret2[k], 2, "%02x", h3);
|
328
|
+
if (k < SPAMSUM_HEX_LENGTH/2-1) {
|
329
|
+
h3 = HASH_INIT;
|
330
|
+
k++;
|
331
|
+
}
|
332
|
+
}
|
333
|
+
}
|
334
|
+
|
335
|
+
/* if we have anything left then add it to the end. This
|
336
|
+
ensures that the last part of the message is always
|
337
|
+
considered */
|
338
|
+
if (h != 0) {
|
339
|
+
snprintf(&p[j], 2, "%02x", h2);
|
340
|
+
snprintf(&ret2[k], 2, "%02x", h3);
|
341
|
+
}
|
342
|
+
|
343
|
+
strcat(p+j, ":");
|
344
|
+
strcat(p+j, ret2);
|
345
|
+
|
346
|
+
/* our blocksize guess may have been way off - repeat if necessary */
|
347
|
+
if (bsize == 0 && block_size > MIN_BLOCKSIZE && j < SPAMSUM_HEX_LENGTH/2) {
|
348
|
+
block_size = block_size / 2;
|
349
|
+
goto again;
|
350
|
+
}
|
351
|
+
|
352
|
+
return ret;
|
353
|
+
}
|
354
|
+
|
229
355
|
|
230
356
|
/*
|
231
357
|
we only accept a match if we have at least one common substring in
|
data/ext/spamsum.i
CHANGED
@@ -13,4 +13,5 @@
|
|
13
13
|
|
14
14
|
int edit_distn(char *s1, int s1_len, char *s2, int s2_len);
|
15
15
|
char *spamsum(char *str, unsigned int len, unsigned int flags=0, unsigned int bsize=0);
|
16
|
+
char *spamsum_hex(char *str, unsigned int len, unsigned int flags=0, unsigned int bsize=0);
|
16
17
|
unsigned int spamsum_match(char *s1, char *s2);
|
data/ext/spamsum_wrap.c
CHANGED
@@ -2087,6 +2087,61 @@ fail:
|
|
2087
2087
|
}
|
2088
2088
|
|
2089
2089
|
|
2090
|
+
SWIGINTERN VALUE
|
2091
|
+
_wrap_spamsum_hex(int argc, VALUE *argv, VALUE self) {
|
2092
|
+
char *arg1 = (char *) 0 ;
|
2093
|
+
unsigned int arg2 ;
|
2094
|
+
unsigned int arg3 = (unsigned int) 0 ;
|
2095
|
+
unsigned int arg4 = (unsigned int) 0 ;
|
2096
|
+
char *result = 0 ;
|
2097
|
+
int res1 ;
|
2098
|
+
char *buf1 = 0 ;
|
2099
|
+
int alloc1 = 0 ;
|
2100
|
+
unsigned int val2 ;
|
2101
|
+
int ecode2 = 0 ;
|
2102
|
+
unsigned int val3 ;
|
2103
|
+
int ecode3 = 0 ;
|
2104
|
+
unsigned int val4 ;
|
2105
|
+
int ecode4 = 0 ;
|
2106
|
+
VALUE vresult = Qnil;
|
2107
|
+
|
2108
|
+
if ((argc < 2) || (argc > 4)) {
|
2109
|
+
rb_raise(rb_eArgError, "wrong # of arguments(%d for 2)",argc); SWIG_fail;
|
2110
|
+
}
|
2111
|
+
res1 = SWIG_AsCharPtrAndSize(argv[0], &buf1, NULL, &alloc1);
|
2112
|
+
if (!SWIG_IsOK(res1)) {
|
2113
|
+
SWIG_exception_fail(SWIG_ArgError(res1), Ruby_Format_TypeError( "", "char *","spamsum_hex", 1, argv[0] ));
|
2114
|
+
}
|
2115
|
+
arg1 = (char *)(buf1);
|
2116
|
+
ecode2 = SWIG_AsVal_unsigned_SS_int(argv[1], &val2);
|
2117
|
+
if (!SWIG_IsOK(ecode2)) {
|
2118
|
+
SWIG_exception_fail(SWIG_ArgError(ecode2), Ruby_Format_TypeError( "", "unsigned int","spamsum_hex", 2, argv[1] ));
|
2119
|
+
}
|
2120
|
+
arg2 = (unsigned int)(val2);
|
2121
|
+
if (argc > 2) {
|
2122
|
+
ecode3 = SWIG_AsVal_unsigned_SS_int(argv[2], &val3);
|
2123
|
+
if (!SWIG_IsOK(ecode3)) {
|
2124
|
+
SWIG_exception_fail(SWIG_ArgError(ecode3), Ruby_Format_TypeError( "", "unsigned int","spamsum_hex", 3, argv[2] ));
|
2125
|
+
}
|
2126
|
+
arg3 = (unsigned int)(val3);
|
2127
|
+
}
|
2128
|
+
if (argc > 3) {
|
2129
|
+
ecode4 = SWIG_AsVal_unsigned_SS_int(argv[3], &val4);
|
2130
|
+
if (!SWIG_IsOK(ecode4)) {
|
2131
|
+
SWIG_exception_fail(SWIG_ArgError(ecode4), Ruby_Format_TypeError( "", "unsigned int","spamsum_hex", 4, argv[3] ));
|
2132
|
+
}
|
2133
|
+
arg4 = (unsigned int)(val4);
|
2134
|
+
}
|
2135
|
+
result = (char *)spamsum_hex(arg1,arg2,arg3,arg4);
|
2136
|
+
vresult = SWIG_FromCharPtr((const char *)result);
|
2137
|
+
if (alloc1 == SWIG_NEWOBJ) free((char*)buf1);
|
2138
|
+
return vresult;
|
2139
|
+
fail:
|
2140
|
+
if (alloc1 == SWIG_NEWOBJ) free((char*)buf1);
|
2141
|
+
return Qnil;
|
2142
|
+
}
|
2143
|
+
|
2144
|
+
|
2090
2145
|
SWIGINTERN VALUE
|
2091
2146
|
_wrap_spamsum_match(int argc, VALUE *argv, VALUE self) {
|
2092
2147
|
char *arg1 = (char *) 0 ;
|
@@ -2400,6 +2455,7 @@ SWIGEXPORT void Init_spamsum_swig(void) {
|
|
2400
2455
|
SWIG_RubyInitializeTrackings();
|
2401
2456
|
rb_define_module_function(mSpamsum_swig, "edit_distn", _wrap_edit_distn, -1);
|
2402
2457
|
rb_define_module_function(mSpamsum_swig, "spamsum", _wrap_spamsum, -1);
|
2458
|
+
rb_define_module_function(mSpamsum_swig, "spamsum_hex", _wrap_spamsum_hex, -1);
|
2403
2459
|
rb_define_module_function(mSpamsum_swig, "spamsum_match", _wrap_spamsum_match, -1);
|
2404
2460
|
}
|
2405
2461
|
|
data/spamsum.gemspec
CHANGED
data/test.rb
CHANGED
@@ -3,31 +3,36 @@ s1 = "And she turned to me and took me by the hand and said, I've lost control a
|
|
3
3
|
s2 = "And she screamed out kicking on her side and said, I've lost control again."
|
4
4
|
s3 = "Control"
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
puts "Spamsum.distance(s1, s3): #{Spamsum.distance(s1, s3)}"
|
11
|
-
puts "-"*40
|
6
|
+
# XXX: seems to be a min length issue for compares, replace with other dist func?
|
7
|
+
b1 = "\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb"
|
8
|
+
b2 = "\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc"
|
9
|
+
b3 = "\x00\xaa\xbb\xcc\xdd\x66\x77\x88\x99\x00\xaa"
|
12
10
|
|
13
|
-
(1..3).each { |x|
|
14
|
-
eval("s = s#{x}")
|
15
11
|
|
16
|
-
|
17
|
-
puts "
|
18
|
-
|
19
|
-
}
|
12
|
+
%w(s b).each { |t|
|
13
|
+
puts "-"*40
|
14
|
+
(1..3).each { |x|
|
15
|
+
puts "#{t}#{x}: #{eval("#{t}#{x}")}" if t == 's'
|
16
|
+
puts "Spamsum.distance(#{t}1, #{t}#{x}): #{Spamsum.distance(eval("#{t}1"), eval("#{t}#{x}"))}"
|
17
|
+
eval("s = #{t}#{x}")
|
18
|
+
puts "-"*20
|
20
19
|
|
21
|
-
|
20
|
+
eval("$#{t}sum#{x} = Spamsum.sum(s)")
|
21
|
+
puts "Spamsum.sum(#{t}#{x}): #{eval("$#{t}sum#{x}")}"
|
22
|
+
puts "Spamsum.match(#{t}1, #{t}#{x}): #{Spamsum.match(eval("$#{t}sum1"), eval("$#{t}sum#{x}"))}"
|
23
|
+
puts "-"*20
|
22
24
|
|
23
|
-
#
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
(1..3).each { |x|
|
28
|
-
eval("s = b#{x}")
|
25
|
+
eval("$#{t}hsum#{x} = Spamsum.sum_hex(s)")
|
26
|
+
puts "Spamsum.match(#{t}1, #{t}#{x}): #{Spamsum.match(eval("$#{t}hsum1"), eval("$#{t}hsum#{x}"))}"
|
27
|
+
}
|
28
|
+
}
|
29
29
|
|
30
|
-
|
31
|
-
|
32
|
-
|
30
|
+
puts "-"*40
|
31
|
+
%w(s b).each { |t|
|
32
|
+
(1..3).each { |x|
|
33
|
+
puts "Spamsum.sum_hex(#{t}#{x}): #{eval("$#{t}hsum#{x}")}"
|
34
|
+
}
|
33
35
|
}
|
36
|
+
|
37
|
+
puts "Spamsum.sum_hex(s1*1000): #{Spamsum.sum_hex(s1*1000)}"
|
38
|
+
puts "Spamsum.sum_hex(s1*1000): #{Spamsum.sum_hex(open('random.bin').read)}"
|