jwilkins-spamsum 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
data/ext/spamsum.c CHANGED
@@ -29,6 +29,7 @@
29
29
 
30
30
  /* the output is a string of length 64 in base64 */
31
31
  #define SPAMSUM_LENGTH 64
32
+ #define SPAMSUM_HEX_LENGTH 128
32
33
 
33
34
  #define MIN_BLOCKSIZE 3
34
35
  #define HASH_PRIME 0x01000193
@@ -226,6 +227,131 @@ again:
226
227
  return ret;
227
228
  }
228
229
 
230
+ /*
231
+ take a message of length 'length' and return a string representing a hash of that message,
232
+ prefixed by the selected blocksize
233
+ */
234
+ char *spamsum_hex(const uchar *in, u32 length, u32 flags, u32 bsize)
235
+ {
236
+ char *ret, *p;
237
+ u32 total_chars;
238
+ u32 h, h2, h3;
239
+ u32 j, n, i, k;
240
+ u32 block_size;
241
+ uchar ret2[SPAMSUM_HEX_LENGTH/2 + 1];
242
+
243
+ /* if we are ignoring email headers then skip past them now */
244
+ if (flags & FLAG_IGNORE_HEADERS) {
245
+ const uchar *s = strstr(in, "\n\n");
246
+ if (s) {
247
+ length -= (s+2 - in);
248
+ in = s+2;
249
+ }
250
+ }
251
+
252
+ if (flags & FLAG_IGNORE_WHITESPACE) {
253
+ /* count the non-ignored chars */
254
+ for (n=0, i=0; i<length; i++) {
255
+ if (isspace(in[i])) continue;
256
+ n++;
257
+ }
258
+ total_chars = n;
259
+ } else {
260
+ total_chars = length;
261
+ }
262
+
263
+ if (bsize == 0) {
264
+ /* guess a reasonable block size */
265
+ block_size = MIN_BLOCKSIZE;
266
+ while (block_size * SPAMSUM_HEX_LENGTH < total_chars) {
267
+ block_size = block_size * 2;
268
+ }
269
+ } else {
270
+ block_size = bsize;
271
+ }
272
+
273
+ ret = malloc(SPAMSUM_HEX_LENGTH + SPAMSUM_HEX_LENGTH/2 + 20);
274
+ if (!ret) return NULL;
275
+
276
+ again:
277
+ /* the first part of the spamsum signature is the blocksize */
278
+ snprintf(ret, 12, "%u:", block_size);
279
+ p = ret + strlen(ret);
280
+
281
+ memset(p, 0, SPAMSUM_HEX_LENGTH+1);
282
+ memset(ret2, 0, sizeof(ret2));
283
+
284
+ k = j = 0;
285
+ h3 = h2 = HASH_INIT;
286
+ h = roll_reset();
287
+
288
+ for (i=0; i<length; i++) {
289
+ if ((flags & FLAG_IGNORE_WHITESPACE) &&
290
+ isspace(in[i])) continue;
291
+
292
+ /*
293
+ at each character we update the rolling hash and
294
+ the normal hash. When the rolling hash hits the
295
+ reset value then we emit the normal hash as a
296
+ element of the signature and reset both hashes
297
+ */
298
+ h = roll_hash(in[i]);
299
+ h2 = sum_hash(in[i], h2);
300
+ h3 = sum_hash(in[i], h3);
301
+
302
+ if (h % block_size == (block_size-1)) {
303
+ /* we have hit a reset point. We now emit a
304
+ hash which is based on all chacaters in the
305
+ piece of the message between the last reset
306
+ point and this one */
307
+ snprintf(&p[j], 2, "%02x", h2);
308
+ if (j < SPAMSUM_HEX_LENGTH-1) {
309
+ /* we can have a problem with the tail
310
+ overflowing. The easiest way to
311
+ cope with this is to only reset the
312
+ second hash if we have room for
313
+ more characters in our
314
+ signature. This has the effect of
315
+ combining the last few pieces of
316
+ the message into a single piece */
317
+ h2 = HASH_INIT;
318
+ j++;
319
+ }
320
+ }
321
+
322
+ /* this produces a second signature with a block size
323
+ of block_size*2. By producing dual signatures in
324
+ this way the effect of small changes in the message
325
+ size near a block size boundary is greatly reduced. */
326
+ if (h % (block_size*2) == ((block_size*2)-1)) {
327
+ snprintf(&ret2[k], 2, "%02x", h3);
328
+ if (k < SPAMSUM_HEX_LENGTH/2-1) {
329
+ h3 = HASH_INIT;
330
+ k++;
331
+ }
332
+ }
333
+ }
334
+
335
+ /* if we have anything left then add it to the end. This
336
+ ensures that the last part of the message is always
337
+ considered */
338
+ if (h != 0) {
339
+ snprintf(&p[j], 2, "%02x", h2);
340
+ snprintf(&ret2[k], 2, "%02x", h3);
341
+ }
342
+
343
+ strcat(p+j, ":");
344
+ strcat(p+j, ret2);
345
+
346
+ /* our blocksize guess may have been way off - repeat if necessary */
347
+ if (bsize == 0 && block_size > MIN_BLOCKSIZE && j < SPAMSUM_HEX_LENGTH/2) {
348
+ block_size = block_size / 2;
349
+ goto again;
350
+ }
351
+
352
+ return ret;
353
+ }
354
+
229
355
 
230
356
  /*
231
357
  we only accept a match if we have at least one common substring in
data/ext/spamsum.i CHANGED
@@ -13,4 +13,5 @@
13
13
 
14
14
  int edit_distn(char *s1, int s1_len, char *s2, int s2_len);
15
15
  char *spamsum(char *str, unsigned int len, unsigned int flags=0, unsigned int bsize=0);
16
+ char *spamsum_hex(char *str, unsigned int len, unsigned int flags=0, unsigned int bsize=0);
16
17
  unsigned int spamsum_match(char *s1, char *s2);
data/ext/spamsum_wrap.c CHANGED
@@ -2087,6 +2087,61 @@ fail:
2087
2087
  }
2088
2088
 
2089
2089
 
2090
+ SWIGINTERN VALUE
2091
+ _wrap_spamsum_hex(int argc, VALUE *argv, VALUE self) {
2092
+ char *arg1 = (char *) 0 ;
2093
+ unsigned int arg2 ;
2094
+ unsigned int arg3 = (unsigned int) 0 ;
2095
+ unsigned int arg4 = (unsigned int) 0 ;
2096
+ char *result = 0 ;
2097
+ int res1 ;
2098
+ char *buf1 = 0 ;
2099
+ int alloc1 = 0 ;
2100
+ unsigned int val2 ;
2101
+ int ecode2 = 0 ;
2102
+ unsigned int val3 ;
2103
+ int ecode3 = 0 ;
2104
+ unsigned int val4 ;
2105
+ int ecode4 = 0 ;
2106
+ VALUE vresult = Qnil;
2107
+
2108
+ if ((argc < 2) || (argc > 4)) {
2109
+ rb_raise(rb_eArgError, "wrong # of arguments(%d for 2)",argc); SWIG_fail;
2110
+ }
2111
+ res1 = SWIG_AsCharPtrAndSize(argv[0], &buf1, NULL, &alloc1);
2112
+ if (!SWIG_IsOK(res1)) {
2113
+ SWIG_exception_fail(SWIG_ArgError(res1), Ruby_Format_TypeError( "", "char *","spamsum_hex", 1, argv[0] ));
2114
+ }
2115
+ arg1 = (char *)(buf1);
2116
+ ecode2 = SWIG_AsVal_unsigned_SS_int(argv[1], &val2);
2117
+ if (!SWIG_IsOK(ecode2)) {
2118
+ SWIG_exception_fail(SWIG_ArgError(ecode2), Ruby_Format_TypeError( "", "unsigned int","spamsum_hex", 2, argv[1] ));
2119
+ }
2120
+ arg2 = (unsigned int)(val2);
2121
+ if (argc > 2) {
2122
+ ecode3 = SWIG_AsVal_unsigned_SS_int(argv[2], &val3);
2123
+ if (!SWIG_IsOK(ecode3)) {
2124
+ SWIG_exception_fail(SWIG_ArgError(ecode3), Ruby_Format_TypeError( "", "unsigned int","spamsum_hex", 3, argv[2] ));
2125
+ }
2126
+ arg3 = (unsigned int)(val3);
2127
+ }
2128
+ if (argc > 3) {
2129
+ ecode4 = SWIG_AsVal_unsigned_SS_int(argv[3], &val4);
2130
+ if (!SWIG_IsOK(ecode4)) {
2131
+ SWIG_exception_fail(SWIG_ArgError(ecode4), Ruby_Format_TypeError( "", "unsigned int","spamsum_hex", 4, argv[3] ));
2132
+ }
2133
+ arg4 = (unsigned int)(val4);
2134
+ }
2135
+ result = (char *)spamsum_hex(arg1,arg2,arg3,arg4);
2136
+ vresult = SWIG_FromCharPtr((const char *)result);
2137
+ if (alloc1 == SWIG_NEWOBJ) free((char*)buf1);
2138
+ return vresult;
2139
+ fail:
2140
+ if (alloc1 == SWIG_NEWOBJ) free((char*)buf1);
2141
+ return Qnil;
2142
+ }
2143
+
2144
+
2090
2145
  SWIGINTERN VALUE
2091
2146
  _wrap_spamsum_match(int argc, VALUE *argv, VALUE self) {
2092
2147
  char *arg1 = (char *) 0 ;
@@ -2400,6 +2455,7 @@ SWIGEXPORT void Init_spamsum_swig(void) {
2400
2455
  SWIG_RubyInitializeTrackings();
2401
2456
  rb_define_module_function(mSpamsum_swig, "edit_distn", _wrap_edit_distn, -1);
2402
2457
  rb_define_module_function(mSpamsum_swig, "spamsum", _wrap_spamsum, -1);
2458
+ rb_define_module_function(mSpamsum_swig, "spamsum_hex", _wrap_spamsum_hex, -1);
2403
2459
  rb_define_module_function(mSpamsum_swig, "spamsum_match", _wrap_spamsum_match, -1);
2404
2460
  }
2405
2461
 
data/spamsum.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{spamsum}
5
- s.version = "0.1.1"
5
+ s.version = "0.1.2"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jonathan Wilkins"]
data/test.rb CHANGED
@@ -3,31 +3,36 @@ s1 = "And she turned to me and took me by the hand and said, I've lost control a
3
3
  s2 = "And she screamed out kicking on her side and said, I've lost control again."
4
4
  s3 = "Control"
5
5
 
6
- puts "s1: #{s1}"
7
- puts "s2: #{s2}"
8
- puts "s3: #{s3}"
9
- puts "Spamsum.distance(s1, s2): #{Spamsum.distance(s1, s2)}"
10
- puts "Spamsum.distance(s1, s3): #{Spamsum.distance(s1, s3)}"
11
- puts "-"*40
6
+ # XXX: seems to be a min length issue for compares, replace with other dist func?
7
+ b1 = "\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb"
8
+ b2 = "\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc"
9
+ b3 = "\x00\xaa\xbb\xcc\xdd\x66\x77\x88\x99\x00\xaa"
12
10
 
13
- (1..3).each { |x|
14
- eval("s = s#{x}")
15
11
 
16
- eval("$sum#{x} = Spamsum.sum(s, s.length)")
17
- puts "Spamsum.sum(s#{x}): #{eval("$sum#{x}")}"
18
- puts "Spamsum.match(s1, s#{x}): #{Spamsum.match($sum1, eval("$sum#{x}"))}"
19
- }
12
+ %w(s b).each { |t|
13
+ puts "-"*40
14
+ (1..3).each { |x|
15
+ puts "#{t}#{x}: #{eval("#{t}#{x}")}" if t == 's'
16
+ puts "Spamsum.distance(#{t}1, #{t}#{x}): #{Spamsum.distance(eval("#{t}1"), eval("#{t}#{x}"))}"
17
+ eval("s = #{t}#{x}")
18
+ puts "-"*20
20
19
 
21
- puts "-"*40
20
+ eval("$#{t}sum#{x} = Spamsum.sum(s)")
21
+ puts "Spamsum.sum(#{t}#{x}): #{eval("$#{t}sum#{x}")}"
22
+ puts "Spamsum.match(#{t}1, #{t}#{x}): #{Spamsum.match(eval("$#{t}sum1"), eval("$#{t}sum#{x}"))}"
23
+ puts "-"*20
22
24
 
23
- # XXX: seems to be a min length issue for compares, replace with other dist func?
24
- b1 = "\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb"
25
- b2 = "\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc"
26
- b3 = "\x00\xaa\xbb\xcc\xdd\x66\x77\x88\x99\x00\xaa"
27
- (1..3).each { |x|
28
- eval("s = b#{x}")
25
+ eval("$#{t}hsum#{x} = Spamsum.sum_hex(s)")
26
+ puts "Spamsum.match(#{t}1, #{t}#{x}): #{Spamsum.match(eval("$#{t}hsum1"), eval("$#{t}hsum#{x}"))}"
27
+ }
28
+ }
29
29
 
30
- eval("$bsum#{x} = Spamsum.sum(s, s.length)")
31
- puts "Spamsum.sum(b#{x}): #{eval("$bsum#{x}")}"
32
- puts "Spamsum.match(b1, b#{x}): #{Spamsum.match($bsum1, eval("$bsum#{x}"))}"
30
+ puts "-"*40
31
+ %w(s b).each { |t|
32
+ (1..3).each { |x|
33
+ puts "Spamsum.sum_hex(#{t}#{x}): #{eval("$#{t}hsum#{x}")}"
34
+ }
33
35
  }
36
+
37
+ puts "Spamsum.sum_hex(s1*1000): #{Spamsum.sum_hex(s1*1000)}"
38
+ puts "Spamsum.sum_hex(s1*1000): #{Spamsum.sum_hex(open('random.bin').read)}"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jwilkins-spamsum
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Wilkins