jwilkins-spamsum 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/spamsum.c CHANGED
@@ -29,6 +29,7 @@
29
29
 
30
30
  /* the output is a string of length 64 in base64 */
31
31
  #define SPAMSUM_LENGTH 64
32
+ #define SPAMSUM_HEX_LENGTH 128
32
33
 
33
34
  #define MIN_BLOCKSIZE 3
34
35
  #define HASH_PRIME 0x01000193
@@ -226,6 +227,131 @@ again:
226
227
  return ret;
227
228
  }
228
229
 
230
+ /*
231
+ take a message of length 'length' and return a string representing a hash of that message,
232
+ prefixed by the selected blocksize
233
+ */
234
+ char *spamsum_hex(const uchar *in, u32 length, u32 flags, u32 bsize)
235
+ {
236
+ char *ret, *p;
237
+ u32 total_chars;
238
+ u32 h, h2, h3;
239
+ u32 j, n, i, k;
240
+ u32 block_size;
241
+ uchar ret2[SPAMSUM_HEX_LENGTH/2 + 1];
242
+
243
+ /* if we are ignoring email headers then skip past them now */
244
+ if (flags & FLAG_IGNORE_HEADERS) {
245
+ const uchar *s = strstr(in, "\n\n");
246
+ if (s) {
247
+ length -= (s+2 - in);
248
+ in = s+2;
249
+ }
250
+ }
251
+
252
+ if (flags & FLAG_IGNORE_WHITESPACE) {
253
+ /* count the non-ignored chars */
254
+ for (n=0, i=0; i<length; i++) {
255
+ if (isspace(in[i])) continue;
256
+ n++;
257
+ }
258
+ total_chars = n;
259
+ } else {
260
+ total_chars = length;
261
+ }
262
+
263
+ if (bsize == 0) {
264
+ /* guess a reasonable block size */
265
+ block_size = MIN_BLOCKSIZE;
266
+ while (block_size * SPAMSUM_HEX_LENGTH < total_chars) {
267
+ block_size = block_size * 2;
268
+ }
269
+ } else {
270
+ block_size = bsize;
271
+ }
272
+
273
+ ret = malloc(SPAMSUM_HEX_LENGTH + SPAMSUM_HEX_LENGTH/2 + 20);
274
+ if (!ret) return NULL;
275
+
276
+ again:
277
+ /* the first part of the spamsum signature is the blocksize */
278
+ snprintf(ret, 12, "%u:", block_size);
279
+ p = ret + strlen(ret);
280
+
281
+ memset(p, 0, SPAMSUM_HEX_LENGTH+1);
282
+ memset(ret2, 0, sizeof(ret2));
283
+
284
+ k = j = 0;
285
+ h3 = h2 = HASH_INIT;
286
+ h = roll_reset();
287
+
288
+ for (i=0; i<length; i++) {
289
+ if ((flags & FLAG_IGNORE_WHITESPACE) &&
290
+ isspace(in[i])) continue;
291
+
292
+ /*
293
+ at each character we update the rolling hash and
294
+ the normal hash. When the rolling hash hits the
295
+ reset value then we emit the normal hash as a
296
+ element of the signature and reset both hashes
297
+ */
298
+ h = roll_hash(in[i]);
299
+ h2 = sum_hash(in[i], h2);
300
+ h3 = sum_hash(in[i], h3);
301
+
302
+ if (h % block_size == (block_size-1)) {
303
+ /* we have hit a reset point. We now emit a
304
+ hash which is based on all chacaters in the
305
+ piece of the message between the last reset
306
+ point and this one */
307
+ snprintf(&p[j], 2, "%02x", h2);
308
+ if (j < SPAMSUM_HEX_LENGTH-1) {
309
+ /* we can have a problem with the tail
310
+ overflowing. The easiest way to
311
+ cope with this is to only reset the
312
+ second hash if we have room for
313
+ more characters in our
314
+ signature. This has the effect of
315
+ combining the last few pieces of
316
+ the message into a single piece */
317
+ h2 = HASH_INIT;
318
+ j++;
319
+ }
320
+ }
321
+
322
+ /* this produces a second signature with a block size
323
+ of block_size*2. By producing dual signatures in
324
+ this way the effect of small changes in the message
325
+ size near a block size boundary is greatly reduced. */
326
+ if (h % (block_size*2) == ((block_size*2)-1)) {
327
+ snprintf(&ret2[k], 2, "%02x", h3);
328
+ if (k < SPAMSUM_HEX_LENGTH/2-1) {
329
+ h3 = HASH_INIT;
330
+ k++;
331
+ }
332
+ }
333
+ }
334
+
335
+ /* if we have anything left then add it to the end. This
336
+ ensures that the last part of the message is always
337
+ considered */
338
+ if (h != 0) {
339
+ snprintf(&p[j], 2, "%02x", h2);
340
+ snprintf(&ret2[k], 2, "%02x", h3);
341
+ }
342
+
343
+ strcat(p+j, ":");
344
+ strcat(p+j, ret2);
345
+
346
+ /* our blocksize guess may have been way off - repeat if necessary */
347
+ if (bsize == 0 && block_size > MIN_BLOCKSIZE && j < SPAMSUM_HEX_LENGTH/2) {
348
+ block_size = block_size / 2;
349
+ goto again;
350
+ }
351
+
352
+ return ret;
353
+ }
354
+
229
355
 
230
356
  /*
231
357
  we only accept a match if we have at least one common substring in
data/ext/spamsum.i CHANGED
@@ -13,4 +13,5 @@
13
13
 
14
14
  int edit_distn(char *s1, int s1_len, char *s2, int s2_len);
15
15
  char *spamsum(char *str, unsigned int len, unsigned int flags=0, unsigned int bsize=0);
16
+ char *spamsum_hex(char *str, unsigned int len, unsigned int flags=0, unsigned int bsize=0);
16
17
  unsigned int spamsum_match(char *s1, char *s2);
data/ext/spamsum_wrap.c CHANGED
@@ -2087,6 +2087,61 @@ fail:
2087
2087
  }
2088
2088
 
2089
2089
 
2090
+ SWIGINTERN VALUE
2091
+ _wrap_spamsum_hex(int argc, VALUE *argv, VALUE self) {
2092
+ char *arg1 = (char *) 0 ;
2093
+ unsigned int arg2 ;
2094
+ unsigned int arg3 = (unsigned int) 0 ;
2095
+ unsigned int arg4 = (unsigned int) 0 ;
2096
+ char *result = 0 ;
2097
+ int res1 ;
2098
+ char *buf1 = 0 ;
2099
+ int alloc1 = 0 ;
2100
+ unsigned int val2 ;
2101
+ int ecode2 = 0 ;
2102
+ unsigned int val3 ;
2103
+ int ecode3 = 0 ;
2104
+ unsigned int val4 ;
2105
+ int ecode4 = 0 ;
2106
+ VALUE vresult = Qnil;
2107
+
2108
+ if ((argc < 2) || (argc > 4)) {
2109
+ rb_raise(rb_eArgError, "wrong # of arguments(%d for 2)",argc); SWIG_fail;
2110
+ }
2111
+ res1 = SWIG_AsCharPtrAndSize(argv[0], &buf1, NULL, &alloc1);
2112
+ if (!SWIG_IsOK(res1)) {
2113
+ SWIG_exception_fail(SWIG_ArgError(res1), Ruby_Format_TypeError( "", "char *","spamsum_hex", 1, argv[0] ));
2114
+ }
2115
+ arg1 = (char *)(buf1);
2116
+ ecode2 = SWIG_AsVal_unsigned_SS_int(argv[1], &val2);
2117
+ if (!SWIG_IsOK(ecode2)) {
2118
+ SWIG_exception_fail(SWIG_ArgError(ecode2), Ruby_Format_TypeError( "", "unsigned int","spamsum_hex", 2, argv[1] ));
2119
+ }
2120
+ arg2 = (unsigned int)(val2);
2121
+ if (argc > 2) {
2122
+ ecode3 = SWIG_AsVal_unsigned_SS_int(argv[2], &val3);
2123
+ if (!SWIG_IsOK(ecode3)) {
2124
+ SWIG_exception_fail(SWIG_ArgError(ecode3), Ruby_Format_TypeError( "", "unsigned int","spamsum_hex", 3, argv[2] ));
2125
+ }
2126
+ arg3 = (unsigned int)(val3);
2127
+ }
2128
+ if (argc > 3) {
2129
+ ecode4 = SWIG_AsVal_unsigned_SS_int(argv[3], &val4);
2130
+ if (!SWIG_IsOK(ecode4)) {
2131
+ SWIG_exception_fail(SWIG_ArgError(ecode4), Ruby_Format_TypeError( "", "unsigned int","spamsum_hex", 4, argv[3] ));
2132
+ }
2133
+ arg4 = (unsigned int)(val4);
2134
+ }
2135
+ result = (char *)spamsum_hex(arg1,arg2,arg3,arg4);
2136
+ vresult = SWIG_FromCharPtr((const char *)result);
2137
+ if (alloc1 == SWIG_NEWOBJ) free((char*)buf1);
2138
+ return vresult;
2139
+ fail:
2140
+ if (alloc1 == SWIG_NEWOBJ) free((char*)buf1);
2141
+ return Qnil;
2142
+ }
2143
+
2144
+
2090
2145
  SWIGINTERN VALUE
2091
2146
  _wrap_spamsum_match(int argc, VALUE *argv, VALUE self) {
2092
2147
  char *arg1 = (char *) 0 ;
@@ -2400,6 +2455,7 @@ SWIGEXPORT void Init_spamsum_swig(void) {
2400
2455
  SWIG_RubyInitializeTrackings();
2401
2456
  rb_define_module_function(mSpamsum_swig, "edit_distn", _wrap_edit_distn, -1);
2402
2457
  rb_define_module_function(mSpamsum_swig, "spamsum", _wrap_spamsum, -1);
2458
+ rb_define_module_function(mSpamsum_swig, "spamsum_hex", _wrap_spamsum_hex, -1);
2403
2459
  rb_define_module_function(mSpamsum_swig, "spamsum_match", _wrap_spamsum_match, -1);
2404
2460
  }
2405
2461
 
data/spamsum.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{spamsum}
5
- s.version = "0.1.1"
5
+ s.version = "0.1.2"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jonathan Wilkins"]
data/test.rb CHANGED
@@ -3,31 +3,36 @@ s1 = "And she turned to me and took me by the hand and said, I've lost control a
3
3
  s2 = "And she screamed out kicking on her side and said, I've lost control again."
4
4
  s3 = "Control"
5
5
 
6
- puts "s1: #{s1}"
7
- puts "s2: #{s2}"
8
- puts "s3: #{s3}"
9
- puts "Spamsum.distance(s1, s2): #{Spamsum.distance(s1, s2)}"
10
- puts "Spamsum.distance(s1, s3): #{Spamsum.distance(s1, s3)}"
11
- puts "-"*40
6
+ # XXX: seems to be a min length issue for compares, replace with other dist func?
7
+ b1 = "\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb"
8
+ b2 = "\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc"
9
+ b3 = "\x00\xaa\xbb\xcc\xdd\x66\x77\x88\x99\x00\xaa"
12
10
 
13
- (1..3).each { |x|
14
- eval("s = s#{x}")
15
11
 
16
- eval("$sum#{x} = Spamsum.sum(s, s.length)")
17
- puts "Spamsum.sum(s#{x}): #{eval("$sum#{x}")}"
18
- puts "Spamsum.match(s1, s#{x}): #{Spamsum.match($sum1, eval("$sum#{x}"))}"
19
- }
12
+ %w(s b).each { |t|
13
+ puts "-"*40
14
+ (1..3).each { |x|
15
+ puts "#{t}#{x}: #{eval("#{t}#{x}")}" if t == 's'
16
+ puts "Spamsum.distance(#{t}1, #{t}#{x}): #{Spamsum.distance(eval("#{t}1"), eval("#{t}#{x}"))}"
17
+ eval("s = #{t}#{x}")
18
+ puts "-"*20
20
19
 
21
- puts "-"*40
20
+ eval("$#{t}sum#{x} = Spamsum.sum(s)")
21
+ puts "Spamsum.sum(#{t}#{x}): #{eval("$#{t}sum#{x}")}"
22
+ puts "Spamsum.match(#{t}1, #{t}#{x}): #{Spamsum.match(eval("$#{t}sum1"), eval("$#{t}sum#{x}"))}"
23
+ puts "-"*20
22
24
 
23
- # XXX: seems to be a min length issue for compares, replace with other dist func?
24
- b1 = "\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb"
25
- b2 = "\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc\x22\x33\x44\x55\x66\x77\x88\x99\xaa\xbb\xcc"
26
- b3 = "\x00\xaa\xbb\xcc\xdd\x66\x77\x88\x99\x00\xaa"
27
- (1..3).each { |x|
28
- eval("s = b#{x}")
25
+ eval("$#{t}hsum#{x} = Spamsum.sum_hex(s)")
26
+ puts "Spamsum.match(#{t}1, #{t}#{x}): #{Spamsum.match(eval("$#{t}hsum1"), eval("$#{t}hsum#{x}"))}"
27
+ }
28
+ }
29
29
 
30
- eval("$bsum#{x} = Spamsum.sum(s, s.length)")
31
- puts "Spamsum.sum(b#{x}): #{eval("$bsum#{x}")}"
32
- puts "Spamsum.match(b1, b#{x}): #{Spamsum.match($bsum1, eval("$bsum#{x}"))}"
30
+ puts "-"*40
31
+ %w(s b).each { |t|
32
+ (1..3).each { |x|
33
+ puts "Spamsum.sum_hex(#{t}#{x}): #{eval("$#{t}hsum#{x}")}"
34
+ }
33
35
  }
36
+
37
+ puts "Spamsum.sum_hex(s1*1000): #{Spamsum.sum_hex(s1*1000)}"
38
+ puts "Spamsum.sum_hex(s1*1000): #{Spamsum.sum_hex(open('random.bin').read)}"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: jwilkins-spamsum
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jonathan Wilkins