Nilsimsa 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. data/extconf.rb +4 -0
  2. data/nilsimsa.c +321 -0
  3. data/nilsimsa.rb +166 -0
  4. metadata +48 -0
data/extconf.rb ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require 'mkmf'
3
+
4
+ create_makefile( 'nilsimsa_native' )
data/nilsimsa.c ADDED
@@ -0,0 +1,321 @@
1
+ /*
2
+ Source: Digest-Nilsimsa-0.06; _nilsimsa.c, nilsimsa.h
3
+ Changes: 2005-04-14 Stephen Lewis <slewis@orcon.net.nz>
4
+ - stripped superfluous code
5
+ - some cleanups, reformatting
6
+ - refactored to provide more appropriate ruby interface
7
+
8
+ NOTE - I haven't gotten around to fixing the previous comment
9
+ headers below this
10
+ */
11
+
12
+ /**
13
+ ** chad's modifications for perl xs - Digest::Nilsimsa
14
+ **
15
+ ** main() - removed (too many warnings)
16
+ ** accbuf - added, practically identical to accfile()
17
+ ** dprint() - added (prints debug msgs to debug.txt)
18
+ **
19
+ ** $Id: _nilsimsa.c,v 1.1 2002/05/20 22:29:07 chad Exp $
20
+ **/
21
+ /***************************************************************************
22
+ main.c - nilsimsa
23
+ -------------------
24
+ begin : Fri Mar 16 01:41:08 EST 2001
25
+ copyright : (C) 2001-2002 by cmeclax
26
+ email : cmeclax@ixazon.dynip.com
27
+ ***************************************************************************/
28
+
29
+ /***************************************************************************
30
+ * *
31
+ * This program is free software; you can redistribute it and/or modify *
32
+ * it under the terms of the GNU General Public License as published by *
33
+ * the Free Software Foundation; either version 2 of the License, or *
34
+ * (at your option) any later version. *
35
+ * *
36
+ ***************************************************************************/
37
+
38
+
39
+
40
+ /* NOTE - this should really use the support code from Digest, but
41
+ would require a ruby source tree to build in that case ( the
42
+ required headers don't seem to be generally installed :/ )
43
+ */
44
+
45
+ #include <ruby.h>
46
+ #include <assert.h>
47
+
48
+ #define NSR_CODE_LEN 32
49
+ #define RB_NSR_IVAR "cdata" /* not prefixing @ makes innaccessible from ruby */
50
+
51
+ #define tran3(a,b,c,n) (((tran[((a)+(n))&255]^tran[(b)]*((n)+(n)+1))+tran[(c)^tran[n]])&255)
52
+
53
+
54
+ struct nsrecord {
55
+ int acc[256]; /* counts each trigram's hash */
56
+ char code[NSR_CODE_LEN]; /* the nilsimsa code as a bit vector */
57
+
58
+ unsigned int chcount; /* number of characters processed so far */
59
+ int lastch[4]; /* last 4 characters processed */
60
+ };
61
+
62
+
63
+ void nsr_init( struct nsrecord *a );
64
+ void nsr_digest( struct nsrecord *a );
65
+ void nsr_update(struct nsrecord *, char *buf,unsigned long len);
66
+ int nilsimsa(char *a,char *b);
67
+ /*void nsr_free( struct nsrecord *r );*/
68
+ static void filltran(void);
69
+ static void fillpopcount(void);
70
+
71
+
72
+ static struct nsrecord *get_nsr( VALUE obj );
73
+ VALUE rbns_init(int argc, VALUE *argv, VALUE self);
74
+ VALUE rbns_update(VALUE self, VALUE data);
75
+ VALUE rbns_nilsimsa(VALUE self, VALUE other);
76
+ VALUE rbns_digest(VALUE self);
77
+ void Init_nilsimsa_native(void);
78
+
79
+
80
+
81
+ unsigned char tran[256], popcount[256];
82
+
83
+
84
+
85
+
86
+
87
+ /* formerly clear() */
88
+ void
89
+ nsr_init( struct nsrecord *a ) {
90
+ assert( a );
91
+
92
+ memset(a->acc,0,sizeof(a->acc));
93
+ memset(a->code,0,sizeof(a->code));
94
+
95
+ a->chcount = 0;
96
+ {
97
+ int i;
98
+ for (i=0; i<4; i++) {
99
+ a->lastch[i] = -1;
100
+ }
101
+ }
102
+ }
103
+
104
+ static void
105
+ filltran(void) {
106
+ int i,j,k;
107
+ for (i=j=0;i<256;i++) {
108
+ j=(j*53+1)&255;
109
+ j+=j;
110
+ if (j>255) {
111
+ j-=255;
112
+ }
113
+ for (k=0;k<i;k++) {
114
+ if (j==tran[k]) {
115
+ j=(j+1)&255;
116
+ k=0;
117
+ }
118
+ }
119
+ tran[i]=j;
120
+ }
121
+ }
122
+
123
+ static void
124
+ fillpopcount(void) {
125
+ int i,j;
126
+ memset(popcount,0,sizeof(popcount));
127
+
128
+ for (i=0;i<256;i++) {
129
+ for (j=0;j<8;j++) {
130
+ popcount[i]+=1&(i>>j);
131
+ }
132
+ }
133
+ }
134
+
135
+
136
+ /* formerly accfile() */
137
+ void
138
+ nsr_update(struct nsrecord *a, char *buf, unsigned long len) {
139
+ unsigned int idx;
140
+ int *lastch=a->lastch; // convenience
141
+
142
+ assert( a );
143
+
144
+ for(idx=0; idx<len; idx++) {
145
+ unsigned char ch = (unsigned char) buf[idx];
146
+ a->chcount++;
147
+ if (lastch[1]>=0)
148
+ a->acc[tran3(ch,lastch[0],lastch[1],0)]++;
149
+ if (lastch[2]>=0) {
150
+ a->acc[tran3(ch,lastch[0],lastch[2],1)]++;
151
+ a->acc[tran3(ch,lastch[1],lastch[2],2)]++;
152
+ }
153
+ if (lastch[3]>=0) {
154
+ a->acc[tran3(ch,lastch[0],lastch[3],3)]++;
155
+ a->acc[tran3(ch,lastch[1],lastch[3],4)]++;
156
+ a->acc[tran3(ch,lastch[2],lastch[3],5)]++;
157
+ a->acc[tran3(lastch[3],lastch[0],ch,6)]++;
158
+ a->acc[tran3(lastch[3],lastch[2],ch,7)]++;
159
+ }
160
+ lastch[3]=lastch[2];
161
+ lastch[2]=lastch[1];
162
+ lastch[1]=lastch[0];
163
+ lastch[0]=ch;
164
+ }
165
+ }
166
+
167
+ /* formerly makecode() */
168
+ void
169
+ nsr_digest(struct nsrecord *a) {
170
+ int i;
171
+ int total=0; /* total number of trigrams counted */
172
+ int threshold=0; /* mean of all numbers in acc */
173
+
174
+ assert( a );
175
+
176
+ switch (a->chcount) {
177
+ case 0:
178
+ case 1:
179
+ case 2:
180
+ break;
181
+ case 3:
182
+ total = 1;
183
+ break;
184
+ case 4:
185
+ total = 4;
186
+ break;
187
+ default:
188
+ total = (8 * a->chcount) - 28;
189
+ break;
190
+ }
191
+
192
+ threshold=total/256; /* round down because criterion is >threshold */
193
+
194
+ memset(a->code,0,sizeof(a->code));
195
+ for (i=0;i<256;i++) {
196
+ a->code[i>>3]+=((a->acc[i]>threshold)<<(i&7));
197
+ }
198
+ }
199
+
200
+ /* NOTE - assumes both of length 32 */
201
+ int
202
+ nilsimsa(char *a,char *b) {
203
+ int i,bits=0;
204
+ assert( a );
205
+ assert( b );
206
+
207
+ for (i=0;i<NSR_CODE_LEN;i++) {
208
+ bits+=popcount[255&(a[i]^b[i])];
209
+ }
210
+
211
+ return 128-bits;
212
+ }
213
+
214
+ /*
215
+ void
216
+ nsr_free( struct nsrecord *r ) {
217
+ if (r) {
218
+ free( r );
219
+ }
220
+ }
221
+ */
222
+
223
+
224
+ /*
225
+ *
226
+ * begin ruby wrapper functions
227
+ *
228
+ */
229
+
230
+ static struct nsrecord *
231
+ get_nsr( VALUE obj ) {
232
+ VALUE wrapped;
233
+ struct nsrecord *ret;
234
+
235
+ if (!RTEST( rb_funcall( obj, rb_intern( "kind_of?" ), 1,
236
+ rb_eval_string("Nilsimsa")))) {
237
+ /* FIXME should raise exception */
238
+ return NULL;
239
+ }
240
+
241
+ wrapped = rb_iv_get( obj, RB_NSR_IVAR );
242
+ if (Qnil == wrapped) {
243
+ return NULL;
244
+ }
245
+
246
+ Data_Get_Struct( wrapped, struct nsrecord, ret );
247
+ return ret;
248
+ }
249
+
250
+ VALUE
251
+ rbns_init(int argc, VALUE *argv, VALUE self) {
252
+ VALUE wrapped_nsr;
253
+ struct nsrecord *r;
254
+ wrapped_nsr = Data_Make_Struct( rb_cObject, struct nsrecord,
255
+ NULL, -1, r );
256
+ rb_iv_set( self, RB_NSR_IVAR, wrapped_nsr );
257
+ nsr_init( r );
258
+
259
+ return rb_funcall2( self, rb_intern( "old_initialize" ), argc, argv );
260
+ }
261
+
262
+ VALUE
263
+ rbns_update(VALUE self, VALUE data) {
264
+ struct nsrecord *r;
265
+ char *chdata;
266
+ long chdata_len;
267
+ r = get_nsr( self );
268
+
269
+ Check_Type( data, T_STRING );
270
+ chdata = rb_str2cstr( data, &chdata_len );
271
+ nsr_update( r, chdata, chdata_len );
272
+ return data;
273
+ }
274
+
275
+ VALUE
276
+ rbns_nilsimsa(VALUE self, VALUE other) {
277
+ long len;
278
+ char *d1;
279
+ char *d2;
280
+
281
+ d1 = rb_str2cstr( rb_funcall( self, rb_intern( "digest" ), 0 ), &len );
282
+ if (len < NSR_CODE_LEN) {
283
+ return Qnil;
284
+ }
285
+
286
+ Check_Type( other, T_STRING );
287
+ d2 = rb_str2cstr( other, &len );
288
+ if (len < NSR_CODE_LEN) {
289
+ return Qnil;
290
+ }
291
+
292
+ return INT2NUM( nilsimsa( d1, d2 ) );
293
+ }
294
+
295
+ VALUE
296
+ rbns_digest(VALUE self) {
297
+ struct nsrecord *r=get_nsr( self );
298
+
299
+ nsr_digest( r );
300
+
301
+ /* reverse a newly created string of the digest */
302
+ return rb_funcall( rb_str_new( r->code, NSR_CODE_LEN ),
303
+ rb_intern( "reverse"), 0 );
304
+ }
305
+
306
+ void
307
+ Init_nilsimsa_native(void) {
308
+ VALUE rb_cNilsimsa;
309
+ /* initialize invariant data */
310
+ filltran();
311
+ fillpopcount();
312
+
313
+ /* this grafts itself over the top of an existing Nilsimsa class */
314
+ rb_cNilsimsa = rb_eval_string( "Nilsimsa" );
315
+ /* we'll call old_initialize from our new initialize */
316
+ rb_define_alias( rb_cNilsimsa, "old_initialize", "initialize" );
317
+ rb_define_method( rb_cNilsimsa, "initialize", rbns_init, -1 );
318
+ rb_define_method( rb_cNilsimsa, "update", rbns_update, 1 );
319
+ rb_define_method( rb_cNilsimsa, "nilsimsa", rbns_nilsimsa, 1 );
320
+ rb_define_method( rb_cNilsimsa, "digest", rbns_digest, 0 );
321
+ }
data/nilsimsa.rb ADDED
@@ -0,0 +1,166 @@
1
+ # Nilsimsa hash (build 20050414)
2
+ # Ruby port (C) 2005 Martin Pirker
3
+ # released under GNU GPL V2 license
4
+ #
5
+ # inspired by Digest::Nilsimsa-0.06 from Perl CPAN and
6
+ # the original C nilsimsa-0.2.4 implementation by cmeclax
7
+ # http://ixazon.dynip.com/~cmeclax/nilsimsa.html
8
+
9
+
10
+ class Nilsimsa
11
+
12
+ TRAN =
13
+ "\x02\xD6\x9E\x6F\xF9\x1D\x04\xAB\xD0\x22\x16\x1F\xD8\x73\xA1\xAC" <<
14
+ "\x3B\x70\x62\x96\x1E\x6E\x8F\x39\x9D\x05\x14\x4A\xA6\xBE\xAE\x0E" <<
15
+ "\xCF\xB9\x9C\x9A\xC7\x68\x13\xE1\x2D\xA4\xEB\x51\x8D\x64\x6B\x50" <<
16
+ "\x23\x80\x03\x41\xEC\xBB\x71\xCC\x7A\x86\x7F\x98\xF2\x36\x5E\xEE" <<
17
+ "\x8E\xCE\x4F\xB8\x32\xB6\x5F\x59\xDC\x1B\x31\x4C\x7B\xF0\x63\x01" <<
18
+ "\x6C\xBA\x07\xE8\x12\x77\x49\x3C\xDA\x46\xFE\x2F\x79\x1C\x9B\x30" <<
19
+ "\xE3\x00\x06\x7E\x2E\x0F\x38\x33\x21\xAD\xA5\x54\xCA\xA7\x29\xFC" <<
20
+ "\x5A\x47\x69\x7D\xC5\x95\xB5\xF4\x0B\x90\xA3\x81\x6D\x25\x55\x35" <<
21
+ "\xF5\x75\x74\x0A\x26\xBF\x19\x5C\x1A\xC6\xFF\x99\x5D\x84\xAA\x66" <<
22
+ "\x3E\xAF\x78\xB3\x20\x43\xC1\xED\x24\xEA\xE6\x3F\x18\xF3\xA0\x42" <<
23
+ "\x57\x08\x53\x60\xC3\xC0\x83\x40\x82\xD7\x09\xBD\x44\x2A\x67\xA8" <<
24
+ "\x93\xE0\xC2\x56\x9F\xD9\xDD\x85\x15\xB4\x8A\x27\x28\x92\x76\xDE" <<
25
+ "\xEF\xF8\xB2\xB7\xC9\x3D\x45\x94\x4B\x11\x0D\x65\xD5\x34\x8B\x91" <<
26
+ "\x0C\xFA\x87\xE9\x7C\x5B\xB1\x4D\xE5\xD4\xCB\x10\xA2\x17\x89\xBC" <<
27
+ "\xDB\xB0\xE2\x97\x88\x52\xF7\x48\xD3\x61\x2C\x3A\x2B\xD1\x8C\xFB" <<
28
+ "\xF1\xCD\xE4\x6A\xE7\xA9\xFD\xC4\x37\xC8\xD2\xF6\xDF\x58\x72\x4E"
29
+
30
+ POPC =
31
+ "\x00\x01\x01\x02\x01\x02\x02\x03\x01\x02\x02\x03\x02\x03\x03\x04" <<
32
+ "\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05" <<
33
+ "\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05" <<
34
+ "\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06" <<
35
+ "\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05" <<
36
+ "\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06" <<
37
+ "\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06" <<
38
+ "\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07" <<
39
+ "\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05" <<
40
+ "\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06" <<
41
+ "\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06" <<
42
+ "\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07" <<
43
+ "\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06" <<
44
+ "\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07" <<
45
+ "\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07" <<
46
+ "\x04\x05\x05\x06\x05\x06\x06\x07\x05\x06\x06\x07\x06\x07\x07\x08"
47
+
48
+ def initialize(*data)
49
+ @threshold=0; @count=0
50
+ @acc =Array::new(256,0)
51
+ @lastch0=@lastch1=@lastch2=@lastch3= -1
52
+
53
+ data.each do |d| update(d) end if data && (data.size>0)
54
+ end
55
+
56
+ def tran3(a,b,c,n)
57
+ (((TRAN[(a+n)&255]^TRAN[b]*(n+n+1))+TRAN[(c)^TRAN[n]])&255)
58
+ end
59
+
60
+ def update(data)
61
+ data.each_byte do |ch|
62
+ @count +=1
63
+ if @lastch1>-1 then
64
+ @acc[tran3(ch,@lastch0,@lastch1,0)] +=1
65
+ end
66
+ if @lastch2>-1 then
67
+ @acc[tran3(ch,@lastch0,@lastch2,1)] +=1
68
+ @acc[tran3(ch,@lastch1,@lastch2,2)] +=1
69
+ end
70
+ if @lastch3>-1 then
71
+ @acc[tran3(ch,@lastch0,@lastch3,3)] +=1
72
+ @acc[tran3(ch,@lastch1,@lastch3,4)] +=1
73
+ @acc[tran3(ch,@lastch2,@lastch3,5)] +=1
74
+ @acc[tran3(@lastch3,@lastch0,ch,6)] +=1
75
+ @acc[tran3(@lastch3,@lastch2,ch,7)] +=1
76
+ end
77
+ @lastch3=@lastch2
78
+ @lastch2=@lastch1
79
+ @lastch1=@lastch0
80
+ @lastch0=ch
81
+ end
82
+ end
83
+
84
+ def digest
85
+ @total=0;
86
+ case @count
87
+ when 0..2:
88
+ when 3 : @total +=1
89
+ when 4 : @total +=4
90
+ else
91
+ @total +=(8*@count)-28
92
+ end
93
+ @threshold=@total/256
94
+
95
+ @code=String::new("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" <<
96
+ "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00")
97
+ (0..255).each do |i|
98
+ @code[i>>3]+=( ((@acc[i]>@threshold)?(1):(0))<<(i&7) )
99
+ end
100
+
101
+ @code[0..31].reverse
102
+ end
103
+
104
+ def hexdigest
105
+ digest.unpack("H*")[0]
106
+ end
107
+
108
+ def to_s
109
+ hexdigest
110
+ end
111
+
112
+ def <<(whatever)
113
+ update(whatever)
114
+ end
115
+
116
+ def ==(otherdigest)
117
+ digest == otherdigest
118
+ end
119
+
120
+ def file(thisone)
121
+ File.open(thisone,"rb") do |f|
122
+ until f.eof? do update(f.read(10480)) end
123
+ end
124
+ end
125
+
126
+ def nilsimsa(otherdigest)
127
+ bits=0; myd=digest
128
+ (0..31).each do |i|
129
+ bits += POPC[255&myd[i]^otherdigest[i]]
130
+ end
131
+ (128-bits)
132
+ end
133
+
134
+ end
135
+
136
+ begin # load C core - if available
137
+ require 'nilsimsa_native'
138
+ puts 'using compiled nilsimsa'
139
+ rescue LoadError => e
140
+ # ignore lack of native module
141
+ puts 'using interpreted nilsimsa'
142
+ end
143
+
144
+ if __FILE__ == $0 then
145
+ if ARGV.size>0 then
146
+ ARGV.each do |filename|
147
+ if FileTest::exists?(filename) then
148
+ n = Nilsimsa::new
149
+ n.file(filename)
150
+ puts n.hexdigest+" #{filename}"
151
+ else
152
+ puts "error: can't find '#{filename}'"
153
+ end
154
+ end
155
+ else
156
+ puts "running selftest..."
157
+ n1 = Nilsimsa::new; n1.update("abcdefgh")
158
+ puts "abcdefgh: #{n1.hexdigest=='14c8118000000000030800000004042004189020001308014088003280000078'}"
159
+ n2 = Nilsimsa::new( "abcd","efgh")
160
+ puts "abcd efgh: #{n2.hexdigest=='14c8118000000000030800000004042004189020001308014088003280000078'}"
161
+ puts "digest: #{n1 == n2.digest}"
162
+ n1.update("ijk")
163
+ puts "ijk: #{n1.hexdigest=='14c811840010000c0328200108040630041890200217582d4098103280000078'}"
164
+ puts "nilsimsa: #{n1.nilsimsa(n2.digest)==109}"
165
+ end
166
+ end
metadata ADDED
@@ -0,0 +1,48 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.4
3
+ specification_version: 1
4
+ name: Nilsimsa
5
+ version: !ruby/object:Gem::Version
6
+ version: 1.0.0
7
+ date: 2007-10-29 00:00:00 -07:00
8
+ summary: Computes Nilsimsa values. Nilsimsa is a distance based hash
9
+ require_paths:
10
+ - .
11
+ email: jwilkins[at]nospam[dot]bitland[dot]net
12
+ homepage:
13
+ rubyforge_project:
14
+ description:
15
+ autorequire: nilsimsa.rb
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Jonathan Wilkins
31
+ files:
32
+ - nilsimsa.c
33
+ - nilsimsa.rb
34
+ - extconf.rb
35
+ test_files: []
36
+
37
+ rdoc_options: []
38
+
39
+ extra_rdoc_files: []
40
+
41
+ executables: []
42
+
43
+ extensions: []
44
+
45
+ requirements: []
46
+
47
+ dependencies: []
48
+