Nilsimsa 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. data/extconf.rb +4 -0
  2. data/nilsimsa.c +321 -0
  3. data/nilsimsa.rb +166 -0
  4. metadata +48 -0
data/extconf.rb ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require 'mkmf'
3
+
4
+ create_makefile( 'nilsimsa_native' )
data/nilsimsa.c ADDED
@@ -0,0 +1,321 @@
1
+ /*
2
+ Source: Digest-Nilsimsa-0.06; _nilsimsa.c, nilsimsa.h
3
+ Changes: 2005-04-14 Stephen Lewis <slewis@orcon.net.nz>
4
+ - stripped superfluous code
5
+ - some cleanups, reformatting
6
+ - refactored to provide more appropriate ruby interface
7
+
8
+ NOTE - I haven't gotten around to fixing the previous comment
9
+ headers below this
10
+ */
11
+
12
+ /**
13
+ ** chad's modifications for perl xs - Digest::Nilsimsa
14
+ **
15
+ ** main() - removed (too many warnings)
16
+ ** accbuf - added, practically identical to accfile()
17
+ ** dprint() - added (prints debug msgs to debug.txt)
18
+ **
19
+ ** $Id: _nilsimsa.c,v 1.1 2002/05/20 22:29:07 chad Exp $
20
+ **/
21
+ /***************************************************************************
22
+ main.c - nilsimsa
23
+ -------------------
24
+ begin : Fri Mar 16 01:41:08 EST 2001
25
+ copyright : (C) 2001-2002 by cmeclax
26
+ email : cmeclax@ixazon.dynip.com
27
+ ***************************************************************************/
28
+
29
+ /***************************************************************************
30
+ * *
31
+ * This program is free software; you can redistribute it and/or modify *
32
+ * it under the terms of the GNU General Public License as published by *
33
+ * the Free Software Foundation; either version 2 of the License, or *
34
+ * (at your option) any later version. *
35
+ * *
36
+ ***************************************************************************/
37
+
38
+
39
+
40
+ /* NOTE - this should really use the support code from Digest, but
41
+ would require a ruby source tree to build in that case ( the
42
+ required headers don't seem to be generally installed :/ )
43
+ */
44
+
45
+ #include <ruby.h>
46
+ #include <assert.h>
47
+
48
+ #define NSR_CODE_LEN 32
49
+ #define RB_NSR_IVAR "cdata" /* not prefixing @ makes innaccessible from ruby */
50
+
51
+ #define tran3(a,b,c,n) (((tran[((a)+(n))&255]^tran[(b)]*((n)+(n)+1))+tran[(c)^tran[n]])&255)
52
+
53
+
54
+ struct nsrecord {
55
+ int acc[256]; /* counts each trigram's hash */
56
+ char code[NSR_CODE_LEN]; /* the nilsimsa code as a bit vector */
57
+
58
+ unsigned int chcount; /* number of characters processed so far */
59
+ int lastch[4]; /* last 4 characters processed */
60
+ };
61
+
62
+
63
+ void nsr_init( struct nsrecord *a );
64
+ void nsr_digest( struct nsrecord *a );
65
+ void nsr_update(struct nsrecord *, char *buf,unsigned long len);
66
+ int nilsimsa(char *a,char *b);
67
+ /*void nsr_free( struct nsrecord *r );*/
68
+ static void filltran(void);
69
+ static void fillpopcount(void);
70
+
71
+
72
+ static struct nsrecord *get_nsr( VALUE obj );
73
+ VALUE rbns_init(int argc, VALUE *argv, VALUE self);
74
+ VALUE rbns_update(VALUE self, VALUE data);
75
+ VALUE rbns_nilsimsa(VALUE self, VALUE other);
76
+ VALUE rbns_digest(VALUE self);
77
+ void Init_nilsimsa_native(void);
78
+
79
+
80
+
81
+ unsigned char tran[256], popcount[256];
82
+
83
+
84
+
85
+
86
+
87
+ /* formerly clear() */
88
+ void
89
+ nsr_init( struct nsrecord *a ) {
90
+ assert( a );
91
+
92
+ memset(a->acc,0,sizeof(a->acc));
93
+ memset(a->code,0,sizeof(a->code));
94
+
95
+ a->chcount = 0;
96
+ {
97
+ int i;
98
+ for (i=0; i<4; i++) {
99
+ a->lastch[i] = -1;
100
+ }
101
+ }
102
+ }
103
+
104
+ static void
105
+ filltran(void) {
106
+ int i,j,k;
107
+ for (i=j=0;i<256;i++) {
108
+ j=(j*53+1)&255;
109
+ j+=j;
110
+ if (j>255) {
111
+ j-=255;
112
+ }
113
+ for (k=0;k<i;k++) {
114
+ if (j==tran[k]) {
115
+ j=(j+1)&255;
116
+ k=0;
117
+ }
118
+ }
119
+ tran[i]=j;
120
+ }
121
+ }
122
+
123
+ static void
124
+ fillpopcount(void) {
125
+ int i,j;
126
+ memset(popcount,0,sizeof(popcount));
127
+
128
+ for (i=0;i<256;i++) {
129
+ for (j=0;j<8;j++) {
130
+ popcount[i]+=1&(i>>j);
131
+ }
132
+ }
133
+ }
134
+
135
+
136
+ /* formerly accfile() */
137
+ void
138
+ nsr_update(struct nsrecord *a, char *buf, unsigned long len) {
139
+ unsigned int idx;
140
+ int *lastch=a->lastch; // convenience
141
+
142
+ assert( a );
143
+
144
+ for(idx=0; idx<len; idx++) {
145
+ unsigned char ch = (unsigned char) buf[idx];
146
+ a->chcount++;
147
+ if (lastch[1]>=0)
148
+ a->acc[tran3(ch,lastch[0],lastch[1],0)]++;
149
+ if (lastch[2]>=0) {
150
+ a->acc[tran3(ch,lastch[0],lastch[2],1)]++;
151
+ a->acc[tran3(ch,lastch[1],lastch[2],2)]++;
152
+ }
153
+ if (lastch[3]>=0) {
154
+ a->acc[tran3(ch,lastch[0],lastch[3],3)]++;
155
+ a->acc[tran3(ch,lastch[1],lastch[3],4)]++;
156
+ a->acc[tran3(ch,lastch[2],lastch[3],5)]++;
157
+ a->acc[tran3(lastch[3],lastch[0],ch,6)]++;
158
+ a->acc[tran3(lastch[3],lastch[2],ch,7)]++;
159
+ }
160
+ lastch[3]=lastch[2];
161
+ lastch[2]=lastch[1];
162
+ lastch[1]=lastch[0];
163
+ lastch[0]=ch;
164
+ }
165
+ }
166
+
167
+ /* formerly makecode() */
168
+ void
169
+ nsr_digest(struct nsrecord *a) {
170
+ int i;
171
+ int total=0; /* total number of trigrams counted */
172
+ int threshold=0; /* mean of all numbers in acc */
173
+
174
+ assert( a );
175
+
176
+ switch (a->chcount) {
177
+ case 0:
178
+ case 1:
179
+ case 2:
180
+ break;
181
+ case 3:
182
+ total = 1;
183
+ break;
184
+ case 4:
185
+ total = 4;
186
+ break;
187
+ default:
188
+ total = (8 * a->chcount) - 28;
189
+ break;
190
+ }
191
+
192
+ threshold=total/256; /* round down because criterion is >threshold */
193
+
194
+ memset(a->code,0,sizeof(a->code));
195
+ for (i=0;i<256;i++) {
196
+ a->code[i>>3]+=((a->acc[i]>threshold)<<(i&7));
197
+ }
198
+ }
199
+
200
+ /* NOTE - assumes both of length 32 */
201
+ int
202
+ nilsimsa(char *a,char *b) {
203
+ int i,bits=0;
204
+ assert( a );
205
+ assert( b );
206
+
207
+ for (i=0;i<NSR_CODE_LEN;i++) {
208
+ bits+=popcount[255&(a[i]^b[i])];
209
+ }
210
+
211
+ return 128-bits;
212
+ }
213
+
214
+ /*
215
+ void
216
+ nsr_free( struct nsrecord *r ) {
217
+ if (r) {
218
+ free( r );
219
+ }
220
+ }
221
+ */
222
+
223
+
224
+ /*
225
+ *
226
+ * begin ruby wrapper functions
227
+ *
228
+ */
229
+
230
+ static struct nsrecord *
231
+ get_nsr( VALUE obj ) {
232
+ VALUE wrapped;
233
+ struct nsrecord *ret;
234
+
235
+ if (!RTEST( rb_funcall( obj, rb_intern( "kind_of?" ), 1,
236
+ rb_eval_string("Nilsimsa")))) {
237
+ /* FIXME should raise exception */
238
+ return NULL;
239
+ }
240
+
241
+ wrapped = rb_iv_get( obj, RB_NSR_IVAR );
242
+ if (Qnil == wrapped) {
243
+ return NULL;
244
+ }
245
+
246
+ Data_Get_Struct( wrapped, struct nsrecord, ret );
247
+ return ret;
248
+ }
249
+
250
+ VALUE
251
+ rbns_init(int argc, VALUE *argv, VALUE self) {
252
+ VALUE wrapped_nsr;
253
+ struct nsrecord *r;
254
+ wrapped_nsr = Data_Make_Struct( rb_cObject, struct nsrecord,
255
+ NULL, -1, r );
256
+ rb_iv_set( self, RB_NSR_IVAR, wrapped_nsr );
257
+ nsr_init( r );
258
+
259
+ return rb_funcall2( self, rb_intern( "old_initialize" ), argc, argv );
260
+ }
261
+
262
+ VALUE
263
+ rbns_update(VALUE self, VALUE data) {
264
+ struct nsrecord *r;
265
+ char *chdata;
266
+ long chdata_len;
267
+ r = get_nsr( self );
268
+
269
+ Check_Type( data, T_STRING );
270
+ chdata = rb_str2cstr( data, &chdata_len );
271
+ nsr_update( r, chdata, chdata_len );
272
+ return data;
273
+ }
274
+
275
+ VALUE
276
+ rbns_nilsimsa(VALUE self, VALUE other) {
277
+ long len;
278
+ char *d1;
279
+ char *d2;
280
+
281
+ d1 = rb_str2cstr( rb_funcall( self, rb_intern( "digest" ), 0 ), &len );
282
+ if (len < NSR_CODE_LEN) {
283
+ return Qnil;
284
+ }
285
+
286
+ Check_Type( other, T_STRING );
287
+ d2 = rb_str2cstr( other, &len );
288
+ if (len < NSR_CODE_LEN) {
289
+ return Qnil;
290
+ }
291
+
292
+ return INT2NUM( nilsimsa( d1, d2 ) );
293
+ }
294
+
295
+ VALUE
296
+ rbns_digest(VALUE self) {
297
+ struct nsrecord *r=get_nsr( self );
298
+
299
+ nsr_digest( r );
300
+
301
+ /* reverse a newly created string of the digest */
302
+ return rb_funcall( rb_str_new( r->code, NSR_CODE_LEN ),
303
+ rb_intern( "reverse"), 0 );
304
+ }
305
+
306
+ void
307
+ Init_nilsimsa_native(void) {
308
+ VALUE rb_cNilsimsa;
309
+ /* initialize invariant data */
310
+ filltran();
311
+ fillpopcount();
312
+
313
+ /* this grafts itself over the top of an existing Nilsimsa class */
314
+ rb_cNilsimsa = rb_eval_string( "Nilsimsa" );
315
+ /* we'll call old_initialize from our new initialize */
316
+ rb_define_alias( rb_cNilsimsa, "old_initialize", "initialize" );
317
+ rb_define_method( rb_cNilsimsa, "initialize", rbns_init, -1 );
318
+ rb_define_method( rb_cNilsimsa, "update", rbns_update, 1 );
319
+ rb_define_method( rb_cNilsimsa, "nilsimsa", rbns_nilsimsa, 1 );
320
+ rb_define_method( rb_cNilsimsa, "digest", rbns_digest, 0 );
321
+ }
data/nilsimsa.rb ADDED
@@ -0,0 +1,166 @@
1
+ # Nilsimsa hash (build 20050414)
2
+ # Ruby port (C) 2005 Martin Pirker
3
+ # released under GNU GPL V2 license
4
+ #
5
+ # inspired by Digest::Nilsimsa-0.06 from Perl CPAN and
6
+ # the original C nilsimsa-0.2.4 implementation by cmeclax
7
+ # http://ixazon.dynip.com/~cmeclax/nilsimsa.html
8
+
9
+
10
+ class Nilsimsa
11
+
12
+ TRAN =
13
+ "\x02\xD6\x9E\x6F\xF9\x1D\x04\xAB\xD0\x22\x16\x1F\xD8\x73\xA1\xAC" <<
14
+ "\x3B\x70\x62\x96\x1E\x6E\x8F\x39\x9D\x05\x14\x4A\xA6\xBE\xAE\x0E" <<
15
+ "\xCF\xB9\x9C\x9A\xC7\x68\x13\xE1\x2D\xA4\xEB\x51\x8D\x64\x6B\x50" <<
16
+ "\x23\x80\x03\x41\xEC\xBB\x71\xCC\x7A\x86\x7F\x98\xF2\x36\x5E\xEE" <<
17
+ "\x8E\xCE\x4F\xB8\x32\xB6\x5F\x59\xDC\x1B\x31\x4C\x7B\xF0\x63\x01" <<
18
+ "\x6C\xBA\x07\xE8\x12\x77\x49\x3C\xDA\x46\xFE\x2F\x79\x1C\x9B\x30" <<
19
+ "\xE3\x00\x06\x7E\x2E\x0F\x38\x33\x21\xAD\xA5\x54\xCA\xA7\x29\xFC" <<
20
+ "\x5A\x47\x69\x7D\xC5\x95\xB5\xF4\x0B\x90\xA3\x81\x6D\x25\x55\x35" <<
21
+ "\xF5\x75\x74\x0A\x26\xBF\x19\x5C\x1A\xC6\xFF\x99\x5D\x84\xAA\x66" <<
22
+ "\x3E\xAF\x78\xB3\x20\x43\xC1\xED\x24\xEA\xE6\x3F\x18\xF3\xA0\x42" <<
23
+ "\x57\x08\x53\x60\xC3\xC0\x83\x40\x82\xD7\x09\xBD\x44\x2A\x67\xA8" <<
24
+ "\x93\xE0\xC2\x56\x9F\xD9\xDD\x85\x15\xB4\x8A\x27\x28\x92\x76\xDE" <<
25
+ "\xEF\xF8\xB2\xB7\xC9\x3D\x45\x94\x4B\x11\x0D\x65\xD5\x34\x8B\x91" <<
26
+ "\x0C\xFA\x87\xE9\x7C\x5B\xB1\x4D\xE5\xD4\xCB\x10\xA2\x17\x89\xBC" <<
27
+ "\xDB\xB0\xE2\x97\x88\x52\xF7\x48\xD3\x61\x2C\x3A\x2B\xD1\x8C\xFB" <<
28
+ "\xF1\xCD\xE4\x6A\xE7\xA9\xFD\xC4\x37\xC8\xD2\xF6\xDF\x58\x72\x4E"
29
+
30
+ POPC =
31
+ "\x00\x01\x01\x02\x01\x02\x02\x03\x01\x02\x02\x03\x02\x03\x03\x04" <<
32
+ "\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05" <<
33
+ "\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05" <<
34
+ "\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06" <<
35
+ "\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05" <<
36
+ "\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06" <<
37
+ "\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06" <<
38
+ "\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07" <<
39
+ "\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05" <<
40
+ "\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06" <<
41
+ "\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06" <<
42
+ "\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07" <<
43
+ "\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06" <<
44
+ "\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07" <<
45
+ "\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07" <<
46
+ "\x04\x05\x05\x06\x05\x06\x06\x07\x05\x06\x06\x07\x06\x07\x07\x08"
47
+
48
+ def initialize(*data)
49
+ @threshold=0; @count=0
50
+ @acc =Array::new(256,0)
51
+ @lastch0=@lastch1=@lastch2=@lastch3= -1
52
+
53
+ data.each do |d| update(d) end if data && (data.size>0)
54
+ end
55
+
56
+ def tran3(a,b,c,n)
57
+ (((TRAN[(a+n)&255]^TRAN[b]*(n+n+1))+TRAN[(c)^TRAN[n]])&255)
58
+ end
59
+
60
+ def update(data)
61
+ data.each_byte do |ch|
62
+ @count +=1
63
+ if @lastch1>-1 then
64
+ @acc[tran3(ch,@lastch0,@lastch1,0)] +=1
65
+ end
66
+ if @lastch2>-1 then
67
+ @acc[tran3(ch,@lastch0,@lastch2,1)] +=1
68
+ @acc[tran3(ch,@lastch1,@lastch2,2)] +=1
69
+ end
70
+ if @lastch3>-1 then
71
+ @acc[tran3(ch,@lastch0,@lastch3,3)] +=1
72
+ @acc[tran3(ch,@lastch1,@lastch3,4)] +=1
73
+ @acc[tran3(ch,@lastch2,@lastch3,5)] +=1
74
+ @acc[tran3(@lastch3,@lastch0,ch,6)] +=1
75
+ @acc[tran3(@lastch3,@lastch2,ch,7)] +=1
76
+ end
77
+ @lastch3=@lastch2
78
+ @lastch2=@lastch1
79
+ @lastch1=@lastch0
80
+ @lastch0=ch
81
+ end
82
+ end
83
+
84
+ def digest
85
+ @total=0;
86
+ case @count
87
+ when 0..2:
88
+ when 3 : @total +=1
89
+ when 4 : @total +=4
90
+ else
91
+ @total +=(8*@count)-28
92
+ end
93
+ @threshold=@total/256
94
+
95
+ @code=String::new("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" <<
96
+ "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00")
97
+ (0..255).each do |i|
98
+ @code[i>>3]+=( ((@acc[i]>@threshold)?(1):(0))<<(i&7) )
99
+ end
100
+
101
+ @code[0..31].reverse
102
+ end
103
+
104
+ def hexdigest
105
+ digest.unpack("H*")[0]
106
+ end
107
+
108
+ def to_s
109
+ hexdigest
110
+ end
111
+
112
+ def <<(whatever)
113
+ update(whatever)
114
+ end
115
+
116
+ def ==(otherdigest)
117
+ digest == otherdigest
118
+ end
119
+
120
+ def file(thisone)
121
+ File.open(thisone,"rb") do |f|
122
+ until f.eof? do update(f.read(10480)) end
123
+ end
124
+ end
125
+
126
+ def nilsimsa(otherdigest)
127
+ bits=0; myd=digest
128
+ (0..31).each do |i|
129
+ bits += POPC[255&myd[i]^otherdigest[i]]
130
+ end
131
+ (128-bits)
132
+ end
133
+
134
+ end
135
+
136
+ begin # load C core - if available
137
+ require 'nilsimsa_native'
138
+ puts 'using compiled nilsimsa'
139
+ rescue LoadError => e
140
+ # ignore lack of native module
141
+ puts 'using interpreted nilsimsa'
142
+ end
143
+
144
+ if __FILE__ == $0 then
145
+ if ARGV.size>0 then
146
+ ARGV.each do |filename|
147
+ if FileTest::exists?(filename) then
148
+ n = Nilsimsa::new
149
+ n.file(filename)
150
+ puts n.hexdigest+" #{filename}"
151
+ else
152
+ puts "error: can't find '#{filename}'"
153
+ end
154
+ end
155
+ else
156
+ puts "running selftest..."
157
+ n1 = Nilsimsa::new; n1.update("abcdefgh")
158
+ puts "abcdefgh: #{n1.hexdigest=='14c8118000000000030800000004042004189020001308014088003280000078'}"
159
+ n2 = Nilsimsa::new( "abcd","efgh")
160
+ puts "abcd efgh: #{n2.hexdigest=='14c8118000000000030800000004042004189020001308014088003280000078'}"
161
+ puts "digest: #{n1 == n2.digest}"
162
+ n1.update("ijk")
163
+ puts "ijk: #{n1.hexdigest=='14c811840010000c0328200108040630041890200217582d4098103280000078'}"
164
+ puts "nilsimsa: #{n1.nilsimsa(n2.digest)==109}"
165
+ end
166
+ end
metadata ADDED
@@ -0,0 +1,48 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.4
3
+ specification_version: 1
4
+ name: Nilsimsa
5
+ version: !ruby/object:Gem::Version
6
+ version: 1.0.0
7
+ date: 2007-10-29 00:00:00 -07:00
8
+ summary: Computes Nilsimsa values. Nilsimsa is a distance based hash
9
+ require_paths:
10
+ - .
11
+ email: jwilkins[at]nospam[dot]bitland[dot]net
12
+ homepage:
13
+ rubyforge_project:
14
+ description:
15
+ autorequire: nilsimsa.rb
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Jonathan Wilkins
31
+ files:
32
+ - nilsimsa.c
33
+ - nilsimsa.rb
34
+ - extconf.rb
35
+ test_files: []
36
+
37
+ rdoc_options: []
38
+
39
+ extra_rdoc_files: []
40
+
41
+ executables: []
42
+
43
+ extensions: []
44
+
45
+ requirements: []
46
+
47
+ dependencies: []
48
+