jwilkins-nilsimsa 1.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README +18 -0
- data/bin/nilsimsa +18 -0
- data/examples/simple.rb +35 -0
- data/ext/extconf.rb +4 -0
- data/ext/nilsimsa.c +312 -0
- data/nilsimsa.gemspec +17 -0
- data/nilsimsa.rb +182 -0
- metadata +60 -0
data/README
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
nilsimsa
|
2
|
+
--------
|
3
|
+
Nilsimsa is a distance based hash, which is the opposite of more familiar
|
4
|
+
hashes like MD5. Instead of small changes making a large difference in
|
5
|
+
the resulting hash (to avoid collisions), distance based hashes cause
|
6
|
+
similar values to have similar output. This is good for detecting near
|
7
|
+
similar documents without having to store the original text.
|
8
|
+
|
9
|
+
Standard usage is as follows:
|
10
|
+
|
11
|
+
require 'nilsimsa'
|
12
|
+
|
13
|
+
n1 = Nilsimsa::new
|
14
|
+
text1 = "The quick brown fox"
|
15
|
+
n1.update(text1)
|
16
|
+
puts "Text '#{text1}': #{n1.hexdigest}"
|
17
|
+
|
18
|
+
|
data/bin/nilsimsa
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'nilsimsa'
|
4
|
+
|
5
|
+
if ARGV.size > 0 then
|
6
|
+
ARGV.each do |filename|
|
7
|
+
if FileTest::exists?(filename) then
|
8
|
+
n = Nilsimsa::new
|
9
|
+
n.file(filename)
|
10
|
+
puts n.hexdigest+" #{filename}"
|
11
|
+
else
|
12
|
+
puts "error: can't find '#{filename}'"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
else
|
16
|
+
puts "Specify a file to hash"
|
17
|
+
end
|
18
|
+
|
data/examples/simple.rb
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'nilsimsa'
|
2
|
+
|
3
|
+
# Levenshtein implementation from
|
4
|
+
# http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance
|
5
|
+
# Used under the GNU Free Documentation license
|
6
|
+
class String
|
7
|
+
def levenshtein(other)
|
8
|
+
a, b = self.unpack('U*'), other.unpack('U*')
|
9
|
+
n, m = a.length, b.length
|
10
|
+
a, b, n, m = b, a, m, n if n > m
|
11
|
+
current = [*0..n]
|
12
|
+
1.upto(m) do |i|
|
13
|
+
previous, current = current, [i]+[0]*n
|
14
|
+
1.upto(n) do |j|
|
15
|
+
add, delete = previous[j]+1, current[j-1]+1
|
16
|
+
change = previous[j-1]
|
17
|
+
change += 1 if a[j-1] != b[i-1]
|
18
|
+
current[j] = [add, delete, change].min
|
19
|
+
end
|
20
|
+
end
|
21
|
+
current[n]
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
n1 = Nilsimsa::new
|
26
|
+
text1 = "The quick brown fox"
|
27
|
+
n1.update(text1)
|
28
|
+
puts "'#{text1}':\n #{n1.hexdigest}"
|
29
|
+
|
30
|
+
n2 = Nilsimsa::new
|
31
|
+
text2 = "The quick red fox"
|
32
|
+
n2.update(text2)
|
33
|
+
puts "'#{text2}':\n #{n2.hexdigest}"
|
34
|
+
|
35
|
+
puts "Distance: #{n1.hexdigest.levenshtein(n2.hexdigest)}"
|
data/ext/extconf.rb
ADDED
data/ext/nilsimsa.c
ADDED
@@ -0,0 +1,312 @@
|
|
1
|
+
/*
|
2
|
+
* Source: Digest-Nilsimsa-0.06; _nilsimsa.c, nilsimsa.h
|
3
|
+
* Changes: 2005-04-14 Stephen Lewis <slewis@orcon.net.nz>
|
4
|
+
* - stripped superfluous code
|
5
|
+
* - some cleanups, reformatting
|
6
|
+
* - refactored to provide more appropriate ruby interface
|
7
|
+
*
|
8
|
+
* NOTE - I haven't gotten around to fixing the previous comment
|
9
|
+
* headers below this
|
10
|
+
*/
|
11
|
+
|
12
|
+
/*
|
13
|
+
* chad's modifications for perl xs - Digest::Nilsimsa
|
14
|
+
*
|
15
|
+
* main() - removed (too many warnings)
|
16
|
+
* accbuf - added, practically identical to accfile()
|
17
|
+
* dprint() - added (prints debug msgs to debug.txt)
|
18
|
+
*
|
19
|
+
* $Id: _nilsimsa.c,v 1.1 2002/05/20 22:29:07 chad Exp $
|
20
|
+
*/
|
21
|
+
|
22
|
+
/***************************************************************************
|
23
|
+
* main.c - nilsimsa
|
24
|
+
* -------------------
|
25
|
+
* begin : Fri Mar 16 01:41:08 EST 2001
|
26
|
+
* copyright : (C) 2001-2002 by cmeclax
|
27
|
+
* email : cmeclax@ixazon.dynip.com
|
28
|
+
***************************************************************************/
|
29
|
+
|
30
|
+
/***************************************************************************
|
31
|
+
* *
|
32
|
+
* This program is free software; you can redistribute it and/or modify *
|
33
|
+
* it under the terms of the GNU General Public License as published by *
|
34
|
+
* the Free Software Foundation; either version 2 of the License, or *
|
35
|
+
* (at your option) any later version. *
|
36
|
+
* *
|
37
|
+
***************************************************************************/
|
38
|
+
|
39
|
+
/* NOTE - this should really use the support code from Digest, but
|
40
|
+
* would require a ruby source tree to build in that case ( the
|
41
|
+
* required headers don't seem to be generally installed :/ )
|
42
|
+
*/
|
43
|
+
|
44
|
+
#include <ruby.h>
|
45
|
+
#include <assert.h>
|
46
|
+
|
47
|
+
#define NSR_CODE_LEN 32
|
48
|
+
#define RB_NSR_IVAR "cdata" /* not prefixing @ makes innaccessible from ruby */
|
49
|
+
|
50
|
+
#define tran3(a,b,c,n) (((tran[((a)+(n))&255]^tran[(b)]*((n)+(n)+1))+tran[(c)^tran[n]])&255)
|
51
|
+
|
52
|
+
|
53
|
+
struct nsrecord {
|
54
|
+
int acc[256]; /* counts each trigram's hash */
|
55
|
+
char code[NSR_CODE_LEN]; /* the nilsimsa code as a bit vector */
|
56
|
+
|
57
|
+
unsigned int chcount; /* number of characters processed so far */
|
58
|
+
int lastch[4]; /* last 4 characters processed */
|
59
|
+
};
|
60
|
+
|
61
|
+
void nsr_init( struct nsrecord *a );
|
62
|
+
void nsr_digest( struct nsrecord *a );
|
63
|
+
void nsr_update(struct nsrecord *, char *buf,unsigned long len);
|
64
|
+
int nilsimsa(char *a,char *b);
|
65
|
+
/*void nsr_free( struct nsrecord *r );*/
|
66
|
+
static void filltran(void);
|
67
|
+
static void fillpopcount(void);
|
68
|
+
|
69
|
+
static struct nsrecord *get_nsr( VALUE obj );
|
70
|
+
VALUE rbns_init(int argc, VALUE *argv, VALUE self);
|
71
|
+
VALUE rbns_update(VALUE self, VALUE data);
|
72
|
+
VALUE rbns_nilsimsa(VALUE self, VALUE other);
|
73
|
+
VALUE rbns_digest(VALUE self);
|
74
|
+
void Init_nilsimsa_native(void);
|
75
|
+
|
76
|
+
unsigned char tran[256], popcount[256];
|
77
|
+
|
78
|
+
/* formerly clear() */
|
79
|
+
void
|
80
|
+
nsr_init( struct nsrecord *a ) {
|
81
|
+
assert( a );
|
82
|
+
|
83
|
+
memset(a->acc,0,sizeof(a->acc));
|
84
|
+
memset(a->code,0,sizeof(a->code));
|
85
|
+
|
86
|
+
a->chcount = 0;
|
87
|
+
{
|
88
|
+
int i;
|
89
|
+
for (i=0; i<4; i++) {
|
90
|
+
a->lastch[i] = -1;
|
91
|
+
}
|
92
|
+
}
|
93
|
+
}
|
94
|
+
|
95
|
+
static void
|
96
|
+
filltran(void) {
|
97
|
+
int i,j,k;
|
98
|
+
for (i=j=0;i<256;i++) {
|
99
|
+
j=(j*53+1)&255;
|
100
|
+
j+=j;
|
101
|
+
if (j>255) {
|
102
|
+
j-=255;
|
103
|
+
}
|
104
|
+
for (k=0;k<i;k++) {
|
105
|
+
if (j==tran[k]) {
|
106
|
+
j=(j+1)&255;
|
107
|
+
k=0;
|
108
|
+
}
|
109
|
+
}
|
110
|
+
tran[i]=j;
|
111
|
+
}
|
112
|
+
}
|
113
|
+
|
114
|
+
static void
|
115
|
+
fillpopcount(void) {
|
116
|
+
int i,j;
|
117
|
+
memset(popcount,0,sizeof(popcount));
|
118
|
+
|
119
|
+
for (i=0;i<256;i++) {
|
120
|
+
for (j=0;j<8;j++) {
|
121
|
+
popcount[i]+=1&(i>>j);
|
122
|
+
}
|
123
|
+
}
|
124
|
+
}
|
125
|
+
|
126
|
+
|
127
|
+
/* formerly accfile() */
|
128
|
+
void
|
129
|
+
nsr_update(struct nsrecord *a, char *buf, unsigned long len) {
|
130
|
+
unsigned int idx;
|
131
|
+
int *lastch=a->lastch; // convenience
|
132
|
+
|
133
|
+
assert( a );
|
134
|
+
|
135
|
+
for(idx=0; idx<len; idx++) {
|
136
|
+
unsigned char ch = (unsigned char) buf[idx];
|
137
|
+
a->chcount++;
|
138
|
+
if (lastch[1]>=0)
|
139
|
+
a->acc[tran3(ch,lastch[0],lastch[1],0)]++;
|
140
|
+
if (lastch[2]>=0) {
|
141
|
+
a->acc[tran3(ch,lastch[0],lastch[2],1)]++;
|
142
|
+
a->acc[tran3(ch,lastch[1],lastch[2],2)]++;
|
143
|
+
}
|
144
|
+
if (lastch[3]>=0) {
|
145
|
+
a->acc[tran3(ch,lastch[0],lastch[3],3)]++;
|
146
|
+
a->acc[tran3(ch,lastch[1],lastch[3],4)]++;
|
147
|
+
a->acc[tran3(ch,lastch[2],lastch[3],5)]++;
|
148
|
+
a->acc[tran3(lastch[3],lastch[0],ch,6)]++;
|
149
|
+
a->acc[tran3(lastch[3],lastch[2],ch,7)]++;
|
150
|
+
}
|
151
|
+
lastch[3]=lastch[2];
|
152
|
+
lastch[2]=lastch[1];
|
153
|
+
lastch[1]=lastch[0];
|
154
|
+
lastch[0]=ch;
|
155
|
+
}
|
156
|
+
}
|
157
|
+
|
158
|
+
/* formerly makecode() */
|
159
|
+
void
|
160
|
+
nsr_digest(struct nsrecord *a) {
|
161
|
+
int i;
|
162
|
+
int total=0; /* total number of trigrams counted */
|
163
|
+
int threshold=0; /* mean of all numbers in acc */
|
164
|
+
|
165
|
+
assert( a );
|
166
|
+
|
167
|
+
switch (a->chcount) {
|
168
|
+
case 0:
|
169
|
+
case 1:
|
170
|
+
case 2:
|
171
|
+
break;
|
172
|
+
case 3:
|
173
|
+
total = 1;
|
174
|
+
break;
|
175
|
+
case 4:
|
176
|
+
total = 4;
|
177
|
+
break;
|
178
|
+
default:
|
179
|
+
total = (8 * a->chcount) - 28;
|
180
|
+
break;
|
181
|
+
}
|
182
|
+
|
183
|
+
threshold=total/256; /* round down because criterion is >threshold */
|
184
|
+
|
185
|
+
memset(a->code,0,sizeof(a->code));
|
186
|
+
for (i=0;i<256;i++) {
|
187
|
+
a->code[i>>3]+=((a->acc[i]>threshold)<<(i&7));
|
188
|
+
}
|
189
|
+
}
|
190
|
+
|
191
|
+
/* NOTE - assumes both of length 32 */
|
192
|
+
int
|
193
|
+
nilsimsa(char *a,char *b) {
|
194
|
+
int i,bits=0;
|
195
|
+
assert( a );
|
196
|
+
assert( b );
|
197
|
+
|
198
|
+
for (i=0;i<NSR_CODE_LEN;i++) {
|
199
|
+
bits+=popcount[255&(a[i]^b[i])];
|
200
|
+
}
|
201
|
+
|
202
|
+
return 128-bits;
|
203
|
+
}
|
204
|
+
|
205
|
+
/*
|
206
|
+
void
|
207
|
+
nsr_free( struct nsrecord *r ) {
|
208
|
+
if (r) {
|
209
|
+
free( r );
|
210
|
+
}
|
211
|
+
}
|
212
|
+
*/
|
213
|
+
|
214
|
+
|
215
|
+
/*
|
216
|
+
*
|
217
|
+
* begin ruby wrapper functions
|
218
|
+
*
|
219
|
+
*/
|
220
|
+
|
221
|
+
static struct nsrecord *
|
222
|
+
get_nsr( VALUE obj ) {
|
223
|
+
VALUE wrapped;
|
224
|
+
struct nsrecord *ret;
|
225
|
+
|
226
|
+
if (!RTEST( rb_funcall( obj, rb_intern( "kind_of?" ), 1,
|
227
|
+
rb_eval_string("Nilsimsa")))) {
|
228
|
+
/* FIXME should raise exception */
|
229
|
+
return NULL;
|
230
|
+
}
|
231
|
+
|
232
|
+
wrapped = rb_iv_get( obj, RB_NSR_IVAR );
|
233
|
+
if (Qnil == wrapped) {
|
234
|
+
return NULL;
|
235
|
+
}
|
236
|
+
|
237
|
+
Data_Get_Struct( wrapped, struct nsrecord, ret );
|
238
|
+
return ret;
|
239
|
+
}
|
240
|
+
|
241
|
+
VALUE
|
242
|
+
rbns_init(int argc, VALUE *argv, VALUE self) {
|
243
|
+
VALUE wrapped_nsr;
|
244
|
+
struct nsrecord *r;
|
245
|
+
wrapped_nsr = Data_Make_Struct( rb_cObject, struct nsrecord,
|
246
|
+
NULL, -1, r );
|
247
|
+
rb_iv_set( self, RB_NSR_IVAR, wrapped_nsr );
|
248
|
+
nsr_init( r );
|
249
|
+
|
250
|
+
return rb_funcall2( self, rb_intern( "old_initialize" ), argc, argv );
|
251
|
+
}
|
252
|
+
|
253
|
+
VALUE
|
254
|
+
rbns_update(VALUE self, VALUE data) {
|
255
|
+
struct nsrecord *r;
|
256
|
+
char *chdata;
|
257
|
+
long chdata_len;
|
258
|
+
r = get_nsr( self );
|
259
|
+
|
260
|
+
Check_Type( data, T_STRING );
|
261
|
+
chdata = rb_str2cstr( data, &chdata_len );
|
262
|
+
nsr_update( r, chdata, chdata_len );
|
263
|
+
return data;
|
264
|
+
}
|
265
|
+
|
266
|
+
VALUE
|
267
|
+
rbns_nilsimsa(VALUE self, VALUE other) {
|
268
|
+
long len;
|
269
|
+
char *d1;
|
270
|
+
char *d2;
|
271
|
+
|
272
|
+
d1 = rb_str2cstr( rb_funcall( self, rb_intern( "digest" ), 0 ), &len );
|
273
|
+
if (len < NSR_CODE_LEN) {
|
274
|
+
return Qnil;
|
275
|
+
}
|
276
|
+
|
277
|
+
Check_Type( other, T_STRING );
|
278
|
+
d2 = rb_str2cstr( other, &len );
|
279
|
+
if (len < NSR_CODE_LEN) {
|
280
|
+
return Qnil;
|
281
|
+
}
|
282
|
+
|
283
|
+
return INT2NUM( nilsimsa( d1, d2 ) );
|
284
|
+
}
|
285
|
+
|
286
|
+
VALUE
|
287
|
+
rbns_digest(VALUE self) {
|
288
|
+
struct nsrecord *r=get_nsr( self );
|
289
|
+
|
290
|
+
nsr_digest( r );
|
291
|
+
|
292
|
+
/* reverse a newly created string of the digest */
|
293
|
+
return rb_funcall( rb_str_new( r->code, NSR_CODE_LEN ),
|
294
|
+
rb_intern( "reverse"), 0 );
|
295
|
+
}
|
296
|
+
|
297
|
+
void
|
298
|
+
Init_nilsimsa_native(void) {
|
299
|
+
VALUE rb_cNilsimsa;
|
300
|
+
/* initialize invariant data */
|
301
|
+
filltran();
|
302
|
+
fillpopcount();
|
303
|
+
|
304
|
+
/* this grafts itself over the top of an existing Nilsimsa class */
|
305
|
+
rb_cNilsimsa = rb_eval_string( "Nilsimsa" );
|
306
|
+
/* we'll call old_initialize from our new initialize */
|
307
|
+
rb_define_alias( rb_cNilsimsa, "old_initialize", "initialize" );
|
308
|
+
rb_define_method( rb_cNilsimsa, "initialize", rbns_init, -1 );
|
309
|
+
rb_define_method( rb_cNilsimsa, "update", rbns_update, 1 );
|
310
|
+
rb_define_method( rb_cNilsimsa, "nilsimsa", rbns_nilsimsa, 1 );
|
311
|
+
rb_define_method( rb_cNilsimsa, "digest", rbns_digest, 0 );
|
312
|
+
}
|
data/nilsimsa.gemspec
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
SPEC = Gem::Specification.new do |spec|
|
2
|
+
# Descriptive and source information for this gem.
|
3
|
+
spec.name = "nilsimsa"
|
4
|
+
spec.version = "1.0.5"
|
5
|
+
spec.summary = "Computes Nilsimsa values. Nilsimsa is a distance based hash"
|
6
|
+
spec.author = "Jonathan Wilkins"
|
7
|
+
spec.email = "jwilkins[at]nospam[dot]bitland[dot]net"
|
8
|
+
spec.has_rdoc = true
|
9
|
+
spec.extra_rdoc_files = ["README"]
|
10
|
+
|
11
|
+
spec.files = %w(README nilsimsa.gemspec nilsimsa.rb bin/nilsimsa
|
12
|
+
examples/simple.rb ext/extconf.rb ext/nilsimsa.c)
|
13
|
+
spec.executables = ['nilsimsa']
|
14
|
+
|
15
|
+
# optional native component
|
16
|
+
spec.extensions = ['ext/extconf.rb']
|
17
|
+
end
|
data/nilsimsa.rb
ADDED
@@ -0,0 +1,182 @@
|
|
1
|
+
# Nilsimsa hash (build 20050414)
|
2
|
+
# Ruby port (C) 2005 Martin Pirker
|
3
|
+
# released under GNU GPL V2 license
|
4
|
+
#
|
5
|
+
# inspired by Digest::Nilsimsa-0.06 from Perl CPAN and
|
6
|
+
# the original C nilsimsa-0.2.4 implementation by cmeclax
|
7
|
+
# http://ixazon.dynip.com/~cmeclax/nilsimsa.html
|
8
|
+
|
9
|
+
class Nilsimsa
|
10
|
+
|
11
|
+
TRAN =
|
12
|
+
"\x02\xD6\x9E\x6F\xF9\x1D\x04\xAB\xD0\x22\x16\x1F\xD8\x73\xA1\xAC" <<
|
13
|
+
"\x3B\x70\x62\x96\x1E\x6E\x8F\x39\x9D\x05\x14\x4A\xA6\xBE\xAE\x0E" <<
|
14
|
+
"\xCF\xB9\x9C\x9A\xC7\x68\x13\xE1\x2D\xA4\xEB\x51\x8D\x64\x6B\x50" <<
|
15
|
+
"\x23\x80\x03\x41\xEC\xBB\x71\xCC\x7A\x86\x7F\x98\xF2\x36\x5E\xEE" <<
|
16
|
+
"\x8E\xCE\x4F\xB8\x32\xB6\x5F\x59\xDC\x1B\x31\x4C\x7B\xF0\x63\x01" <<
|
17
|
+
"\x6C\xBA\x07\xE8\x12\x77\x49\x3C\xDA\x46\xFE\x2F\x79\x1C\x9B\x30" <<
|
18
|
+
"\xE3\x00\x06\x7E\x2E\x0F\x38\x33\x21\xAD\xA5\x54\xCA\xA7\x29\xFC" <<
|
19
|
+
"\x5A\x47\x69\x7D\xC5\x95\xB5\xF4\x0B\x90\xA3\x81\x6D\x25\x55\x35" <<
|
20
|
+
"\xF5\x75\x74\x0A\x26\xBF\x19\x5C\x1A\xC6\xFF\x99\x5D\x84\xAA\x66" <<
|
21
|
+
"\x3E\xAF\x78\xB3\x20\x43\xC1\xED\x24\xEA\xE6\x3F\x18\xF3\xA0\x42" <<
|
22
|
+
"\x57\x08\x53\x60\xC3\xC0\x83\x40\x82\xD7\x09\xBD\x44\x2A\x67\xA8" <<
|
23
|
+
"\x93\xE0\xC2\x56\x9F\xD9\xDD\x85\x15\xB4\x8A\x27\x28\x92\x76\xDE" <<
|
24
|
+
"\xEF\xF8\xB2\xB7\xC9\x3D\x45\x94\x4B\x11\x0D\x65\xD5\x34\x8B\x91" <<
|
25
|
+
"\x0C\xFA\x87\xE9\x7C\x5B\xB1\x4D\xE5\xD4\xCB\x10\xA2\x17\x89\xBC" <<
|
26
|
+
"\xDB\xB0\xE2\x97\x88\x52\xF7\x48\xD3\x61\x2C\x3A\x2B\xD1\x8C\xFB" <<
|
27
|
+
"\xF1\xCD\xE4\x6A\xE7\xA9\xFD\xC4\x37\xC8\xD2\xF6\xDF\x58\x72\x4E"
|
28
|
+
|
29
|
+
POPC =
|
30
|
+
"\x00\x01\x01\x02\x01\x02\x02\x03\x01\x02\x02\x03\x02\x03\x03\x04" <<
|
31
|
+
"\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05" <<
|
32
|
+
"\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05" <<
|
33
|
+
"\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06" <<
|
34
|
+
"\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05" <<
|
35
|
+
"\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06" <<
|
36
|
+
"\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06" <<
|
37
|
+
"\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07" <<
|
38
|
+
"\x01\x02\x02\x03\x02\x03\x03\x04\x02\x03\x03\x04\x03\x04\x04\x05" <<
|
39
|
+
"\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06" <<
|
40
|
+
"\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06" <<
|
41
|
+
"\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07" <<
|
42
|
+
"\x02\x03\x03\x04\x03\x04\x04\x05\x03\x04\x04\x05\x04\x05\x05\x06" <<
|
43
|
+
"\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07" <<
|
44
|
+
"\x03\x04\x04\x05\x04\x05\x05\x06\x04\x05\x05\x06\x05\x06\x06\x07" <<
|
45
|
+
"\x04\x05\x05\x06\x05\x06\x06\x07\x05\x06\x06\x07\x06\x07\x07\x08"
|
46
|
+
|
47
|
+
def initialize(*data)
|
48
|
+
@threshold=0; @count=0
|
49
|
+
@acc =Array::new(256,0)
|
50
|
+
@lastch0=@lastch1=@lastch2=@lastch3= -1
|
51
|
+
|
52
|
+
data.each do |d| update(d) end if data && (data.size>0)
|
53
|
+
end
|
54
|
+
|
55
|
+
def tran3(a,b,c,n)
|
56
|
+
(((TRAN[(a+n)&255]^TRAN[b]*(n+n+1))+TRAN[(c)^TRAN[n]])&255)
|
57
|
+
end
|
58
|
+
|
59
|
+
def update(data)
|
60
|
+
data.each_byte do |ch|
|
61
|
+
@count +=1
|
62
|
+
if @lastch1>-1 then
|
63
|
+
@acc[tran3(ch,@lastch0,@lastch1,0)] +=1
|
64
|
+
end
|
65
|
+
if @lastch2>-1 then
|
66
|
+
@acc[tran3(ch,@lastch0,@lastch2,1)] +=1
|
67
|
+
@acc[tran3(ch,@lastch1,@lastch2,2)] +=1
|
68
|
+
end
|
69
|
+
if @lastch3>-1 then
|
70
|
+
@acc[tran3(ch,@lastch0,@lastch3,3)] +=1
|
71
|
+
@acc[tran3(ch,@lastch1,@lastch3,4)] +=1
|
72
|
+
@acc[tran3(ch,@lastch2,@lastch3,5)] +=1
|
73
|
+
@acc[tran3(@lastch3,@lastch0,ch,6)] +=1
|
74
|
+
@acc[tran3(@lastch3,@lastch2,ch,7)] +=1
|
75
|
+
end
|
76
|
+
@lastch3=@lastch2
|
77
|
+
@lastch2=@lastch1
|
78
|
+
@lastch1=@lastch0
|
79
|
+
@lastch0=ch
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def digest
|
84
|
+
@total=0;
|
85
|
+
case @count
|
86
|
+
when 0..2:
|
87
|
+
when 3 : @total +=1
|
88
|
+
when 4 : @total +=4
|
89
|
+
else
|
90
|
+
@total +=(8*@count)-28
|
91
|
+
end
|
92
|
+
@threshold=@total/256
|
93
|
+
|
94
|
+
@code=String::new(
|
95
|
+
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" <<
|
96
|
+
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00")
|
97
|
+
(0..255).each do |i|
|
98
|
+
@code[i>>3]+=( ((@acc[i]>@threshold)?(1):(0))<<(i&7) )
|
99
|
+
end
|
100
|
+
|
101
|
+
@code[0..31].reverse
|
102
|
+
end
|
103
|
+
|
104
|
+
def hexdigest
|
105
|
+
digest.unpack("H*")[0]
|
106
|
+
end
|
107
|
+
|
108
|
+
def to_s
|
109
|
+
hexdigest
|
110
|
+
end
|
111
|
+
|
112
|
+
def <<(whatever)
|
113
|
+
update(whatever)
|
114
|
+
end
|
115
|
+
|
116
|
+
def ==(otherdigest)
|
117
|
+
digest == otherdigest
|
118
|
+
end
|
119
|
+
|
120
|
+
def file(thisone)
|
121
|
+
File.open(thisone,"rb") do |f|
|
122
|
+
until f.eof? do update(f.read(10480)) end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def nilsimsa(otherdigest)
|
127
|
+
bits=0; myd=digest
|
128
|
+
(0..31).each do |i|
|
129
|
+
bits += POPC[255&myd[i]^otherdigest[i]]
|
130
|
+
end
|
131
|
+
(128-bits)
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
135
|
+
|
136
|
+
def selftest
|
137
|
+
n1 = Nilsimsa::new;
|
138
|
+
n1.update("abcdefgh")
|
139
|
+
puts "abcdefgh: #{n1.hexdigest=='14c8118000000000030800000004042004189020001308014088003280000078'}"
|
140
|
+
n2 = Nilsimsa::new("abcd","efgh")
|
141
|
+
puts "abcd efgh: #{n2.hexdigest=='14c8118000000000030800000004042004189020001308014088003280000078'}"
|
142
|
+
puts "digest: #{n1 == n2.digest}"
|
143
|
+
n1.update("ijk")
|
144
|
+
puts "ijk: #{n1.hexdigest=='14c811840010000c0328200108040630041890200217582d4098103280000078'}"
|
145
|
+
puts "nilsimsa: #{n1.nilsimsa(n2.digest)==109}"
|
146
|
+
puts
|
147
|
+
end
|
148
|
+
|
149
|
+
if __FILE__ == $0 then
|
150
|
+
if ARGV.size>0 then
|
151
|
+
begin # load C core - if available
|
152
|
+
require 'nilsimsa_native'
|
153
|
+
rescue LoadError => e
|
154
|
+
# ignore lack of native module
|
155
|
+
end
|
156
|
+
|
157
|
+
ARGV.each do |filename|
|
158
|
+
if FileTest::exists?(filename) then
|
159
|
+
n = Nilsimsa::new
|
160
|
+
n.file(filename)
|
161
|
+
puts n.hexdigest+" #{filename}"
|
162
|
+
else
|
163
|
+
puts "error: can't find '#{filename}'"
|
164
|
+
end
|
165
|
+
end
|
166
|
+
else
|
167
|
+
puts 'Running selftest using native ruby version'
|
168
|
+
selftest
|
169
|
+
begin # load C core - if available
|
170
|
+
if File.exists?('./nilsimsa_native')
|
171
|
+
require './nilsimsa_native'
|
172
|
+
puts 'Running selftest using compiled nilsimsa in current dir'
|
173
|
+
else
|
174
|
+
require 'nilsimsa_native'
|
175
|
+
puts 'Running selftest using compiled nilsimsa'
|
176
|
+
end
|
177
|
+
selftest
|
178
|
+
rescue LoadError => e
|
179
|
+
puts "Couldnt run selftest with compiled nilsimsa"
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
metadata
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: jwilkins-nilsimsa
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.5
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jonathan Wilkins
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-05-16 00:00:00 -07:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description:
|
17
|
+
email: jwilkins[at]nospam[dot]bitland[dot]net
|
18
|
+
executables:
|
19
|
+
- nilsimsa
|
20
|
+
extensions:
|
21
|
+
- ext/extconf.rb
|
22
|
+
extra_rdoc_files:
|
23
|
+
- README
|
24
|
+
files:
|
25
|
+
- README
|
26
|
+
- nilsimsa.gemspec
|
27
|
+
- nilsimsa.rb
|
28
|
+
- bin/nilsimsa
|
29
|
+
- examples/simple.rb
|
30
|
+
- ext/extconf.rb
|
31
|
+
- ext/nilsimsa.c
|
32
|
+
has_rdoc: true
|
33
|
+
homepage:
|
34
|
+
licenses:
|
35
|
+
post_install_message:
|
36
|
+
rdoc_options: []
|
37
|
+
|
38
|
+
require_paths:
|
39
|
+
- lib
|
40
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
41
|
+
requirements:
|
42
|
+
- - ">="
|
43
|
+
- !ruby/object:Gem::Version
|
44
|
+
version: "0"
|
45
|
+
version:
|
46
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
47
|
+
requirements:
|
48
|
+
- - ">="
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: "0"
|
51
|
+
version:
|
52
|
+
requirements: []
|
53
|
+
|
54
|
+
rubyforge_project:
|
55
|
+
rubygems_version: 1.3.5
|
56
|
+
signing_key:
|
57
|
+
specification_version: 2
|
58
|
+
summary: Computes Nilsimsa values. Nilsimsa is a distance based hash
|
59
|
+
test_files: []
|
60
|
+
|