tyler-bloom_filter 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +23 -0
- data/VERSION.yml +4 -0
- data/ext/bloom_filter/Makefile +149 -0
- data/ext/bloom_filter/bloom.c +140 -0
- data/ext/bloom_filter/bloom.h +20 -0
- data/ext/bloom_filter/bloom_filter.c +74 -0
- data/ext/bloom_filter/extconf.rb +3 -0
- data/lib/bloom_filter.rb +1 -0
- data/spec/bloom_filter_spec.rb +86 -0
- metadata +63 -0
data/README.textile
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
h1. Bloom Filter
|
|
2
|
+
|
|
3
|
+
This is a Bloom filter for Ruby written in C.
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
h2. What is a Bloom filter?
|
|
7
|
+
|
|
8
|
+
I suck at explaining things. Wikipedia doesn't. http://wikipedia.org/wiki/Bloom_filter.
|
|
9
|
+
|
|
10
|
+
But in short a Bloom filter is a probabilistic data structure which is used to test for membership in a set. False positives can happen, but false negatives can not. The exact error rate of a Bloom filter can be tuned by changing the size of it.
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
h2. Why would I use a Bloom filter?
|
|
14
|
+
|
|
15
|
+
They're extremely compact and quite fast. The size of a Bloom filter is dependent upon the number of keys you'll be inserting into it and the acceptable error rate. The smaller the size of the bloom filter, the higher the error rate.
|
|
16
|
+
|
|
17
|
+
So, one idea is that you could use a Bloom filter as a pre-filter for database calls. Before requesting a particular id from the database, you could query the Bloom filter. If it returns true for the id, you can query the database for the data. If it returns false, you don't. Since false positives cannot happen, you'll never miss querying the database when you should. However, you may (at a particular error rate) query the database when you don't necessarily need to. But, as before, this can be tuned according to your needs.
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
h2. Todo
|
|
21
|
+
|
|
22
|
+
* Add benchmarks
|
|
23
|
+
* Add graphs of size vs. error rate
|
data/VERSION.yml
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
|
|
2
|
+
SHELL = /bin/sh
|
|
3
|
+
|
|
4
|
+
#### Start of system configuration section. ####
|
|
5
|
+
|
|
6
|
+
srcdir = .
|
|
7
|
+
topdir = /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/lib/ruby/1.8/universal-darwin9.0
|
|
8
|
+
hdrdir = $(topdir)
|
|
9
|
+
VPATH = $(srcdir):$(topdir):$(hdrdir)
|
|
10
|
+
prefix = $(DESTDIR)/System/Library/Frameworks/Ruby.framework/Versions/1.8/usr
|
|
11
|
+
exec_prefix = $(prefix)
|
|
12
|
+
sitedir = $(DESTDIR)/Library/Ruby/Site
|
|
13
|
+
rubylibdir = $(libdir)/ruby/$(ruby_version)
|
|
14
|
+
docdir = $(datarootdir)/doc/$(PACKAGE)
|
|
15
|
+
dvidir = $(docdir)
|
|
16
|
+
datarootdir = $(prefix)/share
|
|
17
|
+
archdir = $(rubylibdir)/$(arch)
|
|
18
|
+
sbindir = $(exec_prefix)/sbin
|
|
19
|
+
psdir = $(docdir)
|
|
20
|
+
localedir = $(datarootdir)/locale
|
|
21
|
+
htmldir = $(docdir)
|
|
22
|
+
datadir = $(datarootdir)
|
|
23
|
+
includedir = $(prefix)/include
|
|
24
|
+
infodir = $(DESTDIR)/usr/share/info
|
|
25
|
+
sysconfdir = $(prefix)/etc
|
|
26
|
+
mandir = $(DESTDIR)/usr/share/man
|
|
27
|
+
libdir = $(exec_prefix)/lib
|
|
28
|
+
sharedstatedir = $(prefix)/com
|
|
29
|
+
oldincludedir = $(DESTDIR)/usr/include
|
|
30
|
+
pdfdir = $(docdir)
|
|
31
|
+
sitearchdir = $(sitelibdir)/$(sitearch)
|
|
32
|
+
bindir = $(exec_prefix)/bin
|
|
33
|
+
localstatedir = $(prefix)/var
|
|
34
|
+
sitelibdir = $(sitedir)/$(ruby_version)
|
|
35
|
+
libexecdir = $(exec_prefix)/libexec
|
|
36
|
+
|
|
37
|
+
CC = gcc
|
|
38
|
+
LIBRUBY = $(LIBRUBY_SO)
|
|
39
|
+
LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
|
|
40
|
+
LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
|
|
41
|
+
LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)
|
|
42
|
+
|
|
43
|
+
RUBY_EXTCONF_H =
|
|
44
|
+
CFLAGS = -fno-common -arch ppc -arch i386 -Os -pipe -fno-common
|
|
45
|
+
INCFLAGS = -I. -I$(topdir) -I$(hdrdir) -I$(srcdir)
|
|
46
|
+
CPPFLAGS =
|
|
47
|
+
CXXFLAGS = $(CFLAGS)
|
|
48
|
+
DLDFLAGS = -L. -arch ppc -arch i386
|
|
49
|
+
LDSHARED = cc -arch ppc -arch i386 -pipe -bundle -undefined dynamic_lookup
|
|
50
|
+
AR = ar
|
|
51
|
+
EXEEXT =
|
|
52
|
+
|
|
53
|
+
RUBY_INSTALL_NAME = ruby
|
|
54
|
+
RUBY_SO_NAME = ruby
|
|
55
|
+
arch = universal-darwin9.0
|
|
56
|
+
sitearch = universal-darwin9.0
|
|
57
|
+
ruby_version = 1.8
|
|
58
|
+
ruby = /System/Library/Frameworks/Ruby.framework/Versions/1.8/usr/bin/ruby
|
|
59
|
+
RUBY = $(ruby)
|
|
60
|
+
RM = rm -f
|
|
61
|
+
MAKEDIRS = mkdir -p
|
|
62
|
+
INSTALL = /usr/bin/install -c
|
|
63
|
+
INSTALL_PROG = $(INSTALL) -m 0755
|
|
64
|
+
INSTALL_DATA = $(INSTALL) -m 644
|
|
65
|
+
COPY = cp
|
|
66
|
+
|
|
67
|
+
#### End of system configuration section. ####
|
|
68
|
+
|
|
69
|
+
preload =
|
|
70
|
+
|
|
71
|
+
libpath = . $(libdir)
|
|
72
|
+
LIBPATH = -L"." -L"$(libdir)"
|
|
73
|
+
DEFFILE =
|
|
74
|
+
|
|
75
|
+
CLEANFILES = mkmf.log
|
|
76
|
+
DISTCLEANFILES =
|
|
77
|
+
|
|
78
|
+
extout =
|
|
79
|
+
extout_prefix =
|
|
80
|
+
target_prefix =
|
|
81
|
+
LOCAL_LIBS =
|
|
82
|
+
LIBS = $(LIBRUBYARG_SHARED) -lpthread -ldl -lm
|
|
83
|
+
SRCS = bloom.c bloom_filter.c
|
|
84
|
+
OBJS = bloom.o bloom_filter.o
|
|
85
|
+
TARGET = bloom_filter
|
|
86
|
+
DLLIB = $(TARGET).bundle
|
|
87
|
+
EXTSTATIC =
|
|
88
|
+
STATIC_LIB =
|
|
89
|
+
|
|
90
|
+
RUBYCOMMONDIR = $(sitedir)$(target_prefix)
|
|
91
|
+
RUBYLIBDIR = $(sitelibdir)$(target_prefix)
|
|
92
|
+
RUBYARCHDIR = $(sitearchdir)$(target_prefix)
|
|
93
|
+
|
|
94
|
+
TARGET_SO = $(DLLIB)
|
|
95
|
+
CLEANLIBS = $(TARGET).bundle $(TARGET).il? $(TARGET).tds $(TARGET).map
|
|
96
|
+
CLEANOBJS = *.o *.a *.s[ol] *.pdb *.exp *.bak
|
|
97
|
+
|
|
98
|
+
all: $(DLLIB)
|
|
99
|
+
static: $(STATIC_LIB)
|
|
100
|
+
|
|
101
|
+
clean:
|
|
102
|
+
@-$(RM) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES)
|
|
103
|
+
|
|
104
|
+
distclean: clean
|
|
105
|
+
@-$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
|
|
106
|
+
@-$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
|
|
107
|
+
|
|
108
|
+
realclean: distclean
|
|
109
|
+
install: install-so install-rb
|
|
110
|
+
|
|
111
|
+
install-so: $(RUBYARCHDIR)
|
|
112
|
+
install-so: $(RUBYARCHDIR)/$(DLLIB)
|
|
113
|
+
$(RUBYARCHDIR)/$(DLLIB): $(DLLIB)
|
|
114
|
+
$(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
|
|
115
|
+
install-rb: pre-install-rb install-rb-default
|
|
116
|
+
install-rb-default: pre-install-rb-default
|
|
117
|
+
pre-install-rb: Makefile
|
|
118
|
+
pre-install-rb-default: Makefile
|
|
119
|
+
$(RUBYARCHDIR):
|
|
120
|
+
$(MAKEDIRS) $@
|
|
121
|
+
|
|
122
|
+
site-install: site-install-so site-install-rb
|
|
123
|
+
site-install-so: install-so
|
|
124
|
+
site-install-rb: install-rb
|
|
125
|
+
|
|
126
|
+
.SUFFIXES: .c .m .cc .cxx .cpp .C .o
|
|
127
|
+
|
|
128
|
+
.cc.o:
|
|
129
|
+
$(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
|
|
130
|
+
|
|
131
|
+
.cxx.o:
|
|
132
|
+
$(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
|
|
133
|
+
|
|
134
|
+
.cpp.o:
|
|
135
|
+
$(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
|
|
136
|
+
|
|
137
|
+
.C.o:
|
|
138
|
+
$(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $<
|
|
139
|
+
|
|
140
|
+
.c.o:
|
|
141
|
+
$(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) -c $<
|
|
142
|
+
|
|
143
|
+
$(DLLIB): $(OBJS)
|
|
144
|
+
@-$(RM) $@
|
|
145
|
+
$(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
$(OBJS): ruby.h defines.h
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
#include <stdlib.h>
|
|
2
|
+
#include <stdio.h>
|
|
3
|
+
#include <string.h>
|
|
4
|
+
#include <math.h>
|
|
5
|
+
#include "bloom.h"
|
|
6
|
+
|
|
7
|
+
#define TRUE 1
|
|
8
|
+
#define FALSE 0
|
|
9
|
+
#define jen_mix(a,b,c) { \
|
|
10
|
+
a -= b; a -= c; a ^= ( c >> 13 ); \
|
|
11
|
+
b -= c; b -= a; b ^= ( a << 8 ); \
|
|
12
|
+
c -= a; c -= b; c ^= ( b >> 13 ); \
|
|
13
|
+
a -= b; a -= c; a ^= ( c >> 12 ); \
|
|
14
|
+
b -= c; b -= a; b ^= ( a << 16 ); \
|
|
15
|
+
c -= a; c -= b; c ^= ( b >> 5 ); \
|
|
16
|
+
a -= b; a -= c; a ^= ( c >> 3 ); \
|
|
17
|
+
b -= c; b -= a; b ^= ( a << 10 ); \
|
|
18
|
+
c -= a; c -= b; c ^= ( b >> 15 ); \
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
#define bloom_index_jen(filter,key,len,salt_idx) \
|
|
22
|
+
bloom_hash_jen((void*)key, len, *(filter->salts + salt_idx)) % filter->bitset_size;
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
Bloom *bloom_alloc() {
|
|
26
|
+
return (Bloom*)(malloc(sizeof(Bloom)));
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
void bloom_set_hashes(Bloom* filter, int hash_count) {
|
|
30
|
+
filter->hash_count = hash_count;
|
|
31
|
+
filter->salts = (unsigned int*)(malloc(sizeof(int) * hash_count));
|
|
32
|
+
int i;
|
|
33
|
+
for(i = 0; i < hash_count; i++)
|
|
34
|
+
*(filter->salts + i) = (unsigned int)rand();
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
void bloom_set_bitset(Bloom* filter, int bitset_size) {
|
|
38
|
+
filter->bitset_size = bitset_size;
|
|
39
|
+
filter->bitset = (bitstr_t*)(malloc(bitstr_size(bitset_size)));
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
Bloom *bloom_new(int bitset_size, int hash_count) {
|
|
43
|
+
Bloom *filter = bloom_alloc();
|
|
44
|
+
bloom_set_bitset(filter, bitset_size);
|
|
45
|
+
bloom_set_hashes(filter, hash_count);
|
|
46
|
+
return filter;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
int bloom_size_for_error(double error, int key_count) {
|
|
50
|
+
return ceil((key_count * log(error)) / log(1.0 / (pow(2.0, log(2.0)))));
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
int bloom_ideal_hash_count(int size, int key_count) {
|
|
54
|
+
return round(log(2.0) * size / key_count);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
Bloom *bloom_for_error_and_keys(double error, int key_count) {
|
|
58
|
+
int size = bloom_size_for_error(error, key_count);
|
|
59
|
+
int hashes = bloom_ideal_hash_count(size, key_count);
|
|
60
|
+
return bloom_alloc(size, hashes);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
void bloom_free(Bloom *filter) {
|
|
64
|
+
free(filter->bitset);
|
|
65
|
+
free(filter);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
void bloom_add(Bloom *filter, void *key, unsigned int key_size) {
|
|
70
|
+
int i;
|
|
71
|
+
for(i = 0; i < filter->hash_count; i++) {
|
|
72
|
+
int bit = bloom_index_jen(filter, key, key_size, i);
|
|
73
|
+
bit_set(filter->bitset, bit);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
int bloom_get(Bloom *filter, void *key, unsigned int key_size) {
|
|
78
|
+
int i;
|
|
79
|
+
for(i = 0; i < filter->hash_count; i++) {
|
|
80
|
+
int bit = bloom_index_jen(filter, key, key_size, i);
|
|
81
|
+
if(bit_test(filter->bitset, bit) == 0)
|
|
82
|
+
return FALSE;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
return TRUE;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
unsigned int bloom_hash_jen (void *key, unsigned int length, unsigned int salt) {
|
|
89
|
+
unsigned char *k = (unsigned char*) key;
|
|
90
|
+
unsigned a, b;
|
|
91
|
+
unsigned c = salt;
|
|
92
|
+
unsigned int len = length;
|
|
93
|
+
|
|
94
|
+
a = b = 0x9e3779b9;
|
|
95
|
+
|
|
96
|
+
while ( len >= 12 ) {
|
|
97
|
+
a += ( k[0] + ( (unsigned)k[1] << 8 )
|
|
98
|
+
+ ( (unsigned)k[2] << 16 )
|
|
99
|
+
+ ( (unsigned)k[3] << 24 ) );
|
|
100
|
+
b += ( k[4] + ( (unsigned)k[5] << 8 )
|
|
101
|
+
+ ( (unsigned)k[6] << 16 )
|
|
102
|
+
+ ( (unsigned)k[7] << 24 ) );
|
|
103
|
+
c += ( k[8] + ( (unsigned)k[9] << 8 )
|
|
104
|
+
+ ( (unsigned)k[10] << 16 )
|
|
105
|
+
+ ( (unsigned)k[11] << 24 ) );
|
|
106
|
+
|
|
107
|
+
jen_mix ( a, b, c );
|
|
108
|
+
|
|
109
|
+
k += 12;
|
|
110
|
+
len -= 12;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
c += length;
|
|
114
|
+
|
|
115
|
+
switch ( len ) {
|
|
116
|
+
case 11: c += ( (unsigned)k[10] << 24 );
|
|
117
|
+
case 10: c += ( (unsigned)k[9] << 16 );
|
|
118
|
+
case 9 : c += ( (unsigned)k[8] << 8 );
|
|
119
|
+
/* First byte of c reserved for length */
|
|
120
|
+
case 8 : b += ( (unsigned)k[7] << 24 );
|
|
121
|
+
case 7 : b += ( (unsigned)k[6] << 16 );
|
|
122
|
+
case 6 : b += ( (unsigned)k[5] << 8 );
|
|
123
|
+
case 5 : b += k[4];
|
|
124
|
+
case 4 : a += ( (unsigned)k[3] << 24 );
|
|
125
|
+
case 3 : a += ( (unsigned)k[2] << 16 );
|
|
126
|
+
case 2 : a += ( (unsigned)k[1] << 8 );
|
|
127
|
+
case 1 : a += k[0];
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
jen_mix ( a, b, c );
|
|
131
|
+
|
|
132
|
+
return c;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
void bloom_status(Bloom *filter) {
|
|
136
|
+
printf("Bitset Size: %d\n", filter->bitset_size);
|
|
137
|
+
printf("Hash Count: %d\n", filter->hash_count);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#include <bitstring.h>
|
|
2
|
+
|
|
3
|
+
typedef struct {
|
|
4
|
+
bitstr_t *bitset;
|
|
5
|
+
int bitset_size;
|
|
6
|
+
int hash_count;
|
|
7
|
+
unsigned int *salts;
|
|
8
|
+
} Bloom;
|
|
9
|
+
|
|
10
|
+
Bloom *bloom_alloc();
|
|
11
|
+
Bloom *bloom_new(int bitset_size, int hash_count);
|
|
12
|
+
void bloom_set_hashes(Bloom* filter, int hash_count);
|
|
13
|
+
void bloom_set_bitset(Bloom* filter, int hash_count);
|
|
14
|
+
Bloom *bloom_for_error_and_keys(double error, int key_count);
|
|
15
|
+
void bloom_free(Bloom *filter);
|
|
16
|
+
void bloom_add(Bloom *filter, void *key, unsigned int key_size);
|
|
17
|
+
int bloom_get(Bloom *filter, void *key, unsigned int key_size);
|
|
18
|
+
unsigned int bloom_hash_jen(void *key, unsigned int length, unsigned int salt);
|
|
19
|
+
int bloom_size_for_error(double error, int key_count);
|
|
20
|
+
int bloom_ideal_hash_count(int size, int key_count);
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
#include "bloom.h"
|
|
2
|
+
#include "ruby.h"
|
|
3
|
+
#include <stdlib.h>
|
|
4
|
+
|
|
5
|
+
VALUE cBloomFilter;
|
|
6
|
+
|
|
7
|
+
void rb_bloom_free(void *bloom) {
|
|
8
|
+
bloom_free(bloom);
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
static VALUE rb_bloom_alloc(VALUE klass) {
|
|
12
|
+
VALUE obj = Data_Wrap_Struct(klass, 0, rb_bloom_free, bloom_alloc());
|
|
13
|
+
return obj;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
Bloom *get_bloom(VALUE self) {
|
|
17
|
+
Bloom *filter;
|
|
18
|
+
Data_Get_Struct(self, Bloom, filter);
|
|
19
|
+
return filter;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
static VALUE rb_bloom_initialize(VALUE self, VALUE bitset_size, VALUE hash_count) {
|
|
23
|
+
Bloom *filter = get_bloom(self);
|
|
24
|
+
bloom_set_bitset(filter, NUM2INT(bitset_size));
|
|
25
|
+
bloom_set_hashes(filter, NUM2INT(hash_count));
|
|
26
|
+
return self;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
static VALUE rb_bloom_set(VALUE self, VALUE key) {
|
|
30
|
+
Bloom *filter = get_bloom(self);
|
|
31
|
+
int hash_key = NUM2INT(rb_hash(key));
|
|
32
|
+
bloom_add(filter, &hash_key, sizeof(int));
|
|
33
|
+
return Qtrue;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
static VALUE rb_bloom_get(VALUE self, VALUE key) {
|
|
37
|
+
Bloom *filter = get_bloom(self);
|
|
38
|
+
int hash_key = NUM2INT(rb_hash(key));
|
|
39
|
+
return bloom_get(filter, &hash_key, sizeof(int)) ? Qtrue : Qnil;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
static VALUE rb_bloom_filter_size(VALUE self) {
|
|
43
|
+
Bloom *filter = get_bloom(self);
|
|
44
|
+
return INT2FIX(filter->bitset_size);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
static VALUE rb_bloom_hash_count(VALUE self) {
|
|
48
|
+
Bloom *filter = get_bloom(self);
|
|
49
|
+
return INT2FIX(filter->hash_count);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
static VALUE rb_bloom_for_error_rate(VALUE self, VALUE error_rate, VALUE key_count) {
|
|
53
|
+
VALUE rb_filter = rb_bloom_alloc(cBloomFilter);
|
|
54
|
+
Bloom *filter = get_bloom(rb_filter);
|
|
55
|
+
|
|
56
|
+
int size = bloom_size_for_error(NUM2DBL(error_rate), NUM2INT(key_count));
|
|
57
|
+
int hash_count = bloom_ideal_hash_count(size, NUM2INT(key_count));
|
|
58
|
+
|
|
59
|
+
bloom_set_bitset(filter, size);
|
|
60
|
+
bloom_set_hashes(filter, hash_count);
|
|
61
|
+
|
|
62
|
+
return rb_filter;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
void Init_bloom_filter() {
|
|
66
|
+
cBloomFilter = rb_define_class("BloomFilter", rb_cObject);
|
|
67
|
+
rb_define_alloc_func(cBloomFilter, rb_bloom_alloc);
|
|
68
|
+
rb_define_method(cBloomFilter, "initialize", rb_bloom_initialize, 2);
|
|
69
|
+
rb_define_method(cBloomFilter, "get", rb_bloom_get, 1);
|
|
70
|
+
rb_define_method(cBloomFilter, "set", rb_bloom_set, 1);
|
|
71
|
+
rb_define_method(cBloomFilter, "filter_size", rb_bloom_filter_size, 0);
|
|
72
|
+
rb_define_method(cBloomFilter, "hash_count", rb_bloom_hash_count, 0);
|
|
73
|
+
rb_define_singleton_method(cBloomFilter, "for_error_rate", rb_bloom_for_error_rate, 2);
|
|
74
|
+
}
|
data/lib/bloom_filter.rb
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
require File.dirname(__FILE__) + '/../ext/bloom_filter'
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
require File.dirname(__FILE__) + '/../ext/bloom_filter/bloom_filter'
|
|
2
|
+
|
|
3
|
+
describe BloomFilter do
|
|
4
|
+
describe :new do
|
|
5
|
+
it 'returns an instance of BloomFilter' do
|
|
6
|
+
BloomFilter.new(1024, 3).should be_an_instance_of(BloomFilter)
|
|
7
|
+
end
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
describe :set do
|
|
11
|
+
it 'accepts a key of any type' do
|
|
12
|
+
filter = BloomFilter.new(1024,3)
|
|
13
|
+
lambda { filter.set(123) }.should_not raise_error
|
|
14
|
+
lambda { filter.set('abc') }.should_not raise_error
|
|
15
|
+
lambda { filter.set(:foo) }.should_not raise_error
|
|
16
|
+
lambda { filter.set([1,2,3]) }.should_not raise_error
|
|
17
|
+
lambda { filter.set(:a => 1) }.should_not raise_error
|
|
18
|
+
lambda { filter.set(nil) }.should_not raise_error
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
it 'returns true' do
|
|
22
|
+
filter = BloomFilter.new(1024,3)
|
|
23
|
+
filter.set(123).should be_true
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
describe :get do
|
|
28
|
+
before :each do
|
|
29
|
+
@filter = BloomFilter.new(1024,3)
|
|
30
|
+
@filter.set(123)
|
|
31
|
+
@filter.set('abc')
|
|
32
|
+
@filter.set(:foo)
|
|
33
|
+
@filter.set([1,2,3])
|
|
34
|
+
@hash = { :a => 1 }
|
|
35
|
+
@filter.set(@hash)
|
|
36
|
+
@filter.set(nil)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
it 'returns nil when the key does not exist' do
|
|
40
|
+
@filter.get('not in the filter').should be_nil
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
it 'returns true when the key exists' do
|
|
44
|
+
@filter.get(123).should be_true
|
|
45
|
+
@filter.get('abc').should be_true
|
|
46
|
+
@filter.get(:foo).should be_true
|
|
47
|
+
@filter.get([1,2,3]).should be_true
|
|
48
|
+
@filter.get(@hash).should be_true
|
|
49
|
+
@filter.get(nil).should be_true
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
describe :filter_size do
|
|
54
|
+
it 'returns the size of the bitset used in the filter' do
|
|
55
|
+
f = BloomFilter.new(1024,3)
|
|
56
|
+
f.filter_size.should == 1024
|
|
57
|
+
|
|
58
|
+
f = BloomFilter.new(10,3)
|
|
59
|
+
f.filter_size.should == 10
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
describe :hash_count do
|
|
64
|
+
it 'returns the number of hash functions used in the filter' do
|
|
65
|
+
f = BloomFilter.new(1024,3)
|
|
66
|
+
f.hash_count.should == 3
|
|
67
|
+
|
|
68
|
+
f = BloomFilter.new(1024,10)
|
|
69
|
+
f.hash_count.should == 10
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
describe :for_error_rate do
|
|
74
|
+
it 'returns a filter of the proper size' do
|
|
75
|
+
BloomFilter.for_error_rate(0.05,100).filter_size.should == 624
|
|
76
|
+
BloomFilter.for_error_rate(0.10,100).filter_size.should == 480
|
|
77
|
+
BloomFilter.for_error_rate(0.10,50).filter_size.should == 240
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
it 'returns a filter with the correct hash count' do
|
|
81
|
+
BloomFilter.for_error_rate(0.1,1000).hash_count.should == 3
|
|
82
|
+
BloomFilter.for_error_rate(0.01,1000).hash_count.should == 7
|
|
83
|
+
BloomFilter.for_error_rate(0.001,1000).hash_count.should == 10
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: tyler-bloom_filter
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.1
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Tyler McMullen
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
|
|
12
|
+
date: 2009-05-11 00:00:00 -07:00
|
|
13
|
+
default_executable:
|
|
14
|
+
dependencies: []
|
|
15
|
+
|
|
16
|
+
description: Fast Bloom filter in C and Ruby.
|
|
17
|
+
email: tyler@scribd.com
|
|
18
|
+
executables: []
|
|
19
|
+
|
|
20
|
+
extensions:
|
|
21
|
+
- ext/bloom_filter/extconf.rb
|
|
22
|
+
extra_rdoc_files: []
|
|
23
|
+
|
|
24
|
+
files:
|
|
25
|
+
- README.textile
|
|
26
|
+
- VERSION.yml
|
|
27
|
+
- lib/bloom_filter.rb
|
|
28
|
+
- spec/bloom_filter_spec.rb
|
|
29
|
+
- ext/bloom_filter
|
|
30
|
+
- ext/bloom_filter/bloom.c
|
|
31
|
+
- ext/bloom_filter/bloom.h
|
|
32
|
+
- ext/bloom_filter/bloom_filter.c
|
|
33
|
+
- ext/bloom_filter/extconf.rb
|
|
34
|
+
- ext/bloom_filter/Makefile
|
|
35
|
+
has_rdoc: false
|
|
36
|
+
homepage: http://github.com/tyler/bloom_filter
|
|
37
|
+
post_install_message:
|
|
38
|
+
rdoc_options: []
|
|
39
|
+
|
|
40
|
+
require_paths:
|
|
41
|
+
- ext
|
|
42
|
+
- lib
|
|
43
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
44
|
+
requirements:
|
|
45
|
+
- - ">="
|
|
46
|
+
- !ruby/object:Gem::Version
|
|
47
|
+
version: "0"
|
|
48
|
+
version:
|
|
49
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - ">="
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: "0"
|
|
54
|
+
version:
|
|
55
|
+
requirements: []
|
|
56
|
+
|
|
57
|
+
rubyforge_project:
|
|
58
|
+
rubygems_version: 1.2.0
|
|
59
|
+
signing_key:
|
|
60
|
+
specification_version: 2
|
|
61
|
+
summary: Fast Bloom filter in C and Ruby.
|
|
62
|
+
test_files: []
|
|
63
|
+
|