bloom_fit 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +78 -0
- data/Rakefile +14 -0
- data/ext/cbloomfilter/cbloomfilter.c +397 -0
- data/ext/cbloomfilter/crc32.h +76 -0
- data/ext/cbloomfilter/extconf.rb +4 -0
- data/lib/bloom_fit/version.rb +3 -0
- data/lib/bloom_fit.rb +107 -0
- data/spec/bloom_fit_spec.rb +152 -0
- data/spec/helper.rb +2 -0
- metadata +119 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 11be2e4492a3e06ff7c401ebe04e8f5b462a48d8399dea08ecdc33726cbec31f
|
|
4
|
+
data.tar.gz: 0642065d4004d002fc51cbe07f77a92daf5b84b117abc6684c45f7ca82b7d757
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 840f467007a4efc4bcf55c4e29c320d28b16daa819b1f54a1884c43bb5092add5c4522a2e6e524a5516031e48bc937eb4bfddf9e20767d85b18590b1245c1d23
|
|
7
|
+
data.tar.gz: 58b45319a8cc83342ab224f0430bd8849b4655b2be0b1758d788dbc68df7ecca9cc5ab2deb1c33419e07dc6d0f6c8c22a4c158e7001fc696ee695273bb9ad196
|
data/README.md
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# BloomFit makes Bloom Filter tuning easy
|
|
2
|
+
|
|
3
|
+
BloomFit provides a MRI/C-based non-counting bloom filter for use in your Ruby projects. It is heavily based on [bloomfilter-rb]'s native implementation, but provides a better hashing distribution by using DJB2 over CRC32, avoids the need to supply a seed, removes counting abilities, improves performance for very large datasets, and will automatically calculate the bit size (m) and the number of hashes (k) when given a capacity and false-positive-rate.
|
|
4
|
+
|
|
5
|
+
A [Bloom filter](http://en.wikipedia.org/wiki/Bloom_filter) is a space-efficient probabilistic data structure that is used to test whether an element is a member of a set. False positives are possible, but false negatives are not. Instead of using k different hash functions, this implementation a DJB2 hash with k seeds from the CRC table.
|
|
6
|
+
|
|
7
|
+
Performance of the Bloom filter depends on the following:
|
|
8
|
+
|
|
9
|
+
- size of the bit array
|
|
10
|
+
- number of hash functions
|
|
11
|
+
|
|
12
|
+
BloomFit is a fork of [bloomfilter-rb].
|
|
13
|
+
|
|
14
|
+
## Resources
|
|
15
|
+
|
|
16
|
+
- Background: [Bloom filter](http://en.wikipedia.org/wiki/Bloom_filter)
|
|
17
|
+
- Determining parameters: [Scalable Datasets: Bloom Filters in Ruby](http://www.igvita.com/2008/12/27/scalable-datasets-bloom-filters-in-ruby/)
|
|
18
|
+
- Applications & reasons behind bloom filter: [Flow analysis: Time based bloom filter](http://www.igvita.com/2010/01/06/flow-analysis-time-based-bloom-filters/)
|
|
19
|
+
|
|
20
|
+
## Examples
|
|
21
|
+
|
|
22
|
+
MRI/C implementation which creates an in-memory filter which can be saved and reloaded from disk.
|
|
23
|
+
|
|
24
|
+
(COMING SOON) If you'd like to specify an expected item count and a false-positive rate that you can tolerate:
|
|
25
|
+
|
|
26
|
+
```ruby
|
|
27
|
+
require "bloom_fit"
|
|
28
|
+
|
|
29
|
+
bf = BloomFit.new(capacity: 250, false_positive_rate: 0.001)
|
|
30
|
+
bf.add("cat")
|
|
31
|
+
bf.include?("cat") # => true
|
|
32
|
+
bf.include?("dog") # => false
|
|
33
|
+
|
|
34
|
+
# Hash syntax with a bloom filter!
|
|
35
|
+
bf["bird"] = "bar"
|
|
36
|
+
bf["bird"] # => true
|
|
37
|
+
bf["mouse"] # => false
|
|
38
|
+
|
|
39
|
+
bf.stats
|
|
40
|
+
# => Number of filter bits (m): 3600
|
|
41
|
+
# => Number of set bits (n): 20
|
|
42
|
+
# => Number of filter hashes (k) : 10
|
|
43
|
+
# => Predicted false positive rate = 0.00%
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
If you'd like more control over the traditional inputs like bit size and the number of hashes:
|
|
47
|
+
|
|
48
|
+
```ruby
|
|
49
|
+
require "bloom_fit"
|
|
50
|
+
|
|
51
|
+
bf = BloomFit.new(size: 100, hashes: 2)
|
|
52
|
+
bf.add("cat")
|
|
53
|
+
bf.include?("cat") # => true
|
|
54
|
+
bf.include?("dog") # => false
|
|
55
|
+
|
|
56
|
+
# Hash syntax with a bloom filter!
|
|
57
|
+
bf["bird"] = "bar"
|
|
58
|
+
bf["bird"] # => true
|
|
59
|
+
bf["mouse"] # => false
|
|
60
|
+
|
|
61
|
+
bf.stats
|
|
62
|
+
# => Number of filter bits (m): 100
|
|
63
|
+
# => Number of set bits (n): 4
|
|
64
|
+
# => Number of filter hashes (k) : 2
|
|
65
|
+
# => Predicted false positive rate = 10.87%
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Credits
|
|
69
|
+
|
|
70
|
+
- Tatsuya Mori <valdzone@gmail.com> (Original C implementation)
|
|
71
|
+
- Ilya Grigorik [@igrigorik](https://github.com/igrigorik) ([bloomfilter-rb] gem)
|
|
72
|
+
- Bharanee Rathna [@deepfryed](https://github.com/deepfryed) ([bloom-filter](https://github.com/deepfryed/bloom-filter) gem)
|
|
73
|
+
|
|
74
|
+
## License
|
|
75
|
+
|
|
76
|
+
[MIT License](https://rmm5t.mit-license.org/)
|
|
77
|
+
|
|
78
|
+
[bloomfilter-rb]: https://github.com/igrigorik/bloomfilter-rb
|
data/Rakefile
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
require 'bundler/gem_tasks'
|
|
2
|
+
require 'rake'
|
|
3
|
+
require 'rspec'
|
|
4
|
+
require 'rspec/core/rake_task'
|
|
5
|
+
require 'rake/extensiontask'
|
|
6
|
+
|
|
7
|
+
Bundler::GemHelper.install_tasks
|
|
8
|
+
Rake::ExtensionTask.new('cbloomfilter')
|
|
9
|
+
RSpec::Core::RakeTask.new(:spec)
|
|
10
|
+
Rake::Task[:spec].prerequisites << :clean
|
|
11
|
+
Rake::Task[:spec].prerequisites << :compile
|
|
12
|
+
|
|
13
|
+
desc "Default: run unit tests."
|
|
14
|
+
task default: :spec
|
|
@@ -0,0 +1,397 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* cbloomfilter.c - simple Bloom Filter
|
|
3
|
+
* (c) Tatsuya Mori <valdzone@gmail.com>
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
#include "ruby.h"
|
|
7
|
+
#include "crc32.h"
|
|
8
|
+
|
|
9
|
+
#if !defined(RSTRING_LEN)
|
|
10
|
+
# define RSTRING_LEN(x) (RSTRING(x)->len)
|
|
11
|
+
# define RSTRING_PTR(x) (RSTRING(x)->ptr)
|
|
12
|
+
#endif
|
|
13
|
+
|
|
14
|
+
/* Reuse the standard CRC table for consistent seeds */
|
|
15
|
+
static unsigned int *seeds = crc_table;
|
|
16
|
+
|
|
17
|
+
static VALUE cBloomFilter;
|
|
18
|
+
|
|
19
|
+
struct BloomFilter {
|
|
20
|
+
int m; /* # of buckets in a bloom filter */
|
|
21
|
+
int b; /* # of bits in a bloom filter bucket */
|
|
22
|
+
int k; /* # of hash functions */
|
|
23
|
+
int r; /* # raise on bucket overflow? */
|
|
24
|
+
unsigned char *ptr; /* bits data */
|
|
25
|
+
int bytes; /* size of byte data */
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
unsigned long djb2(unsigned char *str, int len) {
|
|
29
|
+
unsigned long hash = 5381;
|
|
30
|
+
unsigned char *c;
|
|
31
|
+
c = (unsigned char *) str;
|
|
32
|
+
while (len > 0) {
|
|
33
|
+
hash = ((hash << 5) ^ hash) ^ (*c);
|
|
34
|
+
--len;
|
|
35
|
+
++c;
|
|
36
|
+
}
|
|
37
|
+
return hash;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
void bits_free(struct BloomFilter *bf) {
|
|
41
|
+
ruby_xfree(bf->ptr);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
void bucket_unset(struct BloomFilter *bf, int index) {
|
|
45
|
+
int byte_offset = (index * bf->b) / 8;
|
|
46
|
+
int bit_offset = (index * bf->b) % 8;
|
|
47
|
+
unsigned int c = bf->ptr[byte_offset];
|
|
48
|
+
c += bf->ptr[byte_offset + 1] << 8;
|
|
49
|
+
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
|
50
|
+
if ((c & mask) == 0) {
|
|
51
|
+
// do nothing
|
|
52
|
+
} else {
|
|
53
|
+
// reduce the counter: 11 00 => 10 00 (suppose bf->b is 2)
|
|
54
|
+
c -= (1 << bit_offset) & ((1 << 8) -1);
|
|
55
|
+
// shift the bitmap right by 1 bit: 10 00 => 01 00
|
|
56
|
+
c = (~mask & c) | ((c & mask) >> (bit_offset + 1) << bit_offset);
|
|
57
|
+
|
|
58
|
+
bf->ptr[byte_offset] = c & ((1 << 8) - 1);
|
|
59
|
+
bf->ptr[byte_offset + 1] = (c & ((1 << 16) - 1)) >> 8;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
void bucket_set(struct BloomFilter *bf, int index) {
|
|
64
|
+
int byte_offset = (index * bf->b) / 8;
|
|
65
|
+
int bit_offset = (index * bf->b) % 8;
|
|
66
|
+
unsigned int c = bf->ptr[byte_offset];
|
|
67
|
+
c += bf->ptr[byte_offset + 1] << 8;
|
|
68
|
+
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
|
69
|
+
if ((c & mask) == mask) {
|
|
70
|
+
if (bf->r == 1) rb_raise(rb_eRuntimeError, "bucket got filled up");
|
|
71
|
+
} else {
|
|
72
|
+
c = c + ((1 << bit_offset) & ((1 << 8) -1)) | c;
|
|
73
|
+
bf->ptr[byte_offset] = c & ((1 << 8) - 1);
|
|
74
|
+
bf->ptr[byte_offset + 1] = (c & ((1 << 16) - 1)) >> 8;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
int bucket_check(struct BloomFilter *bf, int index) {
|
|
79
|
+
int byte_offset = (index * bf->b) / 8;
|
|
80
|
+
int bit_offset = (index * bf->b) % 8;
|
|
81
|
+
unsigned int c = bf->ptr[byte_offset];
|
|
82
|
+
c += bf->ptr[byte_offset + 1] << 8;
|
|
83
|
+
|
|
84
|
+
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
|
85
|
+
return (c & mask) >> bit_offset;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
int bucket_get(struct BloomFilter *bf, int index) {
|
|
89
|
+
int byte_offset = (index * bf->b) / 8;
|
|
90
|
+
int bit_offset = (index * bf->b) % 8;
|
|
91
|
+
unsigned int c = bf->ptr[byte_offset];
|
|
92
|
+
c += bf->ptr[byte_offset + 1] << 8;
|
|
93
|
+
|
|
94
|
+
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
|
95
|
+
return (c & mask) >> bit_offset;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
static VALUE bf_s_new(int argc, VALUE *argv, VALUE self) {
|
|
99
|
+
struct BloomFilter *bf;
|
|
100
|
+
VALUE arg1, arg2, arg3, arg4, obj;
|
|
101
|
+
int m, k, b, r;
|
|
102
|
+
|
|
103
|
+
obj = Data_Make_Struct(self, struct BloomFilter, NULL, bits_free, bf);
|
|
104
|
+
|
|
105
|
+
/* default = Fugou approach :-) */
|
|
106
|
+
arg1 = INT2FIX(100000000);
|
|
107
|
+
arg2 = INT2FIX(4);
|
|
108
|
+
arg3 = INT2FIX(1);
|
|
109
|
+
arg4 = INT2FIX(0);
|
|
110
|
+
|
|
111
|
+
switch (argc) {
|
|
112
|
+
case 4:
|
|
113
|
+
if (argv[3] == Qtrue) {
|
|
114
|
+
arg4 = INT2FIX(1);
|
|
115
|
+
}
|
|
116
|
+
case 3:
|
|
117
|
+
arg3 = argv[2];
|
|
118
|
+
case 2:
|
|
119
|
+
arg2 = argv[1];
|
|
120
|
+
case 1:
|
|
121
|
+
arg1 = argv[0];
|
|
122
|
+
break;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
m = FIX2INT(arg1);
|
|
126
|
+
k = FIX2INT(arg2);
|
|
127
|
+
b = FIX2INT(arg3);
|
|
128
|
+
r = FIX2INT(arg4);
|
|
129
|
+
|
|
130
|
+
if (b < 1 || b > 8)
|
|
131
|
+
rb_raise(rb_eArgError, "bucket size");
|
|
132
|
+
if (m < 1)
|
|
133
|
+
rb_raise(rb_eArgError, "array size");
|
|
134
|
+
if (k < 1)
|
|
135
|
+
rb_raise(rb_eArgError, "hash length");
|
|
136
|
+
|
|
137
|
+
bf->b = b;
|
|
138
|
+
bf->m = m;
|
|
139
|
+
bf->k = k;
|
|
140
|
+
bf->r = r;
|
|
141
|
+
|
|
142
|
+
bf->bytes = ((m * b) + 15) / 8;
|
|
143
|
+
bf->ptr = ALLOC_N(unsigned char, bf->bytes);
|
|
144
|
+
|
|
145
|
+
/* initialize the bits with zeros */
|
|
146
|
+
memset(bf->ptr, 0, bf->bytes);
|
|
147
|
+
rb_iv_set(obj, "@hash_value", rb_hash_new());
|
|
148
|
+
|
|
149
|
+
return obj;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
static VALUE bf_clear(VALUE self) {
|
|
153
|
+
struct BloomFilter *bf;
|
|
154
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
155
|
+
memset(bf->ptr, 0, bf->bytes);
|
|
156
|
+
return Qtrue;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
static VALUE bf_m(VALUE self) {
|
|
160
|
+
struct BloomFilter *bf;
|
|
161
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
162
|
+
return INT2FIX(bf->m);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
static VALUE bf_k(VALUE self) {
|
|
166
|
+
struct BloomFilter *bf;
|
|
167
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
168
|
+
return INT2FIX(bf->k);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
static VALUE bf_b(VALUE self) {
|
|
172
|
+
struct BloomFilter *bf;
|
|
173
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
174
|
+
return INT2FIX(bf->b);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
static VALUE bf_r(VALUE self) {
|
|
178
|
+
struct BloomFilter *bf;
|
|
179
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
180
|
+
return bf->r == 0 ? Qfalse : Qtrue;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
static VALUE bf_set_bits(VALUE self){
|
|
184
|
+
struct BloomFilter *bf;
|
|
185
|
+
int i,j,count = 0;
|
|
186
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
187
|
+
for (i = 0; i < bf->bytes; i++) {
|
|
188
|
+
for (j = 0; j < 8; j++) {
|
|
189
|
+
count += (bf->ptr[i] >> j) & 1;
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
return INT2FIX(count);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
static VALUE bf_insert(VALUE self, VALUE key) {
|
|
196
|
+
VALUE skey;
|
|
197
|
+
unsigned long hash, index;
|
|
198
|
+
int i, len, m, k;
|
|
199
|
+
char *ckey;
|
|
200
|
+
struct BloomFilter *bf;
|
|
201
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
202
|
+
|
|
203
|
+
skey = rb_obj_as_string(key);
|
|
204
|
+
ckey = StringValuePtr(skey);
|
|
205
|
+
len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
|
|
206
|
+
|
|
207
|
+
m = bf->m;
|
|
208
|
+
k = bf->k;
|
|
209
|
+
|
|
210
|
+
hash = (unsigned long) djb2(ckey, len);
|
|
211
|
+
for (i = 0; i <= k - 1; i++) {
|
|
212
|
+
index = (unsigned long) (hash ^ seeds[i]) % (unsigned int) (m);
|
|
213
|
+
|
|
214
|
+
/* set a bit at the index */
|
|
215
|
+
bucket_set(bf, index);
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
return Qnil;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
static VALUE bf_merge(VALUE self, VALUE other) {
|
|
222
|
+
struct BloomFilter *bf, *target;
|
|
223
|
+
int i;
|
|
224
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
225
|
+
Data_Get_Struct(other, struct BloomFilter, target);
|
|
226
|
+
for (i = 0; i < bf->bytes; i++) {
|
|
227
|
+
bf->ptr[i] |= target->ptr[i];
|
|
228
|
+
}
|
|
229
|
+
return Qnil;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
static VALUE bf_and(VALUE self, VALUE other) {
|
|
233
|
+
struct BloomFilter *bf, *bf_other, *target;
|
|
234
|
+
VALUE klass, obj, args[5];
|
|
235
|
+
int i;
|
|
236
|
+
|
|
237
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
238
|
+
Data_Get_Struct(other, struct BloomFilter, bf_other);
|
|
239
|
+
args[0] = INT2FIX(bf->m);
|
|
240
|
+
args[1] = INT2FIX(bf->k);
|
|
241
|
+
args[2] = INT2FIX(bf->b);
|
|
242
|
+
args[3] = INT2FIX(bf->r);
|
|
243
|
+
klass = rb_funcall(self,rb_intern("class"),0);
|
|
244
|
+
obj = bf_s_new(4,args,klass);
|
|
245
|
+
Data_Get_Struct(obj, struct BloomFilter, target);
|
|
246
|
+
for (i = 0; i < bf->bytes; i++){
|
|
247
|
+
target->ptr[i] = bf->ptr[i] & bf_other->ptr[i];
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
return obj;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
static VALUE bf_or(VALUE self, VALUE other) {
|
|
254
|
+
struct BloomFilter *bf, *bf_other, *target;
|
|
255
|
+
VALUE klass, obj, args[5];
|
|
256
|
+
int i;
|
|
257
|
+
|
|
258
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
259
|
+
Data_Get_Struct(other, struct BloomFilter, bf_other);
|
|
260
|
+
args[0] = INT2FIX(bf->m);
|
|
261
|
+
args[1] = INT2FIX(bf->k);
|
|
262
|
+
args[2] = INT2FIX(bf->b);
|
|
263
|
+
args[3] = INT2FIX(bf->r);
|
|
264
|
+
klass = rb_funcall(self,rb_intern("class"),0);
|
|
265
|
+
obj = bf_s_new(4,args,klass);
|
|
266
|
+
Data_Get_Struct(obj, struct BloomFilter, target);
|
|
267
|
+
for (i = 0; i < bf->bytes; i++){
|
|
268
|
+
target->ptr[i] = bf->ptr[i] | bf_other->ptr[i];
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
return obj;
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
static VALUE bf_delete(VALUE self, VALUE key) {
|
|
275
|
+
unsigned long hash, index;
|
|
276
|
+
int i, len, m, k;
|
|
277
|
+
char *ckey;
|
|
278
|
+
VALUE skey;
|
|
279
|
+
struct BloomFilter *bf;
|
|
280
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
281
|
+
|
|
282
|
+
skey = rb_obj_as_string(key);
|
|
283
|
+
ckey = StringValuePtr(skey);
|
|
284
|
+
len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
|
|
285
|
+
|
|
286
|
+
m = bf->m;
|
|
287
|
+
k = bf->k;
|
|
288
|
+
|
|
289
|
+
hash = (unsigned long) djb2(ckey, len);
|
|
290
|
+
for (i = 0; i <= k - 1; i++) {
|
|
291
|
+
index = (unsigned long) (hash ^ seeds[i]) % (unsigned int) (m);
|
|
292
|
+
|
|
293
|
+
/* set a bit at the index */
|
|
294
|
+
bucket_unset(bf, index);
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
return Qnil;
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
static VALUE bf_include(int argc, VALUE* argv, VALUE self) {
|
|
302
|
+
unsigned long hash, index;
|
|
303
|
+
int i, len, m, k, tests_idx, vlen;
|
|
304
|
+
char *ckey;
|
|
305
|
+
VALUE tests, key, skey;
|
|
306
|
+
struct BloomFilter *bf;
|
|
307
|
+
|
|
308
|
+
rb_scan_args(argc, argv, "*", &tests);
|
|
309
|
+
|
|
310
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
311
|
+
vlen = RARRAY_LEN(tests);
|
|
312
|
+
for(tests_idx = 0; tests_idx < vlen; tests_idx++) {
|
|
313
|
+
key = rb_ary_entry(tests, tests_idx);
|
|
314
|
+
skey = rb_obj_as_string(key);
|
|
315
|
+
ckey = StringValuePtr(skey);
|
|
316
|
+
len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
|
|
317
|
+
|
|
318
|
+
m = bf->m;
|
|
319
|
+
k = bf->k;
|
|
320
|
+
|
|
321
|
+
hash = (unsigned long) djb2(ckey, len);
|
|
322
|
+
for (i = 0; i <= k - 1; i++) {
|
|
323
|
+
index = (unsigned long) (hash ^ seeds[i]) % (unsigned int) (m);
|
|
324
|
+
|
|
325
|
+
/* check the bit at the index */
|
|
326
|
+
if (!bucket_check(bf, index)) {
|
|
327
|
+
return Qfalse; /* i.e., it is a new entry ; escape the loop */
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
return Qtrue;
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
static VALUE bf_to_s(VALUE self) {
|
|
336
|
+
struct BloomFilter *bf;
|
|
337
|
+
unsigned char *ptr;
|
|
338
|
+
int i;
|
|
339
|
+
VALUE str;
|
|
340
|
+
|
|
341
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
342
|
+
str = rb_str_new(0, bf->m);
|
|
343
|
+
|
|
344
|
+
ptr = (unsigned char *) RSTRING_PTR(str);
|
|
345
|
+
for (i = 0; i < bf->m; i++)
|
|
346
|
+
*ptr++ = bucket_get(bf, i) ? '1' : '0';
|
|
347
|
+
|
|
348
|
+
return str;
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
static VALUE bf_bitmap(VALUE self) {
|
|
352
|
+
struct BloomFilter *bf;
|
|
353
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
354
|
+
|
|
355
|
+
VALUE str = rb_str_new(0, bf->bytes);
|
|
356
|
+
unsigned char* ptr = (unsigned char *) RSTRING_PTR(str);
|
|
357
|
+
|
|
358
|
+
memcpy(ptr, bf->ptr, bf->bytes);
|
|
359
|
+
|
|
360
|
+
return str;
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
static VALUE bf_load(VALUE self, VALUE bitmap) {
|
|
364
|
+
struct BloomFilter *bf;
|
|
365
|
+
Data_Get_Struct(self, struct BloomFilter, bf);
|
|
366
|
+
unsigned char* ptr = (unsigned char *) RSTRING_PTR(bitmap);
|
|
367
|
+
|
|
368
|
+
memcpy(bf->ptr, ptr, bf->bytes);
|
|
369
|
+
|
|
370
|
+
return Qnil;
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
void Init_cbloomfilter(void) {
|
|
374
|
+
cBloomFilter = rb_define_class("CBloomFilter", rb_cObject);
|
|
375
|
+
rb_define_singleton_method(cBloomFilter, "new", bf_s_new, -1);
|
|
376
|
+
rb_define_method(cBloomFilter, "m", bf_m, 0);
|
|
377
|
+
rb_define_method(cBloomFilter, "k", bf_k, 0);
|
|
378
|
+
rb_define_method(cBloomFilter, "b", bf_b, 0);
|
|
379
|
+
rb_define_method(cBloomFilter, "r", bf_r, 0);
|
|
380
|
+
rb_define_method(cBloomFilter, "set_bits", bf_set_bits, 0);
|
|
381
|
+
/* rb_define_method(cBloomFilter, "s", bf_s, 0); */
|
|
382
|
+
rb_define_method(cBloomFilter, "insert", bf_insert, 1);
|
|
383
|
+
rb_define_method(cBloomFilter, "delete", bf_delete, 1);
|
|
384
|
+
rb_define_method(cBloomFilter, "include?", bf_include, -1);
|
|
385
|
+
rb_define_method(cBloomFilter, "clear", bf_clear, 0);
|
|
386
|
+
rb_define_method(cBloomFilter, "merge!", bf_merge, 1);
|
|
387
|
+
rb_define_method(cBloomFilter, "&", bf_and, 1);
|
|
388
|
+
rb_define_method(cBloomFilter, "|", bf_or, 1);
|
|
389
|
+
|
|
390
|
+
rb_define_method(cBloomFilter, "to_s", bf_to_s, 0);
|
|
391
|
+
rb_define_method(cBloomFilter, "bitmap", bf_bitmap, 0);
|
|
392
|
+
rb_define_method(cBloomFilter, "load", bf_load, 1);
|
|
393
|
+
|
|
394
|
+
/* functions that have not been implemented, yet */
|
|
395
|
+
|
|
396
|
+
// rb_define_method(cBloomFilter, "<=>", bf_cmp, 1);
|
|
397
|
+
}
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
/* simple CRC32 code */
|
|
2
|
+
/*
|
|
3
|
+
* Copyright 2005 Aris Adamantiadis
|
|
4
|
+
*
|
|
5
|
+
* This file is part of the SSH Library
|
|
6
|
+
*
|
|
7
|
+
* The SSH Library is free software; you can redistribute it and/or modify
|
|
8
|
+
* it under the terms of the GNU Lesser General Public License as published by
|
|
9
|
+
* the Free Software Foundation; either version 2.1 of the License, or (at your
|
|
10
|
+
* option) any later version.
|
|
11
|
+
*
|
|
12
|
+
*
|
|
13
|
+
* The SSH Library is distributed in the hope that it will be useful, but
|
|
14
|
+
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
15
|
+
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
|
16
|
+
* License for more details.
|
|
17
|
+
*
|
|
18
|
+
* You should have received a copy of the GNU Lesser General Public License
|
|
19
|
+
* along with the SSH Library; see the file COPYING. If not, write to
|
|
20
|
+
* the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
|
|
21
|
+
* MA 02111-1307, USA. */
|
|
22
|
+
|
|
23
|
+
static unsigned int crc_table[] = {
|
|
24
|
+
0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
|
|
25
|
+
0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
|
|
26
|
+
0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
|
|
27
|
+
0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
|
|
28
|
+
0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
|
|
29
|
+
0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
|
|
30
|
+
0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
|
|
31
|
+
0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
|
|
32
|
+
0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
|
|
33
|
+
0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
|
|
34
|
+
0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
|
|
35
|
+
0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
|
|
36
|
+
0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
|
|
37
|
+
0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
|
|
38
|
+
0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
|
|
39
|
+
0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
|
|
40
|
+
0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
|
|
41
|
+
0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
|
|
42
|
+
0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
|
|
43
|
+
0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
|
|
44
|
+
0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
|
|
45
|
+
0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
|
|
46
|
+
0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
|
|
47
|
+
0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
|
|
48
|
+
0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
|
|
49
|
+
0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
|
|
50
|
+
0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
|
|
51
|
+
0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
|
|
52
|
+
0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
|
|
53
|
+
0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
|
|
54
|
+
0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
|
|
55
|
+
0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
|
|
56
|
+
0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
|
|
57
|
+
0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
|
|
58
|
+
0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
|
|
59
|
+
0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
|
|
60
|
+
0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
|
|
61
|
+
0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
|
|
62
|
+
0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
|
|
63
|
+
0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
|
|
64
|
+
0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
|
|
65
|
+
0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
|
|
66
|
+
0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
|
|
67
|
+
0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
|
|
68
|
+
0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
|
|
69
|
+
0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
|
|
70
|
+
0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
|
|
71
|
+
0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
|
|
72
|
+
0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
|
|
73
|
+
0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
|
|
74
|
+
0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
|
|
75
|
+
0x2d02ef8dUL
|
|
76
|
+
};
|
data/lib/bloom_fit.rb
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
require 'cbloomfilter'
|
|
2
|
+
require 'bloom_fit/version'
|
|
3
|
+
|
|
4
|
+
class BloomFit
|
|
5
|
+
BloomFit::ConfigurationMismatch = Class.new(ArgumentError)
|
|
6
|
+
|
|
7
|
+
attr_reader :bf
|
|
8
|
+
|
|
9
|
+
def initialize(opts = {})
|
|
10
|
+
@opts = {
|
|
11
|
+
:size => 100,
|
|
12
|
+
:hashes => 4,
|
|
13
|
+
:bucket => 1,
|
|
14
|
+
:raise => false
|
|
15
|
+
}.merge(opts)
|
|
16
|
+
|
|
17
|
+
# arg 1: m => size : number of buckets in a bloom filter
|
|
18
|
+
# arg 2: k => hashes : number of hash functions
|
|
19
|
+
# arg 3: b => bucket : number of bits per bucket
|
|
20
|
+
# arg 4: r => raise : whether to raise on bucket overflow
|
|
21
|
+
|
|
22
|
+
@bf = CBloomFilter.new(@opts[:size], @opts[:hashes], @opts[:bucket], @opts[:raise])
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def insert(key)
|
|
26
|
+
@bf.insert(key)
|
|
27
|
+
end
|
|
28
|
+
alias :[]= :insert
|
|
29
|
+
|
|
30
|
+
def include?(*keys)
|
|
31
|
+
@bf.include?(*keys)
|
|
32
|
+
end
|
|
33
|
+
alias :key? :include?
|
|
34
|
+
alias :[] :include?
|
|
35
|
+
|
|
36
|
+
def delete(key); @bf.delete(key); end
|
|
37
|
+
def clear; @bf.clear; end
|
|
38
|
+
def size; @bf.set_bits; end
|
|
39
|
+
def merge!(o); @bf.merge!(o.bf); end
|
|
40
|
+
|
|
41
|
+
# Returns the number of bits that are set to 1 in the filter.
|
|
42
|
+
def set_bits
|
|
43
|
+
@bf.set_bits
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Computes the intersection of two Bloom filters.
|
|
47
|
+
# It assumes that both filters have the same size -
|
|
48
|
+
# if this is not true +BloomFit::ConfigurationMismatch+ is raised.
|
|
49
|
+
def &(o)
|
|
50
|
+
raise BloomFit::ConfigurationMismatch.new unless same_parameters?(o)
|
|
51
|
+
result = self.class.new
|
|
52
|
+
result.instance_variable_set(:@bf,@bf.&(o.bf))
|
|
53
|
+
result
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Computes the union of two Bloom filters.
|
|
57
|
+
# It assumes that both filters have the same size -
|
|
58
|
+
# if this is not true +BloomFit::ConfigurationMismatch+ is raised.
|
|
59
|
+
def |(o)
|
|
60
|
+
raise BloomFit::ConfigurationMismatch.new unless same_parameters?(o)
|
|
61
|
+
result = self.class.new
|
|
62
|
+
result.instance_variable_set(:@bf,@bf.|(o.bf))
|
|
63
|
+
result
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def bitmap
|
|
67
|
+
@bf.bitmap
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def marshal_load(ary)
|
|
71
|
+
opts, bitmap = *ary
|
|
72
|
+
|
|
73
|
+
initialize(opts)
|
|
74
|
+
@bf.load(bitmap) if !bitmap.nil?
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def marshal_dump
|
|
78
|
+
[@opts, @bf.bitmap]
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def self.load(filename)
|
|
82
|
+
Marshal.load(File.open(filename, 'r'))
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def save(filename)
|
|
86
|
+
File.open(filename, 'w') do |f|
|
|
87
|
+
f << Marshal.dump(self)
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def stats
|
|
92
|
+
fp = ((1.0 - Math.exp(-(@opts[:hashes] * size).to_f / @opts[:size])) ** @opts[:hashes]) * 100
|
|
93
|
+
printf "Number of filter buckets (m): %d\n", @opts[:size]
|
|
94
|
+
printf "Number of bits per buckets (b): %d\n", @opts[:bucket]
|
|
95
|
+
printf "Number of set bits (n): %d\n", set_bits
|
|
96
|
+
printf "Number of filter hashes (k) : %d\n", @opts[:hashes]
|
|
97
|
+
printf "Predicted false positive rate = %.2f%%\n", fp
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
protected
|
|
101
|
+
|
|
102
|
+
# Returns true if parameters of the +o+ther filter are
|
|
103
|
+
# the same.
|
|
104
|
+
def same_parameters?(o)
|
|
105
|
+
@bf.m == o.bf.m && @bf.k == o.bf.k && @bf.b == o.bf.b
|
|
106
|
+
end
|
|
107
|
+
end
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
require "helper"
|
|
2
|
+
|
|
3
|
+
describe BloomFit do
|
|
4
|
+
it "should clear" do
|
|
5
|
+
bf = BloomFit.new(:size => 100, :hashes => 2, :bucket => 3, :raise => false)
|
|
6
|
+
bf.insert("test")
|
|
7
|
+
expect(bf.include?("test")).to be true
|
|
8
|
+
bf.clear
|
|
9
|
+
expect(bf.include?("test")).to be false
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
it "should merge" do
|
|
13
|
+
bf1 = BloomFit.new(:size => 100, :hashes => 2, :bucket => 3, :raise => false)
|
|
14
|
+
bf2 = BloomFit.new(:size => 100, :hashes => 2, :bucket => 3, :raise => false)
|
|
15
|
+
bf2.insert("test")
|
|
16
|
+
expect(bf1.include?("test")).to be false
|
|
17
|
+
bf1.merge!(bf2)
|
|
18
|
+
expect(bf1.include?("test")).to be true
|
|
19
|
+
expect(bf2.include?("test")).to be true
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
context "behave like a bloom filter" do
|
|
23
|
+
it "should test set membership" do
|
|
24
|
+
bf = BloomFit.new(:size => 100, :hashes => 2, :bucket => 3, :raise => false)
|
|
25
|
+
bf.insert("test")
|
|
26
|
+
bf.insert("test1")
|
|
27
|
+
|
|
28
|
+
expect(bf.include?("test")).to be true
|
|
29
|
+
expect(bf.include?("abcd")).to be false
|
|
30
|
+
expect(bf.include?("test", "test1")).to be true
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
it "should work with any object's to_s" do
|
|
34
|
+
subject.insert(:test)
|
|
35
|
+
subject.insert(:test1)
|
|
36
|
+
subject.insert(12345)
|
|
37
|
+
|
|
38
|
+
expect(subject.include?("test")).to be true
|
|
39
|
+
expect(subject.include?("abcd")).to be false
|
|
40
|
+
expect(subject.include?("test", "test1", '12345')).to be true
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
it "should return the number of bits set to 1" do
|
|
44
|
+
bf = BloomFit.new(:hashes => 4)
|
|
45
|
+
bf.insert("test")
|
|
46
|
+
expect(bf.set_bits).to be == 4
|
|
47
|
+
bf.delete("test")
|
|
48
|
+
expect(bf.set_bits).to be == 0
|
|
49
|
+
|
|
50
|
+
bf = BloomFit.new(:hashes => 1)
|
|
51
|
+
bf.insert("test")
|
|
52
|
+
expect(bf.set_bits).to be == 1
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
it "should return intersection with other filter" do
|
|
56
|
+
bf1 = BloomFit.new
|
|
57
|
+
bf1.insert("test")
|
|
58
|
+
bf1.insert("test1")
|
|
59
|
+
|
|
60
|
+
bf2 = BloomFit.new
|
|
61
|
+
bf2.insert("test")
|
|
62
|
+
bf2.insert("test2")
|
|
63
|
+
|
|
64
|
+
bf3 = bf1 & bf2
|
|
65
|
+
expect(bf3.include?("test")).to be true
|
|
66
|
+
expect(bf3.include?("test1")).to be false
|
|
67
|
+
expect(bf3.include?("test2")).to be false
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
it "should raise an exception when intersection is to be computed for incompatible filters" do
|
|
71
|
+
bf1 = BloomFit.new(:size => 10)
|
|
72
|
+
bf1.insert("test")
|
|
73
|
+
|
|
74
|
+
bf2 = BloomFit.new(:size => 20)
|
|
75
|
+
bf2.insert("test")
|
|
76
|
+
|
|
77
|
+
expect { bf1 & bf2 }.to raise_error(BloomFit::ConfigurationMismatch)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
it "should return union with other filter" do
|
|
81
|
+
bf1 = BloomFit.new
|
|
82
|
+
bf1.insert("test")
|
|
83
|
+
bf1.insert("test1")
|
|
84
|
+
|
|
85
|
+
bf2 = BloomFit.new
|
|
86
|
+
bf2.insert("test")
|
|
87
|
+
bf2.insert("test2")
|
|
88
|
+
|
|
89
|
+
bf3 = bf1 | bf2
|
|
90
|
+
expect(bf3.include?("test")).to be true
|
|
91
|
+
expect(bf3.include?("test1")).to be true
|
|
92
|
+
expect(bf3.include?("test2")).to be true
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
it "should raise an exception when union is to be computed for incompatible filters" do
|
|
96
|
+
bf1 = BloomFit.new(:size => 10)
|
|
97
|
+
bf1.insert("test")
|
|
98
|
+
|
|
99
|
+
bf2 = BloomFit.new(:size => 20)
|
|
100
|
+
bf2.insert("test")
|
|
101
|
+
|
|
102
|
+
expect {bf1 | bf2}.to raise_error(BloomFit::ConfigurationMismatch)
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
it "should output current stats" do
|
|
106
|
+
subject.insert('test')
|
|
107
|
+
expect { subject.stats }.not_to raise_error
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
context "behave like counting bloom filter" do
|
|
112
|
+
it "should delete / decrement keys" do
|
|
113
|
+
subject.insert("test")
|
|
114
|
+
expect(subject.include?("test")).to be true
|
|
115
|
+
|
|
116
|
+
subject.delete("test")
|
|
117
|
+
expect(subject.include?("test")).to be false
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
context "serialize" do
|
|
122
|
+
after(:each) { File.unlink('bf.out') }
|
|
123
|
+
|
|
124
|
+
it "should marshall" do
|
|
125
|
+
bf = BloomFit.new
|
|
126
|
+
expect { bf.save('bf.out') }.not_to raise_error
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
it "should load from marshalled" do
|
|
130
|
+
subject.insert('foo')
|
|
131
|
+
subject.insert('bar')
|
|
132
|
+
subject.save('bf.out')
|
|
133
|
+
|
|
134
|
+
bf2 = BloomFit.load('bf.out')
|
|
135
|
+
expect(bf2.include?('foo')).to be true
|
|
136
|
+
expect(bf2.include?('bar')).to be true
|
|
137
|
+
expect(bf2.include?('baz')).to be false
|
|
138
|
+
|
|
139
|
+
expect(subject.send(:same_parameters?, bf2)).to be true
|
|
140
|
+
end
|
|
141
|
+
|
|
142
|
+
it "should serialize to a file size proporational its bucket size" do
|
|
143
|
+
fs_size = 0
|
|
144
|
+
8.times do |i|
|
|
145
|
+
bf = BloomFit.new(size: 10_000, bucket: i+1)
|
|
146
|
+
bf.save('bf.out')
|
|
147
|
+
prev_size, fs_size = fs_size, File.size('bf.out')
|
|
148
|
+
expect(prev_size).to be < fs_size
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
end
|
data/spec/helper.rb
ADDED
metadata
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: bloom_fit
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Ilya Grigorik
|
|
8
|
+
- Tatsuya Mori
|
|
9
|
+
- Ryan McGeary
|
|
10
|
+
- Beshad Talayeminaei
|
|
11
|
+
bindir: bin
|
|
12
|
+
cert_chain: []
|
|
13
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
14
|
+
dependencies:
|
|
15
|
+
- !ruby/object:Gem::Dependency
|
|
16
|
+
name: irb
|
|
17
|
+
requirement: !ruby/object:Gem::Requirement
|
|
18
|
+
requirements:
|
|
19
|
+
- - ">="
|
|
20
|
+
- !ruby/object:Gem::Version
|
|
21
|
+
version: '0'
|
|
22
|
+
type: :development
|
|
23
|
+
prerelease: false
|
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
25
|
+
requirements:
|
|
26
|
+
- - ">="
|
|
27
|
+
- !ruby/object:Gem::Version
|
|
28
|
+
version: '0'
|
|
29
|
+
- !ruby/object:Gem::Dependency
|
|
30
|
+
name: rake
|
|
31
|
+
requirement: !ruby/object:Gem::Requirement
|
|
32
|
+
requirements:
|
|
33
|
+
- - ">="
|
|
34
|
+
- !ruby/object:Gem::Version
|
|
35
|
+
version: '0'
|
|
36
|
+
type: :development
|
|
37
|
+
prerelease: false
|
|
38
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
39
|
+
requirements:
|
|
40
|
+
- - ">="
|
|
41
|
+
- !ruby/object:Gem::Version
|
|
42
|
+
version: '0'
|
|
43
|
+
- !ruby/object:Gem::Dependency
|
|
44
|
+
name: rake-compiler
|
|
45
|
+
requirement: !ruby/object:Gem::Requirement
|
|
46
|
+
requirements:
|
|
47
|
+
- - ">="
|
|
48
|
+
- !ruby/object:Gem::Version
|
|
49
|
+
version: '0'
|
|
50
|
+
type: :development
|
|
51
|
+
prerelease: false
|
|
52
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
53
|
+
requirements:
|
|
54
|
+
- - ">="
|
|
55
|
+
- !ruby/object:Gem::Version
|
|
56
|
+
version: '0'
|
|
57
|
+
- !ruby/object:Gem::Dependency
|
|
58
|
+
name: rspec
|
|
59
|
+
requirement: !ruby/object:Gem::Requirement
|
|
60
|
+
requirements:
|
|
61
|
+
- - ">="
|
|
62
|
+
- !ruby/object:Gem::Version
|
|
63
|
+
version: '3'
|
|
64
|
+
type: :development
|
|
65
|
+
prerelease: false
|
|
66
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
67
|
+
requirements:
|
|
68
|
+
- - ">="
|
|
69
|
+
- !ruby/object:Gem::Version
|
|
70
|
+
version: '3'
|
|
71
|
+
email:
|
|
72
|
+
- ilya@grigorik.com
|
|
73
|
+
- valdzone@gmail.com
|
|
74
|
+
- ryan@mcgeary.org
|
|
75
|
+
- 'btalayeminaei@gmail.com '
|
|
76
|
+
executables: []
|
|
77
|
+
extensions:
|
|
78
|
+
- ext/cbloomfilter/extconf.rb
|
|
79
|
+
extra_rdoc_files: []
|
|
80
|
+
files:
|
|
81
|
+
- README.md
|
|
82
|
+
- Rakefile
|
|
83
|
+
- ext/cbloomfilter/cbloomfilter.c
|
|
84
|
+
- ext/cbloomfilter/crc32.h
|
|
85
|
+
- ext/cbloomfilter/extconf.rb
|
|
86
|
+
- lib/bloom_fit.rb
|
|
87
|
+
- lib/bloom_fit/version.rb
|
|
88
|
+
- spec/bloom_fit_spec.rb
|
|
89
|
+
- spec/helper.rb
|
|
90
|
+
homepage: https://github.com/rmm5t/bloom_fit
|
|
91
|
+
licenses: []
|
|
92
|
+
metadata:
|
|
93
|
+
homepage_uri: https://github.com/rmm5t/bloom_fit
|
|
94
|
+
bug_tracker_uri: https://github.com/rmm5t/bloom_fit/issues
|
|
95
|
+
changelog_uri: https://github.com/rmm5t/bloom_fit/blob/main/CHANGELOG.md
|
|
96
|
+
source_code_uri: https://github.com/rmm5t/bloom_fit
|
|
97
|
+
funding_uri: https://github.com/sponsors/rmm5t
|
|
98
|
+
rubygems_mfa_required: 'true'
|
|
99
|
+
rdoc_options: []
|
|
100
|
+
require_paths:
|
|
101
|
+
- lib
|
|
102
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
103
|
+
requirements:
|
|
104
|
+
- - ">="
|
|
105
|
+
- !ruby/object:Gem::Version
|
|
106
|
+
version: '0'
|
|
107
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
108
|
+
requirements:
|
|
109
|
+
- - ">="
|
|
110
|
+
- !ruby/object:Gem::Version
|
|
111
|
+
version: '0'
|
|
112
|
+
requirements: []
|
|
113
|
+
rubygems_version: 4.0.9
|
|
114
|
+
specification_version: 4
|
|
115
|
+
summary: BloomFit helps you build correctly sized Bloom filters from expected set
|
|
116
|
+
size and target false positive rate.
|
|
117
|
+
test_files:
|
|
118
|
+
- spec/bloom_fit_spec.rb
|
|
119
|
+
- spec/helper.rb
|