bloom_fit 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 11be2e4492a3e06ff7c401ebe04e8f5b462a48d8399dea08ecdc33726cbec31f
4
+ data.tar.gz: 0642065d4004d002fc51cbe07f77a92daf5b84b117abc6684c45f7ca82b7d757
5
+ SHA512:
6
+ metadata.gz: 840f467007a4efc4bcf55c4e29c320d28b16daa819b1f54a1884c43bb5092add5c4522a2e6e524a5516031e48bc937eb4bfddf9e20767d85b18590b1245c1d23
7
+ data.tar.gz: 58b45319a8cc83342ab224f0430bd8849b4655b2be0b1758d788dbc68df7ecca9cc5ab2deb1c33419e07dc6d0f6c8c22a4c158e7001fc696ee695273bb9ad196
data/README.md ADDED
@@ -0,0 +1,78 @@
1
+ # BloomFit makes Bloom Filter tuning easy
2
+
3
+ BloomFit provides a MRI/C-based non-counting bloom filter for use in your Ruby projects. It is heavily based on [bloomfilter-rb]'s native implementation, but provides a better hashing distribution by using DJB2 over CRC32, avoids the need to supply a seed, removes counting abilities, improves performance for very large datasets, and will automatically calculate the bit size (m) and the number of hashes (k) when given a capacity and false-positive-rate.
4
+
5
+ A [Bloom filter](http://en.wikipedia.org/wiki/Bloom_filter) is a space-efficient probabilistic data structure that is used to test whether an element is a member of a set. False positives are possible, but false negatives are not. Instead of using k different hash functions, this implementation a DJB2 hash with k seeds from the CRC table.
6
+
7
+ Performance of the Bloom filter depends on the following:
8
+
9
+ - size of the bit array
10
+ - number of hash functions
11
+
12
+ BloomFit is a fork of [bloomfilter-rb].
13
+
14
+ ## Resources
15
+
16
+ - Background: [Bloom filter](http://en.wikipedia.org/wiki/Bloom_filter)
17
+ - Determining parameters: [Scalable Datasets: Bloom Filters in Ruby](http://www.igvita.com/2008/12/27/scalable-datasets-bloom-filters-in-ruby/)
18
+ - Applications & reasons behind bloom filter: [Flow analysis: Time based bloom filter](http://www.igvita.com/2010/01/06/flow-analysis-time-based-bloom-filters/)
19
+
20
+ ## Examples
21
+
22
+ MRI/C implementation which creates an in-memory filter which can be saved and reloaded from disk.
23
+
24
+ (COMING SOON) If you'd like to specify an expected item count and a false-positive rate that you can tolerate:
25
+
26
+ ```ruby
27
+ require "bloom_fit"
28
+
29
+ bf = BloomFit.new(capacity: 250, false_positive_rate: 0.001)
30
+ bf.add("cat")
31
+ bf.include?("cat") # => true
32
+ bf.include?("dog") # => false
33
+
34
+ # Hash syntax with a bloom filter!
35
+ bf["bird"] = "bar"
36
+ bf["bird"] # => true
37
+ bf["mouse"] # => false
38
+
39
+ bf.stats
40
+ # => Number of filter bits (m): 3600
41
+ # => Number of set bits (n): 20
42
+ # => Number of filter hashes (k) : 10
43
+ # => Predicted false positive rate = 0.00%
44
+ ```
45
+
46
+ If you'd like more control over the traditional inputs like bit size and the number of hashes:
47
+
48
+ ```ruby
49
+ require "bloom_fit"
50
+
51
+ bf = BloomFit.new(size: 100, hashes: 2)
52
+ bf.add("cat")
53
+ bf.include?("cat") # => true
54
+ bf.include?("dog") # => false
55
+
56
+ # Hash syntax with a bloom filter!
57
+ bf["bird"] = "bar"
58
+ bf["bird"] # => true
59
+ bf["mouse"] # => false
60
+
61
+ bf.stats
62
+ # => Number of filter bits (m): 100
63
+ # => Number of set bits (n): 4
64
+ # => Number of filter hashes (k) : 2
65
+ # => Predicted false positive rate = 10.87%
66
+ ```
67
+
68
+ ## Credits
69
+
70
+ - Tatsuya Mori <valdzone@gmail.com> (Original C implementation)
71
+ - Ilya Grigorik [@igrigorik](https://github.com/igrigorik) ([bloomfilter-rb] gem)
72
+ - Bharanee Rathna [@deepfryed](https://github.com/deepfryed) ([bloom-filter](https://github.com/deepfryed/bloom-filter) gem)
73
+
74
+ ## License
75
+
76
+ [MIT License](https://rmm5t.mit-license.org/)
77
+
78
+ [bloomfilter-rb]: https://github.com/igrigorik/bloomfilter-rb
data/Rakefile ADDED
@@ -0,0 +1,14 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rake'
3
+ require 'rspec'
4
+ require 'rspec/core/rake_task'
5
+ require 'rake/extensiontask'
6
+
7
+ Bundler::GemHelper.install_tasks
8
+ Rake::ExtensionTask.new('cbloomfilter')
9
+ RSpec::Core::RakeTask.new(:spec)
10
+ Rake::Task[:spec].prerequisites << :clean
11
+ Rake::Task[:spec].prerequisites << :compile
12
+
13
+ desc "Default: run unit tests."
14
+ task default: :spec
@@ -0,0 +1,397 @@
1
+ /*
2
+ * cbloomfilter.c - simple Bloom Filter
3
+ * (c) Tatsuya Mori <valdzone@gmail.com>
4
+ */
5
+
6
+ #include "ruby.h"
7
+ #include "crc32.h"
8
+
9
+ #if !defined(RSTRING_LEN)
10
+ # define RSTRING_LEN(x) (RSTRING(x)->len)
11
+ # define RSTRING_PTR(x) (RSTRING(x)->ptr)
12
+ #endif
13
+
14
+ /* Reuse the standard CRC table for consistent seeds */
15
+ static unsigned int *seeds = crc_table;
16
+
17
+ static VALUE cBloomFilter;
18
+
19
+ struct BloomFilter {
20
+ int m; /* # of buckets in a bloom filter */
21
+ int b; /* # of bits in a bloom filter bucket */
22
+ int k; /* # of hash functions */
23
+ int r; /* # raise on bucket overflow? */
24
+ unsigned char *ptr; /* bits data */
25
+ int bytes; /* size of byte data */
26
+ };
27
+
28
+ unsigned long djb2(unsigned char *str, int len) {
29
+ unsigned long hash = 5381;
30
+ unsigned char *c;
31
+ c = (unsigned char *) str;
32
+ while (len > 0) {
33
+ hash = ((hash << 5) ^ hash) ^ (*c);
34
+ --len;
35
+ ++c;
36
+ }
37
+ return hash;
38
+ }
39
+
40
+ void bits_free(struct BloomFilter *bf) {
41
+ ruby_xfree(bf->ptr);
42
+ }
43
+
44
+ void bucket_unset(struct BloomFilter *bf, int index) {
45
+ int byte_offset = (index * bf->b) / 8;
46
+ int bit_offset = (index * bf->b) % 8;
47
+ unsigned int c = bf->ptr[byte_offset];
48
+ c += bf->ptr[byte_offset + 1] << 8;
49
+ unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
50
+ if ((c & mask) == 0) {
51
+ // do nothing
52
+ } else {
53
+ // reduce the counter: 11 00 => 10 00 (suppose bf->b is 2)
54
+ c -= (1 << bit_offset) & ((1 << 8) -1);
55
+ // shift the bitmap right by 1 bit: 10 00 => 01 00
56
+ c = (~mask & c) | ((c & mask) >> (bit_offset + 1) << bit_offset);
57
+
58
+ bf->ptr[byte_offset] = c & ((1 << 8) - 1);
59
+ bf->ptr[byte_offset + 1] = (c & ((1 << 16) - 1)) >> 8;
60
+ }
61
+ }
62
+
63
+ void bucket_set(struct BloomFilter *bf, int index) {
64
+ int byte_offset = (index * bf->b) / 8;
65
+ int bit_offset = (index * bf->b) % 8;
66
+ unsigned int c = bf->ptr[byte_offset];
67
+ c += bf->ptr[byte_offset + 1] << 8;
68
+ unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
69
+ if ((c & mask) == mask) {
70
+ if (bf->r == 1) rb_raise(rb_eRuntimeError, "bucket got filled up");
71
+ } else {
72
+ c = c + ((1 << bit_offset) & ((1 << 8) -1)) | c;
73
+ bf->ptr[byte_offset] = c & ((1 << 8) - 1);
74
+ bf->ptr[byte_offset + 1] = (c & ((1 << 16) - 1)) >> 8;
75
+ }
76
+ }
77
+
78
+ int bucket_check(struct BloomFilter *bf, int index) {
79
+ int byte_offset = (index * bf->b) / 8;
80
+ int bit_offset = (index * bf->b) % 8;
81
+ unsigned int c = bf->ptr[byte_offset];
82
+ c += bf->ptr[byte_offset + 1] << 8;
83
+
84
+ unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
85
+ return (c & mask) >> bit_offset;
86
+ }
87
+
88
+ int bucket_get(struct BloomFilter *bf, int index) {
89
+ int byte_offset = (index * bf->b) / 8;
90
+ int bit_offset = (index * bf->b) % 8;
91
+ unsigned int c = bf->ptr[byte_offset];
92
+ c += bf->ptr[byte_offset + 1] << 8;
93
+
94
+ unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
95
+ return (c & mask) >> bit_offset;
96
+ }
97
+
98
+ static VALUE bf_s_new(int argc, VALUE *argv, VALUE self) {
99
+ struct BloomFilter *bf;
100
+ VALUE arg1, arg2, arg3, arg4, obj;
101
+ int m, k, b, r;
102
+
103
+ obj = Data_Make_Struct(self, struct BloomFilter, NULL, bits_free, bf);
104
+
105
+ /* default = Fugou approach :-) */
106
+ arg1 = INT2FIX(100000000);
107
+ arg2 = INT2FIX(4);
108
+ arg3 = INT2FIX(1);
109
+ arg4 = INT2FIX(0);
110
+
111
+ switch (argc) {
112
+ case 4:
113
+ if (argv[3] == Qtrue) {
114
+ arg4 = INT2FIX(1);
115
+ }
116
+ case 3:
117
+ arg3 = argv[2];
118
+ case 2:
119
+ arg2 = argv[1];
120
+ case 1:
121
+ arg1 = argv[0];
122
+ break;
123
+ }
124
+
125
+ m = FIX2INT(arg1);
126
+ k = FIX2INT(arg2);
127
+ b = FIX2INT(arg3);
128
+ r = FIX2INT(arg4);
129
+
130
+ if (b < 1 || b > 8)
131
+ rb_raise(rb_eArgError, "bucket size");
132
+ if (m < 1)
133
+ rb_raise(rb_eArgError, "array size");
134
+ if (k < 1)
135
+ rb_raise(rb_eArgError, "hash length");
136
+
137
+ bf->b = b;
138
+ bf->m = m;
139
+ bf->k = k;
140
+ bf->r = r;
141
+
142
+ bf->bytes = ((m * b) + 15) / 8;
143
+ bf->ptr = ALLOC_N(unsigned char, bf->bytes);
144
+
145
+ /* initialize the bits with zeros */
146
+ memset(bf->ptr, 0, bf->bytes);
147
+ rb_iv_set(obj, "@hash_value", rb_hash_new());
148
+
149
+ return obj;
150
+ }
151
+
152
+ static VALUE bf_clear(VALUE self) {
153
+ struct BloomFilter *bf;
154
+ Data_Get_Struct(self, struct BloomFilter, bf);
155
+ memset(bf->ptr, 0, bf->bytes);
156
+ return Qtrue;
157
+ }
158
+
159
+ static VALUE bf_m(VALUE self) {
160
+ struct BloomFilter *bf;
161
+ Data_Get_Struct(self, struct BloomFilter, bf);
162
+ return INT2FIX(bf->m);
163
+ }
164
+
165
+ static VALUE bf_k(VALUE self) {
166
+ struct BloomFilter *bf;
167
+ Data_Get_Struct(self, struct BloomFilter, bf);
168
+ return INT2FIX(bf->k);
169
+ }
170
+
171
+ static VALUE bf_b(VALUE self) {
172
+ struct BloomFilter *bf;
173
+ Data_Get_Struct(self, struct BloomFilter, bf);
174
+ return INT2FIX(bf->b);
175
+ }
176
+
177
+ static VALUE bf_r(VALUE self) {
178
+ struct BloomFilter *bf;
179
+ Data_Get_Struct(self, struct BloomFilter, bf);
180
+ return bf->r == 0 ? Qfalse : Qtrue;
181
+ }
182
+
183
+ static VALUE bf_set_bits(VALUE self){
184
+ struct BloomFilter *bf;
185
+ int i,j,count = 0;
186
+ Data_Get_Struct(self, struct BloomFilter, bf);
187
+ for (i = 0; i < bf->bytes; i++) {
188
+ for (j = 0; j < 8; j++) {
189
+ count += (bf->ptr[i] >> j) & 1;
190
+ }
191
+ }
192
+ return INT2FIX(count);
193
+ }
194
+
195
+ static VALUE bf_insert(VALUE self, VALUE key) {
196
+ VALUE skey;
197
+ unsigned long hash, index;
198
+ int i, len, m, k;
199
+ char *ckey;
200
+ struct BloomFilter *bf;
201
+ Data_Get_Struct(self, struct BloomFilter, bf);
202
+
203
+ skey = rb_obj_as_string(key);
204
+ ckey = StringValuePtr(skey);
205
+ len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
206
+
207
+ m = bf->m;
208
+ k = bf->k;
209
+
210
+ hash = (unsigned long) djb2(ckey, len);
211
+ for (i = 0; i <= k - 1; i++) {
212
+ index = (unsigned long) (hash ^ seeds[i]) % (unsigned int) (m);
213
+
214
+ /* set a bit at the index */
215
+ bucket_set(bf, index);
216
+ }
217
+
218
+ return Qnil;
219
+ }
220
+
221
+ static VALUE bf_merge(VALUE self, VALUE other) {
222
+ struct BloomFilter *bf, *target;
223
+ int i;
224
+ Data_Get_Struct(self, struct BloomFilter, bf);
225
+ Data_Get_Struct(other, struct BloomFilter, target);
226
+ for (i = 0; i < bf->bytes; i++) {
227
+ bf->ptr[i] |= target->ptr[i];
228
+ }
229
+ return Qnil;
230
+ }
231
+
232
+ static VALUE bf_and(VALUE self, VALUE other) {
233
+ struct BloomFilter *bf, *bf_other, *target;
234
+ VALUE klass, obj, args[5];
235
+ int i;
236
+
237
+ Data_Get_Struct(self, struct BloomFilter, bf);
238
+ Data_Get_Struct(other, struct BloomFilter, bf_other);
239
+ args[0] = INT2FIX(bf->m);
240
+ args[1] = INT2FIX(bf->k);
241
+ args[2] = INT2FIX(bf->b);
242
+ args[3] = INT2FIX(bf->r);
243
+ klass = rb_funcall(self,rb_intern("class"),0);
244
+ obj = bf_s_new(4,args,klass);
245
+ Data_Get_Struct(obj, struct BloomFilter, target);
246
+ for (i = 0; i < bf->bytes; i++){
247
+ target->ptr[i] = bf->ptr[i] & bf_other->ptr[i];
248
+ }
249
+
250
+ return obj;
251
+ }
252
+
253
+ static VALUE bf_or(VALUE self, VALUE other) {
254
+ struct BloomFilter *bf, *bf_other, *target;
255
+ VALUE klass, obj, args[5];
256
+ int i;
257
+
258
+ Data_Get_Struct(self, struct BloomFilter, bf);
259
+ Data_Get_Struct(other, struct BloomFilter, bf_other);
260
+ args[0] = INT2FIX(bf->m);
261
+ args[1] = INT2FIX(bf->k);
262
+ args[2] = INT2FIX(bf->b);
263
+ args[3] = INT2FIX(bf->r);
264
+ klass = rb_funcall(self,rb_intern("class"),0);
265
+ obj = bf_s_new(4,args,klass);
266
+ Data_Get_Struct(obj, struct BloomFilter, target);
267
+ for (i = 0; i < bf->bytes; i++){
268
+ target->ptr[i] = bf->ptr[i] | bf_other->ptr[i];
269
+ }
270
+
271
+ return obj;
272
+ }
273
+
274
+ static VALUE bf_delete(VALUE self, VALUE key) {
275
+ unsigned long hash, index;
276
+ int i, len, m, k;
277
+ char *ckey;
278
+ VALUE skey;
279
+ struct BloomFilter *bf;
280
+ Data_Get_Struct(self, struct BloomFilter, bf);
281
+
282
+ skey = rb_obj_as_string(key);
283
+ ckey = StringValuePtr(skey);
284
+ len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
285
+
286
+ m = bf->m;
287
+ k = bf->k;
288
+
289
+ hash = (unsigned long) djb2(ckey, len);
290
+ for (i = 0; i <= k - 1; i++) {
291
+ index = (unsigned long) (hash ^ seeds[i]) % (unsigned int) (m);
292
+
293
+ /* set a bit at the index */
294
+ bucket_unset(bf, index);
295
+ }
296
+
297
+ return Qnil;
298
+ }
299
+
300
+
301
+ static VALUE bf_include(int argc, VALUE* argv, VALUE self) {
302
+ unsigned long hash, index;
303
+ int i, len, m, k, tests_idx, vlen;
304
+ char *ckey;
305
+ VALUE tests, key, skey;
306
+ struct BloomFilter *bf;
307
+
308
+ rb_scan_args(argc, argv, "*", &tests);
309
+
310
+ Data_Get_Struct(self, struct BloomFilter, bf);
311
+ vlen = RARRAY_LEN(tests);
312
+ for(tests_idx = 0; tests_idx < vlen; tests_idx++) {
313
+ key = rb_ary_entry(tests, tests_idx);
314
+ skey = rb_obj_as_string(key);
315
+ ckey = StringValuePtr(skey);
316
+ len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
317
+
318
+ m = bf->m;
319
+ k = bf->k;
320
+
321
+ hash = (unsigned long) djb2(ckey, len);
322
+ for (i = 0; i <= k - 1; i++) {
323
+ index = (unsigned long) (hash ^ seeds[i]) % (unsigned int) (m);
324
+
325
+ /* check the bit at the index */
326
+ if (!bucket_check(bf, index)) {
327
+ return Qfalse; /* i.e., it is a new entry ; escape the loop */
328
+ }
329
+ }
330
+
331
+ return Qtrue;
332
+ }
333
+ }
334
+
335
+ static VALUE bf_to_s(VALUE self) {
336
+ struct BloomFilter *bf;
337
+ unsigned char *ptr;
338
+ int i;
339
+ VALUE str;
340
+
341
+ Data_Get_Struct(self, struct BloomFilter, bf);
342
+ str = rb_str_new(0, bf->m);
343
+
344
+ ptr = (unsigned char *) RSTRING_PTR(str);
345
+ for (i = 0; i < bf->m; i++)
346
+ *ptr++ = bucket_get(bf, i) ? '1' : '0';
347
+
348
+ return str;
349
+ }
350
+
351
+ static VALUE bf_bitmap(VALUE self) {
352
+ struct BloomFilter *bf;
353
+ Data_Get_Struct(self, struct BloomFilter, bf);
354
+
355
+ VALUE str = rb_str_new(0, bf->bytes);
356
+ unsigned char* ptr = (unsigned char *) RSTRING_PTR(str);
357
+
358
+ memcpy(ptr, bf->ptr, bf->bytes);
359
+
360
+ return str;
361
+ }
362
+
363
+ static VALUE bf_load(VALUE self, VALUE bitmap) {
364
+ struct BloomFilter *bf;
365
+ Data_Get_Struct(self, struct BloomFilter, bf);
366
+ unsigned char* ptr = (unsigned char *) RSTRING_PTR(bitmap);
367
+
368
+ memcpy(bf->ptr, ptr, bf->bytes);
369
+
370
+ return Qnil;
371
+ }
372
+
373
+ void Init_cbloomfilter(void) {
374
+ cBloomFilter = rb_define_class("CBloomFilter", rb_cObject);
375
+ rb_define_singleton_method(cBloomFilter, "new", bf_s_new, -1);
376
+ rb_define_method(cBloomFilter, "m", bf_m, 0);
377
+ rb_define_method(cBloomFilter, "k", bf_k, 0);
378
+ rb_define_method(cBloomFilter, "b", bf_b, 0);
379
+ rb_define_method(cBloomFilter, "r", bf_r, 0);
380
+ rb_define_method(cBloomFilter, "set_bits", bf_set_bits, 0);
381
+ /* rb_define_method(cBloomFilter, "s", bf_s, 0); */
382
+ rb_define_method(cBloomFilter, "insert", bf_insert, 1);
383
+ rb_define_method(cBloomFilter, "delete", bf_delete, 1);
384
+ rb_define_method(cBloomFilter, "include?", bf_include, -1);
385
+ rb_define_method(cBloomFilter, "clear", bf_clear, 0);
386
+ rb_define_method(cBloomFilter, "merge!", bf_merge, 1);
387
+ rb_define_method(cBloomFilter, "&", bf_and, 1);
388
+ rb_define_method(cBloomFilter, "|", bf_or, 1);
389
+
390
+ rb_define_method(cBloomFilter, "to_s", bf_to_s, 0);
391
+ rb_define_method(cBloomFilter, "bitmap", bf_bitmap, 0);
392
+ rb_define_method(cBloomFilter, "load", bf_load, 1);
393
+
394
+ /* functions that have not been implemented, yet */
395
+
396
+ // rb_define_method(cBloomFilter, "<=>", bf_cmp, 1);
397
+ }
@@ -0,0 +1,76 @@
1
+ /* simple CRC32 code */
2
+ /*
3
+ * Copyright 2005 Aris Adamantiadis
4
+ *
5
+ * This file is part of the SSH Library
6
+ *
7
+ * The SSH Library is free software; you can redistribute it and/or modify
8
+ * it under the terms of the GNU Lesser General Public License as published by
9
+ * the Free Software Foundation; either version 2.1 of the License, or (at your
10
+ * option) any later version.
11
+ *
12
+ *
13
+ * The SSH Library is distributed in the hope that it will be useful, but
14
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16
+ * License for more details.
17
+ *
18
+ * You should have received a copy of the GNU Lesser General Public License
19
+ * along with the SSH Library; see the file COPYING. If not, write to
20
+ * the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21
+ * MA 02111-1307, USA. */
22
+
23
+ static unsigned int crc_table[] = {
24
+ 0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
25
+ 0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
26
+ 0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
27
+ 0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
28
+ 0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
29
+ 0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
30
+ 0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
31
+ 0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
32
+ 0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
33
+ 0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
34
+ 0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
35
+ 0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
36
+ 0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
37
+ 0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
38
+ 0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
39
+ 0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
40
+ 0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
41
+ 0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
42
+ 0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
43
+ 0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
44
+ 0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
45
+ 0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
46
+ 0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
47
+ 0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
48
+ 0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
49
+ 0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
50
+ 0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
51
+ 0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
52
+ 0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
53
+ 0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
54
+ 0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
55
+ 0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
56
+ 0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
57
+ 0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
58
+ 0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
59
+ 0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
60
+ 0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
61
+ 0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
62
+ 0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
63
+ 0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
64
+ 0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
65
+ 0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
66
+ 0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
67
+ 0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
68
+ 0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
69
+ 0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
70
+ 0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
71
+ 0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
72
+ 0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
73
+ 0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
74
+ 0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
75
+ 0x2d02ef8dUL
76
+ };
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ require "mkmf"
3
+
4
+ create_makefile("cbloomfilter")
@@ -0,0 +1,3 @@
1
+ class BloomFit
2
+ VERSION = "0.1.0".freeze
3
+ end
data/lib/bloom_fit.rb ADDED
@@ -0,0 +1,107 @@
1
+ require 'cbloomfilter'
2
+ require 'bloom_fit/version'
3
+
4
+ class BloomFit
5
+ BloomFit::ConfigurationMismatch = Class.new(ArgumentError)
6
+
7
+ attr_reader :bf
8
+
9
+ def initialize(opts = {})
10
+ @opts = {
11
+ :size => 100,
12
+ :hashes => 4,
13
+ :bucket => 1,
14
+ :raise => false
15
+ }.merge(opts)
16
+
17
+ # arg 1: m => size : number of buckets in a bloom filter
18
+ # arg 2: k => hashes : number of hash functions
19
+ # arg 3: b => bucket : number of bits per bucket
20
+ # arg 4: r => raise : whether to raise on bucket overflow
21
+
22
+ @bf = CBloomFilter.new(@opts[:size], @opts[:hashes], @opts[:bucket], @opts[:raise])
23
+ end
24
+
25
+ def insert(key)
26
+ @bf.insert(key)
27
+ end
28
+ alias :[]= :insert
29
+
30
+ def include?(*keys)
31
+ @bf.include?(*keys)
32
+ end
33
+ alias :key? :include?
34
+ alias :[] :include?
35
+
36
+ def delete(key); @bf.delete(key); end
37
+ def clear; @bf.clear; end
38
+ def size; @bf.set_bits; end
39
+ def merge!(o); @bf.merge!(o.bf); end
40
+
41
+ # Returns the number of bits that are set to 1 in the filter.
42
+ def set_bits
43
+ @bf.set_bits
44
+ end
45
+
46
+ # Computes the intersection of two Bloom filters.
47
+ # It assumes that both filters have the same size -
48
+ # if this is not true +BloomFit::ConfigurationMismatch+ is raised.
49
+ def &(o)
50
+ raise BloomFit::ConfigurationMismatch.new unless same_parameters?(o)
51
+ result = self.class.new
52
+ result.instance_variable_set(:@bf,@bf.&(o.bf))
53
+ result
54
+ end
55
+
56
+ # Computes the union of two Bloom filters.
57
+ # It assumes that both filters have the same size -
58
+ # if this is not true +BloomFit::ConfigurationMismatch+ is raised.
59
+ def |(o)
60
+ raise BloomFit::ConfigurationMismatch.new unless same_parameters?(o)
61
+ result = self.class.new
62
+ result.instance_variable_set(:@bf,@bf.|(o.bf))
63
+ result
64
+ end
65
+
66
+ def bitmap
67
+ @bf.bitmap
68
+ end
69
+
70
+ def marshal_load(ary)
71
+ opts, bitmap = *ary
72
+
73
+ initialize(opts)
74
+ @bf.load(bitmap) if !bitmap.nil?
75
+ end
76
+
77
+ def marshal_dump
78
+ [@opts, @bf.bitmap]
79
+ end
80
+
81
+ def self.load(filename)
82
+ Marshal.load(File.open(filename, 'r'))
83
+ end
84
+
85
+ def save(filename)
86
+ File.open(filename, 'w') do |f|
87
+ f << Marshal.dump(self)
88
+ end
89
+ end
90
+
91
+ def stats
92
+ fp = ((1.0 - Math.exp(-(@opts[:hashes] * size).to_f / @opts[:size])) ** @opts[:hashes]) * 100
93
+ printf "Number of filter buckets (m): %d\n", @opts[:size]
94
+ printf "Number of bits per buckets (b): %d\n", @opts[:bucket]
95
+ printf "Number of set bits (n): %d\n", set_bits
96
+ printf "Number of filter hashes (k) : %d\n", @opts[:hashes]
97
+ printf "Predicted false positive rate = %.2f%%\n", fp
98
+ end
99
+
100
+ protected
101
+
102
+ # Returns true if parameters of the +o+ther filter are
103
+ # the same.
104
+ def same_parameters?(o)
105
+ @bf.m == o.bf.m && @bf.k == o.bf.k && @bf.b == o.bf.b
106
+ end
107
+ end
@@ -0,0 +1,152 @@
1
+ require "helper"
2
+
3
+ describe BloomFit do
4
+ it "should clear" do
5
+ bf = BloomFit.new(:size => 100, :hashes => 2, :bucket => 3, :raise => false)
6
+ bf.insert("test")
7
+ expect(bf.include?("test")).to be true
8
+ bf.clear
9
+ expect(bf.include?("test")).to be false
10
+ end
11
+
12
+ it "should merge" do
13
+ bf1 = BloomFit.new(:size => 100, :hashes => 2, :bucket => 3, :raise => false)
14
+ bf2 = BloomFit.new(:size => 100, :hashes => 2, :bucket => 3, :raise => false)
15
+ bf2.insert("test")
16
+ expect(bf1.include?("test")).to be false
17
+ bf1.merge!(bf2)
18
+ expect(bf1.include?("test")).to be true
19
+ expect(bf2.include?("test")).to be true
20
+ end
21
+
22
+ context "behave like a bloom filter" do
23
+ it "should test set membership" do
24
+ bf = BloomFit.new(:size => 100, :hashes => 2, :bucket => 3, :raise => false)
25
+ bf.insert("test")
26
+ bf.insert("test1")
27
+
28
+ expect(bf.include?("test")).to be true
29
+ expect(bf.include?("abcd")).to be false
30
+ expect(bf.include?("test", "test1")).to be true
31
+ end
32
+
33
+ it "should work with any object's to_s" do
34
+ subject.insert(:test)
35
+ subject.insert(:test1)
36
+ subject.insert(12345)
37
+
38
+ expect(subject.include?("test")).to be true
39
+ expect(subject.include?("abcd")).to be false
40
+ expect(subject.include?("test", "test1", '12345')).to be true
41
+ end
42
+
43
+ it "should return the number of bits set to 1" do
44
+ bf = BloomFit.new(:hashes => 4)
45
+ bf.insert("test")
46
+ expect(bf.set_bits).to be == 4
47
+ bf.delete("test")
48
+ expect(bf.set_bits).to be == 0
49
+
50
+ bf = BloomFit.new(:hashes => 1)
51
+ bf.insert("test")
52
+ expect(bf.set_bits).to be == 1
53
+ end
54
+
55
+ it "should return intersection with other filter" do
56
+ bf1 = BloomFit.new
57
+ bf1.insert("test")
58
+ bf1.insert("test1")
59
+
60
+ bf2 = BloomFit.new
61
+ bf2.insert("test")
62
+ bf2.insert("test2")
63
+
64
+ bf3 = bf1 & bf2
65
+ expect(bf3.include?("test")).to be true
66
+ expect(bf3.include?("test1")).to be false
67
+ expect(bf3.include?("test2")).to be false
68
+ end
69
+
70
+ it "should raise an exception when intersection is to be computed for incompatible filters" do
71
+ bf1 = BloomFit.new(:size => 10)
72
+ bf1.insert("test")
73
+
74
+ bf2 = BloomFit.new(:size => 20)
75
+ bf2.insert("test")
76
+
77
+ expect { bf1 & bf2 }.to raise_error(BloomFit::ConfigurationMismatch)
78
+ end
79
+
80
+ it "should return union with other filter" do
81
+ bf1 = BloomFit.new
82
+ bf1.insert("test")
83
+ bf1.insert("test1")
84
+
85
+ bf2 = BloomFit.new
86
+ bf2.insert("test")
87
+ bf2.insert("test2")
88
+
89
+ bf3 = bf1 | bf2
90
+ expect(bf3.include?("test")).to be true
91
+ expect(bf3.include?("test1")).to be true
92
+ expect(bf3.include?("test2")).to be true
93
+ end
94
+
95
+ it "should raise an exception when union is to be computed for incompatible filters" do
96
+ bf1 = BloomFit.new(:size => 10)
97
+ bf1.insert("test")
98
+
99
+ bf2 = BloomFit.new(:size => 20)
100
+ bf2.insert("test")
101
+
102
+ expect {bf1 | bf2}.to raise_error(BloomFit::ConfigurationMismatch)
103
+ end
104
+
105
+ it "should output current stats" do
106
+ subject.insert('test')
107
+ expect { subject.stats }.not_to raise_error
108
+ end
109
+ end
110
+
111
+ context "behave like counting bloom filter" do
112
+ it "should delete / decrement keys" do
113
+ subject.insert("test")
114
+ expect(subject.include?("test")).to be true
115
+
116
+ subject.delete("test")
117
+ expect(subject.include?("test")).to be false
118
+ end
119
+ end
120
+
121
+ context "serialize" do
122
+ after(:each) { File.unlink('bf.out') }
123
+
124
+ it "should marshall" do
125
+ bf = BloomFit.new
126
+ expect { bf.save('bf.out') }.not_to raise_error
127
+ end
128
+
129
+ it "should load from marshalled" do
130
+ subject.insert('foo')
131
+ subject.insert('bar')
132
+ subject.save('bf.out')
133
+
134
+ bf2 = BloomFit.load('bf.out')
135
+ expect(bf2.include?('foo')).to be true
136
+ expect(bf2.include?('bar')).to be true
137
+ expect(bf2.include?('baz')).to be false
138
+
139
+ expect(subject.send(:same_parameters?, bf2)).to be true
140
+ end
141
+
142
+ it "should serialize to a file size proporational its bucket size" do
143
+ fs_size = 0
144
+ 8.times do |i|
145
+ bf = BloomFit.new(size: 10_000, bucket: i+1)
146
+ bf.save('bf.out')
147
+ prev_size, fs_size = fs_size, File.size('bf.out')
148
+ expect(prev_size).to be < fs_size
149
+ end
150
+ end
151
+ end
152
+ end
data/spec/helper.rb ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/setup"
2
+ require "bloom_fit"
metadata ADDED
@@ -0,0 +1,119 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bloom_fit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ilya Grigorik
8
+ - Tatsuya Mori
9
+ - Ryan McGeary
10
+ - Beshad Talayeminaei
11
+ bindir: bin
12
+ cert_chain: []
13
+ date: 1980-01-02 00:00:00.000000000 Z
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: irb
17
+ requirement: !ruby/object:Gem::Requirement
18
+ requirements:
19
+ - - ">="
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ version: '0'
29
+ - !ruby/object:Gem::Dependency
30
+ name: rake
31
+ requirement: !ruby/object:Gem::Requirement
32
+ requirements:
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: '0'
36
+ type: :development
37
+ prerelease: false
38
+ version_requirements: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ - !ruby/object:Gem::Dependency
44
+ name: rake-compiler
45
+ requirement: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - ">="
48
+ - !ruby/object:Gem::Version
49
+ version: '0'
50
+ type: :development
51
+ prerelease: false
52
+ version_requirements: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: '0'
57
+ - !ruby/object:Gem::Dependency
58
+ name: rspec
59
+ requirement: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: '3'
64
+ type: :development
65
+ prerelease: false
66
+ version_requirements: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: '3'
71
+ email:
72
+ - ilya@grigorik.com
73
+ - valdzone@gmail.com
74
+ - ryan@mcgeary.org
75
+ - 'btalayeminaei@gmail.com '
76
+ executables: []
77
+ extensions:
78
+ - ext/cbloomfilter/extconf.rb
79
+ extra_rdoc_files: []
80
+ files:
81
+ - README.md
82
+ - Rakefile
83
+ - ext/cbloomfilter/cbloomfilter.c
84
+ - ext/cbloomfilter/crc32.h
85
+ - ext/cbloomfilter/extconf.rb
86
+ - lib/bloom_fit.rb
87
+ - lib/bloom_fit/version.rb
88
+ - spec/bloom_fit_spec.rb
89
+ - spec/helper.rb
90
+ homepage: https://github.com/rmm5t/bloom_fit
91
+ licenses: []
92
+ metadata:
93
+ homepage_uri: https://github.com/rmm5t/bloom_fit
94
+ bug_tracker_uri: https://github.com/rmm5t/bloom_fit/issues
95
+ changelog_uri: https://github.com/rmm5t/bloom_fit/blob/main/CHANGELOG.md
96
+ source_code_uri: https://github.com/rmm5t/bloom_fit
97
+ funding_uri: https://github.com/sponsors/rmm5t
98
+ rubygems_mfa_required: 'true'
99
+ rdoc_options: []
100
+ require_paths:
101
+ - lib
102
+ required_ruby_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - ">="
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ required_rubygems_version: !ruby/object:Gem::Requirement
108
+ requirements:
109
+ - - ">="
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ requirements: []
113
+ rubygems_version: 4.0.9
114
+ specification_version: 4
115
+ summary: BloomFit helps you build correctly sized Bloom filters from expected set
116
+ size and target false positive rate.
117
+ test_files:
118
+ - spec/bloom_fit_spec.rb
119
+ - spec/helper.rb