bloom_fit 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5f1a3e06592409a17a287562a4f51910e4c103cc7e7a95fa18b051e684ec2f72
4
- data.tar.gz: 85dab6561d4626f1ece379cbdd63678befac1e19d8f3cc403a270a13da5ca049
3
+ metadata.gz: f542d198165a81ecdc9307e3d2b9a9168608197c117245cb89b087f5fde31081
4
+ data.tar.gz: 60c9bed4dfbf8b6d5e8d4cb47350b9ec31ee22ce9eae3fbcc92628cc8e4aed53
5
5
  SHA512:
6
- metadata.gz: 2553c5c3ce8bff634d2d2f79bc58d9d05ee96426e681c3bdb3295c762fc01c9b2de0fe5aa97cc6755037a7c8a80cc4d367c31b6457e8b087489d36797ad1a598
7
- data.tar.gz: 803c578af7494501775e52bb4db1ea46b5311c7280d20aaa632185d486dcd35ce60390a48d3ca4af2cb6adec4b9cbcc7aa7c922fcc5d9e58b0d7cdeb60a12fa7
6
+ metadata.gz: 55e33f10d0c71aa77bece3ba974995144f44cfc644d7bbf773de9b5ea562078df4511905de6ae87f15a9b95c12975fd423c1fbe6fccfb22c4a0073b2cdf66362
7
+ data.tar.gz: a2cca2d8c5c2ea66979ad93030b75fc7e64f5258650dc43b13e1cbf7080ee32ae44f77ac205b9055d3174d2e95dbe32fff61b949e27fc5f3306a3a332673bf57
data/README.md CHANGED
@@ -25,7 +25,7 @@ BloomFit is a fork of [bloomfilter-rb].
25
25
 
26
26
  MRI/C implementation which creates an in-memory filter which can be saved and reloaded from disk.
27
27
 
28
- (COMING SOON) If you'd like to specify an expected item count and a false-positive rate that you can tolerate:
28
+ (COMING SOON) If you'd like to specify an expected item count and a false-positive rate that you can tolerate. Visit the [Bloom Filter Calculator](https://hur.st/bloomfilter/) to learn more.
29
29
 
30
30
  ```ruby
31
31
  require "bloom_fit"
@@ -11,8 +11,8 @@
11
11
  # define RSTRING_PTR(x) (RSTRING(x)->ptr)
12
12
  #endif
13
13
 
14
- /* Reuse the standard CRC table for consistent seeds */
15
- static unsigned int *seeds = crc_table;
14
+ /* Reuse the standard CRC table for consistent salts */
15
+ static unsigned int *salts = crc_table;
16
16
 
17
17
  static VALUE cBloomFilter;
18
18
 
@@ -20,25 +20,64 @@ struct BloomFilter {
20
20
  int m; /* # of buckets in a bloom filter */
21
21
  int b; /* # of bits in a bloom filter bucket */
22
22
  int k; /* # of hash functions */
23
- int r; /* # raise on bucket overflow? */
24
23
  unsigned char *ptr; /* bits data */
25
24
  int bytes; /* size of byte data */
26
25
  };
27
26
 
28
- unsigned long djb2(unsigned char *str, int len) {
27
+ unsigned long djb2(const char *str, int len) {
29
28
  unsigned long hash = 5381;
30
- unsigned char *c;
31
- c = (unsigned char *) str;
32
- while (len > 0) {
33
- hash = ((hash << 5) ^ hash) ^ (*c);
34
- --len;
35
- ++c;
29
+ for (int i = 0; i < len; i++) {
30
+ hash = ((hash << 5) + hash) + str[i];
36
31
  }
37
32
  return hash;
38
33
  }
39
34
 
40
- void bits_free(struct BloomFilter *bf) {
35
+ static void bf_free(void *ptr) {
36
+ struct BloomFilter *bf = ptr;
37
+
38
+ if (bf == NULL) {
39
+ return;
40
+ }
41
+
41
42
  ruby_xfree(bf->ptr);
43
+ ruby_xfree(bf);
44
+ }
45
+
46
+ static size_t bf_memsize(const void *ptr) {
47
+ const struct BloomFilter *bf = ptr;
48
+
49
+ if (bf == NULL) {
50
+ return 0;
51
+ }
52
+
53
+ return sizeof(*bf) + (bf->ptr == NULL ? 0 : (size_t) bf->bytes);
54
+ }
55
+
56
+ static const rb_data_type_t bf_type = {
57
+ "CBloomFilter",
58
+ {0, bf_free, bf_memsize,},
59
+ 0, 0, RUBY_TYPED_FREE_IMMEDIATELY,
60
+ };
61
+
62
+ static struct BloomFilter *bf_ptr(VALUE obj) {
63
+ struct BloomFilter *bf;
64
+
65
+ TypedData_Get_Struct(obj, struct BloomFilter, &bf_type, bf);
66
+
67
+ return bf;
68
+ }
69
+
70
+ static VALUE bf_alloc(VALUE klass) {
71
+ struct BloomFilter *bf;
72
+ VALUE obj = TypedData_Make_Struct(klass, struct BloomFilter, &bf_type, bf);
73
+
74
+ bf->m = 0;
75
+ bf->b = 0;
76
+ bf->k = 0;
77
+ bf->ptr = NULL;
78
+ bf->bytes = 0;
79
+
80
+ return obj;
42
81
  }
43
82
 
44
83
  void bucket_unset(struct BloomFilter *bf, int index) {
@@ -66,9 +105,7 @@ void bucket_set(struct BloomFilter *bf, int index) {
66
105
  unsigned int c = bf->ptr[byte_offset];
67
106
  c += bf->ptr[byte_offset + 1] << 8;
68
107
  unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
69
- if ((c & mask) == mask) {
70
- if (bf->r == 1) rb_raise(rb_eRuntimeError, "bucket got filled up");
71
- } else {
108
+ if ((c & mask) != mask) {
72
109
  c = c + ((1 << bit_offset) & ((1 << 8) -1)) | c;
73
110
  bf->ptr[byte_offset] = c & ((1 << 8) - 1);
74
111
  bf->ptr[byte_offset + 1] = (c & ((1 << 16) - 1)) >> 8;
@@ -85,36 +122,18 @@ int bucket_check(struct BloomFilter *bf, int index) {
85
122
  return (c & mask) >> bit_offset;
86
123
  }
87
124
 
88
- int bucket_get(struct BloomFilter *bf, int index) {
89
- int byte_offset = (index * bf->b) / 8;
90
- int bit_offset = (index * bf->b) % 8;
91
- unsigned int c = bf->ptr[byte_offset];
92
- c += bf->ptr[byte_offset + 1] << 8;
93
-
94
- unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
95
- return (c & mask) >> bit_offset;
96
- }
97
-
98
- static VALUE bf_s_new(int argc, VALUE *argv, VALUE self) {
125
+ static VALUE bf_initialize(int argc, VALUE *argv, VALUE self) {
99
126
  struct BloomFilter *bf;
100
- VALUE arg1, arg2, arg3, arg4, obj;
101
- int m, k, b, r;
127
+ VALUE arg1, arg2;
128
+ int m, k, b;
102
129
 
103
- obj = Data_Make_Struct(self, struct BloomFilter, NULL, bits_free, bf);
130
+ bf = bf_ptr(self);
104
131
 
105
132
  /* default = Fugou approach :-) */
106
- arg1 = INT2FIX(100000000);
133
+ arg1 = INT2FIX(1000);
107
134
  arg2 = INT2FIX(4);
108
- arg3 = INT2FIX(1);
109
- arg4 = INT2FIX(0);
110
135
 
111
136
  switch (argc) {
112
- case 4:
113
- if (argv[3] == Qtrue) {
114
- arg4 = INT2FIX(1);
115
- }
116
- case 3:
117
- arg3 = argv[2];
118
137
  case 2:
119
138
  arg2 = argv[1];
120
139
  case 1:
@@ -124,11 +143,8 @@ static VALUE bf_s_new(int argc, VALUE *argv, VALUE self) {
124
143
 
125
144
  m = FIX2INT(arg1);
126
145
  k = FIX2INT(arg2);
127
- b = FIX2INT(arg3);
128
- r = FIX2INT(arg4);
146
+ b = 1;
129
147
 
130
- if (b < 1 || b > 8)
131
- rb_raise(rb_eArgError, "bucket size");
132
148
  if (m < 1)
133
149
  rb_raise(rb_eArgError, "array size");
134
150
  if (k < 1)
@@ -137,53 +153,39 @@ static VALUE bf_s_new(int argc, VALUE *argv, VALUE self) {
137
153
  bf->b = b;
138
154
  bf->m = m;
139
155
  bf->k = k;
140
- bf->r = r;
141
156
 
157
+ ruby_xfree(bf->ptr);
158
+ bf->ptr = NULL;
159
+ bf->bytes = 0;
142
160
  bf->bytes = ((m * b) + 15) / 8;
143
161
  bf->ptr = ALLOC_N(unsigned char, bf->bytes);
144
162
 
145
163
  /* initialize the bits with zeros */
146
164
  memset(bf->ptr, 0, bf->bytes);
147
- rb_iv_set(obj, "@hash_value", rb_hash_new());
165
+ rb_iv_set(self, "@hash_value", rb_hash_new());
148
166
 
149
- return obj;
167
+ return self;
150
168
  }
151
169
 
152
170
  static VALUE bf_clear(VALUE self) {
153
- struct BloomFilter *bf;
154
- Data_Get_Struct(self, struct BloomFilter, bf);
171
+ struct BloomFilter *bf = bf_ptr(self);
155
172
  memset(bf->ptr, 0, bf->bytes);
156
173
  return Qtrue;
157
174
  }
158
175
 
159
176
  static VALUE bf_m(VALUE self) {
160
- struct BloomFilter *bf;
161
- Data_Get_Struct(self, struct BloomFilter, bf);
177
+ struct BloomFilter *bf = bf_ptr(self);
162
178
  return INT2FIX(bf->m);
163
179
  }
164
180
 
165
181
  static VALUE bf_k(VALUE self) {
166
- struct BloomFilter *bf;
167
- Data_Get_Struct(self, struct BloomFilter, bf);
182
+ struct BloomFilter *bf = bf_ptr(self);
168
183
  return INT2FIX(bf->k);
169
184
  }
170
185
 
171
- static VALUE bf_b(VALUE self) {
172
- struct BloomFilter *bf;
173
- Data_Get_Struct(self, struct BloomFilter, bf);
174
- return INT2FIX(bf->b);
175
- }
176
-
177
- static VALUE bf_r(VALUE self) {
178
- struct BloomFilter *bf;
179
- Data_Get_Struct(self, struct BloomFilter, bf);
180
- return bf->r == 0 ? Qfalse : Qtrue;
181
- }
182
-
183
186
  static VALUE bf_set_bits(VALUE self){
184
- struct BloomFilter *bf;
187
+ struct BloomFilter *bf = bf_ptr(self);
185
188
  int i,j,count = 0;
186
- Data_Get_Struct(self, struct BloomFilter, bf);
187
189
  for (i = 0; i < bf->bytes; i++) {
188
190
  for (j = 0; j < 8; j++) {
189
191
  count += (bf->ptr[i] >> j) & 1;
@@ -194,11 +196,11 @@ static VALUE bf_set_bits(VALUE self){
194
196
 
195
197
  static VALUE bf_insert(VALUE self, VALUE key) {
196
198
  VALUE skey;
197
- unsigned long hash, index;
199
+ unsigned long hash;
200
+ int index;
198
201
  int i, len, m, k;
199
202
  char *ckey;
200
- struct BloomFilter *bf;
201
- Data_Get_Struct(self, struct BloomFilter, bf);
203
+ struct BloomFilter *bf = bf_ptr(self);
202
204
 
203
205
  skey = rb_obj_as_string(key);
204
206
  ckey = StringValuePtr(skey);
@@ -209,7 +211,7 @@ static VALUE bf_insert(VALUE self, VALUE key) {
209
211
 
210
212
  hash = (unsigned long) djb2(ckey, len);
211
213
  for (i = 0; i <= k - 1; i++) {
212
- index = (unsigned long) (hash ^ seeds[i]) % (unsigned int) (m);
214
+ index = (int) ((hash ^ salts[i]) % (unsigned int) (m));
213
215
 
214
216
  /* set a bit at the index */
215
217
  bucket_set(bf, index);
@@ -219,10 +221,9 @@ static VALUE bf_insert(VALUE self, VALUE key) {
219
221
  }
220
222
 
221
223
  static VALUE bf_merge(VALUE self, VALUE other) {
222
- struct BloomFilter *bf, *target;
224
+ struct BloomFilter *bf = bf_ptr(self);
225
+ struct BloomFilter *target = bf_ptr(other);
223
226
  int i;
224
- Data_Get_Struct(self, struct BloomFilter, bf);
225
- Data_Get_Struct(other, struct BloomFilter, target);
226
227
  for (i = 0; i < bf->bytes; i++) {
227
228
  bf->ptr[i] |= target->ptr[i];
228
229
  }
@@ -230,19 +231,17 @@ static VALUE bf_merge(VALUE self, VALUE other) {
230
231
  }
231
232
 
232
233
  static VALUE bf_and(VALUE self, VALUE other) {
233
- struct BloomFilter *bf, *bf_other, *target;
234
+ struct BloomFilter *bf = bf_ptr(self);
235
+ struct BloomFilter *bf_other = bf_ptr(other);
236
+ struct BloomFilter *target;
234
237
  VALUE klass, obj, args[5];
235
238
  int i;
236
239
 
237
- Data_Get_Struct(self, struct BloomFilter, bf);
238
- Data_Get_Struct(other, struct BloomFilter, bf_other);
239
240
  args[0] = INT2FIX(bf->m);
240
241
  args[1] = INT2FIX(bf->k);
241
- args[2] = INT2FIX(bf->b);
242
- args[3] = INT2FIX(bf->r);
243
242
  klass = rb_funcall(self,rb_intern("class"),0);
244
- obj = bf_s_new(4,args,klass);
245
- Data_Get_Struct(obj, struct BloomFilter, target);
243
+ obj = rb_class_new_instance(2, args, klass);
244
+ target = bf_ptr(obj);
246
245
  for (i = 0; i < bf->bytes; i++){
247
246
  target->ptr[i] = bf->ptr[i] & bf_other->ptr[i];
248
247
  }
@@ -251,19 +250,17 @@ static VALUE bf_and(VALUE self, VALUE other) {
251
250
  }
252
251
 
253
252
  static VALUE bf_or(VALUE self, VALUE other) {
254
- struct BloomFilter *bf, *bf_other, *target;
253
+ struct BloomFilter *bf = bf_ptr(self);
254
+ struct BloomFilter *bf_other = bf_ptr(other);
255
+ struct BloomFilter *target;
255
256
  VALUE klass, obj, args[5];
256
257
  int i;
257
258
 
258
- Data_Get_Struct(self, struct BloomFilter, bf);
259
- Data_Get_Struct(other, struct BloomFilter, bf_other);
260
259
  args[0] = INT2FIX(bf->m);
261
260
  args[1] = INT2FIX(bf->k);
262
- args[2] = INT2FIX(bf->b);
263
- args[3] = INT2FIX(bf->r);
264
261
  klass = rb_funcall(self,rb_intern("class"),0);
265
- obj = bf_s_new(4,args,klass);
266
- Data_Get_Struct(obj, struct BloomFilter, target);
262
+ obj = rb_class_new_instance(2, args, klass);
263
+ target = bf_ptr(obj);
267
264
  for (i = 0; i < bf->bytes; i++){
268
265
  target->ptr[i] = bf->ptr[i] | bf_other->ptr[i];
269
266
  }
@@ -271,86 +268,59 @@ static VALUE bf_or(VALUE self, VALUE other) {
271
268
  return obj;
272
269
  }
273
270
 
274
- static VALUE bf_delete(VALUE self, VALUE key) {
275
- unsigned long hash, index;
276
- int i, len, m, k;
277
- char *ckey;
278
- VALUE skey;
279
- struct BloomFilter *bf;
280
- Data_Get_Struct(self, struct BloomFilter, bf);
281
-
282
- skey = rb_obj_as_string(key);
283
- ckey = StringValuePtr(skey);
284
- len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
285
-
286
- m = bf->m;
287
- k = bf->k;
288
-
289
- hash = (unsigned long) djb2(ckey, len);
290
- for (i = 0; i <= k - 1; i++) {
291
- index = (unsigned long) (hash ^ seeds[i]) % (unsigned int) (m);
292
-
293
- /* set a bit at the index */
294
- bucket_unset(bf, index);
295
- }
296
-
297
- return Qnil;
298
- }
299
-
300
-
301
271
  static VALUE bf_include(int argc, VALUE* argv, VALUE self) {
302
- unsigned long hash, index;
303
- int i, len, m, k, tests_idx, vlen;
272
+ unsigned long hash;
273
+ int i, len, m, k;
274
+ int index;
275
+ long tests_idx, vlen;
304
276
  char *ckey;
305
277
  VALUE tests, key, skey;
306
278
  struct BloomFilter *bf;
307
279
 
308
280
  rb_scan_args(argc, argv, "*", &tests);
309
281
 
310
- Data_Get_Struct(self, struct BloomFilter, bf);
282
+ bf = bf_ptr(self);
311
283
  vlen = RARRAY_LEN(tests);
312
- for(tests_idx = 0; tests_idx < vlen; tests_idx++) {
313
- key = rb_ary_entry(tests, tests_idx);
314
- skey = rb_obj_as_string(key);
315
- ckey = StringValuePtr(skey);
316
- len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
317
-
318
- m = bf->m;
319
- k = bf->k;
320
-
321
- hash = (unsigned long) djb2(ckey, len);
322
- for (i = 0; i <= k - 1; i++) {
323
- index = (unsigned long) (hash ^ seeds[i]) % (unsigned int) (m);
324
-
325
- /* check the bit at the index */
326
- if (!bucket_check(bf, index)) {
327
- return Qfalse; /* i.e., it is a new entry ; escape the loop */
328
- }
329
- }
330
-
331
- return Qtrue;
284
+ for (tests_idx = 0; tests_idx < vlen; tests_idx++) {
285
+ key = rb_ary_entry(tests, tests_idx);
286
+ skey = rb_obj_as_string(key);
287
+ ckey = StringValuePtr(skey);
288
+ len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
289
+
290
+ m = bf->m;
291
+ k = bf->k;
292
+
293
+ hash = (unsigned long) djb2(ckey, len);
294
+ for (i = 0; i <= k - 1; i++) {
295
+ index = (int) ((hash ^ salts[i]) % (unsigned int) (m));
296
+
297
+ /* check the bit at the index */
298
+ if (!bucket_check(bf, index)) {
299
+ return Qfalse; /* i.e., it is a new entry ; escape the loop */
300
+ }
301
+ }
332
302
  }
303
+
304
+ return Qtrue;
333
305
  }
334
306
 
335
307
  static VALUE bf_to_s(VALUE self) {
336
- struct BloomFilter *bf;
308
+ struct BloomFilter *bf = bf_ptr(self);
337
309
  unsigned char *ptr;
338
310
  int i;
339
311
  VALUE str;
340
312
 
341
- Data_Get_Struct(self, struct BloomFilter, bf);
342
313
  str = rb_str_new(0, bf->m);
343
314
 
344
315
  ptr = (unsigned char *) RSTRING_PTR(str);
345
316
  for (i = 0; i < bf->m; i++)
346
- *ptr++ = bucket_get(bf, i) ? '1' : '0';
317
+ *ptr++ = bucket_check(bf, i) ? '1' : '0';
347
318
 
348
319
  return str;
349
320
  }
350
321
 
351
322
  static VALUE bf_bitmap(VALUE self) {
352
- struct BloomFilter *bf;
353
- Data_Get_Struct(self, struct BloomFilter, bf);
323
+ struct BloomFilter *bf = bf_ptr(self);
354
324
 
355
325
  VALUE str = rb_str_new(0, bf->bytes);
356
326
  unsigned char* ptr = (unsigned char *) RSTRING_PTR(str);
@@ -361,8 +331,7 @@ static VALUE bf_bitmap(VALUE self) {
361
331
  }
362
332
 
363
333
  static VALUE bf_load(VALUE self, VALUE bitmap) {
364
- struct BloomFilter *bf;
365
- Data_Get_Struct(self, struct BloomFilter, bf);
334
+ struct BloomFilter *bf = bf_ptr(self);
366
335
  unsigned char* ptr = (unsigned char *) RSTRING_PTR(bitmap);
367
336
 
368
337
  memcpy(bf->ptr, ptr, bf->bytes);
@@ -372,15 +341,13 @@ static VALUE bf_load(VALUE self, VALUE bitmap) {
372
341
 
373
342
  void Init_cbloomfilter(void) {
374
343
  cBloomFilter = rb_define_class("CBloomFilter", rb_cObject);
375
- rb_define_singleton_method(cBloomFilter, "new", bf_s_new, -1);
344
+ rb_define_alloc_func(cBloomFilter, bf_alloc);
345
+ rb_define_method(cBloomFilter, "initialize", bf_initialize, -1);
376
346
  rb_define_method(cBloomFilter, "m", bf_m, 0);
377
347
  rb_define_method(cBloomFilter, "k", bf_k, 0);
378
- rb_define_method(cBloomFilter, "b", bf_b, 0);
379
- rb_define_method(cBloomFilter, "r", bf_r, 0);
380
348
  rb_define_method(cBloomFilter, "set_bits", bf_set_bits, 0);
381
349
  /* rb_define_method(cBloomFilter, "s", bf_s, 0); */
382
350
  rb_define_method(cBloomFilter, "insert", bf_insert, 1);
383
- rb_define_method(cBloomFilter, "delete", bf_delete, 1);
384
351
  rb_define_method(cBloomFilter, "include?", bf_include, -1);
385
352
  rb_define_method(cBloomFilter, "clear", bf_clear, 0);
386
353
  rb_define_method(cBloomFilter, "merge!", bf_merge, 1);
@@ -392,6 +359,5 @@ void Init_cbloomfilter(void) {
392
359
  rb_define_method(cBloomFilter, "load", bf_load, 1);
393
360
 
394
361
  /* functions that have not been implemented, yet */
395
-
396
362
  // rb_define_method(cBloomFilter, "<=>", bf_cmp, 1);
397
363
  }
@@ -1,4 +1,3 @@
1
- #!/usr/bin/env ruby
2
1
  require "mkmf"
3
2
 
4
3
  create_makefile("cbloomfilter")
@@ -1,3 +1,3 @@
1
1
  class BloomFit
2
- VERSION = "0.1.1".freeze
2
+ VERSION = "0.2.0".freeze
3
3
  end
data/lib/bloom_fit.rb CHANGED
@@ -1,42 +1,35 @@
1
- require 'cbloomfilter'
2
- require 'bloom_fit/version'
1
+ require "cbloomfilter"
2
+ require "bloom_fit/version"
3
3
 
4
4
  class BloomFit
5
- BloomFit::ConfigurationMismatch = Class.new(ArgumentError)
5
+ class ConfigurationMismatch < ArgumentError
6
+ end
6
7
 
7
8
  attr_reader :bf
8
9
 
9
- def initialize(opts = {})
10
- @opts = {
11
- :size => 100,
12
- :hashes => 4,
13
- :bucket => 1,
14
- :raise => false
15
- }.merge(opts)
10
+ def initialize(size: 1_000, hashes: 4)
11
+ @size = size
12
+ @hashes = hashes
16
13
 
17
14
  # arg 1: m => size : number of buckets in a bloom filter
18
15
  # arg 2: k => hashes : number of hash functions
19
- # arg 3: b => bucket : number of bits per bucket
20
- # arg 4: r => raise : whether to raise on bucket overflow
21
-
22
- @bf = CBloomFilter.new(@opts[:size], @opts[:hashes], @opts[:bucket], @opts[:raise])
16
+ @bf = CBloomFilter.new(@size, @hashes)
23
17
  end
24
18
 
25
19
  def insert(key)
26
20
  @bf.insert(key)
27
21
  end
28
- alias :[]= :insert
22
+ alias []= insert
29
23
 
30
24
  def include?(*keys)
31
25
  @bf.include?(*keys)
32
26
  end
33
- alias :key? :include?
34
- alias :[] :include?
27
+ alias key? include?
28
+ alias [] include?
35
29
 
36
- def delete(key); @bf.delete(key); end
37
- def clear; @bf.clear; end
38
- def size; @bf.set_bits; end
39
- def merge!(o); @bf.merge!(o.bf); end
30
+ def clear = @bf.clear
31
+ def size = @bf.set_bits
32
+ def merge!(other) = @bf.merge!(other.bf)
40
33
 
41
34
  # Returns the number of bits that are set to 1 in the filter.
42
35
  def set_bits
@@ -46,20 +39,20 @@ class BloomFit
46
39
  # Computes the intersection of two Bloom filters.
47
40
  # It assumes that both filters have the same size -
48
41
  # if this is not true +BloomFit::ConfigurationMismatch+ is raised.
49
- def &(o)
50
- raise BloomFit::ConfigurationMismatch.new unless same_parameters?(o)
42
+ def &(other)
43
+ raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
51
44
  result = self.class.new
52
- result.instance_variable_set(:@bf,@bf.&(o.bf))
45
+ result.instance_variable_set(:@bf, @bf.&(other.bf))
53
46
  result
54
47
  end
55
48
 
56
49
  # Computes the union of two Bloom filters.
57
50
  # It assumes that both filters have the same size -
58
51
  # if this is not true +BloomFit::ConfigurationMismatch+ is raised.
59
- def |(o)
60
- raise BloomFit::ConfigurationMismatch.new unless same_parameters?(o)
52
+ def |(other)
53
+ raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
61
54
  result = self.class.new
62
- result.instance_variable_set(:@bf,@bf.|(o.bf))
55
+ result.instance_variable_set(:@bf, @bf.|(other.bf))
63
56
  result
64
57
  end
65
58
 
@@ -68,40 +61,39 @@ class BloomFit
68
61
  end
69
62
 
70
63
  def marshal_load(ary)
71
- opts, bitmap = *ary
64
+ size, hashes, bitmap = *ary
72
65
 
73
- initialize(opts)
74
- @bf.load(bitmap) if !bitmap.nil?
66
+ initialize(size:, hashes:)
67
+ @bf.load(bitmap) if bitmap
75
68
  end
76
69
 
77
70
  def marshal_dump
78
- [@opts, @bf.bitmap]
71
+ [@size, @hashes, @bf.bitmap]
79
72
  end
80
73
 
81
74
  def self.load(filename)
82
- Marshal.load(File.open(filename, 'r'))
75
+ Marshal.load(File.open(filename, "r"))
83
76
  end
84
77
 
85
78
  def save(filename)
86
- File.open(filename, 'w') do |f|
79
+ File.open(filename, "w") do |f|
87
80
  f << Marshal.dump(self)
88
81
  end
89
82
  end
90
83
 
91
84
  def stats
92
- fp = ((1.0 - Math.exp(-(@opts[:hashes] * size).to_f / @opts[:size])) ** @opts[:hashes]) * 100
93
- printf "Number of filter buckets (m): %d\n", @opts[:size]
94
- printf "Number of bits per buckets (b): %d\n", @opts[:bucket]
85
+ fp = ((1.0 - Math.exp(-(@hashes * size).to_f / @size))**@hashes) * 100
86
+ printf "Number of filter buckets (m): %d\n", @size
95
87
  printf "Number of set bits (n): %d\n", set_bits
96
- printf "Number of filter hashes (k) : %d\n", @opts[:hashes]
88
+ printf "Number of filter hashes (k) : %d\n", @hashes
97
89
  printf "Predicted false positive rate = %.2f%%\n", fp
98
90
  end
99
91
 
100
92
  protected
101
93
 
102
- # Returns true if parameters of the +o+ther filter are
94
+ # Returns true if parameters of the +other+ filter are
103
95
  # the same.
104
- def same_parameters?(o)
105
- @bf.m == o.bf.m && @bf.k == o.bf.k && @bf.b == o.bf.b
96
+ def same_parameters?(other)
97
+ bf.m == other.bf.m && bf.k == other.bf.k
106
98
  end
107
99
  end
Binary file
@@ -1,17 +1,17 @@
1
1
  require "helper"
2
2
 
3
3
  describe BloomFit do
4
- it "should clear" do
5
- bf = BloomFit.new(:size => 100, :hashes => 2, :bucket => 3, :raise => false)
4
+ it "clears" do
5
+ bf = BloomFit.new(size: 100, hashes: 2)
6
6
  bf.insert("test")
7
7
  expect(bf.include?("test")).to be true
8
8
  bf.clear
9
9
  expect(bf.include?("test")).to be false
10
10
  end
11
11
 
12
- it "should merge" do
13
- bf1 = BloomFit.new(:size => 100, :hashes => 2, :bucket => 3, :raise => false)
14
- bf2 = BloomFit.new(:size => 100, :hashes => 2, :bucket => 3, :raise => false)
12
+ it "merges" do
13
+ bf1 = BloomFit.new(size: 100, hashes: 2)
14
+ bf2 = BloomFit.new(size: 100, hashes: 2)
15
15
  bf2.insert("test")
16
16
  expect(bf1.include?("test")).to be false
17
17
  bf1.merge!(bf2)
@@ -19,134 +19,111 @@ describe BloomFit do
19
19
  expect(bf2.include?("test")).to be true
20
20
  end
21
21
 
22
- context "behave like a bloom filter" do
23
- it "should test set membership" do
24
- bf = BloomFit.new(:size => 100, :hashes => 2, :bucket => 3, :raise => false)
25
- bf.insert("test")
26
- bf.insert("test1")
27
-
28
- expect(bf.include?("test")).to be true
29
- expect(bf.include?("abcd")).to be false
30
- expect(bf.include?("test", "test1")).to be true
31
- end
22
+ it "tests set membership" do
23
+ bf = BloomFit.new(size: 100, hashes: 2)
24
+ bf.insert("test")
25
+ bf.insert("test1")
32
26
 
33
- it "should work with any object's to_s" do
34
- subject.insert(:test)
35
- subject.insert(:test1)
36
- subject.insert(12345)
27
+ expect(bf.include?("test")).to be true
28
+ expect(bf.include?("abcd")).to be false
29
+ expect(bf.include?("test", "test1")).to be true
30
+ expect(bf.include?("test1", "abcd")).to be false
31
+ end
37
32
 
38
- expect(subject.include?("test")).to be true
39
- expect(subject.include?("abcd")).to be false
40
- expect(subject.include?("test", "test1", '12345')).to be true
41
- end
33
+ it "works with any object's to_s" do
34
+ subject.insert(:test)
35
+ subject.insert(:test1)
36
+ subject.insert(12_345)
42
37
 
43
- it "should return the number of bits set to 1" do
44
- bf = BloomFit.new(:hashes => 4)
45
- bf.insert("test")
46
- expect(bf.set_bits).to be == 4
47
- bf.delete("test")
48
- expect(bf.set_bits).to be == 0
38
+ expect(subject.include?("test")).to be true
39
+ expect(subject.include?("abcd")).to be false
40
+ expect(subject.include?("12345")).to be true
41
+ end
49
42
 
50
- bf = BloomFit.new(:hashes => 1)
51
- bf.insert("test")
52
- expect(bf.set_bits).to be == 1
53
- end
43
+ it "returns the number of bits set to 1" do
44
+ bf = BloomFit.new(hashes: 4)
45
+ bf.insert("test")
46
+ expect(bf.set_bits).to eq 4
54
47
 
55
- it "should return intersection with other filter" do
56
- bf1 = BloomFit.new
57
- bf1.insert("test")
58
- bf1.insert("test1")
48
+ bf = BloomFit.new(hashes: 1)
49
+ bf.insert("test")
50
+ expect(bf.set_bits).to eq 1
51
+ end
59
52
 
60
- bf2 = BloomFit.new
61
- bf2.insert("test")
62
- bf2.insert("test2")
53
+ it "returns intersection with other filter" do
54
+ bf1 = BloomFit.new
55
+ bf1.insert("test")
56
+ bf1.insert("test1")
63
57
 
64
- bf3 = bf1 & bf2
65
- expect(bf3.include?("test")).to be true
66
- expect(bf3.include?("test1")).to be false
67
- expect(bf3.include?("test2")).to be false
68
- end
58
+ bf2 = BloomFit.new
59
+ bf2.insert("test")
60
+ bf2.insert("test2")
69
61
 
70
- it "should raise an exception when intersection is to be computed for incompatible filters" do
71
- bf1 = BloomFit.new(:size => 10)
72
- bf1.insert("test")
62
+ bf3 = bf1 & bf2
63
+ expect(bf3.include?("test")).to be true
64
+ expect(bf3.include?("test1")).to be false
65
+ expect(bf3.include?("test2")).to be false
66
+ end
73
67
 
74
- bf2 = BloomFit.new(:size => 20)
75
- bf2.insert("test")
68
+ it "raises an exception when intersection is to be computed for incompatible filters" do
69
+ bf1 = BloomFit.new(size: 10)
70
+ bf1.insert("test")
76
71
 
77
- expect { bf1 & bf2 }.to raise_error(BloomFit::ConfigurationMismatch)
78
- end
72
+ bf2 = BloomFit.new(size: 20)
73
+ bf2.insert("test")
79
74
 
80
- it "should return union with other filter" do
81
- bf1 = BloomFit.new
82
- bf1.insert("test")
83
- bf1.insert("test1")
75
+ expect { bf1 & bf2 }.to raise_error(BloomFit::ConfigurationMismatch)
76
+ end
84
77
 
85
- bf2 = BloomFit.new
86
- bf2.insert("test")
87
- bf2.insert("test2")
78
+ it "returns union with other filter" do
79
+ bf1 = BloomFit.new
80
+ bf1.insert("test")
81
+ bf1.insert("test1")
88
82
 
89
- bf3 = bf1 | bf2
90
- expect(bf3.include?("test")).to be true
91
- expect(bf3.include?("test1")).to be true
92
- expect(bf3.include?("test2")).to be true
93
- end
83
+ bf2 = BloomFit.new
84
+ bf2.insert("test")
85
+ bf2.insert("test2")
94
86
 
95
- it "should raise an exception when union is to be computed for incompatible filters" do
96
- bf1 = BloomFit.new(:size => 10)
97
- bf1.insert("test")
87
+ bf3 = bf1 | bf2
88
+ expect(bf3.include?("test")).to be true
89
+ expect(bf3.include?("test1")).to be true
90
+ expect(bf3.include?("test2")).to be true
91
+ end
98
92
 
99
- bf2 = BloomFit.new(:size => 20)
100
- bf2.insert("test")
93
+ it "raises an exception when union is to be computed for incompatible filters" do
94
+ bf1 = BloomFit.new(size: 10)
95
+ bf1.insert("test")
101
96
 
102
- expect {bf1 | bf2}.to raise_error(BloomFit::ConfigurationMismatch)
103
- end
97
+ bf2 = BloomFit.new(size: 20)
98
+ bf2.insert("test")
104
99
 
105
- it "should output current stats" do
106
- subject.insert('test')
107
- expect { subject.stats }.not_to raise_error
108
- end
100
+ expect { bf1 | bf2 }.to raise_error(BloomFit::ConfigurationMismatch)
109
101
  end
110
102
 
111
- context "behave like counting bloom filter" do
112
- it "should delete / decrement keys" do
113
- subject.insert("test")
114
- expect(subject.include?("test")).to be true
115
-
116
- subject.delete("test")
117
- expect(subject.include?("test")).to be false
118
- end
103
+ it "outputs current stats" do
104
+ subject.insert("test")
105
+ expect { subject.stats }.not_to raise_error
119
106
  end
120
107
 
121
- context "serialize" do
122
- after(:each) { File.unlink('bf.out') }
108
+ context "serialization" do
109
+ after { File.unlink("bf.out") }
123
110
 
124
- it "should marshall" do
111
+ it "marshalls" do
125
112
  bf = BloomFit.new
126
- expect { bf.save('bf.out') }.not_to raise_error
113
+ expect { bf.save("bf.out") }.not_to raise_error
127
114
  end
128
115
 
129
- it "should load from marshalled" do
130
- subject.insert('foo')
131
- subject.insert('bar')
132
- subject.save('bf.out')
116
+ it "loads from marshalled" do
117
+ subject.insert("foo")
118
+ subject.insert("bar")
119
+ subject.save("bf.out")
133
120
 
134
- bf2 = BloomFit.load('bf.out')
135
- expect(bf2.include?('foo')).to be true
136
- expect(bf2.include?('bar')).to be true
137
- expect(bf2.include?('baz')).to be false
121
+ bf2 = BloomFit.load("bf.out")
122
+ expect(bf2.include?("foo")).to be true
123
+ expect(bf2.include?("bar")).to be true
124
+ expect(bf2.include?("baz")).to be false
138
125
 
139
126
  expect(subject.send(:same_parameters?, bf2)).to be true
140
127
  end
141
-
142
- it "should serialize to a file size proporational its bucket size" do
143
- fs_size = 0
144
- 8.times do |i|
145
- bf = BloomFit.new(size: 10_000, bucket: i+1)
146
- bf.save('bf.out')
147
- prev_size, fs_size = fs_size, File.size('bf.out')
148
- expect(prev_size).to be < fs_size
149
- end
150
- end
151
128
  end
152
129
  end
data/spec/helper.rb CHANGED
@@ -1,2 +1 @@
1
- require "bundler/setup"
2
1
  require "bloom_fit"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bloom_fit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ilya Grigorik
@@ -11,63 +11,7 @@ authors:
11
11
  bindir: bin
12
12
  cert_chain: []
13
13
  date: 1980-01-02 00:00:00.000000000 Z
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
16
- name: irb
17
- requirement: !ruby/object:Gem::Requirement
18
- requirements:
19
- - - ">="
20
- - !ruby/object:Gem::Version
21
- version: '0'
22
- type: :development
23
- prerelease: false
24
- version_requirements: !ruby/object:Gem::Requirement
25
- requirements:
26
- - - ">="
27
- - !ruby/object:Gem::Version
28
- version: '0'
29
- - !ruby/object:Gem::Dependency
30
- name: rake
31
- requirement: !ruby/object:Gem::Requirement
32
- requirements:
33
- - - ">="
34
- - !ruby/object:Gem::Version
35
- version: '0'
36
- type: :development
37
- prerelease: false
38
- version_requirements: !ruby/object:Gem::Requirement
39
- requirements:
40
- - - ">="
41
- - !ruby/object:Gem::Version
42
- version: '0'
43
- - !ruby/object:Gem::Dependency
44
- name: rake-compiler
45
- requirement: !ruby/object:Gem::Requirement
46
- requirements:
47
- - - ">="
48
- - !ruby/object:Gem::Version
49
- version: '0'
50
- type: :development
51
- prerelease: false
52
- version_requirements: !ruby/object:Gem::Requirement
53
- requirements:
54
- - - ">="
55
- - !ruby/object:Gem::Version
56
- version: '0'
57
- - !ruby/object:Gem::Dependency
58
- name: rspec
59
- requirement: !ruby/object:Gem::Requirement
60
- requirements:
61
- - - ">="
62
- - !ruby/object:Gem::Version
63
- version: '3'
64
- type: :development
65
- prerelease: false
66
- version_requirements: !ruby/object:Gem::Requirement
67
- requirements:
68
- - - ">="
69
- - !ruby/object:Gem::Version
70
- version: '3'
14
+ dependencies: []
71
15
  email:
72
16
  - ilya@grigorik.com
73
17
  - valdzone@gmail.com
@@ -79,12 +23,12 @@ extensions:
79
23
  extra_rdoc_files: []
80
24
  files:
81
25
  - README.md
82
- - Rakefile
83
26
  - ext/cbloomfilter/cbloomfilter.c
84
27
  - ext/cbloomfilter/crc32.h
85
28
  - ext/cbloomfilter/extconf.rb
86
29
  - lib/bloom_fit.rb
87
30
  - lib/bloom_fit/version.rb
31
+ - lib/cbloomfilter.bundle
88
32
  - spec/bloom_fit_spec.rb
89
33
  - spec/helper.rb
90
34
  homepage: https://github.com/rmm5t/bloom_fit
@@ -103,7 +47,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
103
47
  requirements:
104
48
  - - ">="
105
49
  - !ruby/object:Gem::Version
106
- version: '0'
50
+ version: 3.2.0
107
51
  required_rubygems_version: !ruby/object:Gem::Requirement
108
52
  requirements:
109
53
  - - ">="
@@ -114,6 +58,4 @@ rubygems_version: 4.0.9
114
58
  specification_version: 4
115
59
  summary: BloomFit helps you build correctly sized Bloom filters from expected set
116
60
  size and target false positive rate.
117
- test_files:
118
- - spec/bloom_fit_spec.rb
119
- - spec/helper.rb
61
+ test_files: []
data/Rakefile DELETED
@@ -1,12 +0,0 @@
1
- require "bundler/gem_tasks"
2
- require "bundler/setup"
3
- require "rspec/core/rake_task"
4
- require "rake/extensiontask"
5
-
6
- Rake::ExtensionTask.new("cbloomfilter")
7
- RSpec::Core::RakeTask.new(:spec)
8
- Rake::Task[:spec].prerequisites << :clean
9
- Rake::Task[:spec].prerequisites << :compile
10
-
11
- desc "Default: run unit tests."
12
- task default: :spec