bloom_fit 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 54da887424b56d9c09e4d351125c22873bc24be3e32e96cf3716d044a0864957
4
- data.tar.gz: 50780ab65355bc42c075586888f4f09ee6ce6849b16c01264d83887dc83f71a3
3
+ metadata.gz: ed19ba044e45497c9026b8227e77c48cd62aea3043f698c6aca4955eb734f17e
4
+ data.tar.gz: e712cf58a3b6b11e38733da4437c95fbd94a7e9b07eeb4e72945138a140d730f
5
5
  SHA512:
6
- metadata.gz: 53511030706f900e42050938ff80eaaaa5290c609dcd40e6b809bed6c6d491fe63bc57d4d2c1e494c0081642f85e6e29e8c5bc46cbe9cc342d8700990d910043
7
- data.tar.gz: f5da69e7acebde88b41649f6dfac9925e4f021c5fb7687f442a0b61b78efd1c30423f013851c2982048ef2f3c374b0e583cca7f92352475a31ca5cddfb67fd46
6
+ metadata.gz: 69b2b91fdf8e3995931507a53b13c6923e225faef01f6cf39c3524e9ad2e63673411452719fc93509aaa830b42f4fa45198cf39f0b3f70cd33f60846116f5430
7
+ data.tar.gz: e33e427c4bd6ca79d818887dbca0d80348a868fdea85930197e92e087c63adb8a3d339b74d420a894ed55c731e05fedfeb9875a57e5f32698ee342c2836a1ebc
data/README.md CHANGED
@@ -185,7 +185,7 @@ filter = BloomFit.new(capacity: 100)
185
185
  filter.merge(%w[cat dog bird])
186
186
  ```
187
187
 
188
- Filters can only be combined when they have the same `size` and `hashes`. Otherwise BloomFit raises `BloomFit::ConfigurationMismatch`.
188
+ Filters can only be combined when they have the same `size` and `hashes`. Otherwise BloomFit raises `ArgumentError`.
189
189
 
190
190
  When you create filters with automatic sizing, use the same `capacity` and `false_positive_rate` for filters you plan to merge, union, or intersect.
191
191
 
@@ -4,15 +4,15 @@
4
4
  */
5
5
 
6
6
  #include "ruby.h"
7
- #include "crc32.h"
7
+ #include <limits.h>
8
+ #include "salts.h"
8
9
 
9
10
  #if !defined(RSTRING_LEN)
10
11
  # define RSTRING_LEN(x) (RSTRING(x)->len)
11
12
  # define RSTRING_PTR(x) (RSTRING(x)->ptr)
12
13
  #endif
13
14
 
14
- /* Reuse the standard CRC table for consistent salts */
15
- static unsigned int *salts = crc_table;
15
+ static const int salts_length = sizeof(salts) / sizeof(salts[0]);
16
16
 
17
17
  static VALUE cBloomFilter;
18
18
 
@@ -26,7 +26,7 @@ struct BloomFilter {
26
26
  unsigned long djb2(const char *str, int len) {
27
27
  unsigned long hash = 5381;
28
28
  for (int i = 0; i < len; i++) {
29
- hash = ((hash << 5) + hash) + str[i];
29
+ hash = ((hash << 5) + hash) + (unsigned char) str[i];
30
30
  }
31
31
  return hash;
32
32
  }
@@ -92,13 +92,40 @@ static int bucket_check(struct BloomFilter *bf, int index) {
92
92
  return (bf->ptr[byte_offset] >> bit_offset) & 1;
93
93
  }
94
94
 
95
+ static void bf_ensure_compatible(struct BloomFilter *bf, struct BloomFilter *other) {
96
+ if (bf->m != other->m || bf->k != other->k || bf->bytes != other->bytes) {
97
+ rb_raise(rb_eArgError, "bloom filters must have matching size and hash count");
98
+ }
99
+ }
100
+
101
+ static void bf_clear_padding_bits(struct BloomFilter *bf) {
102
+ int full_bytes = bf->m / 8;
103
+ int remaining_bits = bf->m % 8;
104
+ int i;
105
+
106
+ if (remaining_bits > 0) {
107
+ unsigned char mask = (unsigned char) ((1U << remaining_bits) - 1U);
108
+ bf->ptr[full_bytes] &= mask;
109
+ full_bytes += 1;
110
+ }
111
+
112
+ for (i = full_bytes; i < bf->bytes; i++) {
113
+ bf->ptr[i] = 0;
114
+ }
115
+ }
116
+
95
117
  static VALUE bf_initialize(int argc, VALUE *argv, VALUE self) {
96
118
  struct BloomFilter *bf;
97
119
  VALUE arg1, arg2;
120
+ long m_value, k_value;
98
121
  int m, k;
99
122
 
100
123
  bf = bf_ptr(self);
101
124
 
125
+ if (argc > 2) {
126
+ rb_error_arity(argc, 0, 2);
127
+ }
128
+
102
129
  /* defaults */
103
130
  arg1 = INT2FIX(1000);
104
131
  arg2 = INT2FIX(4);
@@ -111,13 +138,23 @@ static VALUE bf_initialize(int argc, VALUE *argv, VALUE self) {
111
138
  break;
112
139
  }
113
140
 
114
- m = FIX2INT(arg1);
115
- k = FIX2INT(arg2);
141
+ m_value = NUM2LONG(arg1);
142
+ k_value = NUM2LONG(arg2);
143
+
144
+ if (m_value > INT_MAX - 15)
145
+ rb_raise(rb_eRangeError, "bit length is too large");
146
+ if (k_value > INT_MAX)
147
+ rb_raise(rb_eRangeError, "hash length is too large");
148
+
149
+ m = (int) m_value;
150
+ k = (int) k_value;
116
151
 
117
152
  if (m < 1)
118
- rb_raise(rb_eArgError, "array size");
153
+ rb_raise(rb_eArgError, "bit length must be >= 1");
119
154
  if (k < 1)
120
- rb_raise(rb_eArgError, "hash length");
155
+ rb_raise(rb_eArgError, "hash length must be >= 1");
156
+ if (k > salts_length)
157
+ rb_raise(rb_eArgError, "hash length must be <= %d", salts_length);
121
158
 
122
159
  bf->m = m;
123
160
  bf->k = k;
@@ -131,7 +168,6 @@ static VALUE bf_initialize(int argc, VALUE *argv, VALUE self) {
131
168
 
132
169
  /* initialize the bits with zeros */
133
170
  memset(bf->ptr, 0, bf->bytes);
134
- rb_iv_set(self, "@hash_value", rb_hash_new());
135
171
 
136
172
  return self;
137
173
  }
@@ -154,12 +190,18 @@ static VALUE bf_k(VALUE self) {
154
190
 
155
191
  static VALUE bf_set_bits(VALUE self){
156
192
  struct BloomFilter *bf = bf_ptr(self);
157
- int i,j,count = 0;
193
+ int i, count = 0;
194
+
158
195
  for (i = 0; i < bf->bytes; i++) {
159
- for (j = 0; j < 8; j++) {
160
- count += (bf->ptr[i] >> j) & 1;
196
+ unsigned char byte = bf->ptr[i];
197
+
198
+ /* Brian Kernighan’s bit-count loop a*/
199
+ while (byte != 0) {
200
+ byte &= (unsigned char) (byte - 1);
201
+ count++;
161
202
  }
162
203
  }
204
+
163
205
  return INT2FIX(count);
164
206
  }
165
207
 
@@ -193,6 +235,9 @@ static VALUE bf_merge(VALUE self, VALUE other) {
193
235
  struct BloomFilter *bf = bf_ptr(self);
194
236
  struct BloomFilter *target = bf_ptr(other);
195
237
  int i;
238
+
239
+ bf_ensure_compatible(bf, target);
240
+
196
241
  for (i = 0; i < bf->bytes; i++) {
197
242
  bf->ptr[i] |= target->ptr[i];
198
243
  }
@@ -206,6 +251,8 @@ static VALUE bf_and(VALUE self, VALUE other) {
206
251
  VALUE klass, obj, args[5];
207
252
  int i;
208
253
 
254
+ bf_ensure_compatible(bf, bf_other);
255
+
209
256
  args[0] = INT2FIX(bf->m);
210
257
  args[1] = INT2FIX(bf->k);
211
258
  klass = rb_funcall(self,rb_intern("class"),0);
@@ -225,6 +272,8 @@ static VALUE bf_or(VALUE self, VALUE other) {
225
272
  VALUE klass, obj, args[5];
226
273
  int i;
227
274
 
275
+ bf_ensure_compatible(bf, bf_other);
276
+
228
277
  args[0] = INT2FIX(bf->m);
229
278
  args[1] = INT2FIX(bf->k);
230
279
  klass = rb_funcall(self,rb_intern("class"),0);
@@ -278,9 +327,17 @@ static VALUE bf_bitmap(VALUE self) {
278
327
 
279
328
  static VALUE bf_load(VALUE self, VALUE bitmap) {
280
329
  struct BloomFilter *bf = bf_ptr(self);
281
- unsigned char* ptr = (unsigned char *) RSTRING_PTR(bitmap);
330
+ VALUE bitmap_string = StringValue(bitmap);
331
+ unsigned char* ptr;
332
+
333
+ if (RSTRING_LEN(bitmap_string) != bf->bytes) {
334
+ rb_raise(rb_eArgError, "bitmap length must be %d bytes", bf->bytes);
335
+ }
336
+
337
+ ptr = (unsigned char *) RSTRING_PTR(bitmap_string);
282
338
 
283
339
  memcpy(bf->ptr, ptr, bf->bytes);
340
+ bf_clear_padding_bits(bf);
284
341
 
285
342
  return Qnil;
286
343
  }
@@ -0,0 +1,50 @@
1
+ /*
2
+ * Borrowed from the CRC table
3
+ * https://www.mrob.com/pub/comp/crc-all.html
4
+ *
5
+ */
6
+ static unsigned int salts[] = {
7
+ 0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL, 0x706af48fUL,
8
+ 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL, 0xe0d5e91eUL, 0x97d2d988UL,
9
+ 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL, 0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL,
10
+ 0xf3b97148UL, 0x84be41deUL, 0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL,
11
+ 0x136c9856UL, 0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
12
+ 0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL, 0xa2677172UL,
13
+ 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL, 0x35b5a8faUL, 0x42b2986cUL,
14
+ 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL, 0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL,
15
+ 0x26d930acUL, 0x51de003aUL, 0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL,
16
+ 0xcfba9599UL, 0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
17
+ 0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL, 0x01db7106UL,
18
+ 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL, 0x9fbfe4a5UL, 0xe8b8d433UL,
19
+ 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL, 0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL,
20
+ 0x91646c97UL, 0xe6635c01UL, 0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL,
21
+ 0x6c0695edUL, 0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
22
+ 0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL, 0xfbd44c65UL,
23
+ 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL, 0x4adfa541UL, 0x3dd895d7UL,
24
+ 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL, 0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL,
25
+ 0x44042d73UL, 0x33031de5UL, 0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL,
26
+ 0xbe0b1010UL, 0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
27
+ 0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL, 0x2eb40d81UL,
28
+ 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL, 0x03b6e20cUL, 0x74b1d29aUL,
29
+ 0xead54739UL, 0x9dd277afUL, 0x04db2615UL, 0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL,
30
+ 0x0d6d6a3eUL, 0x7a6a5aa8UL, 0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL,
31
+ 0xf00f9344UL, 0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
32
+ 0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL, 0x67dd4accUL,
33
+ 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL, 0xd6d6a3e8UL, 0xa1d1937eUL,
34
+ 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL, 0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL,
35
+ 0xd80d2bdaUL, 0xaf0a1b4cUL, 0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL,
36
+ 0x316e8eefUL, 0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
37
+ 0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL, 0xb2bd0b28UL,
38
+ 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL, 0x2cd99e8bUL, 0x5bdeae1dUL,
39
+ 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL, 0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL,
40
+ 0x72076785UL, 0x05005713UL, 0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL,
41
+ 0x92d28e9bUL, 0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
42
+ 0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL, 0x18b74777UL,
43
+ 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL, 0x8f659effUL, 0xf862ae69UL,
44
+ 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL, 0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL,
45
+ 0xa7672661UL, 0xd06016f7UL, 0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL,
46
+ 0x40df0b66UL, 0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
47
+ 0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL, 0xcdd70693UL,
48
+ 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL, 0x5d681b02UL, 0x2a6f2b94UL,
49
+ 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL, 0x2d02ef8dUL
50
+ };
@@ -1,3 +1,3 @@
1
1
  class BloomFit
2
- VERSION = "1.0.0".freeze
2
+ VERSION = "1.1.0".freeze
3
3
  end
data/lib/bloom_fit.rb CHANGED
@@ -1,7 +1,6 @@
1
1
  require "forwardable"
2
2
 
3
3
  require "cbloomfilter"
4
- require "bloom_fit/configuration_mismatch"
5
4
  require "bloom_fit/version"
6
5
 
7
6
  # BloomFit is an in-memory Bloom filter with a small, Set-like API.
@@ -16,7 +15,7 @@ require "bloom_fit/version"
16
15
  # serialized with +save+ and reloaded with +BloomFit.load+.
17
16
  #
18
17
  # Filters can only be combined when they were created with the same +size+ and
19
- # +hashes+ values; otherwise +BloomFit::ConfigurationMismatch+ is raised.
18
+ # +hashes+ values; otherwise the native extension raises +ArgumentError+.
20
19
  #
21
20
  # filter = BloomFit.new(size: 10_000, hashes: 6)
22
21
  # filter.add("cat")
@@ -80,15 +79,11 @@ class BloomFit
80
79
  #
81
80
  # Positive results are probabilistic and may be false positives.
82
81
 
83
- # :method: clear
84
- #
85
- # Clears the filter by resetting all bits to +0+.
86
-
87
82
  # :method: set_bits
88
83
  #
89
84
  # Returns the number of bits currently set to +1+.
90
85
 
91
- def_delegators :@bf, :m, :k, :bitmap, :include?, :clear, :set_bits
86
+ def_delegators :@bf, :m, :k, :bitmap, :include?, :set_bits
92
87
 
93
88
  # Returns the configured filter width.
94
89
  alias size m
@@ -115,6 +110,12 @@ class BloomFit
115
110
  end
116
111
  alias << add
117
112
 
113
+ # Clears the filter by resetting all bits to +0+ and returns +self+.
114
+ def clear
115
+ @bf.clear
116
+ self
117
+ end
118
+
118
119
  # Adds +key+ to the filter when +value+ is truthy.
119
120
  #
120
121
  # This makes BloomFit behave like a write-only membership hash: truthy values
@@ -162,7 +163,6 @@ class BloomFit
162
163
  # This method mutates the receiver and mimics Set#merge.
163
164
  def merge(other)
164
165
  if other.is_a?(BloomFit)
165
- raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
166
166
  @bf.merge(other.bf)
167
167
  elsif other.respond_to?(:each_key)
168
168
  other.each { |k, v| add(k) if v }
@@ -171,17 +171,18 @@ class BloomFit
171
171
  else
172
172
  raise ArgumentError, "value must be enumerable or another BloomFit filter"
173
173
  end
174
+
175
+ self
174
176
  end
175
177
 
176
178
  # Returns a new filter containing the bitwise intersection of two filters.
177
179
  #
178
- # Both filters must have the same +size+ and +hashes+ values or
179
- # +BloomFit::ConfigurationMismatch+ is raised.
180
+ # Both filters must have the same +size+ and +hashes+ values or the native
181
+ # extension raises +ArgumentError+.
180
182
  #
181
183
  # Like all Bloom filter operations, membership checks on the result remain
182
184
  # probabilistic and may still produce false positives.
183
185
  def &(other)
184
- raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
185
186
  self.class.new(size:, hashes:).tap do |result|
186
187
  result.instance_variable_set(:@bf, @bf.&(other.bf))
187
188
  end
@@ -190,12 +191,11 @@ class BloomFit
190
191
 
191
192
  # Returns a new filter containing the bitwise union of two filters.
192
193
  #
193
- # Both filters must have the same +size+ and +hashes+ values or
194
- # +BloomFit::ConfigurationMismatch+ is raised.
194
+ # Both filters must have the same +size+ and +hashes+ values or the native
195
+ # extension raises +ArgumentError+.
195
196
  #
196
197
  # The receiver and +other+ are left unchanged.
197
198
  def |(other)
198
- raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
199
199
  self.class.new(size:, hashes:).tap do |result|
200
200
  result.instance_variable_set(:@bf, @bf.|(other.bf))
201
201
  end
@@ -208,14 +208,14 @@ class BloomFit
208
208
  # bits (+n+), the hash count (+k+), and the predicted false-positive rate
209
209
  # based on the current fill level.
210
210
  def stats
211
- fpr = ((1.0 - Math.exp(-(k * n).to_f / m))**k) * 100
211
+ fpr = ((n.to_f / m)**k) * 100
212
212
 
213
- (+"").tap do |s|
214
- s << format("Number of filter buckets (m): %d\n", m)
215
- s << format("Number of set bits (n): %d\n", n)
216
- s << format("Number of filter hashes (k): %d\n", k)
217
- s << format("Predicted false positive rate: %.2f%%\n", fpr)
218
- end
213
+ format <<~STATS, m, n, k, fpr
214
+ Number of filter buckets (m): %d
215
+ Number of set bits (n): %d
216
+ Number of filter hashes (k): %d
217
+ Predicted false positive rate: %.2f%%
218
+ STATS
219
219
  end
220
220
 
221
221
  # Rebuilds the filter from the serialized data returned by +marshal_dump+.
@@ -238,20 +238,11 @@ class BloomFit
238
238
  # The file is read using Ruby's +Marshal+ format, so it should only be used
239
239
  # with trusted input.
240
240
  def self.load(filename)
241
- Marshal.load(File.open(filename, "r")) # rubocop:disable Security/MarshalLoad
241
+ Marshal.load(File.binread(filename)) # rubocop:disable Security/MarshalLoad
242
242
  end
243
243
 
244
244
  # Writes the filter to +filename+ using Ruby's +Marshal+ format.
245
245
  def save(filename)
246
- File.open(filename, "w") do |f|
247
- f << Marshal.dump(self)
248
- end
249
- end
250
-
251
- protected
252
-
253
- # Returns +true+ when +other+ has the same +size+ and +hashes+ values.
254
- def same_parameters?(other)
255
- bf.m == other.bf.m && bf.k == other.bf.k
246
+ File.binwrite(filename, Marshal.dump(self))
256
247
  end
257
248
  end
Binary file
@@ -124,11 +124,11 @@ class BloomFitTest < Minitest::Spec
124
124
  end
125
125
 
126
126
  describe "#clear" do
127
- it "zeroes the bits" do
127
+ it "zeroes the bits and returns self" do
128
128
  subject.add("test")
129
129
  assert_includes subject, "test"
130
130
  assert_includes subject.to_binary, "1"
131
- subject.clear
131
+ assert_equal subject, subject.clear
132
132
  refute_includes subject, "test"
133
133
  refute_includes subject.to_binary, "1"
134
134
  end
@@ -202,14 +202,14 @@ class BloomFitTest < Minitest::Spec
202
202
  end
203
203
 
204
204
  describe "#merge" do
205
- it "merges another BloomFit filter" do
205
+ it "merges another BloomFit filter and returns self" do
206
206
  bf1 = BloomFit.new(size: 100, hashes: 2)
207
207
  bf2 = BloomFit.new(size: 100, hashes: 2)
208
208
  bf1 << "mouse"
209
209
  bf2 << "cat" << "dog"
210
210
  refute_includes bf1, "cat"
211
211
  refute_includes bf1, "dog"
212
- bf1.merge(bf2)
212
+ assert_equal bf1, bf1.merge(bf2)
213
213
  assert_includes bf1, "mouse"
214
214
  assert_includes bf1, "cat"
215
215
  assert_includes bf1, "dog"
@@ -218,9 +218,9 @@ class BloomFitTest < Minitest::Spec
218
218
  assert_includes bf2, "dog"
219
219
  end
220
220
 
221
- it "merges an array" do
221
+ it "merges an array and returns self" do
222
222
  subject << "mouse"
223
- subject.merge %i[cat dog]
223
+ assert_equal subject, subject.merge(%i[cat dog])
224
224
  assert_includes subject, "mouse"
225
225
  assert_includes subject, "cat"
226
226
  assert_includes subject, "dog"
@@ -247,7 +247,7 @@ class BloomFitTest < Minitest::Spec
247
247
  it "raises when merge is between incompatible filters" do
248
248
  bf1 = BloomFit.new(size: 10)
249
249
  bf2 = BloomFit.new(size: 20)
250
- assert_raises(BloomFit::ConfigurationMismatch) { bf1.merge(bf2) }
250
+ assert_raises(ArgumentError) { bf1.merge(bf2) }
251
251
  end
252
252
  end
253
253
 
@@ -285,11 +285,11 @@ class BloomFitTest < Minitest::Spec
285
285
  it "raises when intersection is between incompatible filters" do
286
286
  bf1 = BloomFit.new(size: 10)
287
287
  bf2 = BloomFit.new(size: 20)
288
- assert_raises(BloomFit::ConfigurationMismatch) { bf1 & bf2 }
288
+ assert_raises(ArgumentError) { bf1 & bf2 }
289
289
 
290
290
  bf1 = BloomFit.new(size: 10, hashes: 2)
291
291
  bf2 = BloomFit.new(size: 10, hashes: 4)
292
- assert_raises(BloomFit::ConfigurationMismatch) { bf1 & bf2 }
292
+ assert_raises(ArgumentError) { bf1 & bf2 }
293
293
  end
294
294
  end
295
295
 
@@ -325,7 +325,7 @@ class BloomFitTest < Minitest::Spec
325
325
  it "raises when union is between incompatible filters" do
326
326
  bf1 = BloomFit.new(size: 10)
327
327
  bf2 = BloomFit.new(size: 20)
328
- assert_raises(BloomFit::ConfigurationMismatch) { bf1 | bf2 }
328
+ assert_raises(ArgumentError) { bf1 | bf2 }
329
329
  end
330
330
  end
331
331
 
@@ -340,16 +340,51 @@ class BloomFitTest < Minitest::Spec
340
340
  STATS
341
341
  assert_equal expected, bf.stats
342
342
  end
343
+
344
+ it "estimates false positives from the current fill level" do
345
+ bf = BloomFit.new(size: 10, hashes: 3)
346
+ bf.bf.load("\x07\x00\x00".b)
347
+
348
+ expected = <<~STATS
349
+ Number of filter buckets (m): 10
350
+ Number of set bits (n): 3
351
+ Number of filter hashes (k): 3
352
+ Predicted false positive rate: 2.70%
353
+ STATS
354
+ assert_equal expected, bf.stats
355
+ end
343
356
  end
344
357
 
345
358
  describe "serialization" do
346
- after { File.unlink("bf.out") }
359
+ after { FileUtils.rm_f("bf.out") }
347
360
 
348
361
  it "marshalls" do
349
362
  bf = BloomFit.new
350
363
  assert bf.save("bf.out")
351
364
  end
352
365
 
366
+ it "uses binary file io" do
367
+ dumped = Marshal.dump(subject)
368
+ writer = Minitest::Mock.new
369
+ writer.expect(:call, dumped.bytesize, ["bf.out", dumped])
370
+
371
+ reader = Minitest::Mock.new
372
+ reader.expect(:call, dumped, ["bf.out"])
373
+
374
+ File.stub(:binwrite, writer) do
375
+ assert_equal dumped.bytesize, subject.save("bf.out")
376
+ end
377
+
378
+ File.stub(:binread, reader) do
379
+ bf2 = BloomFit.load("bf.out")
380
+ assert_equal subject.size, bf2.size
381
+ assert_equal subject.hashes, bf2.hashes
382
+ end
383
+
384
+ writer.verify
385
+ reader.verify
386
+ end
387
+
353
388
  it "loads from marshalled" do
354
389
  subject.add("foo")
355
390
  subject.add("bar")
@@ -360,7 +395,8 @@ class BloomFitTest < Minitest::Spec
360
395
  assert_includes bf2, "bar"
361
396
  refute_includes bf2, "baz"
362
397
 
363
- assert subject.send(:same_parameters?, bf2)
398
+ assert_equal subject.size, bf2.size
399
+ assert_equal subject.hashes, bf2.hashes
364
400
  end
365
401
  end
366
402
  end
@@ -3,6 +3,13 @@ require "test_helper"
3
3
  class CBloomFilterTest < Minitest::Spec
4
4
  subject { CBloomFilter.new }
5
5
 
6
+ describe ".new" do
7
+ it "rejects more than two arguments" do
8
+ error = assert_raises(ArgumentError) { CBloomFilter.new(1, 2, 3) }
9
+ assert_equal "wrong number of arguments (given 3, expected 0..2)", error.message
10
+ end
11
+ end
12
+
6
13
  describe "#m" do
7
14
  it "defaults" do
8
15
  assert_equal 1000, subject.m
@@ -12,6 +19,16 @@ class CBloomFilterTest < Minitest::Spec
12
19
  bf = CBloomFilter.new(10_000)
13
20
  assert_equal 10_000, bf.m
14
21
  end
22
+
23
+ it "rejects values less than 1" do
24
+ error = assert_raises(ArgumentError) { CBloomFilter.new(-1) }
25
+ assert_equal "bit length must be >= 1", error.message
26
+ end
27
+
28
+ it "rejects values that overflow internal byte sizing" do
29
+ error = assert_raises(RangeError) { CBloomFilter.new((1 << 31) - 7) }
30
+ assert_equal "bit length is too large", error.message
31
+ end
15
32
  end
16
33
 
17
34
  describe "#k" do
@@ -23,6 +40,16 @@ class CBloomFilterTest < Minitest::Spec
23
40
  bf = CBloomFilter.new(10_000, 9)
24
41
  assert_equal 9, bf.k
25
42
  end
43
+
44
+ it "rejects values less than 1" do
45
+ error = assert_raises(ArgumentError) { CBloomFilter.new(1000, 0) }
46
+ assert_equal "hash length must be >= 1", error.message
47
+ end
48
+
49
+ it "rejects values larger than the salt table" do
50
+ error = assert_raises(ArgumentError) { CBloomFilter.new(10_000, 257) }
51
+ assert_equal "hash length must be <= 256", error.message
52
+ end
26
53
  end
27
54
 
28
55
  describe "#set_bits" do
@@ -44,6 +71,12 @@ class CBloomFilterTest < Minitest::Spec
44
71
  assert_includes subject, "bar"
45
72
  refute_includes subject, "baz"
46
73
  end
74
+
75
+ it "treats binary bytes as unsigned when hashing" do
76
+ bf = CBloomFilter.new(20, 4)
77
+ bf.add("\xFF".b)
78
+ assert_equal "\x00\x05\x05\x00".b, bf.bitmap
79
+ end
47
80
  end
48
81
 
49
82
  describe "#include?" do
@@ -82,6 +115,11 @@ class CBloomFilterTest < Minitest::Spec
82
115
  assert_includes subject, "bar"
83
116
  assert_includes subject, "baz"
84
117
  end
118
+
119
+ it "rejects incompatible filters" do
120
+ error = assert_raises(ArgumentError) { subject.merge(CBloomFilter.new(2000, 4)) }
121
+ assert_equal "bloom filters must have matching size and hash count", error.message
122
+ end
85
123
  end
86
124
 
87
125
  describe "#&" do
@@ -103,6 +141,11 @@ class CBloomFilterTest < Minitest::Spec
103
141
  assert_includes bf3, "bar"
104
142
  refute_includes bf3, "baz"
105
143
  end
144
+
145
+ it "rejects incompatible filters" do
146
+ error = assert_raises(ArgumentError) { subject & CBloomFilter.new(1000, 2) }
147
+ assert_equal "bloom filters must have matching size and hash count", error.message
148
+ end
106
149
  end
107
150
 
108
151
  describe "#|" do
@@ -124,6 +167,11 @@ class CBloomFilterTest < Minitest::Spec
124
167
  assert_includes bf3, "bar"
125
168
  assert_includes bf3, "baz"
126
169
  end
170
+
171
+ it "rejects incompatible filters" do
172
+ error = assert_raises(ArgumentError) { subject | CBloomFilter.new(2000, 4) }
173
+ assert_equal "bloom filters must have matching size and hash count", error.message
174
+ end
127
175
  end
128
176
 
129
177
  describe "#bitmap" do
@@ -154,5 +202,32 @@ class CBloomFilterTest < Minitest::Spec
154
202
  assert_includes subject, "foo"
155
203
  assert_includes subject, "bar"
156
204
  end
205
+
206
+ it "rejects a short bitmap" do
207
+ error = assert_raises(ArgumentError) { subject.load("\x00".b) }
208
+ assert_equal "bitmap length must be 126 bytes", error.message
209
+ end
210
+
211
+ it "rejects a long bitmap" do
212
+ error = assert_raises(ArgumentError) { subject.load("\x00".b * 127) }
213
+ assert_equal "bitmap length must be 126 bytes", error.message
214
+ end
215
+
216
+ it "coerces bitmap-like objects to strings before loading" do
217
+ bitmap_data = subject.bitmap
218
+ bitmap = Object.new
219
+ bitmap.define_singleton_method(:to_str) { bitmap_data }
220
+ subject.load(bitmap)
221
+ assert_equal 0, subject.set_bits
222
+ end
223
+
224
+ it "clears loaded padding bits beyond the configured size" do
225
+ bf = CBloomFilter.new(20, 4)
226
+
227
+ bf.load("\x00\x00\xF0\xFF".b)
228
+
229
+ assert_equal 0, bf.set_bits
230
+ assert_equal "\x00\x00\x00\x00".b, bf.bitmap
231
+ end
157
232
  end
158
233
  end
data/test/test_helper.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  require "minitest/autorun"
2
+ require "minitest/mock"
2
3
  require "minitest/reporters"
3
4
 
4
5
  Minitest::Reporters.use! # override with MINITEST_REPORTER env var
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bloom_fit
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ryan McGeary
@@ -24,10 +24,9 @@ extra_rdoc_files: []
24
24
  files:
25
25
  - README.md
26
26
  - ext/cbloomfilter/cbloomfilter.c
27
- - ext/cbloomfilter/crc32.h
28
27
  - ext/cbloomfilter/extconf.rb
28
+ - ext/cbloomfilter/salts.h
29
29
  - lib/bloom_fit.rb
30
- - lib/bloom_fit/configuration_mismatch.rb
31
30
  - lib/bloom_fit/version.rb
32
31
  - lib/cbloomfilter.bundle
33
32
  - test/bloom_fit_test.rb
@@ -1,76 +0,0 @@
1
- /* simple CRC32 code */
2
- /*
3
- * Copyright 2005 Aris Adamantiadis
4
- *
5
- * This file is part of the SSH Library
6
- *
7
- * The SSH Library is free software; you can redistribute it and/or modify
8
- * it under the terms of the GNU Lesser General Public License as published by
9
- * the Free Software Foundation; either version 2.1 of the License, or (at your
10
- * option) any later version.
11
- *
12
- *
13
- * The SSH Library is distributed in the hope that it will be useful, but
14
- * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15
- * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16
- * License for more details.
17
- *
18
- * You should have received a copy of the GNU Lesser General Public License
19
- * along with the SSH Library; see the file COPYING. If not, write to
20
- * the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21
- * MA 02111-1307, USA. */
22
-
23
- static unsigned int crc_table[] = {
24
- 0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
25
- 0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
26
- 0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
27
- 0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
28
- 0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
29
- 0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
30
- 0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
31
- 0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
32
- 0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
33
- 0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
34
- 0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
35
- 0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
36
- 0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
37
- 0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
38
- 0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
39
- 0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
40
- 0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
41
- 0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
42
- 0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
43
- 0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
44
- 0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
45
- 0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
46
- 0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
47
- 0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
48
- 0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
49
- 0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
50
- 0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
51
- 0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
52
- 0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
53
- 0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
54
- 0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
55
- 0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
56
- 0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
57
- 0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
58
- 0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
59
- 0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
60
- 0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
61
- 0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
62
- 0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
63
- 0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
64
- 0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
65
- 0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
66
- 0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
67
- 0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
68
- 0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
69
- 0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
70
- 0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
71
- 0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
72
- 0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
73
- 0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
74
- 0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
75
- 0x2d02ef8dUL
76
- };
@@ -1,4 +0,0 @@
1
- class BloomFit
2
- class ConfigurationMismatch < ArgumentError
3
- end
4
- end