bloom_fit 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f542d198165a81ecdc9307e3d2b9a9168608197c117245cb89b087f5fde31081
4
- data.tar.gz: 60c9bed4dfbf8b6d5e8d4cb47350b9ec31ee22ce9eae3fbcc92628cc8e4aed53
3
+ metadata.gz: efa22c92049e3607485a8fcfe471b15cca6e85e6da0c7b19b65f74b9f6ad5fe9
4
+ data.tar.gz: 5e8432456b1258111671d536165217bc3e82e0e430c3bc63112abc4670f91e78
5
5
  SHA512:
6
- metadata.gz: 55e33f10d0c71aa77bece3ba974995144f44cfc644d7bbf773de9b5ea562078df4511905de6ae87f15a9b95c12975fd423c1fbe6fccfb22c4a0073b2cdf66362
7
- data.tar.gz: a2cca2d8c5c2ea66979ad93030b75fc7e64f5258650dc43b13e1cbf7080ee32ae44f77ac205b9055d3174d2e95dbe32fff61b949e27fc5f3306a3a332673bf57
6
+ metadata.gz: 72738a57ccb3a1a8989e86993490c3ba6a4f90925c834c1acd70ba104df8ef2bb318d5c66830786ba662e88df09f9ce46d7184810e3d4ec1c6b4cc0b41fcec44
7
+ data.tar.gz: 7472e370d1a66a6034ecb2f0d4720b9edd12f21e181a37cae2869e0e34c70a829366e7ee6caf880f4c4d8c789bc18bdbe2f83e6699617ccc77460320f2a2a1af
data/README.md CHANGED
@@ -4,7 +4,12 @@
4
4
  [![CI](https://github.com/rmm5t/bloom_fit/actions/workflows/ci.yml/badge.svg)](https://github.com/rmm5t/bloom_fit/actions/workflows/ci.yml)
5
5
  [![Gem Downloads](https://img.shields.io/gem/dt/bloom_fit.svg)](https://rubygems.org/gems/bloom_fit)
6
6
 
7
- BloomFit provides a MRI/C-based non-counting bloom filter for use in your Ruby projects. It is heavily based on [bloomfilter-rb]'s native implementation, but provides a better hashing distribution by using DJB2 over CRC32, avoids the need to supply a seed, removes counting abilities, improves performance for very large datasets, and will automatically calculate the bit size (m) and the number of hashes (k) when given a capacity and false-positive-rate.
7
+ BloomFit provides a MRI/C-based non-counting bloom filter for use in your Ruby projects. It is heavily based on [bloomfilter-rb]'s native implementation, but differs in the following ways:
8
+
9
+ - uses DJB2 over CRC32 yielding better hash distribution
10
+ - improves performance for very large datasets
11
+ - avoids the need to supply a seed
12
+ - automatically calculates the bit size (m) and the number of hashes (k) when given a capacity and false-positive-rate
8
13
 
9
14
  A [Bloom filter](http://en.wikipedia.org/wiki/Bloom_filter) is a space-efficient probabilistic data structure that is used to test whether an element is a member of a set. False positives are possible, but false negatives are not. Instead of using k different hash functions, this implementation a DJB2 hash with k seeds from the CRC table.
10
15
 
@@ -13,8 +18,6 @@ Performance of the Bloom filter depends on the following:
13
18
  - size of the bit array
14
19
  - number of hash functions
15
20
 
16
- BloomFit is a fork of [bloomfilter-rb].
17
-
18
21
  ## Resources
19
22
 
20
23
  - Background: [Bloom filter](http://en.wikipedia.org/wiki/Bloom_filter)
@@ -40,11 +43,11 @@ bf["bird"] = "bar"
40
43
  bf["bird"] # => true
41
44
  bf["mouse"] # => false
42
45
 
43
- bf.stats
44
- # => Number of filter bits (m): 3600
45
- # => Number of set bits (n): 20
46
- # => Number of filter hashes (k) : 10
47
- # => Predicted false positive rate = 0.00%
46
+ puts bf.stats
47
+ # Number of filter bits (m): 3600
48
+ # Number of set bits (n): 20
49
+ # Number of filter hashes (k) : 10
50
+ # Predicted false positive rate = 0.00%
48
51
  ```
49
52
 
50
53
  If you'd like more control over the traditional inputs like bit size and the number of hashes:
@@ -62,11 +65,11 @@ bf["bird"] = "bar"
62
65
  bf["bird"] # => true
63
66
  bf["mouse"] # => false
64
67
 
65
- bf.stats
66
- # => Number of filter bits (m): 100
67
- # => Number of set bits (n): 4
68
- # => Number of filter hashes (k) : 2
69
- # => Predicted false positive rate = 10.87%
68
+ puts bf.stats
69
+ # Number of filter bits (m): 100
70
+ # Number of set bits (n): 4
71
+ # Number of filter hashes (k) : 2
72
+ # Predicted false positive rate = 10.87%
70
73
  ```
71
74
 
72
75
  ## Credits
@@ -17,8 +17,7 @@ static unsigned int *salts = crc_table;
17
17
  static VALUE cBloomFilter;
18
18
 
19
19
  struct BloomFilter {
20
- int m; /* # of buckets in a bloom filter */
21
- int b; /* # of bits in a bloom filter bucket */
20
+ int m; /* # of bits in a bloom filter */
22
21
  int k; /* # of hash functions */
23
22
  unsigned char *ptr; /* bits data */
24
23
  int bytes; /* size of byte data */
@@ -72,7 +71,6 @@ static VALUE bf_alloc(VALUE klass) {
72
71
  VALUE obj = TypedData_Make_Struct(klass, struct BloomFilter, &bf_type, bf);
73
72
 
74
73
  bf->m = 0;
75
- bf->b = 0;
76
74
  bf->k = 0;
77
75
  bf->ptr = NULL;
78
76
  bf->bytes = 0;
@@ -80,52 +78,24 @@ static VALUE bf_alloc(VALUE klass) {
80
78
  return obj;
81
79
  }
82
80
 
83
- void bucket_unset(struct BloomFilter *bf, int index) {
84
- int byte_offset = (index * bf->b) / 8;
85
- int bit_offset = (index * bf->b) % 8;
86
- unsigned int c = bf->ptr[byte_offset];
87
- c += bf->ptr[byte_offset + 1] << 8;
88
- unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
89
- if ((c & mask) == 0) {
90
- // do nothing
91
- } else {
92
- // reduce the counter: 11 00 => 10 00 (suppose bf->b is 2)
93
- c -= (1 << bit_offset) & ((1 << 8) -1);
94
- // shift the bitmap right by 1 bit: 10 00 => 01 00
95
- c = (~mask & c) | ((c & mask) >> (bit_offset + 1) << bit_offset);
96
-
97
- bf->ptr[byte_offset] = c & ((1 << 8) - 1);
98
- bf->ptr[byte_offset + 1] = (c & ((1 << 16) - 1)) >> 8;
99
- }
100
- }
81
+ static void bucket_set(struct BloomFilter *bf, int index) {
82
+ int byte_offset = index / 8;
83
+ int bit_offset = index % 8;
101
84
 
102
- void bucket_set(struct BloomFilter *bf, int index) {
103
- int byte_offset = (index * bf->b) / 8;
104
- int bit_offset = (index * bf->b) % 8;
105
- unsigned int c = bf->ptr[byte_offset];
106
- c += bf->ptr[byte_offset + 1] << 8;
107
- unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
108
- if ((c & mask) != mask) {
109
- c = c + ((1 << bit_offset) & ((1 << 8) -1)) | c;
110
- bf->ptr[byte_offset] = c & ((1 << 8) - 1);
111
- bf->ptr[byte_offset + 1] = (c & ((1 << 16) - 1)) >> 8;
112
- }
85
+ bf->ptr[byte_offset] |= (unsigned char) (1U << bit_offset);
113
86
  }
114
87
 
115
- int bucket_check(struct BloomFilter *bf, int index) {
116
- int byte_offset = (index * bf->b) / 8;
117
- int bit_offset = (index * bf->b) % 8;
118
- unsigned int c = bf->ptr[byte_offset];
119
- c += bf->ptr[byte_offset + 1] << 8;
88
+ static int bucket_check(struct BloomFilter *bf, int index) {
89
+ int byte_offset = index / 8;
90
+ int bit_offset = index % 8;
120
91
 
121
- unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
122
- return (c & mask) >> bit_offset;
92
+ return (bf->ptr[byte_offset] >> bit_offset) & 1;
123
93
  }
124
94
 
125
95
  static VALUE bf_initialize(int argc, VALUE *argv, VALUE self) {
126
96
  struct BloomFilter *bf;
127
97
  VALUE arg1, arg2;
128
- int m, k, b;
98
+ int m, k;
129
99
 
130
100
  bf = bf_ptr(self);
131
101
 
@@ -143,21 +113,20 @@ static VALUE bf_initialize(int argc, VALUE *argv, VALUE self) {
143
113
 
144
114
  m = FIX2INT(arg1);
145
115
  k = FIX2INT(arg2);
146
- b = 1;
147
116
 
148
117
  if (m < 1)
149
118
  rb_raise(rb_eArgError, "array size");
150
119
  if (k < 1)
151
120
  rb_raise(rb_eArgError, "hash length");
152
121
 
153
- bf->b = b;
154
122
  bf->m = m;
155
123
  bf->k = k;
156
124
 
157
125
  ruby_xfree(bf->ptr);
158
126
  bf->ptr = NULL;
159
127
  bf->bytes = 0;
160
- bf->bytes = ((m * b) + 15) / 8;
128
+ /* Preserve the existing serialized bitmap length, including one padding byte. */
129
+ bf->bytes = (m + 15) / 8;
161
130
  bf->ptr = ALLOC_N(unsigned char, bf->bytes);
162
131
 
163
132
  /* initialize the bits with zeros */
@@ -194,7 +163,7 @@ static VALUE bf_set_bits(VALUE self){
194
163
  return INT2FIX(count);
195
164
  }
196
165
 
197
- static VALUE bf_insert(VALUE self, VALUE key) {
166
+ static VALUE bf_add(VALUE self, VALUE key) {
198
167
  VALUE skey;
199
168
  unsigned long hash;
200
169
  int index;
@@ -268,57 +237,34 @@ static VALUE bf_or(VALUE self, VALUE other) {
268
237
  return obj;
269
238
  }
270
239
 
271
- static VALUE bf_include(int argc, VALUE* argv, VALUE self) {
240
+ static VALUE bf_include(VALUE self, VALUE key) {
241
+ VALUE skey;
272
242
  unsigned long hash;
273
- int i, len, m, k;
274
243
  int index;
275
- long tests_idx, vlen;
244
+ int i, len, m, k;
276
245
  char *ckey;
277
- VALUE tests, key, skey;
278
- struct BloomFilter *bf;
246
+ struct BloomFilter *bf = bf_ptr(self);
279
247
 
280
- rb_scan_args(argc, argv, "*", &tests);
248
+ skey = rb_obj_as_string(key);
249
+ ckey = StringValuePtr(skey);
250
+ len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
281
251
 
282
- bf = bf_ptr(self);
283
- vlen = RARRAY_LEN(tests);
284
- for (tests_idx = 0; tests_idx < vlen; tests_idx++) {
285
- key = rb_ary_entry(tests, tests_idx);
286
- skey = rb_obj_as_string(key);
287
- ckey = StringValuePtr(skey);
288
- len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
289
-
290
- m = bf->m;
291
- k = bf->k;
292
-
293
- hash = (unsigned long) djb2(ckey, len);
294
- for (i = 0; i <= k - 1; i++) {
295
- index = (int) ((hash ^ salts[i]) % (unsigned int) (m));
296
-
297
- /* check the bit at the index */
298
- if (!bucket_check(bf, index)) {
299
- return Qfalse; /* i.e., it is a new entry ; escape the loop */
300
- }
252
+ m = bf->m;
253
+ k = bf->k;
254
+
255
+ hash = (unsigned long) djb2(ckey, len);
256
+ for (i = 0; i <= k - 1; i++) {
257
+ index = (int) ((hash ^ salts[i]) % (unsigned int) (m));
258
+
259
+ /* check the bit at the index */
260
+ if (!bucket_check(bf, index)) {
261
+ return Qfalse; /* i.e., it is a new entry ; escape the loop */
301
262
  }
302
263
  }
303
264
 
304
265
  return Qtrue;
305
266
  }
306
267
 
307
- static VALUE bf_to_s(VALUE self) {
308
- struct BloomFilter *bf = bf_ptr(self);
309
- unsigned char *ptr;
310
- int i;
311
- VALUE str;
312
-
313
- str = rb_str_new(0, bf->m);
314
-
315
- ptr = (unsigned char *) RSTRING_PTR(str);
316
- for (i = 0; i < bf->m; i++)
317
- *ptr++ = bucket_check(bf, i) ? '1' : '0';
318
-
319
- return str;
320
- }
321
-
322
268
  static VALUE bf_bitmap(VALUE self) {
323
269
  struct BloomFilter *bf = bf_ptr(self);
324
270
 
@@ -346,15 +292,13 @@ void Init_cbloomfilter(void) {
346
292
  rb_define_method(cBloomFilter, "m", bf_m, 0);
347
293
  rb_define_method(cBloomFilter, "k", bf_k, 0);
348
294
  rb_define_method(cBloomFilter, "set_bits", bf_set_bits, 0);
349
- /* rb_define_method(cBloomFilter, "s", bf_s, 0); */
350
- rb_define_method(cBloomFilter, "insert", bf_insert, 1);
351
- rb_define_method(cBloomFilter, "include?", bf_include, -1);
295
+ rb_define_method(cBloomFilter, "add", bf_add, 1);
296
+ rb_define_method(cBloomFilter, "include?", bf_include, 1);
352
297
  rb_define_method(cBloomFilter, "clear", bf_clear, 0);
353
- rb_define_method(cBloomFilter, "merge!", bf_merge, 1);
298
+ rb_define_method(cBloomFilter, "merge", bf_merge, 1);
354
299
  rb_define_method(cBloomFilter, "&", bf_and, 1);
355
300
  rb_define_method(cBloomFilter, "|", bf_or, 1);
356
301
 
357
- rb_define_method(cBloomFilter, "to_s", bf_to_s, 0);
358
302
  rb_define_method(cBloomFilter, "bitmap", bf_bitmap, 0);
359
303
  rb_define_method(cBloomFilter, "load", bf_load, 1);
360
304
 
@@ -0,0 +1,4 @@
1
+ class BloomFit
2
+ class ConfigurationMismatch < ArgumentError
3
+ end
4
+ end
@@ -1,3 +1,3 @@
1
1
  class BloomFit
2
- VERSION = "0.2.0".freeze
2
+ VERSION = "0.3.0".freeze
3
3
  end
data/lib/bloom_fit.rb CHANGED
@@ -1,63 +1,110 @@
1
+ require "forwardable"
2
+
1
3
  require "cbloomfilter"
4
+ require "bloom_fit/configuration_mismatch"
2
5
  require "bloom_fit/version"
3
6
 
4
7
  class BloomFit
5
- class ConfigurationMismatch < ArgumentError
6
- end
8
+ extend Forwardable
7
9
 
8
10
  attr_reader :bf
9
11
 
12
+ # @param size [Integer] number of buckets in a bloom filter
13
+ # @param hashes [Integer] number of hash functions
10
14
  def initialize(size: 1_000, hashes: 4)
11
- @size = size
12
- @hashes = hashes
15
+ @bf = CBloomFilter.new(size, hashes)
16
+ end
17
+
18
+ def_delegators :@bf, :m, :k, :bitmap, :include?, :clear, :set_bits
19
+
20
+ alias size m
21
+ alias hashes k
22
+ alias key? include?
23
+ alias [] include?
24
+ alias n set_bits
13
25
 
14
- # arg 1: m => size : number of buckets in a bloom filter
15
- # arg 2: k => hashes : number of hash functions
16
- @bf = CBloomFilter.new(@size, @hashes)
26
+ def empty?
27
+ set_bits.zero?
17
28
  end
18
29
 
19
- def insert(key)
20
- @bf.insert(key)
30
+ # Adds the given key to the set and returns +self+. Mimics the behavior of
31
+ # +Set#add+
32
+ def add(key)
33
+ @bf.add(key)
34
+ self
21
35
  end
22
- alias []= insert
36
+ alias << add
23
37
 
24
- def include?(*keys)
25
- @bf.include?(*keys)
38
+ # Adds the given key to the set if the value is truthy. Mimics the behavior of
39
+ # +Hash#[]=+
40
+ def []=(key, value)
41
+ @bf.add(key) if value
26
42
  end
27
- alias key? include?
28
- alias [] include?
29
43
 
30
- def clear = @bf.clear
31
- def size = @bf.set_bits
32
- def merge!(other) = @bf.merge!(other.bf)
44
+ # Adds the given key to the set and returns +self+. If the key is already
45
+ # the in set, returns +nil+. Mimics the behavior of +Set#add?+
46
+ def add?(key)
47
+ return nil if include?(key) # rubocop:disable Style/ReturnNilInPredicateMethodDefinition
48
+ add(key)
49
+ end
50
+
51
+ # Returns a string of the set bits in hex format
52
+ def to_hex
53
+ length = ((size / 8.0).ceil * 8 / 4)
54
+ bitmap.unpack1("H*")[0...length]
55
+ end
33
56
 
34
- # Returns the number of bits that are set to 1 in the filter.
35
- def set_bits
36
- @bf.set_bits
57
+ # Returns a string of the set bits in binary format
58
+ def to_binary
59
+ bitmap.unpack1("B*")[0...size]
37
60
  end
38
61
 
39
- # Computes the intersection of two Bloom filters.
40
- # It assumes that both filters have the same size -
41
- # if this is not true +BloomFit::ConfigurationMismatch+ is raised.
62
+ # Adds the set from another BloomFit filter or adds all the elements from an
63
+ # enumerable. Mimics the behavior of +Set#merge+
64
+ def merge(other)
65
+ if other.is_a?(BloomFit)
66
+ raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
67
+ @bf.merge(other.bf)
68
+ elsif other.respond_to?(:each_key)
69
+ other.each { |k, v| add(k) if v }
70
+ elsif other.is_a?(Enumerable)
71
+ other.each { |k| add(k) }
72
+ else
73
+ raise ArgumentError, "value must be enumerable or another BloomFit filter"
74
+ end
75
+ end
76
+
77
+ # Computes the intersection of two Bloom filters. It requires that both
78
+ # filters have the same size; otherwise, +BloomFit::ConfigurationMismatch+
79
+ # is raised.
42
80
  def &(other)
43
81
  raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
44
- result = self.class.new
45
- result.instance_variable_set(:@bf, @bf.&(other.bf))
46
- result
82
+ self.class.new(size:, hashes:).tap do |result|
83
+ result.instance_variable_set(:@bf, @bf.&(other.bf))
84
+ end
47
85
  end
86
+ alias intersection &
48
87
 
49
- # Computes the union of two Bloom filters.
50
- # It assumes that both filters have the same size -
51
- # if this is not true +BloomFit::ConfigurationMismatch+ is raised.
88
+ # Computes the union of two Bloom filters. It requires that both filters
89
+ # have the same size; otherwise, +BloomFit::ConfigurationMismatch+ is
90
+ # raised.
52
91
  def |(other)
53
92
  raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
54
- result = self.class.new
55
- result.instance_variable_set(:@bf, @bf.|(other.bf))
56
- result
93
+ self.class.new(size:, hashes:).tap do |result|
94
+ result.instance_variable_set(:@bf, @bf.|(other.bf))
95
+ end
57
96
  end
97
+ alias union |
58
98
 
59
- def bitmap
60
- @bf.bitmap
99
+ def stats
100
+ fpr = ((1.0 - Math.exp(-(k * n).to_f / m))**k) * 100
101
+
102
+ (+"").tap do |s|
103
+ s << format("Number of filter buckets (m): %d\n", m)
104
+ s << format("Number of set bits (n): %d\n", n)
105
+ s << format("Number of filter hashes (k): %d\n", k)
106
+ s << format("Predicted false positive rate: %.2f%%\n", fpr)
107
+ end
61
108
  end
62
109
 
63
110
  def marshal_load(ary)
@@ -68,11 +115,11 @@ class BloomFit
68
115
  end
69
116
 
70
117
  def marshal_dump
71
- [@size, @hashes, @bf.bitmap]
118
+ [size, hashes, bitmap]
72
119
  end
73
120
 
74
121
  def self.load(filename)
75
- Marshal.load(File.open(filename, "r"))
122
+ Marshal.load(File.open(filename, "r")) # rubocop:disable Security/MarshalLoad
76
123
  end
77
124
 
78
125
  def save(filename)
@@ -81,14 +128,6 @@ class BloomFit
81
128
  end
82
129
  end
83
130
 
84
- def stats
85
- fp = ((1.0 - Math.exp(-(@hashes * size).to_f / @size))**@hashes) * 100
86
- printf "Number of filter buckets (m): %d\n", @size
87
- printf "Number of set bits (n): %d\n", set_bits
88
- printf "Number of filter hashes (k) : %d\n", @hashes
89
- printf "Predicted false positive rate = %.2f%%\n", fp
90
- end
91
-
92
131
  protected
93
132
 
94
133
  # Returns true if parameters of the +other+ filter are
Binary file
@@ -0,0 +1,344 @@
1
+ require "test_helper"
2
+
3
+ class BloomFitTest < Minitest::Spec
4
+ subject { BloomFit.new(size: 100, hashes: 4) }
5
+
6
+ describe "#empty?" do
7
+ it "returns true when nothing set" do
8
+ assert_equal true, subject.empty? # rubocop:disable Minitest/AssertTruthy
9
+ assert_empty subject
10
+ end
11
+
12
+ it "returns false when something set" do
13
+ subject << "key"
14
+ assert_equal false, subject.empty? # rubocop:disable Minitest/RefuteFalse
15
+ refute_empty subject
16
+ end
17
+ end
18
+
19
+ describe "#add" do
20
+ it "adds the key and returns self" do
21
+ assert_equal subject, subject.add("test1")
22
+ assert_equal subject, subject.add("test2")
23
+ assert_includes subject, "test1"
24
+ assert_includes subject, "test2"
25
+ end
26
+
27
+ it "is aliased as #<<" do
28
+ subject << "test1" << "test2"
29
+ assert_includes subject, "test1"
30
+ assert_includes subject, "test2"
31
+ end
32
+
33
+ it "is aliased as #[]=, and handles truthy/falsey values" do
34
+ subject["dog"] = :bar
35
+ subject["cat"] = :foo
36
+ assert_includes subject, "dog"
37
+ assert_includes subject, "cat"
38
+
39
+ subject["bat"] = nil
40
+ subject["pig"] = false
41
+ refute_includes subject, "bat"
42
+ refute_includes subject, "pig"
43
+ end
44
+
45
+ it "casts using #to_s as necessary" do
46
+ subject << :symbol << true << 12_345
47
+
48
+ assert_includes subject, "symbol"
49
+ assert_includes subject, :symbol
50
+ assert_includes subject, "true"
51
+ assert_includes subject, "12345"
52
+ assert_includes subject, 12_345
53
+ end
54
+ end
55
+
56
+ describe "#add?" do
57
+ it "adds new key and returns self" do
58
+ assert_equal subject, subject.add("test1")
59
+ assert_equal subject, subject.add("test2")
60
+ assert_includes subject, "test1"
61
+ assert_includes subject, "test2"
62
+ end
63
+
64
+ it "return nil if the key already exists" do
65
+ subject << "test1"
66
+ subject << "test2"
67
+ assert_includes subject, "test1"
68
+ assert_includes subject, "test2"
69
+ assert_nil subject.add?("test1")
70
+ assert_nil subject.add?("test2")
71
+ end
72
+ end
73
+
74
+ describe "#include?" do
75
+ it "returns true when a key is in the set" do
76
+ subject << "test1"
77
+ subject << "test2"
78
+ assert_equal true, subject.include?("test1") # rubocop:disable Minitest/AssertTruthy
79
+ assert_equal true, subject.include?("test2") # rubocop:disable Minitest/AssertTruthy
80
+ end
81
+
82
+ it "returns false when a key is not in the set" do
83
+ assert_equal false, subject.include?("test") # rubocop:disable Minitest/RefuteFalse
84
+ assert_equal false, subject.include?("nada") # rubocop:disable Minitest/RefuteFalse
85
+ end
86
+
87
+ it "is aliased as #key?" do
88
+ subject << "test1"
89
+ subject << "test2"
90
+ assert subject.key?("test1")
91
+ assert subject.key?("test2")
92
+ refute subject.key?("test3")
93
+ end
94
+
95
+ it "is aliased as #[]" do
96
+ subject << "test1"
97
+ subject << "test2"
98
+ assert subject["test1"]
99
+ assert subject["test2"]
100
+ refute subject["test3"]
101
+ end
102
+ end
103
+
104
+ describe "#clear" do
105
+ it "zeroes the bits" do
106
+ subject.add("test")
107
+ assert_includes subject, "test"
108
+ assert_includes subject.to_binary, "1"
109
+ subject.clear
110
+ refute_includes subject, "test"
111
+ refute_includes subject.to_binary, "1"
112
+ end
113
+ end
114
+
115
+ describe "#set_bits" do
116
+ it "returns the number of bits set to 1" do
117
+ bf = BloomFit.new(size: 100, hashes: 4)
118
+ bf.add("bits")
119
+ assert_equal 4, bf.set_bits
120
+
121
+ bf = BloomFit.new(size: 100, hashes: 1)
122
+ bf.add("bits")
123
+ assert_equal 1, bf.set_bits
124
+ end
125
+ end
126
+
127
+ describe "#bitmap" do
128
+ it "returns a binary bitmap of all zeros when empty (including a terminating byte)" do
129
+ bf = BloomFit.new(size: 16)
130
+ assert_equal "\x00\x00\x00".b, bf.bitmap
131
+ end
132
+
133
+ it "returns a binary bitmap representing the set" do
134
+ bf = BloomFit.new(size: 16, hashes: 4)
135
+ bf.add("something")
136
+ assert_equal "(\x82\x00".b, bf.bitmap
137
+ end
138
+
139
+ it "returns a binary bitmap representing the set even if not a multiple of 8 bits" do
140
+ bf = BloomFit.new(size: 20, hashes: 4)
141
+ bf.add("wow")
142
+ assert_equal "\x04\x14\x00\x00".b, bf.bitmap
143
+ end
144
+ end
145
+
146
+ describe "#to_hex" do
147
+ it "returns a hex bitmap of all zeros when empty" do
148
+ bf = BloomFit.new(size: 16)
149
+ assert_equal "0000", bf.to_hex
150
+ end
151
+
152
+ it "returns a hex bitmap of all zeros when empty if not a multiple of 8 bits" do
153
+ bf = BloomFit.new(size: 18)
154
+ assert_equal "000000", bf.to_hex
155
+ end
156
+
157
+ it "returns a hex bitmap representing the set" do
158
+ bf = BloomFit.new(size: 16, hashes: 4)
159
+ bf.add("cool")
160
+ assert_equal "1441", bf.to_hex
161
+ end
162
+ end
163
+
164
+ describe "#to_binary" do
165
+ it "returns a binary bitmap of all zeros when empty" do
166
+ bf = BloomFit.new(size: 16)
167
+ assert_equal "0000000000000000", bf.to_binary
168
+ end
169
+
170
+ it "returns a binary bitmap of all zeros when empty if not a multiple of 8 bits" do
171
+ bf = BloomFit.new(size: 19)
172
+ assert_equal "0000000000000000000", bf.to_binary
173
+ end
174
+
175
+ it "returns a binary bitmap representing the set" do
176
+ bf = BloomFit.new(size: 16, hashes: 4)
177
+ bf << "cool" << "cat"
178
+ assert_equal "1001011001101001", bf.to_binary
179
+ end
180
+ end
181
+
182
+ describe "#merge" do
183
+ it "merges another BloomFit filter" do
184
+ bf1 = BloomFit.new(size: 100, hashes: 2)
185
+ bf2 = BloomFit.new(size: 100, hashes: 2)
186
+ bf1 << "mouse"
187
+ bf2 << "cat" << "dog"
188
+ refute_includes bf1, "cat"
189
+ refute_includes bf1, "dog"
190
+ bf1.merge(bf2)
191
+ assert_includes bf1, "mouse"
192
+ assert_includes bf1, "cat"
193
+ assert_includes bf1, "dog"
194
+ refute_includes bf2, "mouse"
195
+ assert_includes bf2, "cat"
196
+ assert_includes bf2, "dog"
197
+ end
198
+
199
+ it "merges an array" do
200
+ subject << "mouse"
201
+ subject.merge %i[cat dog]
202
+ assert_includes subject, "mouse"
203
+ assert_includes subject, "cat"
204
+ assert_includes subject, "dog"
205
+ end
206
+
207
+ it "merges a set" do
208
+ subject << "mouse"
209
+ subject.merge Set.new(%w[cat dog])
210
+ assert_includes subject, "mouse"
211
+ assert_includes subject, "cat"
212
+ assert_includes subject, "dog"
213
+ end
214
+
215
+ it "merges a hash ignoring falsey values" do
216
+ subject << "mouse"
217
+ subject.merge({ cat: 1, dog: 2, ant: false, bug: nil })
218
+ assert_includes subject, "mouse"
219
+ assert_includes subject, "cat"
220
+ assert_includes subject, "dog"
221
+ refute_includes subject, "ant"
222
+ refute_includes subject, "bug"
223
+ end
224
+
225
+ it "raises when merge is between incompatible filters" do
226
+ bf1 = BloomFit.new(size: 10)
227
+ bf2 = BloomFit.new(size: 20)
228
+ assert_raises(BloomFit::ConfigurationMismatch) { bf1.merge(bf2) }
229
+ end
230
+ end
231
+
232
+ describe "#&" do
233
+ it "returns intersection of both filters" do
234
+ bf1 = BloomFit.new(size: 35, hashes: 4)
235
+ bf1.add("test")
236
+ bf1.add("test1")
237
+
238
+ bf2 = BloomFit.new(size: 35, hashes: 4)
239
+ bf2.add("test")
240
+ bf2.add("test2")
241
+
242
+ bf3 = bf1 & bf2
243
+ assert_equal 35, bf3.size
244
+ assert_equal 4, bf3.hashes
245
+ assert_includes bf3, "test"
246
+ refute_includes bf3, "test1"
247
+ refute_includes bf3, "test2"
248
+ end
249
+
250
+ it "is aliased as #intersection" do
251
+ bf1 = BloomFit.new(size: 20, hashes: 4)
252
+ bf1.add("test")
253
+ bf1.add("test1")
254
+
255
+ bf2 = BloomFit.new(size: 20, hashes: 4)
256
+ bf2.add("test")
257
+
258
+ bf3 = bf1.intersection(bf2)
259
+ assert_includes bf3, "test"
260
+ refute_includes bf3, "test1"
261
+ end
262
+
263
+ it "raises when intersection is between incompatible filters" do
264
+ bf1 = BloomFit.new(size: 10)
265
+ bf2 = BloomFit.new(size: 20)
266
+ assert_raises(BloomFit::ConfigurationMismatch) { bf1 & bf2 }
267
+
268
+ bf1 = BloomFit.new(size: 10, hashes: 2)
269
+ bf2 = BloomFit.new(size: 10, hashes: 4)
270
+ assert_raises(BloomFit::ConfigurationMismatch) { bf1 & bf2 }
271
+ end
272
+ end
273
+
274
+ describe "#|" do
275
+ it "returns union with other filter" do
276
+ bf1 = BloomFit.new
277
+ bf1.add("test")
278
+ bf1.add("test1")
279
+
280
+ bf2 = BloomFit.new
281
+ bf2.add("test")
282
+ bf2.add("test2")
283
+
284
+ bf3 = bf1 | bf2
285
+ assert_includes bf3, "test"
286
+ assert_includes bf3, "test1"
287
+ assert_includes bf3, "test2"
288
+ end
289
+
290
+ it "is aliased as #union" do
291
+ bf1 = BloomFit.new(size: 20, hashes: 4)
292
+ bf1.add("test")
293
+ bf1.add("test1")
294
+
295
+ bf2 = BloomFit.new(size: 20, hashes: 4)
296
+ bf2.add("test")
297
+
298
+ bf3 = bf1.union(bf2)
299
+ assert_includes bf3, "test"
300
+ assert_includes bf3, "test1"
301
+ end
302
+
303
+ it "raises when union is between incompatible filters" do
304
+ bf1 = BloomFit.new(size: 10)
305
+ bf2 = BloomFit.new(size: 20)
306
+ assert_raises(BloomFit::ConfigurationMismatch) { bf1 | bf2 }
307
+ end
308
+ end
309
+
310
+ describe "#stats" do
311
+ it "returns current stats" do
312
+ bf = BloomFit.new(size: 10, hashes: 3)
313
+ expected = <<~STATS
314
+ Number of filter buckets (m): 10
315
+ Number of set bits (n): 0
316
+ Number of filter hashes (k): 3
317
+ Predicted false positive rate: 0.00%
318
+ STATS
319
+ assert_equal expected, bf.stats
320
+ end
321
+ end
322
+
323
+ describe "serialization" do
324
+ after { File.unlink("bf.out") }
325
+
326
+ it "marshalls" do
327
+ bf = BloomFit.new
328
+ assert bf.save("bf.out")
329
+ end
330
+
331
+ it "loads from marshalled" do
332
+ subject.add("foo")
333
+ subject.add("bar")
334
+ subject.save("bf.out")
335
+
336
+ bf2 = BloomFit.load("bf.out")
337
+ assert_includes bf2, "foo"
338
+ assert_includes bf2, "bar"
339
+ refute_includes bf2, "baz"
340
+
341
+ assert subject.send(:same_parameters?, bf2)
342
+ end
343
+ end
344
+ end
@@ -0,0 +1,6 @@
1
+ require "minitest/autorun"
2
+ require "minitest/reporters"
3
+
4
+ Minitest::Reporters.use! # override with MINITEST_REPORTER env var
5
+
6
+ require "bloom_fit"
metadata CHANGED
@@ -1,22 +1,22 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bloom_fit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
- - Ilya Grigorik
8
- - Tatsuya Mori
9
7
  - Ryan McGeary
10
8
  - Beshad Talayeminaei
9
+ - Ilya Grigorik
10
+ - Tatsuya Mori
11
11
  bindir: bin
12
12
  cert_chain: []
13
13
  date: 1980-01-02 00:00:00.000000000 Z
14
14
  dependencies: []
15
15
  email:
16
- - ilya@grigorik.com
17
- - valdzone@gmail.com
18
16
  - ryan@mcgeary.org
19
17
  - 'btalayeminaei@gmail.com '
18
+ - ilya@grigorik.com
19
+ - valdzone@gmail.com
20
20
  executables: []
21
21
  extensions:
22
22
  - ext/cbloomfilter/extconf.rb
@@ -27,10 +27,11 @@ files:
27
27
  - ext/cbloomfilter/crc32.h
28
28
  - ext/cbloomfilter/extconf.rb
29
29
  - lib/bloom_fit.rb
30
+ - lib/bloom_fit/configuration_mismatch.rb
30
31
  - lib/bloom_fit/version.rb
31
32
  - lib/cbloomfilter.bundle
32
- - spec/bloom_fit_spec.rb
33
- - spec/helper.rb
33
+ - test/bloom_fit_test.rb
34
+ - test/test_helper.rb
34
35
  homepage: https://github.com/rmm5t/bloom_fit
35
36
  licenses: []
36
37
  metadata:
@@ -1,129 +0,0 @@
1
- require "helper"
2
-
3
- describe BloomFit do
4
- it "clears" do
5
- bf = BloomFit.new(size: 100, hashes: 2)
6
- bf.insert("test")
7
- expect(bf.include?("test")).to be true
8
- bf.clear
9
- expect(bf.include?("test")).to be false
10
- end
11
-
12
- it "merges" do
13
- bf1 = BloomFit.new(size: 100, hashes: 2)
14
- bf2 = BloomFit.new(size: 100, hashes: 2)
15
- bf2.insert("test")
16
- expect(bf1.include?("test")).to be false
17
- bf1.merge!(bf2)
18
- expect(bf1.include?("test")).to be true
19
- expect(bf2.include?("test")).to be true
20
- end
21
-
22
- it "tests set membership" do
23
- bf = BloomFit.new(size: 100, hashes: 2)
24
- bf.insert("test")
25
- bf.insert("test1")
26
-
27
- expect(bf.include?("test")).to be true
28
- expect(bf.include?("abcd")).to be false
29
- expect(bf.include?("test", "test1")).to be true
30
- expect(bf.include?("test1", "abcd")).to be false
31
- end
32
-
33
- it "works with any object's to_s" do
34
- subject.insert(:test)
35
- subject.insert(:test1)
36
- subject.insert(12_345)
37
-
38
- expect(subject.include?("test")).to be true
39
- expect(subject.include?("abcd")).to be false
40
- expect(subject.include?("12345")).to be true
41
- end
42
-
43
- it "returns the number of bits set to 1" do
44
- bf = BloomFit.new(hashes: 4)
45
- bf.insert("test")
46
- expect(bf.set_bits).to eq 4
47
-
48
- bf = BloomFit.new(hashes: 1)
49
- bf.insert("test")
50
- expect(bf.set_bits).to eq 1
51
- end
52
-
53
- it "returns intersection with other filter" do
54
- bf1 = BloomFit.new
55
- bf1.insert("test")
56
- bf1.insert("test1")
57
-
58
- bf2 = BloomFit.new
59
- bf2.insert("test")
60
- bf2.insert("test2")
61
-
62
- bf3 = bf1 & bf2
63
- expect(bf3.include?("test")).to be true
64
- expect(bf3.include?("test1")).to be false
65
- expect(bf3.include?("test2")).to be false
66
- end
67
-
68
- it "raises an exception when intersection is to be computed for incompatible filters" do
69
- bf1 = BloomFit.new(size: 10)
70
- bf1.insert("test")
71
-
72
- bf2 = BloomFit.new(size: 20)
73
- bf2.insert("test")
74
-
75
- expect { bf1 & bf2 }.to raise_error(BloomFit::ConfigurationMismatch)
76
- end
77
-
78
- it "returns union with other filter" do
79
- bf1 = BloomFit.new
80
- bf1.insert("test")
81
- bf1.insert("test1")
82
-
83
- bf2 = BloomFit.new
84
- bf2.insert("test")
85
- bf2.insert("test2")
86
-
87
- bf3 = bf1 | bf2
88
- expect(bf3.include?("test")).to be true
89
- expect(bf3.include?("test1")).to be true
90
- expect(bf3.include?("test2")).to be true
91
- end
92
-
93
- it "raises an exception when union is to be computed for incompatible filters" do
94
- bf1 = BloomFit.new(size: 10)
95
- bf1.insert("test")
96
-
97
- bf2 = BloomFit.new(size: 20)
98
- bf2.insert("test")
99
-
100
- expect { bf1 | bf2 }.to raise_error(BloomFit::ConfigurationMismatch)
101
- end
102
-
103
- it "outputs current stats" do
104
- subject.insert("test")
105
- expect { subject.stats }.not_to raise_error
106
- end
107
-
108
- context "serialization" do
109
- after { File.unlink("bf.out") }
110
-
111
- it "marshalls" do
112
- bf = BloomFit.new
113
- expect { bf.save("bf.out") }.not_to raise_error
114
- end
115
-
116
- it "loads from marshalled" do
117
- subject.insert("foo")
118
- subject.insert("bar")
119
- subject.save("bf.out")
120
-
121
- bf2 = BloomFit.load("bf.out")
122
- expect(bf2.include?("foo")).to be true
123
- expect(bf2.include?("bar")).to be true
124
- expect(bf2.include?("baz")).to be false
125
-
126
- expect(subject.send(:same_parameters?, bf2)).to be true
127
- end
128
- end
129
- end
data/spec/helper.rb DELETED
@@ -1 +0,0 @@
1
- require "bloom_fit"