bloom_fit 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +16 -13
- data/ext/cbloomfilter/cbloomfilter.c +33 -89
- data/lib/bloom_fit/configuration_mismatch.rb +4 -0
- data/lib/bloom_fit/version.rb +1 -1
- data/lib/bloom_fit.rb +193 -47
- data/lib/cbloomfilter.bundle +0 -0
- data/test/bloom_fit_test.rb +344 -0
- data/test/test_helper.rb +6 -0
- metadata +8 -7
- data/spec/bloom_fit_spec.rb +0 -129
- data/spec/helper.rb +0 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: cd631cdb483e0a84fa05d56eb962fda0f7c7d7a0b002ea708024ce82505a9054
|
|
4
|
+
data.tar.gz: ee781997465d6f5b590828082e4fadd5b00768298bbdec7845b9f07c3d046549
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7862f2d0189bae865c6fc5e7c7ad24f5c7ab0420415a455a1a0b130835d639c536cb8925b08219eab7dd7a10db1e9299b2868019d3e2259db4dce96de01e50a2
|
|
7
|
+
data.tar.gz: 41cb7f2fcb8cf80f5345785ce0110e242a29fbe6177284b13b701973ec7b0e7010d788585e406f77712f7ee284ff308633fe060e492b0e153a4a5598658fd465
|
data/README.md
CHANGED
|
@@ -4,7 +4,12 @@
|
|
|
4
4
|
[](https://github.com/rmm5t/bloom_fit/actions/workflows/ci.yml)
|
|
5
5
|
[](https://rubygems.org/gems/bloom_fit)
|
|
6
6
|
|
|
7
|
-
BloomFit provides a MRI/C-based non-counting bloom filter for use in your Ruby projects. It is heavily based on [bloomfilter-rb]'s native implementation, but
|
|
7
|
+
BloomFit provides a MRI/C-based non-counting bloom filter for use in your Ruby projects. It is heavily based on [bloomfilter-rb]'s native implementation, but differs in the following ways:
|
|
8
|
+
|
|
9
|
+
- uses DJB2 over CRC32 yielding better hash distribution
|
|
10
|
+
- improves performance for very large datasets
|
|
11
|
+
- avoids the need to supply a seed
|
|
12
|
+
- automatically calculates the bit size (m) and the number of hashes (k) when given a capacity and false-positive-rate
|
|
8
13
|
|
|
9
14
|
A [Bloom filter](http://en.wikipedia.org/wiki/Bloom_filter) is a space-efficient probabilistic data structure that is used to test whether an element is a member of a set. False positives are possible, but false negatives are not. Instead of using k different hash functions, this implementation a DJB2 hash with k seeds from the CRC table.
|
|
10
15
|
|
|
@@ -13,8 +18,6 @@ Performance of the Bloom filter depends on the following:
|
|
|
13
18
|
- size of the bit array
|
|
14
19
|
- number of hash functions
|
|
15
20
|
|
|
16
|
-
BloomFit is a fork of [bloomfilter-rb].
|
|
17
|
-
|
|
18
21
|
## Resources
|
|
19
22
|
|
|
20
23
|
- Background: [Bloom filter](http://en.wikipedia.org/wiki/Bloom_filter)
|
|
@@ -40,11 +43,11 @@ bf["bird"] = "bar"
|
|
|
40
43
|
bf["bird"] # => true
|
|
41
44
|
bf["mouse"] # => false
|
|
42
45
|
|
|
43
|
-
bf.stats
|
|
44
|
-
#
|
|
45
|
-
#
|
|
46
|
-
#
|
|
47
|
-
#
|
|
46
|
+
puts bf.stats
|
|
47
|
+
# Number of filter bits (m): 3600
|
|
48
|
+
# Number of set bits (n): 20
|
|
49
|
+
# Number of filter hashes (k) : 10
|
|
50
|
+
# Predicted false positive rate = 0.00%
|
|
48
51
|
```
|
|
49
52
|
|
|
50
53
|
If you'd like more control over the traditional inputs like bit size and the number of hashes:
|
|
@@ -62,11 +65,11 @@ bf["bird"] = "bar"
|
|
|
62
65
|
bf["bird"] # => true
|
|
63
66
|
bf["mouse"] # => false
|
|
64
67
|
|
|
65
|
-
bf.stats
|
|
66
|
-
#
|
|
67
|
-
#
|
|
68
|
-
#
|
|
69
|
-
#
|
|
68
|
+
puts bf.stats
|
|
69
|
+
# Number of filter bits (m): 100
|
|
70
|
+
# Number of set bits (n): 4
|
|
71
|
+
# Number of filter hashes (k) : 2
|
|
72
|
+
# Predicted false positive rate = 10.87%
|
|
70
73
|
```
|
|
71
74
|
|
|
72
75
|
## Credits
|
|
@@ -17,8 +17,7 @@ static unsigned int *salts = crc_table;
|
|
|
17
17
|
static VALUE cBloomFilter;
|
|
18
18
|
|
|
19
19
|
struct BloomFilter {
|
|
20
|
-
int m; /* # of
|
|
21
|
-
int b; /* # of bits in a bloom filter bucket */
|
|
20
|
+
int m; /* # of bits in a bloom filter */
|
|
22
21
|
int k; /* # of hash functions */
|
|
23
22
|
unsigned char *ptr; /* bits data */
|
|
24
23
|
int bytes; /* size of byte data */
|
|
@@ -72,7 +71,6 @@ static VALUE bf_alloc(VALUE klass) {
|
|
|
72
71
|
VALUE obj = TypedData_Make_Struct(klass, struct BloomFilter, &bf_type, bf);
|
|
73
72
|
|
|
74
73
|
bf->m = 0;
|
|
75
|
-
bf->b = 0;
|
|
76
74
|
bf->k = 0;
|
|
77
75
|
bf->ptr = NULL;
|
|
78
76
|
bf->bytes = 0;
|
|
@@ -80,52 +78,24 @@ static VALUE bf_alloc(VALUE klass) {
|
|
|
80
78
|
return obj;
|
|
81
79
|
}
|
|
82
80
|
|
|
83
|
-
void
|
|
84
|
-
int byte_offset =
|
|
85
|
-
int bit_offset =
|
|
86
|
-
unsigned int c = bf->ptr[byte_offset];
|
|
87
|
-
c += bf->ptr[byte_offset + 1] << 8;
|
|
88
|
-
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
|
89
|
-
if ((c & mask) == 0) {
|
|
90
|
-
// do nothing
|
|
91
|
-
} else {
|
|
92
|
-
// reduce the counter: 11 00 => 10 00 (suppose bf->b is 2)
|
|
93
|
-
c -= (1 << bit_offset) & ((1 << 8) -1);
|
|
94
|
-
// shift the bitmap right by 1 bit: 10 00 => 01 00
|
|
95
|
-
c = (~mask & c) | ((c & mask) >> (bit_offset + 1) << bit_offset);
|
|
96
|
-
|
|
97
|
-
bf->ptr[byte_offset] = c & ((1 << 8) - 1);
|
|
98
|
-
bf->ptr[byte_offset + 1] = (c & ((1 << 16) - 1)) >> 8;
|
|
99
|
-
}
|
|
100
|
-
}
|
|
81
|
+
static void bucket_set(struct BloomFilter *bf, int index) {
|
|
82
|
+
int byte_offset = index / 8;
|
|
83
|
+
int bit_offset = index % 8;
|
|
101
84
|
|
|
102
|
-
|
|
103
|
-
int byte_offset = (index * bf->b) / 8;
|
|
104
|
-
int bit_offset = (index * bf->b) % 8;
|
|
105
|
-
unsigned int c = bf->ptr[byte_offset];
|
|
106
|
-
c += bf->ptr[byte_offset + 1] << 8;
|
|
107
|
-
unsigned int mask = ((1 << bf->b) - 1) << bit_offset;
|
|
108
|
-
if ((c & mask) != mask) {
|
|
109
|
-
c = c + ((1 << bit_offset) & ((1 << 8) -1)) | c;
|
|
110
|
-
bf->ptr[byte_offset] = c & ((1 << 8) - 1);
|
|
111
|
-
bf->ptr[byte_offset + 1] = (c & ((1 << 16) - 1)) >> 8;
|
|
112
|
-
}
|
|
85
|
+
bf->ptr[byte_offset] |= (unsigned char) (1U << bit_offset);
|
|
113
86
|
}
|
|
114
87
|
|
|
115
|
-
int bucket_check(struct BloomFilter *bf, int index) {
|
|
116
|
-
int byte_offset =
|
|
117
|
-
int bit_offset =
|
|
118
|
-
unsigned int c = bf->ptr[byte_offset];
|
|
119
|
-
c += bf->ptr[byte_offset + 1] << 8;
|
|
88
|
+
static int bucket_check(struct BloomFilter *bf, int index) {
|
|
89
|
+
int byte_offset = index / 8;
|
|
90
|
+
int bit_offset = index % 8;
|
|
120
91
|
|
|
121
|
-
|
|
122
|
-
return (c & mask) >> bit_offset;
|
|
92
|
+
return (bf->ptr[byte_offset] >> bit_offset) & 1;
|
|
123
93
|
}
|
|
124
94
|
|
|
125
95
|
static VALUE bf_initialize(int argc, VALUE *argv, VALUE self) {
|
|
126
96
|
struct BloomFilter *bf;
|
|
127
97
|
VALUE arg1, arg2;
|
|
128
|
-
int m, k
|
|
98
|
+
int m, k;
|
|
129
99
|
|
|
130
100
|
bf = bf_ptr(self);
|
|
131
101
|
|
|
@@ -143,21 +113,20 @@ static VALUE bf_initialize(int argc, VALUE *argv, VALUE self) {
|
|
|
143
113
|
|
|
144
114
|
m = FIX2INT(arg1);
|
|
145
115
|
k = FIX2INT(arg2);
|
|
146
|
-
b = 1;
|
|
147
116
|
|
|
148
117
|
if (m < 1)
|
|
149
118
|
rb_raise(rb_eArgError, "array size");
|
|
150
119
|
if (k < 1)
|
|
151
120
|
rb_raise(rb_eArgError, "hash length");
|
|
152
121
|
|
|
153
|
-
bf->b = b;
|
|
154
122
|
bf->m = m;
|
|
155
123
|
bf->k = k;
|
|
156
124
|
|
|
157
125
|
ruby_xfree(bf->ptr);
|
|
158
126
|
bf->ptr = NULL;
|
|
159
127
|
bf->bytes = 0;
|
|
160
|
-
|
|
128
|
+
/* Preserve the existing serialized bitmap length, including one padding byte. */
|
|
129
|
+
bf->bytes = (m + 15) / 8;
|
|
161
130
|
bf->ptr = ALLOC_N(unsigned char, bf->bytes);
|
|
162
131
|
|
|
163
132
|
/* initialize the bits with zeros */
|
|
@@ -194,7 +163,7 @@ static VALUE bf_set_bits(VALUE self){
|
|
|
194
163
|
return INT2FIX(count);
|
|
195
164
|
}
|
|
196
165
|
|
|
197
|
-
static VALUE
|
|
166
|
+
static VALUE bf_add(VALUE self, VALUE key) {
|
|
198
167
|
VALUE skey;
|
|
199
168
|
unsigned long hash;
|
|
200
169
|
int index;
|
|
@@ -268,57 +237,34 @@ static VALUE bf_or(VALUE self, VALUE other) {
|
|
|
268
237
|
return obj;
|
|
269
238
|
}
|
|
270
239
|
|
|
271
|
-
static VALUE bf_include(
|
|
240
|
+
static VALUE bf_include(VALUE self, VALUE key) {
|
|
241
|
+
VALUE skey;
|
|
272
242
|
unsigned long hash;
|
|
273
|
-
int i, len, m, k;
|
|
274
243
|
int index;
|
|
275
|
-
|
|
244
|
+
int i, len, m, k;
|
|
276
245
|
char *ckey;
|
|
277
|
-
|
|
278
|
-
struct BloomFilter *bf;
|
|
246
|
+
struct BloomFilter *bf = bf_ptr(self);
|
|
279
247
|
|
|
280
|
-
|
|
248
|
+
skey = rb_obj_as_string(key);
|
|
249
|
+
ckey = StringValuePtr(skey);
|
|
250
|
+
len = (int) (RSTRING_LEN(skey)); /* length of the string in bytes */
|
|
281
251
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
hash = (unsigned long) djb2(ckey, len);
|
|
294
|
-
for (i = 0; i <= k - 1; i++) {
|
|
295
|
-
index = (int) ((hash ^ salts[i]) % (unsigned int) (m));
|
|
296
|
-
|
|
297
|
-
/* check the bit at the index */
|
|
298
|
-
if (!bucket_check(bf, index)) {
|
|
299
|
-
return Qfalse; /* i.e., it is a new entry ; escape the loop */
|
|
300
|
-
}
|
|
252
|
+
m = bf->m;
|
|
253
|
+
k = bf->k;
|
|
254
|
+
|
|
255
|
+
hash = (unsigned long) djb2(ckey, len);
|
|
256
|
+
for (i = 0; i <= k - 1; i++) {
|
|
257
|
+
index = (int) ((hash ^ salts[i]) % (unsigned int) (m));
|
|
258
|
+
|
|
259
|
+
/* check the bit at the index */
|
|
260
|
+
if (!bucket_check(bf, index)) {
|
|
261
|
+
return Qfalse; /* i.e., it is a new entry ; escape the loop */
|
|
301
262
|
}
|
|
302
263
|
}
|
|
303
264
|
|
|
304
265
|
return Qtrue;
|
|
305
266
|
}
|
|
306
267
|
|
|
307
|
-
static VALUE bf_to_s(VALUE self) {
|
|
308
|
-
struct BloomFilter *bf = bf_ptr(self);
|
|
309
|
-
unsigned char *ptr;
|
|
310
|
-
int i;
|
|
311
|
-
VALUE str;
|
|
312
|
-
|
|
313
|
-
str = rb_str_new(0, bf->m);
|
|
314
|
-
|
|
315
|
-
ptr = (unsigned char *) RSTRING_PTR(str);
|
|
316
|
-
for (i = 0; i < bf->m; i++)
|
|
317
|
-
*ptr++ = bucket_check(bf, i) ? '1' : '0';
|
|
318
|
-
|
|
319
|
-
return str;
|
|
320
|
-
}
|
|
321
|
-
|
|
322
268
|
static VALUE bf_bitmap(VALUE self) {
|
|
323
269
|
struct BloomFilter *bf = bf_ptr(self);
|
|
324
270
|
|
|
@@ -346,15 +292,13 @@ void Init_cbloomfilter(void) {
|
|
|
346
292
|
rb_define_method(cBloomFilter, "m", bf_m, 0);
|
|
347
293
|
rb_define_method(cBloomFilter, "k", bf_k, 0);
|
|
348
294
|
rb_define_method(cBloomFilter, "set_bits", bf_set_bits, 0);
|
|
349
|
-
|
|
350
|
-
rb_define_method(cBloomFilter, "
|
|
351
|
-
rb_define_method(cBloomFilter, "include?", bf_include, -1);
|
|
295
|
+
rb_define_method(cBloomFilter, "add", bf_add, 1);
|
|
296
|
+
rb_define_method(cBloomFilter, "include?", bf_include, 1);
|
|
352
297
|
rb_define_method(cBloomFilter, "clear", bf_clear, 0);
|
|
353
|
-
rb_define_method(cBloomFilter, "merge
|
|
298
|
+
rb_define_method(cBloomFilter, "merge", bf_merge, 1);
|
|
354
299
|
rb_define_method(cBloomFilter, "&", bf_and, 1);
|
|
355
300
|
rb_define_method(cBloomFilter, "|", bf_or, 1);
|
|
356
301
|
|
|
357
|
-
rb_define_method(cBloomFilter, "to_s", bf_to_s, 0);
|
|
358
302
|
rb_define_method(cBloomFilter, "bitmap", bf_bitmap, 0);
|
|
359
303
|
rb_define_method(cBloomFilter, "load", bf_load, 1);
|
|
360
304
|
|
data/lib/bloom_fit/version.rb
CHANGED
data/lib/bloom_fit.rb
CHANGED
|
@@ -1,65 +1,214 @@
|
|
|
1
|
+
require "forwardable"
|
|
2
|
+
|
|
1
3
|
require "cbloomfilter"
|
|
4
|
+
require "bloom_fit/configuration_mismatch"
|
|
2
5
|
require "bloom_fit/version"
|
|
3
6
|
|
|
7
|
+
# BloomFit is an in-memory Bloom filter with a small, Set-like API.
|
|
8
|
+
#
|
|
9
|
+
# Bloom filters are probabilistic membership structures: they can report false
|
|
10
|
+
# positives, but they do not report false negatives for values that have been
|
|
11
|
+
# added. That makes BloomFit useful for cheaply ruling out missing values
|
|
12
|
+
# before doing more expensive work, while keeping memory usage low.
|
|
13
|
+
#
|
|
14
|
+
# The class wraps the native +CBloomFilter+ implementation in Ruby-friendly
|
|
15
|
+
# methods such as +add+, +include?+, +merge+, +&+, and +|+. Instances can be
|
|
16
|
+
# serialized with +save+ and reloaded with +BloomFit.load+.
|
|
17
|
+
#
|
|
18
|
+
# Filters can only be combined when they were created with the same +size+ and
|
|
19
|
+
# +hashes+ values; otherwise +BloomFit::ConfigurationMismatch+ is raised.
|
|
20
|
+
#
|
|
21
|
+
# filter = BloomFit.new(size: 10_000, hashes: 6)
|
|
22
|
+
# filter.add("cat")
|
|
23
|
+
# filter.include?("cat") # => true
|
|
24
|
+
# filter.include?("dog") # => false
|
|
25
|
+
#
|
|
26
|
+
# Choose +size+ and +hashes+ based on the expected number of inserts and the
|
|
27
|
+
# false-positive rate you can tolerate.
|
|
4
28
|
class BloomFit
|
|
5
|
-
|
|
6
|
-
end
|
|
29
|
+
extend Forwardable
|
|
7
30
|
|
|
31
|
+
# The wrapped native +CBloomFilter+ instance.
|
|
32
|
+
#
|
|
33
|
+
# This is mostly useful for low-level integrations and internal filter
|
|
34
|
+
# operations such as merge, union, and intersection.
|
|
8
35
|
attr_reader :bf
|
|
9
36
|
|
|
37
|
+
# Creates an empty Bloom filter.
|
|
38
|
+
#
|
|
39
|
+
# The defaults are a reasonable starting point for small in-memory filters,
|
|
40
|
+
# but the best values depend on how many keys you expect to insert and how
|
|
41
|
+
# many false positives you can tolerate.
|
|
42
|
+
#
|
|
43
|
+
# @param size [Integer] number of buckets in a bloom filter
|
|
44
|
+
# @param hashes [Integer] number of hash functions
|
|
10
45
|
def initialize(size: 1_000, hashes: 4)
|
|
11
|
-
@
|
|
12
|
-
|
|
46
|
+
@bf = CBloomFilter.new(size, hashes)
|
|
47
|
+
end
|
|
13
48
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
49
|
+
# :method: m
|
|
50
|
+
#
|
|
51
|
+
# Returns the configured filter width.
|
|
52
|
+
|
|
53
|
+
# :method: k
|
|
54
|
+
#
|
|
55
|
+
# Returns the number of hash functions applied to each key.
|
|
56
|
+
|
|
57
|
+
# :method: bitmap
|
|
58
|
+
#
|
|
59
|
+
# Returns the raw bitmap as a binary string.
|
|
60
|
+
#
|
|
61
|
+
# The returned bytes reflect the native representation, so the string may
|
|
62
|
+
# include padding beyond the configured filter size.
|
|
63
|
+
|
|
64
|
+
# :method: include?
|
|
65
|
+
#
|
|
66
|
+
# Returns +true+ when +key+ may be present and +false+ when it is definitely
|
|
67
|
+
# absent.
|
|
68
|
+
#
|
|
69
|
+
# Positive results are probabilistic and may be false positives.
|
|
70
|
+
|
|
71
|
+
# :method: clear
|
|
72
|
+
#
|
|
73
|
+
# Clears the filter by resetting all bits to +0+.
|
|
74
|
+
|
|
75
|
+
# :method: set_bits
|
|
76
|
+
#
|
|
77
|
+
# Returns the number of bits currently set to +1+.
|
|
78
|
+
|
|
79
|
+
def_delegators :@bf, :m, :k, :bitmap, :include?, :clear, :set_bits
|
|
80
|
+
|
|
81
|
+
# Returns the configured filter width.
|
|
82
|
+
alias size m
|
|
83
|
+
# Returns the number of hash functions used for each inserted key.
|
|
84
|
+
alias hashes k
|
|
85
|
+
alias key? include?
|
|
86
|
+
alias [] include?
|
|
87
|
+
alias n set_bits
|
|
88
|
+
|
|
89
|
+
# Returns +true+ when no bits are set.
|
|
90
|
+
#
|
|
91
|
+
# This is an exact check on the filter state, unlike +include?+, which is
|
|
92
|
+
# probabilistic for positive matches.
|
|
93
|
+
def empty?
|
|
94
|
+
set_bits.zero?
|
|
17
95
|
end
|
|
18
96
|
|
|
19
|
-
|
|
20
|
-
|
|
97
|
+
# Adds +key+ to the filter and returns +self+.
|
|
98
|
+
#
|
|
99
|
+
# This mimics the behavior of Set#add and allows chaining with #<<.
|
|
100
|
+
def add(key)
|
|
101
|
+
@bf.add(key)
|
|
102
|
+
self
|
|
103
|
+
end
|
|
104
|
+
alias << add
|
|
105
|
+
|
|
106
|
+
# Adds +key+ to the filter when +value+ is truthy.
|
|
107
|
+
#
|
|
108
|
+
# This makes BloomFit behave like a write-only membership hash: truthy values
|
|
109
|
+
# add the key, while +false+ and +nil+ are ignored.
|
|
110
|
+
def []=(key, value)
|
|
111
|
+
@bf.add(key) if value
|
|
21
112
|
end
|
|
22
|
-
alias []= insert
|
|
23
113
|
|
|
24
|
-
|
|
25
|
-
|
|
114
|
+
# Adds +key+ only if it does not already appear to be present.
|
|
115
|
+
#
|
|
116
|
+
# Returns +self+ when the key is added and +nil+ when +include?+ is already
|
|
117
|
+
# true. This mimics Set#add?.
|
|
118
|
+
#
|
|
119
|
+
# Because Bloom filters can return false positives, +add?+ may occasionally
|
|
120
|
+
# return +nil+ for a key that has not actually been inserted before.
|
|
121
|
+
def add?(key)
|
|
122
|
+
return nil if include?(key) # rubocop:disable Style/ReturnNilInPredicateMethodDefinition
|
|
123
|
+
add(key)
|
|
26
124
|
end
|
|
27
|
-
alias key? include?
|
|
28
|
-
alias [] include?
|
|
29
125
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
126
|
+
# Returns the bitmap as a hexadecimal string.
|
|
127
|
+
#
|
|
128
|
+
# This is useful for debugging, logging, or comparing filter state in a more
|
|
129
|
+
# compact form than +to_binary+.
|
|
130
|
+
def to_hex
|
|
131
|
+
length = ((size / 8.0).ceil * 8 / 4)
|
|
132
|
+
bitmap.unpack1("H*")[0...length]
|
|
133
|
+
end
|
|
33
134
|
|
|
34
|
-
# Returns the
|
|
35
|
-
|
|
36
|
-
|
|
135
|
+
# Returns the bitmap as a binary string of +0+ and +1+ characters.
|
|
136
|
+
#
|
|
137
|
+
# The output is truncated to the configured filter width, so it omits any
|
|
138
|
+
# trailing padding present in the native bitmap.
|
|
139
|
+
def to_binary
|
|
140
|
+
bitmap.unpack1("B*")[0...size]
|
|
37
141
|
end
|
|
38
142
|
|
|
39
|
-
#
|
|
40
|
-
#
|
|
41
|
-
#
|
|
143
|
+
# Merges another filter or collection of keys into this filter.
|
|
144
|
+
#
|
|
145
|
+
# When +other+ is a +BloomFit+, the merge is performed bitwise and both
|
|
146
|
+
# filters must have the same +size+ and +hashes+ values. When +other+
|
|
147
|
+
# behaves like a hash, only keys with truthy values are added. Any other
|
|
148
|
+
# enumerable is treated as a list of keys.
|
|
149
|
+
#
|
|
150
|
+
# This method mutates the receiver and mimics Set#merge.
|
|
151
|
+
def merge(other)
|
|
152
|
+
if other.is_a?(BloomFit)
|
|
153
|
+
raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
|
|
154
|
+
@bf.merge(other.bf)
|
|
155
|
+
elsif other.respond_to?(:each_key)
|
|
156
|
+
other.each { |k, v| add(k) if v }
|
|
157
|
+
elsif other.is_a?(Enumerable)
|
|
158
|
+
other.each { |k| add(k) }
|
|
159
|
+
else
|
|
160
|
+
raise ArgumentError, "value must be enumerable or another BloomFit filter"
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# Returns a new filter containing the bitwise intersection of two filters.
|
|
165
|
+
#
|
|
166
|
+
# Both filters must have the same +size+ and +hashes+ values or
|
|
167
|
+
# +BloomFit::ConfigurationMismatch+ is raised.
|
|
168
|
+
#
|
|
169
|
+
# Like all Bloom filter operations, membership checks on the result remain
|
|
170
|
+
# probabilistic and may still produce false positives.
|
|
42
171
|
def &(other)
|
|
43
172
|
raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
173
|
+
self.class.new(size:, hashes:).tap do |result|
|
|
174
|
+
result.instance_variable_set(:@bf, @bf.&(other.bf))
|
|
175
|
+
end
|
|
47
176
|
end
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
#
|
|
51
|
-
#
|
|
177
|
+
alias intersection &
|
|
178
|
+
|
|
179
|
+
# Returns a new filter containing the bitwise union of two filters.
|
|
180
|
+
#
|
|
181
|
+
# Both filters must have the same +size+ and +hashes+ values or
|
|
182
|
+
# +BloomFit::ConfigurationMismatch+ is raised.
|
|
183
|
+
#
|
|
184
|
+
# The receiver and +other+ are left unchanged.
|
|
52
185
|
def |(other)
|
|
53
186
|
raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
187
|
+
self.class.new(size:, hashes:).tap do |result|
|
|
188
|
+
result.instance_variable_set(:@bf, @bf.|(other.bf))
|
|
189
|
+
end
|
|
57
190
|
end
|
|
191
|
+
alias union |
|
|
192
|
+
|
|
193
|
+
# Returns a human-readable summary of the filter's current state.
|
|
194
|
+
#
|
|
195
|
+
# The report includes the configured width (+m+), the current number of set
|
|
196
|
+
# bits (+n+), the hash count (+k+), and the predicted false-positive rate
|
|
197
|
+
# based on the current fill level.
|
|
198
|
+
def stats
|
|
199
|
+
fpr = ((1.0 - Math.exp(-(k * n).to_f / m))**k) * 100
|
|
58
200
|
|
|
59
|
-
|
|
60
|
-
|
|
201
|
+
(+"").tap do |s|
|
|
202
|
+
s << format("Number of filter buckets (m): %d\n", m)
|
|
203
|
+
s << format("Number of set bits (n): %d\n", n)
|
|
204
|
+
s << format("Number of filter hashes (k): %d\n", k)
|
|
205
|
+
s << format("Predicted false positive rate: %.2f%%\n", fpr)
|
|
206
|
+
end
|
|
61
207
|
end
|
|
62
208
|
|
|
209
|
+
# Rebuilds the filter from the serialized data returned by +marshal_dump+.
|
|
210
|
+
#
|
|
211
|
+
# This hook is used by Ruby's +Marshal+ support.
|
|
63
212
|
def marshal_load(ary)
|
|
64
213
|
size, hashes, bitmap = *ary
|
|
65
214
|
|
|
@@ -67,32 +216,29 @@ class BloomFit
|
|
|
67
216
|
@bf.load(bitmap) if bitmap
|
|
68
217
|
end
|
|
69
218
|
|
|
219
|
+
# Returns the data Ruby's +Marshal+ uses to serialize this filter.
|
|
70
220
|
def marshal_dump
|
|
71
|
-
[
|
|
221
|
+
[size, hashes, bitmap]
|
|
72
222
|
end
|
|
73
223
|
|
|
224
|
+
# Loads a filter from a file previously written by +save+.
|
|
225
|
+
#
|
|
226
|
+
# The file is read using Ruby's +Marshal+ format, so it should only be used
|
|
227
|
+
# with trusted input.
|
|
74
228
|
def self.load(filename)
|
|
75
|
-
Marshal.load(File.open(filename, "r"))
|
|
229
|
+
Marshal.load(File.open(filename, "r")) # rubocop:disable Security/MarshalLoad
|
|
76
230
|
end
|
|
77
231
|
|
|
232
|
+
# Writes the filter to +filename+ using Ruby's +Marshal+ format.
|
|
78
233
|
def save(filename)
|
|
79
234
|
File.open(filename, "w") do |f|
|
|
80
235
|
f << Marshal.dump(self)
|
|
81
236
|
end
|
|
82
237
|
end
|
|
83
238
|
|
|
84
|
-
def stats
|
|
85
|
-
fp = ((1.0 - Math.exp(-(@hashes * size).to_f / @size))**@hashes) * 100
|
|
86
|
-
printf "Number of filter buckets (m): %d\n", @size
|
|
87
|
-
printf "Number of set bits (n): %d\n", set_bits
|
|
88
|
-
printf "Number of filter hashes (k) : %d\n", @hashes
|
|
89
|
-
printf "Predicted false positive rate = %.2f%%\n", fp
|
|
90
|
-
end
|
|
91
|
-
|
|
92
239
|
protected
|
|
93
240
|
|
|
94
|
-
# Returns true
|
|
95
|
-
# the same.
|
|
241
|
+
# Returns +true+ when +other+ has the same +size+ and +hashes+ values.
|
|
96
242
|
def same_parameters?(other)
|
|
97
243
|
bf.m == other.bf.m && bf.k == other.bf.k
|
|
98
244
|
end
|
data/lib/cbloomfilter.bundle
CHANGED
|
Binary file
|
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
require "test_helper"
|
|
2
|
+
|
|
3
|
+
class BloomFitTest < Minitest::Spec
|
|
4
|
+
subject { BloomFit.new(size: 100, hashes: 4) }
|
|
5
|
+
|
|
6
|
+
describe "#empty?" do
|
|
7
|
+
it "returns true when nothing set" do
|
|
8
|
+
assert_equal true, subject.empty? # rubocop:disable Minitest/AssertTruthy
|
|
9
|
+
assert_empty subject
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
it "returns false when something set" do
|
|
13
|
+
subject << "key"
|
|
14
|
+
assert_equal false, subject.empty? # rubocop:disable Minitest/RefuteFalse
|
|
15
|
+
refute_empty subject
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
describe "#add" do
|
|
20
|
+
it "adds the key and returns self" do
|
|
21
|
+
assert_equal subject, subject.add("test1")
|
|
22
|
+
assert_equal subject, subject.add("test2")
|
|
23
|
+
assert_includes subject, "test1"
|
|
24
|
+
assert_includes subject, "test2"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
it "is aliased as #<<" do
|
|
28
|
+
subject << "test1" << "test2"
|
|
29
|
+
assert_includes subject, "test1"
|
|
30
|
+
assert_includes subject, "test2"
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
it "is aliased as #[]=, and handles truthy/falsey values" do
|
|
34
|
+
subject["dog"] = :bar
|
|
35
|
+
subject["cat"] = :foo
|
|
36
|
+
assert_includes subject, "dog"
|
|
37
|
+
assert_includes subject, "cat"
|
|
38
|
+
|
|
39
|
+
subject["bat"] = nil
|
|
40
|
+
subject["pig"] = false
|
|
41
|
+
refute_includes subject, "bat"
|
|
42
|
+
refute_includes subject, "pig"
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
it "casts using #to_s as necessary" do
|
|
46
|
+
subject << :symbol << true << 12_345
|
|
47
|
+
|
|
48
|
+
assert_includes subject, "symbol"
|
|
49
|
+
assert_includes subject, :symbol
|
|
50
|
+
assert_includes subject, "true"
|
|
51
|
+
assert_includes subject, "12345"
|
|
52
|
+
assert_includes subject, 12_345
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
describe "#add?" do
|
|
57
|
+
it "adds new key and returns self" do
|
|
58
|
+
assert_equal subject, subject.add("test1")
|
|
59
|
+
assert_equal subject, subject.add("test2")
|
|
60
|
+
assert_includes subject, "test1"
|
|
61
|
+
assert_includes subject, "test2"
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
it "return nil if the key already exists" do
|
|
65
|
+
subject << "test1"
|
|
66
|
+
subject << "test2"
|
|
67
|
+
assert_includes subject, "test1"
|
|
68
|
+
assert_includes subject, "test2"
|
|
69
|
+
assert_nil subject.add?("test1")
|
|
70
|
+
assert_nil subject.add?("test2")
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
describe "#include?" do
|
|
75
|
+
it "returns true when a key is in the set" do
|
|
76
|
+
subject << "test1"
|
|
77
|
+
subject << "test2"
|
|
78
|
+
assert_equal true, subject.include?("test1") # rubocop:disable Minitest/AssertTruthy
|
|
79
|
+
assert_equal true, subject.include?("test2") # rubocop:disable Minitest/AssertTruthy
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
it "returns false when a key is not in the set" do
|
|
83
|
+
assert_equal false, subject.include?("test") # rubocop:disable Minitest/RefuteFalse
|
|
84
|
+
assert_equal false, subject.include?("nada") # rubocop:disable Minitest/RefuteFalse
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
it "is aliased as #key?" do
|
|
88
|
+
subject << "test1"
|
|
89
|
+
subject << "test2"
|
|
90
|
+
assert subject.key?("test1")
|
|
91
|
+
assert subject.key?("test2")
|
|
92
|
+
refute subject.key?("test3")
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
it "is aliased as #[]" do
|
|
96
|
+
subject << "test1"
|
|
97
|
+
subject << "test2"
|
|
98
|
+
assert subject["test1"]
|
|
99
|
+
assert subject["test2"]
|
|
100
|
+
refute subject["test3"]
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
describe "#clear" do
|
|
105
|
+
it "zeroes the bits" do
|
|
106
|
+
subject.add("test")
|
|
107
|
+
assert_includes subject, "test"
|
|
108
|
+
assert_includes subject.to_binary, "1"
|
|
109
|
+
subject.clear
|
|
110
|
+
refute_includes subject, "test"
|
|
111
|
+
refute_includes subject.to_binary, "1"
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
describe "#set_bits" do
|
|
116
|
+
it "returns the number of bits set to 1" do
|
|
117
|
+
bf = BloomFit.new(size: 100, hashes: 4)
|
|
118
|
+
bf.add("bits")
|
|
119
|
+
assert_equal 4, bf.set_bits
|
|
120
|
+
|
|
121
|
+
bf = BloomFit.new(size: 100, hashes: 1)
|
|
122
|
+
bf.add("bits")
|
|
123
|
+
assert_equal 1, bf.set_bits
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
describe "#bitmap" do
|
|
128
|
+
it "returns a binary bitmap of all zeros when empty (including a terminating byte)" do
|
|
129
|
+
bf = BloomFit.new(size: 16)
|
|
130
|
+
assert_equal "\x00\x00\x00".b, bf.bitmap
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
it "returns a binary bitmap representing the set" do
|
|
134
|
+
bf = BloomFit.new(size: 16, hashes: 4)
|
|
135
|
+
bf.add("something")
|
|
136
|
+
assert_equal "(\x82\x00".b, bf.bitmap
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
it "returns a binary bitmap representing the set even if not a multiple of 8 bits" do
|
|
140
|
+
bf = BloomFit.new(size: 20, hashes: 4)
|
|
141
|
+
bf.add("wow")
|
|
142
|
+
assert_equal "\x04\x14\x00\x00".b, bf.bitmap
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
describe "#to_hex" do
|
|
147
|
+
it "returns a hex bitmap of all zeros when empty" do
|
|
148
|
+
bf = BloomFit.new(size: 16)
|
|
149
|
+
assert_equal "0000", bf.to_hex
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
it "returns a hex bitmap of all zeros when empty if not a multiple of 8 bits" do
|
|
153
|
+
bf = BloomFit.new(size: 18)
|
|
154
|
+
assert_equal "000000", bf.to_hex
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
it "returns a hex bitmap representing the set" do
|
|
158
|
+
bf = BloomFit.new(size: 16, hashes: 4)
|
|
159
|
+
bf.add("cool")
|
|
160
|
+
assert_equal "1441", bf.to_hex
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
describe "#to_binary" do
|
|
165
|
+
it "returns a binary bitmap of all zeros when empty" do
|
|
166
|
+
bf = BloomFit.new(size: 16)
|
|
167
|
+
assert_equal "0000000000000000", bf.to_binary
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
it "returns a binary bitmap of all zeros when empty if not a multiple of 8 bits" do
|
|
171
|
+
bf = BloomFit.new(size: 19)
|
|
172
|
+
assert_equal "0000000000000000000", bf.to_binary
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
it "returns a binary bitmap representing the set" do
|
|
176
|
+
bf = BloomFit.new(size: 16, hashes: 4)
|
|
177
|
+
bf << "cool" << "cat"
|
|
178
|
+
assert_equal "1001011001101001", bf.to_binary
|
|
179
|
+
end
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
describe "#merge" do
|
|
183
|
+
it "merges another BloomFit filter" do
|
|
184
|
+
bf1 = BloomFit.new(size: 100, hashes: 2)
|
|
185
|
+
bf2 = BloomFit.new(size: 100, hashes: 2)
|
|
186
|
+
bf1 << "mouse"
|
|
187
|
+
bf2 << "cat" << "dog"
|
|
188
|
+
refute_includes bf1, "cat"
|
|
189
|
+
refute_includes bf1, "dog"
|
|
190
|
+
bf1.merge(bf2)
|
|
191
|
+
assert_includes bf1, "mouse"
|
|
192
|
+
assert_includes bf1, "cat"
|
|
193
|
+
assert_includes bf1, "dog"
|
|
194
|
+
refute_includes bf2, "mouse"
|
|
195
|
+
assert_includes bf2, "cat"
|
|
196
|
+
assert_includes bf2, "dog"
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
it "merges an array" do
|
|
200
|
+
subject << "mouse"
|
|
201
|
+
subject.merge %i[cat dog]
|
|
202
|
+
assert_includes subject, "mouse"
|
|
203
|
+
assert_includes subject, "cat"
|
|
204
|
+
assert_includes subject, "dog"
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
it "merges a set" do
|
|
208
|
+
subject << "mouse"
|
|
209
|
+
subject.merge Set.new(%w[cat dog])
|
|
210
|
+
assert_includes subject, "mouse"
|
|
211
|
+
assert_includes subject, "cat"
|
|
212
|
+
assert_includes subject, "dog"
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
it "merges a hash ignoring falsey values" do
|
|
216
|
+
subject << "mouse"
|
|
217
|
+
subject.merge({ cat: 1, dog: 2, ant: false, bug: nil })
|
|
218
|
+
assert_includes subject, "mouse"
|
|
219
|
+
assert_includes subject, "cat"
|
|
220
|
+
assert_includes subject, "dog"
|
|
221
|
+
refute_includes subject, "ant"
|
|
222
|
+
refute_includes subject, "bug"
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
it "raises when merge is between incompatible filters" do
|
|
226
|
+
bf1 = BloomFit.new(size: 10)
|
|
227
|
+
bf2 = BloomFit.new(size: 20)
|
|
228
|
+
assert_raises(BloomFit::ConfigurationMismatch) { bf1.merge(bf2) }
|
|
229
|
+
end
|
|
230
|
+
end
|
|
231
|
+
|
|
232
|
+
describe "#&" do
|
|
233
|
+
it "returns intersection of both filters" do
|
|
234
|
+
bf1 = BloomFit.new(size: 35, hashes: 4)
|
|
235
|
+
bf1.add("test")
|
|
236
|
+
bf1.add("test1")
|
|
237
|
+
|
|
238
|
+
bf2 = BloomFit.new(size: 35, hashes: 4)
|
|
239
|
+
bf2.add("test")
|
|
240
|
+
bf2.add("test2")
|
|
241
|
+
|
|
242
|
+
bf3 = bf1 & bf2
|
|
243
|
+
assert_equal 35, bf3.size
|
|
244
|
+
assert_equal 4, bf3.hashes
|
|
245
|
+
assert_includes bf3, "test"
|
|
246
|
+
refute_includes bf3, "test1"
|
|
247
|
+
refute_includes bf3, "test2"
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
it "is aliased as #intersection" do
|
|
251
|
+
bf1 = BloomFit.new(size: 20, hashes: 4)
|
|
252
|
+
bf1.add("test")
|
|
253
|
+
bf1.add("test1")
|
|
254
|
+
|
|
255
|
+
bf2 = BloomFit.new(size: 20, hashes: 4)
|
|
256
|
+
bf2.add("test")
|
|
257
|
+
|
|
258
|
+
bf3 = bf1.intersection(bf2)
|
|
259
|
+
assert_includes bf3, "test"
|
|
260
|
+
refute_includes bf3, "test1"
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
it "raises when intersection is between incompatible filters" do
|
|
264
|
+
bf1 = BloomFit.new(size: 10)
|
|
265
|
+
bf2 = BloomFit.new(size: 20)
|
|
266
|
+
assert_raises(BloomFit::ConfigurationMismatch) { bf1 & bf2 }
|
|
267
|
+
|
|
268
|
+
bf1 = BloomFit.new(size: 10, hashes: 2)
|
|
269
|
+
bf2 = BloomFit.new(size: 10, hashes: 4)
|
|
270
|
+
assert_raises(BloomFit::ConfigurationMismatch) { bf1 & bf2 }
|
|
271
|
+
end
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
describe "#|" do
|
|
275
|
+
it "returns union with other filter" do
|
|
276
|
+
bf1 = BloomFit.new
|
|
277
|
+
bf1.add("test")
|
|
278
|
+
bf1.add("test1")
|
|
279
|
+
|
|
280
|
+
bf2 = BloomFit.new
|
|
281
|
+
bf2.add("test")
|
|
282
|
+
bf2.add("test2")
|
|
283
|
+
|
|
284
|
+
bf3 = bf1 | bf2
|
|
285
|
+
assert_includes bf3, "test"
|
|
286
|
+
assert_includes bf3, "test1"
|
|
287
|
+
assert_includes bf3, "test2"
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
it "is aliased as #union" do
|
|
291
|
+
bf1 = BloomFit.new(size: 20, hashes: 4)
|
|
292
|
+
bf1.add("test")
|
|
293
|
+
bf1.add("test1")
|
|
294
|
+
|
|
295
|
+
bf2 = BloomFit.new(size: 20, hashes: 4)
|
|
296
|
+
bf2.add("test")
|
|
297
|
+
|
|
298
|
+
bf3 = bf1.union(bf2)
|
|
299
|
+
assert_includes bf3, "test"
|
|
300
|
+
assert_includes bf3, "test1"
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
it "raises when union is between incompatible filters" do
|
|
304
|
+
bf1 = BloomFit.new(size: 10)
|
|
305
|
+
bf2 = BloomFit.new(size: 20)
|
|
306
|
+
assert_raises(BloomFit::ConfigurationMismatch) { bf1 | bf2 }
|
|
307
|
+
end
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
describe "#stats" do
|
|
311
|
+
it "returns current stats" do
|
|
312
|
+
bf = BloomFit.new(size: 10, hashes: 3)
|
|
313
|
+
expected = <<~STATS
|
|
314
|
+
Number of filter buckets (m): 10
|
|
315
|
+
Number of set bits (n): 0
|
|
316
|
+
Number of filter hashes (k): 3
|
|
317
|
+
Predicted false positive rate: 0.00%
|
|
318
|
+
STATS
|
|
319
|
+
assert_equal expected, bf.stats
|
|
320
|
+
end
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
describe "serialization" do
|
|
324
|
+
after { File.unlink("bf.out") }
|
|
325
|
+
|
|
326
|
+
it "marshalls" do
|
|
327
|
+
bf = BloomFit.new
|
|
328
|
+
assert bf.save("bf.out")
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
it "loads from marshalled" do
|
|
332
|
+
subject.add("foo")
|
|
333
|
+
subject.add("bar")
|
|
334
|
+
subject.save("bf.out")
|
|
335
|
+
|
|
336
|
+
bf2 = BloomFit.load("bf.out")
|
|
337
|
+
assert_includes bf2, "foo"
|
|
338
|
+
assert_includes bf2, "bar"
|
|
339
|
+
refute_includes bf2, "baz"
|
|
340
|
+
|
|
341
|
+
assert subject.send(:same_parameters?, bf2)
|
|
342
|
+
end
|
|
343
|
+
end
|
|
344
|
+
end
|
data/test/test_helper.rb
ADDED
metadata
CHANGED
|
@@ -1,22 +1,22 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: bloom_fit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.3.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
|
-
- Ilya Grigorik
|
|
8
|
-
- Tatsuya Mori
|
|
9
7
|
- Ryan McGeary
|
|
10
8
|
- Beshad Talayeminaei
|
|
9
|
+
- Ilya Grigorik
|
|
10
|
+
- Tatsuya Mori
|
|
11
11
|
bindir: bin
|
|
12
12
|
cert_chain: []
|
|
13
13
|
date: 1980-01-02 00:00:00.000000000 Z
|
|
14
14
|
dependencies: []
|
|
15
15
|
email:
|
|
16
|
-
- ilya@grigorik.com
|
|
17
|
-
- valdzone@gmail.com
|
|
18
16
|
- ryan@mcgeary.org
|
|
19
17
|
- 'btalayeminaei@gmail.com '
|
|
18
|
+
- ilya@grigorik.com
|
|
19
|
+
- valdzone@gmail.com
|
|
20
20
|
executables: []
|
|
21
21
|
extensions:
|
|
22
22
|
- ext/cbloomfilter/extconf.rb
|
|
@@ -27,10 +27,11 @@ files:
|
|
|
27
27
|
- ext/cbloomfilter/crc32.h
|
|
28
28
|
- ext/cbloomfilter/extconf.rb
|
|
29
29
|
- lib/bloom_fit.rb
|
|
30
|
+
- lib/bloom_fit/configuration_mismatch.rb
|
|
30
31
|
- lib/bloom_fit/version.rb
|
|
31
32
|
- lib/cbloomfilter.bundle
|
|
32
|
-
-
|
|
33
|
-
-
|
|
33
|
+
- test/bloom_fit_test.rb
|
|
34
|
+
- test/test_helper.rb
|
|
34
35
|
homepage: https://github.com/rmm5t/bloom_fit
|
|
35
36
|
licenses: []
|
|
36
37
|
metadata:
|
data/spec/bloom_fit_spec.rb
DELETED
|
@@ -1,129 +0,0 @@
|
|
|
1
|
-
require "helper"
|
|
2
|
-
|
|
3
|
-
describe BloomFit do
|
|
4
|
-
it "clears" do
|
|
5
|
-
bf = BloomFit.new(size: 100, hashes: 2)
|
|
6
|
-
bf.insert("test")
|
|
7
|
-
expect(bf.include?("test")).to be true
|
|
8
|
-
bf.clear
|
|
9
|
-
expect(bf.include?("test")).to be false
|
|
10
|
-
end
|
|
11
|
-
|
|
12
|
-
it "merges" do
|
|
13
|
-
bf1 = BloomFit.new(size: 100, hashes: 2)
|
|
14
|
-
bf2 = BloomFit.new(size: 100, hashes: 2)
|
|
15
|
-
bf2.insert("test")
|
|
16
|
-
expect(bf1.include?("test")).to be false
|
|
17
|
-
bf1.merge!(bf2)
|
|
18
|
-
expect(bf1.include?("test")).to be true
|
|
19
|
-
expect(bf2.include?("test")).to be true
|
|
20
|
-
end
|
|
21
|
-
|
|
22
|
-
it "tests set membership" do
|
|
23
|
-
bf = BloomFit.new(size: 100, hashes: 2)
|
|
24
|
-
bf.insert("test")
|
|
25
|
-
bf.insert("test1")
|
|
26
|
-
|
|
27
|
-
expect(bf.include?("test")).to be true
|
|
28
|
-
expect(bf.include?("abcd")).to be false
|
|
29
|
-
expect(bf.include?("test", "test1")).to be true
|
|
30
|
-
expect(bf.include?("test1", "abcd")).to be false
|
|
31
|
-
end
|
|
32
|
-
|
|
33
|
-
it "works with any object's to_s" do
|
|
34
|
-
subject.insert(:test)
|
|
35
|
-
subject.insert(:test1)
|
|
36
|
-
subject.insert(12_345)
|
|
37
|
-
|
|
38
|
-
expect(subject.include?("test")).to be true
|
|
39
|
-
expect(subject.include?("abcd")).to be false
|
|
40
|
-
expect(subject.include?("12345")).to be true
|
|
41
|
-
end
|
|
42
|
-
|
|
43
|
-
it "returns the number of bits set to 1" do
|
|
44
|
-
bf = BloomFit.new(hashes: 4)
|
|
45
|
-
bf.insert("test")
|
|
46
|
-
expect(bf.set_bits).to eq 4
|
|
47
|
-
|
|
48
|
-
bf = BloomFit.new(hashes: 1)
|
|
49
|
-
bf.insert("test")
|
|
50
|
-
expect(bf.set_bits).to eq 1
|
|
51
|
-
end
|
|
52
|
-
|
|
53
|
-
it "returns intersection with other filter" do
|
|
54
|
-
bf1 = BloomFit.new
|
|
55
|
-
bf1.insert("test")
|
|
56
|
-
bf1.insert("test1")
|
|
57
|
-
|
|
58
|
-
bf2 = BloomFit.new
|
|
59
|
-
bf2.insert("test")
|
|
60
|
-
bf2.insert("test2")
|
|
61
|
-
|
|
62
|
-
bf3 = bf1 & bf2
|
|
63
|
-
expect(bf3.include?("test")).to be true
|
|
64
|
-
expect(bf3.include?("test1")).to be false
|
|
65
|
-
expect(bf3.include?("test2")).to be false
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
it "raises an exception when intersection is to be computed for incompatible filters" do
|
|
69
|
-
bf1 = BloomFit.new(size: 10)
|
|
70
|
-
bf1.insert("test")
|
|
71
|
-
|
|
72
|
-
bf2 = BloomFit.new(size: 20)
|
|
73
|
-
bf2.insert("test")
|
|
74
|
-
|
|
75
|
-
expect { bf1 & bf2 }.to raise_error(BloomFit::ConfigurationMismatch)
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
it "returns union with other filter" do
|
|
79
|
-
bf1 = BloomFit.new
|
|
80
|
-
bf1.insert("test")
|
|
81
|
-
bf1.insert("test1")
|
|
82
|
-
|
|
83
|
-
bf2 = BloomFit.new
|
|
84
|
-
bf2.insert("test")
|
|
85
|
-
bf2.insert("test2")
|
|
86
|
-
|
|
87
|
-
bf3 = bf1 | bf2
|
|
88
|
-
expect(bf3.include?("test")).to be true
|
|
89
|
-
expect(bf3.include?("test1")).to be true
|
|
90
|
-
expect(bf3.include?("test2")).to be true
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
it "raises an exception when union is to be computed for incompatible filters" do
|
|
94
|
-
bf1 = BloomFit.new(size: 10)
|
|
95
|
-
bf1.insert("test")
|
|
96
|
-
|
|
97
|
-
bf2 = BloomFit.new(size: 20)
|
|
98
|
-
bf2.insert("test")
|
|
99
|
-
|
|
100
|
-
expect { bf1 | bf2 }.to raise_error(BloomFit::ConfigurationMismatch)
|
|
101
|
-
end
|
|
102
|
-
|
|
103
|
-
it "outputs current stats" do
|
|
104
|
-
subject.insert("test")
|
|
105
|
-
expect { subject.stats }.not_to raise_error
|
|
106
|
-
end
|
|
107
|
-
|
|
108
|
-
context "serialization" do
|
|
109
|
-
after { File.unlink("bf.out") }
|
|
110
|
-
|
|
111
|
-
it "marshalls" do
|
|
112
|
-
bf = BloomFit.new
|
|
113
|
-
expect { bf.save("bf.out") }.not_to raise_error
|
|
114
|
-
end
|
|
115
|
-
|
|
116
|
-
it "loads from marshalled" do
|
|
117
|
-
subject.insert("foo")
|
|
118
|
-
subject.insert("bar")
|
|
119
|
-
subject.save("bf.out")
|
|
120
|
-
|
|
121
|
-
bf2 = BloomFit.load("bf.out")
|
|
122
|
-
expect(bf2.include?("foo")).to be true
|
|
123
|
-
expect(bf2.include?("bar")).to be true
|
|
124
|
-
expect(bf2.include?("baz")).to be false
|
|
125
|
-
|
|
126
|
-
expect(subject.send(:same_parameters?, bf2)).to be true
|
|
127
|
-
end
|
|
128
|
-
end
|
|
129
|
-
end
|
data/spec/helper.rb
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
require "bloom_fit"
|