bloom_fit 0.3.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +220 -47
- data/ext/cbloomfilter/cbloomfilter.c +1 -1
- data/lib/bloom_fit/version.rb +1 -1
- data/lib/bloom_fit.rb +138 -19
- data/lib/cbloomfilter.bundle +0 -0
- data/test/bloom_fit_test.rb +22 -0
- data/test/c_bloom_filter_test.rb +158 -0
- metadata +2 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 54da887424b56d9c09e4d351125c22873bc24be3e32e96cf3716d044a0864957
|
|
4
|
+
data.tar.gz: 50780ab65355bc42c075586888f4f09ee6ce6849b16c01264d83887dc83f71a3
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 53511030706f900e42050938ff80eaaaa5290c609dcd40e6b809bed6c6d491fe63bc57d4d2c1e494c0081642f85e6e29e8c5bc46cbe9cc342d8700990d910043
|
|
7
|
+
data.tar.gz: f5da69e7acebde88b41649f6dfac9925e4f021c5fb7687f442a0b61b78efd1c30423f013851c2982048ef2f3c374b0e583cca7f92352475a31ca5cddfb67fd46
|
data/README.md
CHANGED
|
@@ -1,77 +1,250 @@
|
|
|
1
|
-
# BloomFit
|
|
1
|
+
# BloomFit
|
|
2
2
|
|
|
3
|
-
[](https://rubygems.org/gems/bloom_fit)
|
|
4
4
|
[](https://github.com/rmm5t/bloom_fit/actions/workflows/ci.yml)
|
|
5
5
|
[](https://rubygems.org/gems/bloom_fit)
|
|
6
6
|
|
|
7
|
-
BloomFit
|
|
7
|
+
BloomFit is an in-memory, non-counting Bloom filter for Ruby backed by a small C extension.
|
|
8
|
+
|
|
9
|
+
It gives you a compact, Set-like API for probabilistic membership checks:
|
|
10
|
+
|
|
11
|
+
- false positives are possible
|
|
12
|
+
- false negatives are not, as long as a value was added to the same filter
|
|
13
|
+
- individual values cannot be deleted safely because the filter is non-counting
|
|
14
|
+
|
|
15
|
+
BloomFit is heavily inspired by [bloomfilter-rb]'s native implementation and the original C implementation by Tatsuya Mori. This version uses a DJB2 hash with salts from the CRC table and wraps the native filter in a Ruby-friendly API. The most common way to use it is to pass an expected `capacity` and optional `false_positive_rate`, then let BloomFit calculate `size` and `hashes` for you.
|
|
16
|
+
|
|
17
|
+
Compared with bloomfilter-rb, BloomFit:
|
|
8
18
|
|
|
9
19
|
- uses DJB2 over CRC32 yielding better hash distribution
|
|
10
20
|
- improves performance for very large datasets
|
|
11
21
|
- avoids the need to supply a seed
|
|
12
|
-
- automatically calculates the
|
|
22
|
+
- automatically calculates the filter size (`m`) and hash count (`k`) from capacity and false-positive rate
|
|
13
23
|
|
|
14
|
-
|
|
24
|
+
## Features
|
|
15
25
|
|
|
16
|
-
|
|
26
|
+
- native `CBloomFilter` implementation for MRI Ruby
|
|
27
|
+
- automatic sizing from `capacity` and `false_positive_rate`
|
|
28
|
+
- small Ruby API with familiar methods like `add`, `include?`, `merge`, `|`, and `&`
|
|
29
|
+
- supports strings, symbols, integers, booleans, and other values that can be converted with `to_s`
|
|
30
|
+
- manual `size` / `hashes` overrides when you want control
|
|
31
|
+
- save and reload filters with Ruby `Marshal`
|
|
32
|
+
- inspect filter state with `stats`, `to_hex`, `to_binary`, and `bitmap`
|
|
17
33
|
|
|
18
|
-
|
|
19
|
-
- number of hash functions
|
|
34
|
+
## Requirements
|
|
20
35
|
|
|
21
|
-
|
|
36
|
+
- Ruby `>= 3.2.0`
|
|
22
37
|
|
|
23
|
-
|
|
24
|
-
- Determining parameters: [Scalable Datasets: Bloom Filters in Ruby](http://www.igvita.com/2008/12/27/scalable-datasets-bloom-filters-in-ruby/)
|
|
25
|
-
- Applications & reasons behind bloom filter: [Flow analysis: Time based bloom filter](http://www.igvita.com/2010/01/06/flow-analysis-time-based-bloom-filters/)
|
|
38
|
+
## Installation
|
|
26
39
|
|
|
27
|
-
|
|
40
|
+
```bash
|
|
41
|
+
gem install bloom_fit
|
|
42
|
+
```
|
|
28
43
|
|
|
29
|
-
|
|
44
|
+
```ruby
|
|
45
|
+
require "bloom_fit"
|
|
46
|
+
```
|
|
30
47
|
|
|
31
|
-
|
|
48
|
+
## Quick Start
|
|
32
49
|
|
|
33
50
|
```ruby
|
|
34
51
|
require "bloom_fit"
|
|
35
52
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
#
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
#
|
|
49
|
-
#
|
|
50
|
-
|
|
53
|
+
filter = BloomFit.new(capacity: 250, false_positive_rate: 0.001)
|
|
54
|
+
|
|
55
|
+
filter.add("cat")
|
|
56
|
+
filter << :dog
|
|
57
|
+
|
|
58
|
+
filter.include?("cat") # => true
|
|
59
|
+
filter.key?("dog") # => true
|
|
60
|
+
filter["bird"] # => false
|
|
61
|
+
|
|
62
|
+
filter["owl"] = true
|
|
63
|
+
filter["ant"] = false
|
|
64
|
+
|
|
65
|
+
filter["owl"] # => true
|
|
66
|
+
filter["ant"] # => false
|
|
67
|
+
|
|
68
|
+
filter.empty? # => false
|
|
69
|
+
|
|
70
|
+
filter.size # => 3595
|
|
71
|
+
filter.hashes # => 10
|
|
72
|
+
|
|
73
|
+
filter.clear
|
|
74
|
+
filter.empty? # => true
|
|
51
75
|
```
|
|
52
76
|
|
|
53
|
-
|
|
77
|
+
`#include?`, `#key?`, and `#[]` are aliases. `#add` and `#<<` are also aliases.
|
|
78
|
+
|
|
79
|
+
## Automatic Sizing
|
|
80
|
+
|
|
81
|
+
BloomFit now calculates `size` and `hashes` for you when you initialize it with an expected capacity:
|
|
54
82
|
|
|
55
83
|
```ruby
|
|
56
|
-
|
|
84
|
+
filter = BloomFit.new(capacity: 10_000, false_positive_rate: 0.01)
|
|
85
|
+
|
|
86
|
+
filter.size # => 95851
|
|
87
|
+
filter.hashes # => 7
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
The defaults are a good starting point for many small filters:
|
|
91
|
+
|
|
92
|
+
```ruby
|
|
93
|
+
filter = BloomFit.new
|
|
94
|
+
|
|
95
|
+
filter.size # => 1438
|
|
96
|
+
filter.hashes # => 10
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
That is equivalent to:
|
|
100
|
+
|
|
101
|
+
```ruby
|
|
102
|
+
filter = BloomFit.new(capacity: 100, false_positive_rate: 0.001)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Internally BloomFit uses the standard Bloom filter formulas:
|
|
106
|
+
|
|
107
|
+
```text
|
|
108
|
+
m = -(n * ln(p)) / (ln(2)^2)
|
|
109
|
+
k = (m / n) * ln(2)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
- `n`: expected number of inserted values
|
|
113
|
+
- `p`: target false-positive rate
|
|
114
|
+
- `m`: number of filter buckets (`size`)
|
|
115
|
+
- `k`: number of hash functions (`hashes`)
|
|
116
|
+
|
|
117
|
+
For example, if you expect about `10_000` inserts and can tolerate a `1%` false-positive rate, BloomFit will calculate `size: 95_851` and `hashes: 7` for you.
|
|
118
|
+
|
|
119
|
+
If you prefer a calculator, see [Bloom Filter Calculator](https://hur.st/bloomfilter/).
|
|
120
|
+
|
|
121
|
+
## Manual Sizing
|
|
122
|
+
|
|
123
|
+
If you already know the exact filter width and hash count you want, you can still pass them directly:
|
|
124
|
+
|
|
125
|
+
```ruby
|
|
126
|
+
filter = BloomFit.new(size: 95_851, hashes: 7)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
This bypasses automatic sizing.
|
|
130
|
+
|
|
131
|
+
## Common Operations
|
|
57
132
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
133
|
+
### Add and check membership
|
|
134
|
+
|
|
135
|
+
```ruby
|
|
136
|
+
filter = BloomFit.new(capacity: 100)
|
|
137
|
+
|
|
138
|
+
filter << "cat"
|
|
139
|
+
filter << "dog"
|
|
140
|
+
|
|
141
|
+
filter.include?("cat") # => true
|
|
142
|
+
filter.include?("bird") # => false
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Use hash-like syntax for truthy values
|
|
146
|
+
|
|
147
|
+
```ruby
|
|
148
|
+
filter = BloomFit.new(capacity: 64)
|
|
149
|
+
|
|
150
|
+
filter[:cat] = true
|
|
151
|
+
filter[:dog] = false
|
|
152
|
+
|
|
153
|
+
filter[:cat] # => true
|
|
154
|
+
filter[:dog] # => false
|
|
155
|
+
|
|
156
|
+
filter.merge({ bird: true, ant: nil })
|
|
157
|
+
|
|
158
|
+
filter.include?(:bird) # => true
|
|
159
|
+
filter.include?(:ant) # => false
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
When merging a hash, only keys with truthy values are added.
|
|
163
|
+
|
|
164
|
+
### Merge, union, and intersection
|
|
165
|
+
|
|
166
|
+
```ruby
|
|
167
|
+
pets = BloomFit.new(capacity: 50)
|
|
168
|
+
pets << "cat" << "dog"
|
|
169
|
+
|
|
170
|
+
more_pets = BloomFit.new(capacity: 50)
|
|
171
|
+
more_pets << "dog" << "bird"
|
|
172
|
+
|
|
173
|
+
combined = pets | more_pets
|
|
174
|
+
overlap = pets & more_pets
|
|
175
|
+
|
|
176
|
+
combined.include?("bird") # => true
|
|
177
|
+
overlap.include?("dog") # => true
|
|
178
|
+
overlap.include?("cat") # => false
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
`#merge` also accepts arrays, sets, and other enumerables:
|
|
182
|
+
|
|
183
|
+
```ruby
|
|
184
|
+
filter = BloomFit.new(capacity: 100)
|
|
185
|
+
filter.merge(%w[cat dog bird])
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
Filters can only be combined when they have the same `size` and `hashes`. Otherwise BloomFit raises `BloomFit::ConfigurationMismatch`.
|
|
189
|
+
|
|
190
|
+
When you create filters with automatic sizing, use the same `capacity` and `false_positive_rate` for filters you plan to merge, union, or intersect.
|
|
191
|
+
|
|
192
|
+
### Save and load filters
|
|
193
|
+
|
|
194
|
+
```ruby
|
|
195
|
+
filter = BloomFit.new(capacity: 100)
|
|
196
|
+
filter << "cat" << "dog"
|
|
197
|
+
filter.save("pets.bloom")
|
|
198
|
+
|
|
199
|
+
reloaded = BloomFit.load("pets.bloom")
|
|
200
|
+
reloaded.include?("cat") # => true
|
|
201
|
+
reloaded.include?("dog") # => true
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
Persistence uses Ruby `Marshal`. Only load files you trust.
|
|
205
|
+
|
|
206
|
+
### Inspect the bitmap
|
|
207
|
+
|
|
208
|
+
```ruby
|
|
209
|
+
filter = BloomFit.new(size: 16, hashes: 4)
|
|
210
|
+
filter << "cool"
|
|
211
|
+
|
|
212
|
+
filter.to_hex # => "1441"
|
|
213
|
+
filter.to_binary # => "0001010001000001"
|
|
214
|
+
filter.bitmap # => raw bytes from the native filter
|
|
73
215
|
```
|
|
74
216
|
|
|
217
|
+
`#bitmap` returns the native byte representation, which may include padding bytes beyond the configured filter width. `#to_binary` trims the result to exactly `size` bits.
|
|
218
|
+
|
|
219
|
+
## API Overview
|
|
220
|
+
|
|
221
|
+
| Method | Notes |
|
|
222
|
+
| --- | --- |
|
|
223
|
+
| `BloomFit.new` or `BloomFit.new(capacity:, false_positive_rate:)` | Creates a filter and calculates `size` and `hashes` automatically. Defaults to `capacity: 100`, `false_positive_rate: 0.001`. |
|
|
224
|
+
| `BloomFit.new(size:, hashes:)` | Creates a filter with explicit sizing when you want fixed parameters. |
|
|
225
|
+
| `add`, `<<` | Adds a value and returns the filter. |
|
|
226
|
+
| `add?` | Adds only when the value does not already appear present. |
|
|
227
|
+
| `include?`, `key?`, `[]` | Probabilistic membership check. |
|
|
228
|
+
| `[]=` | Adds a key only when the assigned value is truthy. |
|
|
229
|
+
| `merge` | Merges another filter or an enumerable into the receiver. |
|
|
230
|
+
| `\|`, `union` | Returns a new filter containing the union. |
|
|
231
|
+
| `&`, `intersection` | Returns a new filter containing the intersection. |
|
|
232
|
+
| `clear` | Resets all bits to `0`. |
|
|
233
|
+
| `empty?` | Exact check for whether any bits are set. |
|
|
234
|
+
| `size`, `m` | Returns the configured filter width. |
|
|
235
|
+
| `hashes`, `k` | Returns the number of hash functions. |
|
|
236
|
+
| `set_bits`, `n` | Returns the number of bits currently set. |
|
|
237
|
+
| `stats` | Returns a human-readable summary including predicted false-positive rate. |
|
|
238
|
+
| `to_hex`, `to_binary`, `bitmap` | Returns the filter bitmap in different representations. |
|
|
239
|
+
| `save`, `BloomFit.load` | Serializes and restores a filter with Ruby `Marshal`. |
|
|
240
|
+
|
|
241
|
+
## Resources
|
|
242
|
+
|
|
243
|
+
- Background: [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter)
|
|
244
|
+
- Determining parameters: [Scalable Datasets: Bloom Filters in Ruby](http://www.igvita.com/2008/12/27/scalable-datasets-bloom-filters-in-ruby/)
|
|
245
|
+
- Applications and motivation: [Flow analysis: Time based bloom filter](http://www.igvita.com/2010/01/06/flow-analysis-time-based-bloom-filters/)
|
|
246
|
+
- Calculator: [Bloom Filter Calculator](https://hur.st/bloomfilter/)
|
|
247
|
+
|
|
75
248
|
## Credits
|
|
76
249
|
|
|
77
250
|
- Tatsuya Mori <valdzone@gmail.com> (Original C implementation)
|
data/lib/bloom_fit/version.rb
CHANGED
data/lib/bloom_fit.rb
CHANGED
|
@@ -4,63 +4,162 @@ require "cbloomfilter"
|
|
|
4
4
|
require "bloom_fit/configuration_mismatch"
|
|
5
5
|
require "bloom_fit/version"
|
|
6
6
|
|
|
7
|
+
# BloomFit is an in-memory Bloom filter with a small, Set-like API.
|
|
8
|
+
#
|
|
9
|
+
# Bloom filters are probabilistic membership structures: they can report false
|
|
10
|
+
# positives, but they do not report false negatives for values that have been
|
|
11
|
+
# added. That makes BloomFit useful for cheaply ruling out missing values
|
|
12
|
+
# before doing more expensive work, while keeping memory usage low.
|
|
13
|
+
#
|
|
14
|
+
# The class wraps the native +CBloomFilter+ implementation in Ruby-friendly
|
|
15
|
+
# methods such as +add+, +include?+, +merge+, +&+, and +|+. Instances can be
|
|
16
|
+
# serialized with +save+ and reloaded with +BloomFit.load+.
|
|
17
|
+
#
|
|
18
|
+
# Filters can only be combined when they were created with the same +size+ and
|
|
19
|
+
# +hashes+ values; otherwise +BloomFit::ConfigurationMismatch+ is raised.
|
|
20
|
+
#
|
|
21
|
+
# filter = BloomFit.new(size: 10_000, hashes: 6)
|
|
22
|
+
# filter.add("cat")
|
|
23
|
+
# filter.include?("cat") # => true
|
|
24
|
+
# filter.include?("dog") # => false
|
|
25
|
+
#
|
|
26
|
+
# Choose +size+ and +hashes+ based on the expected number of inserts and the
|
|
27
|
+
# false-positive rate you can tolerate.
|
|
7
28
|
class BloomFit
|
|
8
29
|
extend Forwardable
|
|
9
30
|
|
|
31
|
+
LN2 = Math.log(2.0).freeze
|
|
32
|
+
|
|
33
|
+
# The wrapped native +CBloomFilter+ instance.
|
|
34
|
+
#
|
|
35
|
+
# This is mostly useful for low-level integrations and internal filter
|
|
36
|
+
# operations such as merge, union, and intersection.
|
|
10
37
|
attr_reader :bf
|
|
11
38
|
|
|
39
|
+
# Creates an empty Bloom filter.
|
|
40
|
+
#
|
|
41
|
+
# The defaults are a reasonable starting point for small in-memory filters,
|
|
42
|
+
# but the best values depend on how many keys you expect to insert and how
|
|
43
|
+
# many false positives you can tolerate.
|
|
44
|
+
#
|
|
45
|
+
# @param capacity [Integer] expected number of elements to store in the set
|
|
46
|
+
# @param false_positive_rate [Integer] expected number of elements to store in the set
|
|
12
47
|
# @param size [Integer] number of buckets in a bloom filter
|
|
13
48
|
# @param hashes [Integer] number of hash functions
|
|
14
|
-
def initialize(size:
|
|
49
|
+
def initialize(capacity: 100, false_positive_rate: 0.001, size: nil, hashes: 4)
|
|
50
|
+
if size.nil? || hashes.nil?
|
|
51
|
+
raise ArgumentError, "capacity must be > 0" unless capacity.positive?
|
|
52
|
+
raise ArgumentError, "false_positive_rate must be between 0 and 1" if false_positive_rate <= 0.0 || false_positive_rate >= 1.0
|
|
53
|
+
|
|
54
|
+
size = (-capacity.to_f * Math.log(false_positive_rate) / (LN2**2)).ceil
|
|
55
|
+
hashes = (size / capacity * LN2).ceil
|
|
56
|
+
end
|
|
57
|
+
|
|
15
58
|
@bf = CBloomFilter.new(size, hashes)
|
|
16
59
|
end
|
|
17
60
|
|
|
61
|
+
# :method: m
|
|
62
|
+
#
|
|
63
|
+
# Returns the configured filter width.
|
|
64
|
+
|
|
65
|
+
# :method: k
|
|
66
|
+
#
|
|
67
|
+
# Returns the number of hash functions applied to each key.
|
|
68
|
+
|
|
69
|
+
# :method: bitmap
|
|
70
|
+
#
|
|
71
|
+
# Returns the raw bitmap as a binary string.
|
|
72
|
+
#
|
|
73
|
+
# The returned bytes reflect the native representation, so the string may
|
|
74
|
+
# include padding beyond the configured filter size.
|
|
75
|
+
|
|
76
|
+
# :method: include?
|
|
77
|
+
#
|
|
78
|
+
# Returns +true+ when +key+ may be present and +false+ when it is definitely
|
|
79
|
+
# absent.
|
|
80
|
+
#
|
|
81
|
+
# Positive results are probabilistic and may be false positives.
|
|
82
|
+
|
|
83
|
+
# :method: clear
|
|
84
|
+
#
|
|
85
|
+
# Clears the filter by resetting all bits to +0+.
|
|
86
|
+
|
|
87
|
+
# :method: set_bits
|
|
88
|
+
#
|
|
89
|
+
# Returns the number of bits currently set to +1+.
|
|
90
|
+
|
|
18
91
|
def_delegators :@bf, :m, :k, :bitmap, :include?, :clear, :set_bits
|
|
19
92
|
|
|
93
|
+
# Returns the configured filter width.
|
|
20
94
|
alias size m
|
|
95
|
+
# Returns the number of hash functions used for each inserted key.
|
|
21
96
|
alias hashes k
|
|
22
97
|
alias key? include?
|
|
23
98
|
alias [] include?
|
|
24
99
|
alias n set_bits
|
|
25
100
|
|
|
101
|
+
# Returns +true+ when no bits are set.
|
|
102
|
+
#
|
|
103
|
+
# This is an exact check on the filter state, unlike +include?+, which is
|
|
104
|
+
# probabilistic for positive matches.
|
|
26
105
|
def empty?
|
|
27
106
|
set_bits.zero?
|
|
28
107
|
end
|
|
29
108
|
|
|
30
|
-
# Adds
|
|
31
|
-
#
|
|
109
|
+
# Adds +key+ to the filter and returns +self+.
|
|
110
|
+
#
|
|
111
|
+
# This mimics the behavior of Set#add and allows chaining with #<<.
|
|
32
112
|
def add(key)
|
|
33
113
|
@bf.add(key)
|
|
34
114
|
self
|
|
35
115
|
end
|
|
36
116
|
alias << add
|
|
37
117
|
|
|
38
|
-
# Adds
|
|
39
|
-
#
|
|
118
|
+
# Adds +key+ to the filter when +value+ is truthy.
|
|
119
|
+
#
|
|
120
|
+
# This makes BloomFit behave like a write-only membership hash: truthy values
|
|
121
|
+
# add the key, while +false+ and +nil+ are ignored.
|
|
40
122
|
def []=(key, value)
|
|
41
123
|
@bf.add(key) if value
|
|
42
124
|
end
|
|
43
125
|
|
|
44
|
-
# Adds
|
|
45
|
-
#
|
|
126
|
+
# Adds +key+ only if it does not already appear to be present.
|
|
127
|
+
#
|
|
128
|
+
# Returns +self+ when the key is added and +nil+ when +include?+ is already
|
|
129
|
+
# true. This mimics Set#add?.
|
|
130
|
+
#
|
|
131
|
+
# Because Bloom filters can return false positives, +add?+ may occasionally
|
|
132
|
+
# return +nil+ for a key that has not actually been inserted before.
|
|
46
133
|
def add?(key)
|
|
47
134
|
return nil if include?(key) # rubocop:disable Style/ReturnNilInPredicateMethodDefinition
|
|
48
135
|
add(key)
|
|
49
136
|
end
|
|
50
137
|
|
|
51
|
-
# Returns
|
|
138
|
+
# Returns the bitmap as a hexadecimal string.
|
|
139
|
+
#
|
|
140
|
+
# This is useful for debugging, logging, or comparing filter state in a more
|
|
141
|
+
# compact form than +to_binary+.
|
|
52
142
|
def to_hex
|
|
53
143
|
length = ((size / 8.0).ceil * 8 / 4)
|
|
54
144
|
bitmap.unpack1("H*")[0...length]
|
|
55
145
|
end
|
|
56
146
|
|
|
57
|
-
# Returns a string of
|
|
147
|
+
# Returns the bitmap as a binary string of +0+ and +1+ characters.
|
|
148
|
+
#
|
|
149
|
+
# The output is truncated to the configured filter width, so it omits any
|
|
150
|
+
# trailing padding present in the native bitmap.
|
|
58
151
|
def to_binary
|
|
59
152
|
bitmap.unpack1("B*")[0...size]
|
|
60
153
|
end
|
|
61
154
|
|
|
62
|
-
#
|
|
63
|
-
#
|
|
155
|
+
# Merges another filter or collection of keys into this filter.
|
|
156
|
+
#
|
|
157
|
+
# When +other+ is a +BloomFit+, the merge is performed bitwise and both
|
|
158
|
+
# filters must have the same +size+ and +hashes+ values. When +other+
|
|
159
|
+
# behaves like a hash, only keys with truthy values are added. Any other
|
|
160
|
+
# enumerable is treated as a list of keys.
|
|
161
|
+
#
|
|
162
|
+
# This method mutates the receiver and mimics Set#merge.
|
|
64
163
|
def merge(other)
|
|
65
164
|
if other.is_a?(BloomFit)
|
|
66
165
|
raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
|
|
@@ -74,9 +173,13 @@ class BloomFit
|
|
|
74
173
|
end
|
|
75
174
|
end
|
|
76
175
|
|
|
77
|
-
#
|
|
78
|
-
#
|
|
79
|
-
#
|
|
176
|
+
# Returns a new filter containing the bitwise intersection of two filters.
|
|
177
|
+
#
|
|
178
|
+
# Both filters must have the same +size+ and +hashes+ values or
|
|
179
|
+
# +BloomFit::ConfigurationMismatch+ is raised.
|
|
180
|
+
#
|
|
181
|
+
# Like all Bloom filter operations, membership checks on the result remain
|
|
182
|
+
# probabilistic and may still produce false positives.
|
|
80
183
|
def &(other)
|
|
81
184
|
raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
|
|
82
185
|
self.class.new(size:, hashes:).tap do |result|
|
|
@@ -85,9 +188,12 @@ class BloomFit
|
|
|
85
188
|
end
|
|
86
189
|
alias intersection &
|
|
87
190
|
|
|
88
|
-
#
|
|
89
|
-
#
|
|
90
|
-
#
|
|
191
|
+
# Returns a new filter containing the bitwise union of two filters.
|
|
192
|
+
#
|
|
193
|
+
# Both filters must have the same +size+ and +hashes+ values or
|
|
194
|
+
# +BloomFit::ConfigurationMismatch+ is raised.
|
|
195
|
+
#
|
|
196
|
+
# The receiver and +other+ are left unchanged.
|
|
91
197
|
def |(other)
|
|
92
198
|
raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
|
|
93
199
|
self.class.new(size:, hashes:).tap do |result|
|
|
@@ -96,6 +202,11 @@ class BloomFit
|
|
|
96
202
|
end
|
|
97
203
|
alias union |
|
|
98
204
|
|
|
205
|
+
# Returns a human-readable summary of the filter's current state.
|
|
206
|
+
#
|
|
207
|
+
# The report includes the configured width (+m+), the current number of set
|
|
208
|
+
# bits (+n+), the hash count (+k+), and the predicted false-positive rate
|
|
209
|
+
# based on the current fill level.
|
|
99
210
|
def stats
|
|
100
211
|
fpr = ((1.0 - Math.exp(-(k * n).to_f / m))**k) * 100
|
|
101
212
|
|
|
@@ -107,6 +218,9 @@ class BloomFit
|
|
|
107
218
|
end
|
|
108
219
|
end
|
|
109
220
|
|
|
221
|
+
# Rebuilds the filter from the serialized data returned by +marshal_dump+.
|
|
222
|
+
#
|
|
223
|
+
# This hook is used by Ruby's +Marshal+ support.
|
|
110
224
|
def marshal_load(ary)
|
|
111
225
|
size, hashes, bitmap = *ary
|
|
112
226
|
|
|
@@ -114,14 +228,20 @@ class BloomFit
|
|
|
114
228
|
@bf.load(bitmap) if bitmap
|
|
115
229
|
end
|
|
116
230
|
|
|
231
|
+
# Returns the data Ruby's +Marshal+ uses to serialize this filter.
|
|
117
232
|
def marshal_dump
|
|
118
233
|
[size, hashes, bitmap]
|
|
119
234
|
end
|
|
120
235
|
|
|
236
|
+
# Loads a filter from a file previously written by +save+.
|
|
237
|
+
#
|
|
238
|
+
# The file is read using Ruby's +Marshal+ format, so it should only be used
|
|
239
|
+
# with trusted input.
|
|
121
240
|
def self.load(filename)
|
|
122
241
|
Marshal.load(File.open(filename, "r")) # rubocop:disable Security/MarshalLoad
|
|
123
242
|
end
|
|
124
243
|
|
|
244
|
+
# Writes the filter to +filename+ using Ruby's +Marshal+ format.
|
|
125
245
|
def save(filename)
|
|
126
246
|
File.open(filename, "w") do |f|
|
|
127
247
|
f << Marshal.dump(self)
|
|
@@ -130,8 +250,7 @@ class BloomFit
|
|
|
130
250
|
|
|
131
251
|
protected
|
|
132
252
|
|
|
133
|
-
# Returns true
|
|
134
|
-
# the same.
|
|
253
|
+
# Returns +true+ when +other+ has the same +size+ and +hashes+ values.
|
|
135
254
|
def same_parameters?(other)
|
|
136
255
|
bf.m == other.bf.m && bf.k == other.bf.k
|
|
137
256
|
end
|
data/lib/cbloomfilter.bundle
CHANGED
|
Binary file
|
data/test/bloom_fit_test.rb
CHANGED
|
@@ -3,6 +3,28 @@ require "test_helper"
|
|
|
3
3
|
class BloomFitTest < Minitest::Spec
|
|
4
4
|
subject { BloomFit.new(size: 100, hashes: 4) }
|
|
5
5
|
|
|
6
|
+
describe ".new" do
|
|
7
|
+
it "accepts size and hashes override" do
|
|
8
|
+
bf = BloomFit.new(size: 10, hashes: 1)
|
|
9
|
+
assert_equal 10, bf.size
|
|
10
|
+
assert_equal 1, bf.hashes
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
it "has default capacity and false positive-rate" do
|
|
14
|
+
bf = BloomFit.new
|
|
15
|
+
# https://hur.st/bloomfilter/?n=100&p=0.001&m=&k=
|
|
16
|
+
assert_equal 1438, bf.size
|
|
17
|
+
assert_equal 10, bf.hashes
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
it "calculates size and hashes given a capacity and false postiive rate" do
|
|
21
|
+
bf = BloomFit.new(capacity: 10_000, false_positive_rate: 0.0001)
|
|
22
|
+
# https://hur.st/bloomfilter/?n=10000&p=0.0001&m=&k=
|
|
23
|
+
assert_equal 191_702, bf.size
|
|
24
|
+
assert_equal 14, bf.hashes
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
6
28
|
describe "#empty?" do
|
|
7
29
|
it "returns true when nothing set" do
|
|
8
30
|
assert_equal true, subject.empty? # rubocop:disable Minitest/AssertTruthy
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
require "test_helper"
|
|
2
|
+
|
|
3
|
+
class CBloomFilterTest < Minitest::Spec
|
|
4
|
+
subject { CBloomFilter.new }
|
|
5
|
+
|
|
6
|
+
describe "#m" do
|
|
7
|
+
it "defaults" do
|
|
8
|
+
assert_equal 1000, subject.m
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
it "is set by the 1st arg of the contructor" do
|
|
12
|
+
bf = CBloomFilter.new(10_000)
|
|
13
|
+
assert_equal 10_000, bf.m
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
describe "#k" do
|
|
18
|
+
it "defaults" do
|
|
19
|
+
assert_equal 4, subject.k
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
it "is set by the 2nd arg of the contructor" do
|
|
23
|
+
bf = CBloomFilter.new(10_000, 9)
|
|
24
|
+
assert_equal 9, bf.k
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
describe "#set_bits" do
|
|
29
|
+
it "initializes to zero" do
|
|
30
|
+
assert_equal 0, subject.set_bits
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
it "counts the bits when active" do
|
|
34
|
+
subject.add("foo")
|
|
35
|
+
assert_equal 4, subject.set_bits
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
describe "#add" do
|
|
40
|
+
it "adds keys to the filter set" do
|
|
41
|
+
subject.add("foo")
|
|
42
|
+
subject.add("bar")
|
|
43
|
+
assert_includes subject, "foo"
|
|
44
|
+
assert_includes subject, "bar"
|
|
45
|
+
refute_includes subject, "baz"
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
describe "#include?" do
|
|
50
|
+
it "returns true when a key is in the set" do
|
|
51
|
+
subject.add("foo")
|
|
52
|
+
assert_equal true, subject.include?("foo") # rubocop:disable Minitest/AssertTruthy
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
it "returns false when a key is not in the set" do
|
|
56
|
+
subject.add("foo")
|
|
57
|
+
assert_equal false, subject.include?("bar") # rubocop:disable Minitest/RefuteFalse
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
describe "#clear" do
|
|
62
|
+
it "clears a set" do
|
|
63
|
+
subject.add("foo")
|
|
64
|
+
subject.add("bar")
|
|
65
|
+
subject.add("baz")
|
|
66
|
+
assert subject.set_bits.positive?
|
|
67
|
+
subject.clear
|
|
68
|
+
assert subject.set_bits.zero?
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
describe "#merge" do
|
|
73
|
+
it "adds keys from another set" do
|
|
74
|
+
subject.add("foo")
|
|
75
|
+
|
|
76
|
+
bf = CBloomFilter.new
|
|
77
|
+
bf.add("bar")
|
|
78
|
+
bf.add("baz")
|
|
79
|
+
|
|
80
|
+
subject.merge(bf)
|
|
81
|
+
assert_includes subject, "foo"
|
|
82
|
+
assert_includes subject, "bar"
|
|
83
|
+
assert_includes subject, "baz"
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
describe "#&" do
|
|
88
|
+
it "intersects keys from another set" do
|
|
89
|
+
subject.add("foo")
|
|
90
|
+
subject.add("bar")
|
|
91
|
+
|
|
92
|
+
bf = CBloomFilter.new
|
|
93
|
+
bf.add("bar")
|
|
94
|
+
bf.add("baz")
|
|
95
|
+
|
|
96
|
+
bf2 = subject & bf
|
|
97
|
+
refute_includes bf2, "foo"
|
|
98
|
+
assert_includes bf2, "bar"
|
|
99
|
+
refute_includes bf2, "baz"
|
|
100
|
+
|
|
101
|
+
bf3 = bf & subject
|
|
102
|
+
refute_includes bf3, "foo"
|
|
103
|
+
assert_includes bf3, "bar"
|
|
104
|
+
refute_includes bf3, "baz"
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
describe "#|" do
|
|
109
|
+
it "unions keys from another set" do
|
|
110
|
+
subject.add("foo")
|
|
111
|
+
subject.add("bar")
|
|
112
|
+
|
|
113
|
+
bf = CBloomFilter.new
|
|
114
|
+
bf.add("bar")
|
|
115
|
+
bf.add("baz")
|
|
116
|
+
|
|
117
|
+
bf2 = subject | bf
|
|
118
|
+
assert_includes bf2, "foo"
|
|
119
|
+
assert_includes bf2, "bar"
|
|
120
|
+
assert_includes bf2, "baz"
|
|
121
|
+
|
|
122
|
+
bf3 = bf | subject
|
|
123
|
+
assert_includes bf3, "foo"
|
|
124
|
+
assert_includes bf3, "bar"
|
|
125
|
+
assert_includes bf3, "baz"
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
describe "#bitmap" do
|
|
130
|
+
it "returns a binary bitmap of all zeros when empty (including a terminating byte)" do
|
|
131
|
+
bf = CBloomFilter.new(16)
|
|
132
|
+
assert_equal "\x00\x00\x00".b, bf.bitmap
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
it "returns a binary bitmap representing the set" do
|
|
136
|
+
bf = CBloomFilter.new(16, 4)
|
|
137
|
+
bf.add("something")
|
|
138
|
+
assert_equal "(\x82\x00".b, bf.bitmap
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
it "returns a binary bitmap representing the set even if not a multiple of 8 bits (includes padding)" do
|
|
142
|
+
bf = CBloomFilter.new(20, 4)
|
|
143
|
+
bf.add("wow")
|
|
144
|
+
assert_equal "\x04\x14\x00\x00".b, bf.bitmap
|
|
145
|
+
end
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
describe "#load" do
|
|
149
|
+
it "overwrites the bitmap" do
|
|
150
|
+
bf = CBloomFilter.new(1000, 4)
|
|
151
|
+
bf.add("foo")
|
|
152
|
+
bf.add("bar")
|
|
153
|
+
subject.load(bf.bitmap)
|
|
154
|
+
assert_includes subject, "foo"
|
|
155
|
+
assert_includes subject, "bar"
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: bloom_fit
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 1.0.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ryan McGeary
|
|
@@ -31,6 +31,7 @@ files:
|
|
|
31
31
|
- lib/bloom_fit/version.rb
|
|
32
32
|
- lib/cbloomfilter.bundle
|
|
33
33
|
- test/bloom_fit_test.rb
|
|
34
|
+
- test/c_bloom_filter_test.rb
|
|
34
35
|
- test/test_helper.rb
|
|
35
36
|
homepage: https://github.com/rmm5t/bloom_fit
|
|
36
37
|
licenses: []
|