bloom_fit 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/bloom_fit/version.rb +1 -1
- data/lib/bloom_fit.rb +125 -18
- data/lib/cbloomfilter.bundle +0 -0
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: cd631cdb483e0a84fa05d56eb962fda0f7c7d7a0b002ea708024ce82505a9054
|
|
4
|
+
data.tar.gz: ee781997465d6f5b590828082e4fadd5b00768298bbdec7845b9f07c3d046549
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7862f2d0189bae865c6fc5e7c7ad24f5c7ab0420415a455a1a0b130835d639c536cb8925b08219eab7dd7a10db1e9299b2868019d3e2259db4dce96de01e50a2
|
|
7
|
+
data.tar.gz: 41cb7f2fcb8cf80f5345785ce0110e242a29fbe6177284b13b701973ec7b0e7010d788585e406f77712f7ee284ff308633fe060e492b0e153a4a5598658fd465
|
data/lib/bloom_fit/version.rb
CHANGED
data/lib/bloom_fit.rb
CHANGED
|
@@ -4,63 +4,150 @@ require "cbloomfilter"
|
|
|
4
4
|
require "bloom_fit/configuration_mismatch"
|
|
5
5
|
require "bloom_fit/version"
|
|
6
6
|
|
|
7
|
+
# BloomFit is an in-memory Bloom filter with a small, Set-like API.
|
|
8
|
+
#
|
|
9
|
+
# Bloom filters are probabilistic membership structures: they can report false
|
|
10
|
+
# positives, but they do not report false negatives for values that have been
|
|
11
|
+
# added. That makes BloomFit useful for cheaply ruling out missing values
|
|
12
|
+
# before doing more expensive work, while keeping memory usage low.
|
|
13
|
+
#
|
|
14
|
+
# The class wraps the native +CBloomFilter+ implementation in Ruby-friendly
|
|
15
|
+
# methods such as +add+, +include?+, +merge+, +&+, and +|+. Instances can be
|
|
16
|
+
# serialized with +save+ and reloaded with +BloomFit.load+.
|
|
17
|
+
#
|
|
18
|
+
# Filters can only be combined when they were created with the same +size+ and
|
|
19
|
+
# +hashes+ values; otherwise +BloomFit::ConfigurationMismatch+ is raised.
|
|
20
|
+
#
|
|
21
|
+
# filter = BloomFit.new(size: 10_000, hashes: 6)
|
|
22
|
+
# filter.add("cat")
|
|
23
|
+
# filter.include?("cat") # => true
|
|
24
|
+
# filter.include?("dog") # => false
|
|
25
|
+
#
|
|
26
|
+
# Choose +size+ and +hashes+ based on the expected number of inserts and the
|
|
27
|
+
# false-positive rate you can tolerate.
|
|
7
28
|
class BloomFit
|
|
8
29
|
extend Forwardable
|
|
9
30
|
|
|
31
|
+
# The wrapped native +CBloomFilter+ instance.
|
|
32
|
+
#
|
|
33
|
+
# This is mostly useful for low-level integrations and internal filter
|
|
34
|
+
# operations such as merge, union, and intersection.
|
|
10
35
|
attr_reader :bf
|
|
11
36
|
|
|
37
|
+
# Creates an empty Bloom filter.
|
|
38
|
+
#
|
|
39
|
+
# The defaults are a reasonable starting point for small in-memory filters,
|
|
40
|
+
# but the best values depend on how many keys you expect to insert and how
|
|
41
|
+
# many false positives you can tolerate.
|
|
42
|
+
#
|
|
12
43
|
# @param size [Integer] number of buckets in a bloom filter
|
|
13
44
|
# @param hashes [Integer] number of hash functions
|
|
14
45
|
def initialize(size: 1_000, hashes: 4)
|
|
15
46
|
@bf = CBloomFilter.new(size, hashes)
|
|
16
47
|
end
|
|
17
48
|
|
|
49
|
+
# :method: m
|
|
50
|
+
#
|
|
51
|
+
# Returns the configured filter width.
|
|
52
|
+
|
|
53
|
+
# :method: k
|
|
54
|
+
#
|
|
55
|
+
# Returns the number of hash functions applied to each key.
|
|
56
|
+
|
|
57
|
+
# :method: bitmap
|
|
58
|
+
#
|
|
59
|
+
# Returns the raw bitmap as a binary string.
|
|
60
|
+
#
|
|
61
|
+
# The returned bytes reflect the native representation, so the string may
|
|
62
|
+
# include padding beyond the configured filter size.
|
|
63
|
+
|
|
64
|
+
# :method: include?
|
|
65
|
+
#
|
|
66
|
+
# Returns +true+ when +key+ may be present and +false+ when it is definitely
|
|
67
|
+
# absent.
|
|
68
|
+
#
|
|
69
|
+
# Positive results are probabilistic and may be false positives.
|
|
70
|
+
|
|
71
|
+
# :method: clear
|
|
72
|
+
#
|
|
73
|
+
# Clears the filter by resetting all bits to +0+.
|
|
74
|
+
|
|
75
|
+
# :method: set_bits
|
|
76
|
+
#
|
|
77
|
+
# Returns the number of bits currently set to +1+.
|
|
78
|
+
|
|
18
79
|
def_delegators :@bf, :m, :k, :bitmap, :include?, :clear, :set_bits
|
|
19
80
|
|
|
81
|
+
# Returns the configured filter width.
|
|
20
82
|
alias size m
|
|
83
|
+
# Returns the number of hash functions used for each inserted key.
|
|
21
84
|
alias hashes k
|
|
22
85
|
alias key? include?
|
|
23
86
|
alias [] include?
|
|
24
87
|
alias n set_bits
|
|
25
88
|
|
|
89
|
+
# Returns +true+ when no bits are set.
|
|
90
|
+
#
|
|
91
|
+
# This is an exact check on the filter state, unlike +include?+, which is
|
|
92
|
+
# probabilistic for positive matches.
|
|
26
93
|
def empty?
|
|
27
94
|
set_bits.zero?
|
|
28
95
|
end
|
|
29
96
|
|
|
30
|
-
# Adds
|
|
31
|
-
#
|
|
97
|
+
# Adds +key+ to the filter and returns +self+.
|
|
98
|
+
#
|
|
99
|
+
# This mimics the behavior of Set#add and allows chaining with #<<.
|
|
32
100
|
def add(key)
|
|
33
101
|
@bf.add(key)
|
|
34
102
|
self
|
|
35
103
|
end
|
|
36
104
|
alias << add
|
|
37
105
|
|
|
38
|
-
# Adds
|
|
39
|
-
#
|
|
106
|
+
# Adds +key+ to the filter when +value+ is truthy.
|
|
107
|
+
#
|
|
108
|
+
# This makes BloomFit behave like a write-only membership hash: truthy values
|
|
109
|
+
# add the key, while +false+ and +nil+ are ignored.
|
|
40
110
|
def []=(key, value)
|
|
41
111
|
@bf.add(key) if value
|
|
42
112
|
end
|
|
43
113
|
|
|
44
|
-
# Adds
|
|
45
|
-
#
|
|
114
|
+
# Adds +key+ only if it does not already appear to be present.
|
|
115
|
+
#
|
|
116
|
+
# Returns +self+ when the key is added and +nil+ when +include?+ is already
|
|
117
|
+
# true. This mimics Set#add?.
|
|
118
|
+
#
|
|
119
|
+
# Because Bloom filters can return false positives, +add?+ may occasionally
|
|
120
|
+
# return +nil+ for a key that has not actually been inserted before.
|
|
46
121
|
def add?(key)
|
|
47
122
|
return nil if include?(key) # rubocop:disable Style/ReturnNilInPredicateMethodDefinition
|
|
48
123
|
add(key)
|
|
49
124
|
end
|
|
50
125
|
|
|
51
|
-
# Returns
|
|
126
|
+
# Returns the bitmap as a hexadecimal string.
|
|
127
|
+
#
|
|
128
|
+
# This is useful for debugging, logging, or comparing filter state in a more
|
|
129
|
+
# compact form than +to_binary+.
|
|
52
130
|
def to_hex
|
|
53
131
|
length = ((size / 8.0).ceil * 8 / 4)
|
|
54
132
|
bitmap.unpack1("H*")[0...length]
|
|
55
133
|
end
|
|
56
134
|
|
|
57
|
-
# Returns a string of
|
|
135
|
+
# Returns the bitmap as a binary string of +0+ and +1+ characters.
|
|
136
|
+
#
|
|
137
|
+
# The output is truncated to the configured filter width, so it omits any
|
|
138
|
+
# trailing padding present in the native bitmap.
|
|
58
139
|
def to_binary
|
|
59
140
|
bitmap.unpack1("B*")[0...size]
|
|
60
141
|
end
|
|
61
142
|
|
|
62
|
-
#
|
|
63
|
-
#
|
|
143
|
+
# Merges another filter or collection of keys into this filter.
|
|
144
|
+
#
|
|
145
|
+
# When +other+ is a +BloomFit+, the merge is performed bitwise and both
|
|
146
|
+
# filters must have the same +size+ and +hashes+ values. When +other+
|
|
147
|
+
# behaves like a hash, only keys with truthy values are added. Any other
|
|
148
|
+
# enumerable is treated as a list of keys.
|
|
149
|
+
#
|
|
150
|
+
# This method mutates the receiver and mimics Set#merge.
|
|
64
151
|
def merge(other)
|
|
65
152
|
if other.is_a?(BloomFit)
|
|
66
153
|
raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
|
|
@@ -74,9 +161,13 @@ class BloomFit
|
|
|
74
161
|
end
|
|
75
162
|
end
|
|
76
163
|
|
|
77
|
-
#
|
|
78
|
-
#
|
|
79
|
-
#
|
|
164
|
+
# Returns a new filter containing the bitwise intersection of two filters.
|
|
165
|
+
#
|
|
166
|
+
# Both filters must have the same +size+ and +hashes+ values or
|
|
167
|
+
# +BloomFit::ConfigurationMismatch+ is raised.
|
|
168
|
+
#
|
|
169
|
+
# Like all Bloom filter operations, membership checks on the result remain
|
|
170
|
+
# probabilistic and may still produce false positives.
|
|
80
171
|
def &(other)
|
|
81
172
|
raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
|
|
82
173
|
self.class.new(size:, hashes:).tap do |result|
|
|
@@ -85,9 +176,12 @@ class BloomFit
|
|
|
85
176
|
end
|
|
86
177
|
alias intersection &
|
|
87
178
|
|
|
88
|
-
#
|
|
89
|
-
#
|
|
90
|
-
#
|
|
179
|
+
# Returns a new filter containing the bitwise union of two filters.
|
|
180
|
+
#
|
|
181
|
+
# Both filters must have the same +size+ and +hashes+ values or
|
|
182
|
+
# +BloomFit::ConfigurationMismatch+ is raised.
|
|
183
|
+
#
|
|
184
|
+
# The receiver and +other+ are left unchanged.
|
|
91
185
|
def |(other)
|
|
92
186
|
raise BloomFit::ConfigurationMismatch unless same_parameters?(other)
|
|
93
187
|
self.class.new(size:, hashes:).tap do |result|
|
|
@@ -96,6 +190,11 @@ class BloomFit
|
|
|
96
190
|
end
|
|
97
191
|
alias union |
|
|
98
192
|
|
|
193
|
+
# Returns a human-readable summary of the filter's current state.
|
|
194
|
+
#
|
|
195
|
+
# The report includes the configured width (+m+), the current number of set
|
|
196
|
+
# bits (+n+), the hash count (+k+), and the predicted false-positive rate
|
|
197
|
+
# based on the current fill level.
|
|
99
198
|
def stats
|
|
100
199
|
fpr = ((1.0 - Math.exp(-(k * n).to_f / m))**k) * 100
|
|
101
200
|
|
|
@@ -107,6 +206,9 @@ class BloomFit
|
|
|
107
206
|
end
|
|
108
207
|
end
|
|
109
208
|
|
|
209
|
+
# Rebuilds the filter from the serialized data returned by +marshal_dump+.
|
|
210
|
+
#
|
|
211
|
+
# This hook is used by Ruby's +Marshal+ support.
|
|
110
212
|
def marshal_load(ary)
|
|
111
213
|
size, hashes, bitmap = *ary
|
|
112
214
|
|
|
@@ -114,14 +216,20 @@ class BloomFit
|
|
|
114
216
|
@bf.load(bitmap) if bitmap
|
|
115
217
|
end
|
|
116
218
|
|
|
219
|
+
# Returns the data Ruby's +Marshal+ uses to serialize this filter.
|
|
117
220
|
def marshal_dump
|
|
118
221
|
[size, hashes, bitmap]
|
|
119
222
|
end
|
|
120
223
|
|
|
224
|
+
# Loads a filter from a file previously written by +save+.
|
|
225
|
+
#
|
|
226
|
+
# The file is read using Ruby's +Marshal+ format, so it should only be used
|
|
227
|
+
# with trusted input.
|
|
121
228
|
def self.load(filename)
|
|
122
229
|
Marshal.load(File.open(filename, "r")) # rubocop:disable Security/MarshalLoad
|
|
123
230
|
end
|
|
124
231
|
|
|
232
|
+
# Writes the filter to +filename+ using Ruby's +Marshal+ format.
|
|
125
233
|
def save(filename)
|
|
126
234
|
File.open(filename, "w") do |f|
|
|
127
235
|
f << Marshal.dump(self)
|
|
@@ -130,8 +238,7 @@ class BloomFit
|
|
|
130
238
|
|
|
131
239
|
protected
|
|
132
240
|
|
|
133
|
-
# Returns true
|
|
134
|
-
# the same.
|
|
241
|
+
# Returns +true+ when +other+ has the same +size+ and +hashes+ values.
|
|
135
242
|
def same_parameters?(other)
|
|
136
243
|
bf.m == other.bf.m && bf.k == other.bf.k
|
|
137
244
|
end
|
data/lib/cbloomfilter.bundle
CHANGED
|
Binary file
|