datasketches 0.4.3 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/datasketches/vo_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/LICENSE +35 -7
- data/vendor/datasketches-cpp/NOTICE +2 -2
- data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -1
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
- data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
- data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
- data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +51 -1
- data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +38 -1
- data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +12 -3
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -39
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +18 -10
@@ -0,0 +1,753 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
|
7
|
+
* to you under the Apache License, Version 2.0 (the
|
8
|
+
* "License"); you may not use this file except in compliance
|
9
|
+
* with the License. You may obtain a copy of the License at
|
10
|
+
*
|
11
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
*
|
13
|
+
* Unless required by applicable law or agreed to in writing,
|
14
|
+
* software distributed under the License is distributed on an
|
15
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
16
|
+
* KIND, either express or implied. See the License for the
|
17
|
+
* specific language governing permissions and limitations
|
18
|
+
* under the License.
|
19
|
+
*/
|
20
|
+
|
21
|
+
#ifndef _BLOOM_FILTER_HPP_
|
22
|
+
#define _BLOOM_FILTER_HPP_
|
23
|
+
|
24
|
+
#include <cstdint>
|
25
|
+
#include <memory>
|
26
|
+
#include <vector>
|
27
|
+
|
28
|
+
#include "common_defs.hpp"
|
29
|
+
|
30
|
+
namespace datasketches {
|
31
|
+
|
32
|
+
// forward declarations
|
33
|
+
template<typename A> class bloom_filter_alloc;
|
34
|
+
|
35
|
+
// aliases with default allocator
|
36
|
+
using bloom_filter = bloom_filter_alloc<std::allocator<uint8_t>>;
|
37
|
+
|
38
|
+
/**
|
39
|
+
* <p>A Bloom filter is a data structure that can be used for probabilistic
|
40
|
+
* set membership.</p>
|
41
|
+
*
|
42
|
+
* <p>When querying a Bloom filter, there are no false positives. Specifically:
|
43
|
+
* When querying an item that has already been inserted to the filter, the filter will
|
44
|
+
* always indicate that the item is present. There is a chance of false positives, where
|
45
|
+
* querying an item that has never been presented to the filter will indicate that the
|
46
|
+
* item has already been seen. Consequently, any query should be interpreted as
|
47
|
+
* "might have seen."</p>
|
48
|
+
*
|
49
|
+
* <p>A standard Bloom filter is unlike typical sketches in that it is not sub-linear
|
50
|
+
* in size and does not resize itself. A Bloom filter will work up to a target number of
|
51
|
+
* distinct items, beyond which it will saturate and the false positive rate will start to
|
52
|
+
* increase. The size of a Bloom filter will be linear in the expected number of
|
53
|
+
* distinct items.</p>
|
54
|
+
*
|
55
|
+
* <p>See the bloom_filter_builder_alloc class for methods to create a filter, especially
|
56
|
+
* one sized correctly for a target number of distinct elements and a target
|
57
|
+
* false positive probability.</p>
|
58
|
+
*
|
59
|
+
* <p>This implementation uses xxHash64 and follows the approach in Kirsch and Mitzenmacher,
|
60
|
+
* "Less Hashing, Same Performance: Building a Better Bloom Filter," Wiley Interscience, 2008, pp. 187-218.</p>
|
61
|
+
*/
|
62
|
+
|
63
|
+
template<typename Allocator = std::allocator<uint8_t>>
|
64
|
+
class bloom_filter_alloc {
|
65
|
+
public:
|
66
|
+
|
67
|
+
// no public constructor; use builder or deserialize/wrap methods
|
68
|
+
class builder;
|
69
|
+
|
70
|
+
/**
|
71
|
+
* This method deserializes a Bloom filter from a given array of bytes.
|
72
|
+
* @param bytes pointer to the array of bytes
|
73
|
+
* @param size the size of the array
|
74
|
+
* @param allocator instance of an Allocator
|
75
|
+
* @return an instance of a Bloom filter
|
76
|
+
*/
|
77
|
+
static bloom_filter_alloc deserialize(const void* bytes, size_t length_bytes, const Allocator& allocator = Allocator());
|
78
|
+
|
79
|
+
/**
|
80
|
+
* This method deserializes a Bloom filter from a given stream.
|
81
|
+
* @param is input stream
|
82
|
+
* @param allocator instance of an Allocator
|
83
|
+
* @return an instance of a Bloom filter
|
84
|
+
*/
|
85
|
+
static bloom_filter_alloc deserialize(std::istream& is, const Allocator& allocator = Allocator());
|
86
|
+
|
87
|
+
/**
|
88
|
+
* @brief Wraps the provided memory as a read-only Bloom filter. Reads the data in-place and does
|
89
|
+
* not take ownership of the underlying memory. Does not allow modifying the filter.
|
90
|
+
*
|
91
|
+
* @param data The memory to wrap
|
92
|
+
* @param length_bytes The length of the memory in bytes
|
93
|
+
* @param allocator instance of an Allocator
|
94
|
+
* @return a const (read-only) Bloom filter wrapping the provided memory
|
95
|
+
*/
|
96
|
+
static const bloom_filter_alloc wrap(const void* data, size_t length_bytes, const Allocator& allocator = Allocator());
|
97
|
+
|
98
|
+
/**
|
99
|
+
* @brief Wraps the provided memory as a writable Bloom filter. Reads the data in-place and does
|
100
|
+
* not take ownership of the underlying memory. Allows modifying the filter.
|
101
|
+
*
|
102
|
+
* @param data the memory to wrap
|
103
|
+
* @param length_bytes the length of the memory in bytes
|
104
|
+
* @param allocator instance of an Allocator
|
105
|
+
* @return a Bloom filter wrapping the provided memory
|
106
|
+
*/
|
107
|
+
static bloom_filter_alloc writable_wrap(void* data, size_t length_bytes, const Allocator& allocator = Allocator());
|
108
|
+
|
109
|
+
/**
|
110
|
+
* Copy constructor
|
111
|
+
* @param other filter to be copied
|
112
|
+
*/
|
113
|
+
bloom_filter_alloc(const bloom_filter_alloc& other);
|
114
|
+
|
115
|
+
/** Move constructor
|
116
|
+
* @param other filter to be moved
|
117
|
+
*/
|
118
|
+
bloom_filter_alloc(bloom_filter_alloc&& other) noexcept;
|
119
|
+
|
120
|
+
/**
|
121
|
+
* Copy assignment
|
122
|
+
* @param other filter to be copied
|
123
|
+
* @return reference to this filter
|
124
|
+
*/
|
125
|
+
bloom_filter_alloc& operator=(const bloom_filter_alloc& other);
|
126
|
+
|
127
|
+
/**
|
128
|
+
* Move assignment
|
129
|
+
* @param other filter to be moved
|
130
|
+
* @return reference to this filter
|
131
|
+
*/
|
132
|
+
bloom_filter_alloc& operator=(bloom_filter_alloc&& other);
|
133
|
+
|
134
|
+
/**
|
135
|
+
* @brief Destroy the bloom filter object
|
136
|
+
*/
|
137
|
+
~bloom_filter_alloc();
|
138
|
+
|
139
|
+
// This is a convenience alias for users
|
140
|
+
// The type returned by the following serialize method
|
141
|
+
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
|
142
|
+
|
143
|
+
/**
|
144
|
+
* This method serializes the filter as a vector of bytes.
|
145
|
+
* An optional header can be reserved in front of the filter.
|
146
|
+
* It is a blank space of a given size.
|
147
|
+
* Some integrations such as PostgreSQL may need this header space.
|
148
|
+
* @param header_size_bytes space to reserve in front of the filter
|
149
|
+
* @return serialized filter as a vector of bytes
|
150
|
+
*/
|
151
|
+
vector_bytes serialize(unsigned header_size_bytes = 0) const;
|
152
|
+
|
153
|
+
/**
|
154
|
+
* This method serializes the filter into a given stream in a binary form
|
155
|
+
* @param os output stream
|
156
|
+
*/
|
157
|
+
void serialize(std::ostream& os) const;
|
158
|
+
|
159
|
+
/**
|
160
|
+
* Checks if the Bloom Filter has processed any items
|
161
|
+
* @return True if the BloomFilter is empty, otherwise False
|
162
|
+
*/
|
163
|
+
bool is_empty() const;
|
164
|
+
|
165
|
+
/**
|
166
|
+
* Returns the number of bits in the Bloom Filter that are set to 1.
|
167
|
+
* @return The number of bits in use in this filter
|
168
|
+
*/
|
169
|
+
uint64_t get_bits_used();
|
170
|
+
|
171
|
+
/**
|
172
|
+
* Returns the total number of bits in the Bloom Filter.
|
173
|
+
* @return The total size of the Bloom Filter
|
174
|
+
*/
|
175
|
+
uint64_t get_capacity() const;
|
176
|
+
|
177
|
+
/**
|
178
|
+
* Returns the configured number of hash functions for this Bloom Filter
|
179
|
+
* @return The number of hash functions to apply to inputs
|
180
|
+
*/
|
181
|
+
uint16_t get_num_hashes() const;
|
182
|
+
|
183
|
+
/**
|
184
|
+
* Returns the hash seed for this Bloom Filter.
|
185
|
+
* @return The hash seed for this filter
|
186
|
+
*/
|
187
|
+
uint64_t get_seed() const;
|
188
|
+
|
189
|
+
/**
|
190
|
+
* Resets the Bloom Filter to its original state.
|
191
|
+
*/
|
192
|
+
void reset();
|
193
|
+
|
194
|
+
// UPDATE METHODS
|
195
|
+
|
196
|
+
/**
|
197
|
+
* Updates the filter with the given std::string.
|
198
|
+
* The string is converted to a byte array using UTF8 encoding.
|
199
|
+
* If the string is null or empty no update attempt is made and the method returns.
|
200
|
+
* @param item The given string.
|
201
|
+
*/
|
202
|
+
void update(const std::string& item);
|
203
|
+
|
204
|
+
/**
|
205
|
+
* Updates the filter with the given unsigned 64-bit integer.
|
206
|
+
* @param item The given integer.
|
207
|
+
*/
|
208
|
+
void update(uint64_t item);
|
209
|
+
|
210
|
+
/**
|
211
|
+
* Updates the filter with the given unsigned 32-bit integer.
|
212
|
+
* @param item The given integer.
|
213
|
+
*/
|
214
|
+
void update(uint32_t item);
|
215
|
+
|
216
|
+
/**
|
217
|
+
* Updates the filter with the given unsigned 16-bit integer.
|
218
|
+
* @param item The given integer.
|
219
|
+
*/
|
220
|
+
void update(uint16_t item);
|
221
|
+
|
222
|
+
/**
|
223
|
+
* Updates the filter with the given unsigned 8-bit integer.
|
224
|
+
* @param item The given integer.
|
225
|
+
*/
|
226
|
+
void update(uint8_t item);
|
227
|
+
|
228
|
+
/**
|
229
|
+
* Updates the filter with the given signed 64-bit integer.
|
230
|
+
* @param item The given integer.
|
231
|
+
*/
|
232
|
+
void update(int64_t item);
|
233
|
+
|
234
|
+
/**
|
235
|
+
* Updates the filter with the given signed 32-bit integer.
|
236
|
+
* @param item The given integer.
|
237
|
+
*/
|
238
|
+
void update(int32_t item);
|
239
|
+
|
240
|
+
/**
|
241
|
+
* Updates the filter with the given signed 16-bit integer.
|
242
|
+
* @param item The given integer.
|
243
|
+
*/
|
244
|
+
void update(int16_t item);
|
245
|
+
|
246
|
+
/**
|
247
|
+
* Updates the filter with the given signed 8-bit integer.
|
248
|
+
* @param item The given integer.
|
249
|
+
*/
|
250
|
+
void update(int8_t item);
|
251
|
+
|
252
|
+
/**
|
253
|
+
* Updates the filter with the given 64-bit floating point value.
|
254
|
+
* @param item The given double.
|
255
|
+
*/
|
256
|
+
void update(double item);
|
257
|
+
|
258
|
+
/**
|
259
|
+
* Updates the filter with the give 32-bit floating point value.
|
260
|
+
* @param item The given float.
|
261
|
+
*/
|
262
|
+
void update(float item);
|
263
|
+
|
264
|
+
/**
|
265
|
+
* Updates the filter with the given data array.
|
266
|
+
* @param data The given array.
|
267
|
+
* @param length_bytes The array length in bytes.
|
268
|
+
*/
|
269
|
+
void update(const void* data, size_t length_bytes);
|
270
|
+
|
271
|
+
// QUERY-AND-UPDATE METHODS
|
272
|
+
|
273
|
+
/**
|
274
|
+
* Updates the filter with the given std::string and returns the result from
|
275
|
+
* querying the filter prior to the update.
|
276
|
+
* The string is converted to a byte array using UTF8 encoding.
|
277
|
+
* If the string is null or empty no update attempt is made and the method returns false.
|
278
|
+
* @param item The given string.
|
279
|
+
* @return The result from querying the filter prior to the update.
|
280
|
+
*/
|
281
|
+
bool query_and_update(const std::string& item);
|
282
|
+
|
283
|
+
/**
|
284
|
+
* Updates the filter with the given unsigned 64-bit integer and returns the result from
|
285
|
+
* querying the filter prior to the update.
|
286
|
+
* @param item The given integer.
|
287
|
+
* @return The result from querying the filter prior to the update.
|
288
|
+
*/
|
289
|
+
bool query_and_update(uint64_t item);
|
290
|
+
|
291
|
+
/**
|
292
|
+
* Updates the filter with the given unsigned 32-bit integer and returns the result from
|
293
|
+
* querying the filter prior to the update.
|
294
|
+
* @param item The given integer.
|
295
|
+
* @return The result from querying the filter prior to the update.
|
296
|
+
*/
|
297
|
+
bool query_and_update(uint32_t item);
|
298
|
+
|
299
|
+
/**
|
300
|
+
* Updates the filter with the given unsigned 16-bit integer and returns the result from
|
301
|
+
* querying the filter prior to the update.
|
302
|
+
* @param item The given integer.
|
303
|
+
* @return The result from querying the filter prior to the update.
|
304
|
+
*/
|
305
|
+
bool query_and_update(uint16_t item);
|
306
|
+
|
307
|
+
/**
|
308
|
+
* Updates the filter with the given unsigned 8-bit integer and returns the result from
|
309
|
+
* querying the filter prior to the update.
|
310
|
+
* @param item The given integer.
|
311
|
+
* @return The result from querying the filter prior to the update.
|
312
|
+
*/
|
313
|
+
bool query_and_update(uint8_t item);
|
314
|
+
|
315
|
+
/**
|
316
|
+
* Updates the filter with the given signed 64-bit integer and returns the result from
|
317
|
+
* querying the filter prior to the update.
|
318
|
+
* @param item The given integer.
|
319
|
+
* @return The result from querying the filter prior to the update.
|
320
|
+
*/
|
321
|
+
bool query_and_update(int64_t item);
|
322
|
+
|
323
|
+
/**
|
324
|
+
* Updates the filter with the given signed 32-bit integer and returns the result from
|
325
|
+
* querying the filter prior to the update.
|
326
|
+
* @param item The given integer.
|
327
|
+
* @return The result from querying the filter prior to the update.
|
328
|
+
*/
|
329
|
+
bool query_and_update(int32_t item);
|
330
|
+
|
331
|
+
/**
|
332
|
+
* Updates the filter with the given signed 16-bit integer and returns the result from
|
333
|
+
* querying the filter prior to the update.
|
334
|
+
* @param item The given integer.
|
335
|
+
* @return The result from querying the filter prior to the update.
|
336
|
+
*/
|
337
|
+
bool query_and_update(int16_t item);
|
338
|
+
|
339
|
+
/**
|
340
|
+
* Updates the filter with the given signed 8-bit integer and returns the result from
|
341
|
+
* querying the filter prior to the update.
|
342
|
+
* @param item The given integer.
|
343
|
+
* @return The result from querying the filter prior to the update.
|
344
|
+
*/
|
345
|
+
bool query_and_update(int8_t item);
|
346
|
+
|
347
|
+
/**
|
348
|
+
* Updates the filter with the given 64-bit floating point value and returns the result from
|
349
|
+
* querying the filter prior to the update.
|
350
|
+
* @param item The given double.
|
351
|
+
* @return The result from querying the filter prior to the update.
|
352
|
+
*/
|
353
|
+
bool query_and_update(double item);
|
354
|
+
|
355
|
+
/**
|
356
|
+
* Updates the filter with the give 32-bit floating point value and returns the result from
|
357
|
+
* querying the filter prior to the update.
|
358
|
+
* @param item The given float.
|
359
|
+
* @return The result from querying the filter prior to the update.
|
360
|
+
*/
|
361
|
+
bool query_and_update(float item);
|
362
|
+
|
363
|
+
/**
|
364
|
+
* Updates the filter with the given data array and returns the result from
|
365
|
+
* querying the filter prior to the update.
|
366
|
+
* @param data The given array.
|
367
|
+
* @param length_bytes The array length in bytes.
|
368
|
+
* @return The result from querying the filter prior to the update.
|
369
|
+
*/
|
370
|
+
bool query_and_update(const void* data, size_t length_bytes);
|
371
|
+
|
372
|
+
// QUERY METHODS
|
373
|
+
|
374
|
+
/**
|
375
|
+
* Queries the filter with the given std::string and returns whether the value
|
376
|
+
* might have been seen previoiusly. The filter's expected Fale Positive Probability
|
377
|
+
* determines the chances of a true result being a false positive. False engatives are
|
378
|
+
* never possible.
|
379
|
+
* The string is converted to a byte array using UTF8 encoding.
|
380
|
+
* If the string is null or empty the method always returns false.
|
381
|
+
* @param item The given string.
|
382
|
+
* @return The result from querying the filter with the given item.
|
383
|
+
*/
|
384
|
+
bool query(const std::string& item) const;
|
385
|
+
|
386
|
+
/**
|
387
|
+
* Queries the filter with the given unsigned 64-bit integer and returns whether the value
|
388
|
+
* might have been seen previoiusly. The filter's expected Fale Positive Probability
|
389
|
+
* determines the chances of a true result being a false positive. False engatives are
|
390
|
+
* never possible.
|
391
|
+
* @param item The given integer.
|
392
|
+
* @return The result from querying the filter with the given item.
|
393
|
+
*/
|
394
|
+
bool query(uint64_t item) const;
|
395
|
+
|
396
|
+
/**
|
397
|
+
* Queries the filter with the given unsigned 32-bit integer and returns whether the value
|
398
|
+
* might have been seen previoiusly. The filter's expected Fale Positive Probability
|
399
|
+
* determines the chances of a true result being a false positive. False engatives are
|
400
|
+
* never possible.
|
401
|
+
* @param item The given integer.
|
402
|
+
* @return The result from querying the filter with the given item.
|
403
|
+
*/
|
404
|
+
bool query(uint32_t item) const;
|
405
|
+
|
406
|
+
/**
|
407
|
+
* Queries the filter with the given unsigned 16-bit integer and returns whether the value
|
408
|
+
* might have been seen previoiusly. The filter's expected Fale Positive Probability
|
409
|
+
* determines the chances of a true result being a false positive. False engatives are
|
410
|
+
* never possible.
|
411
|
+
* @param item The given integer.
|
412
|
+
* @return The result from querying the filter with the given item.
|
413
|
+
*/
|
414
|
+
bool query(uint16_t item) const;
|
415
|
+
|
416
|
+
/**
|
417
|
+
* Queries the filter with the given unsigned 8-bit integer and returns whether the value
|
418
|
+
* might have been seen previoiusly. The filter's expected Fale Positive Probability
|
419
|
+
* determines the chances of a true result being a false positive. False engatives are
|
420
|
+
* never possible.
|
421
|
+
* @param item The given integer.
|
422
|
+
* @return The result from querying the filter with the given item.
|
423
|
+
*/
|
424
|
+
bool query(uint8_t item) const;
|
425
|
+
|
426
|
+
/**
|
427
|
+
* Queries the filter with the given signed 64-bit integer and returns whether the value
|
428
|
+
* might have been seen previoiusly. The filter's expected Fale Positive Probability
|
429
|
+
* determines the chances of a true result being a false positive. False engatives are
|
430
|
+
* never possible.
|
431
|
+
* @param item The given integer.
|
432
|
+
* @return The result from querying the filter with the given item.
|
433
|
+
*/
|
434
|
+
bool query(int64_t item) const;
|
435
|
+
|
436
|
+
/**
|
437
|
+
* Queries the filter with the given signed 32-bit integer and returns whether the value
|
438
|
+
* might have been seen previoiusly. The filter's expected Fale Positive Probability
|
439
|
+
* determines the chances of a true result being a false positive. False engatives are
|
440
|
+
* never possible.
|
441
|
+
* @param item The given integer.
|
442
|
+
* @return The result from querying the filter with the given item.
|
443
|
+
*/
|
444
|
+
bool query(int32_t item) const;
|
445
|
+
|
446
|
+
/**
|
447
|
+
* Queries the filter with the given signed 16-bit integer and returns whether the value
|
448
|
+
* might have been seen previoiusly. The filter's expected Fale Positive Probability
|
449
|
+
* determines the chances of a true result being a false positive. False engatives are
|
450
|
+
* never possible.
|
451
|
+
* @param item The given integer.
|
452
|
+
* @return The result from querying the filter with the given item.
|
453
|
+
*/
|
454
|
+
bool query(int16_t item) const;
|
455
|
+
|
456
|
+
/**
|
457
|
+
* Queries the filter with the given signed 8-bit integer and returns whether the value
|
458
|
+
* might have been seen previoiusly. The filter's expected Fale Positive Probability
|
459
|
+
* determines the chances of a true result being a false positive. False engatives are
|
460
|
+
* never possible.
|
461
|
+
* @param item The given integer.
|
462
|
+
* @return The result from querying the filter with the given item.
|
463
|
+
*/
|
464
|
+
bool query(int8_t item) const;
|
465
|
+
|
466
|
+
/**
|
467
|
+
* Queries the filter with the given 64-bit floating point value and returns whether the value
|
468
|
+
* might have been seen previoiusly. The filter's expected Fale Positive Probability
|
469
|
+
* determines the chances of a true result being a false positive. False engatives are
|
470
|
+
* never possible.
|
471
|
+
* @param item The given double.
|
472
|
+
* @return The result from querying the filter with the given item.
|
473
|
+
*/
|
474
|
+
bool query(double item) const;
|
475
|
+
|
476
|
+
/**
|
477
|
+
* Queries the filter with the given 32-bit floating point value and returns whether the value
|
478
|
+
* might have been seen previoiusly. The filter's expected Fale Positive Probability
|
479
|
+
* determines the chances of a true result being a false positive. False engatives are
|
480
|
+
* never possible.
|
481
|
+
* @param item The given float.
|
482
|
+
* @return The result from querying the filter with the given item.
|
483
|
+
*/
|
484
|
+
bool query(float item) const;
|
485
|
+
|
486
|
+
/**
|
487
|
+
* Queries the filter with the given data array. and returns the result from
|
488
|
+
* Queries the filter with the given 64-bit floating point value and returns whether the value
|
489
|
+
* might have been seen previoiusly. The filter's expected Fale Positive Probability
|
490
|
+
* determines the chances of a true result being a false positive. False engatives are
|
491
|
+
* never possible.
|
492
|
+
* @param data The given array.
|
493
|
+
* @param length_bytes The array length in bytes.
|
494
|
+
* @return The result from querying the filter with the given item.
|
495
|
+
*/
|
496
|
+
bool query(const void* data, size_t length_bytes) const;
|
497
|
+
|
498
|
+
// OTHER OPERATIONS
|
499
|
+
|
500
|
+
/**
|
501
|
+
* Unions two Bloom Filters by applying a logical OR. The result will recognized
|
502
|
+
* any values seen by either filter (as well as false positives).
|
503
|
+
* @param other A BloomFilter to union with this one
|
504
|
+
*/
|
505
|
+
void union_with(const bloom_filter_alloc& other);
|
506
|
+
|
507
|
+
/**
|
508
|
+
* Intersects two Bloom Filters by applying a logical AND. The result will recognize
|
509
|
+
* only values seen by both filters (as well as false positives).
|
510
|
+
* @param other A Bloom Filter to union with this one
|
511
|
+
*/
|
512
|
+
void intersect(const bloom_filter_alloc& other);
|
513
|
+
|
514
|
+
/**
|
515
|
+
* Inverts all the bits of the BloomFilter. Approximately inverts the notion of set-membership.
|
516
|
+
*/
|
517
|
+
void invert();
|
518
|
+
|
519
|
+
/**
|
520
|
+
* Helps identify if two Bloom Filters may be unioned or intersected.
|
521
|
+
* @param other A Bloom Filter to check for compatibility with this one
|
522
|
+
* @return True if the filters are compatible, otherwise false
|
523
|
+
*/
|
524
|
+
bool is_compatible(const bloom_filter_alloc& other) const;
|
525
|
+
|
526
|
+
/**
|
527
|
+
* @brief Checks if the Bloom Filter is read-only.
|
528
|
+
*
|
529
|
+
* @return True if the filter is read-only, otherwise false.
|
530
|
+
*/
|
531
|
+
bool is_read_only() const;
|
532
|
+
|
533
|
+
/**
|
534
|
+
* @brief Returns whether the filter owns its underlying memory
|
535
|
+
* @return True if the filter owns its memory, otherwise false
|
536
|
+
*/
|
537
|
+
bool is_memory_owned() const;
|
538
|
+
|
539
|
+
/**
|
540
|
+
* @brief Checks if the Bloom Filter was created by a call to wrap().
|
541
|
+
*
|
542
|
+
* @return True if the filter was created by wrapping memory, otherwise false.
|
543
|
+
*/
|
544
|
+
bool is_wrapped() const;
|
545
|
+
|
546
|
+
/**
|
547
|
+
* @brief Returns a pointer to the memory this filter wraps, if it exists.
|
548
|
+
* @return A pointer to the wrapped memory, or nullptr if is_wrapped() is false.
|
549
|
+
*/
|
550
|
+
const uint8_t* get_wrapped_memory() const;
|
551
|
+
|
552
|
+
/**
|
553
|
+
* @brief Gets the serialized size of the Bloom Filter in bytes
|
554
|
+
* @return The serialized size of the Bloom Filter in bytes
|
555
|
+
*/
|
556
|
+
size_t get_serialized_size_bytes() const;
|
557
|
+
|
558
|
+
/**
|
559
|
+
* @brief Gets the serialized size of the Bloom Filter with the given number of bits, in bytes
|
560
|
+
* @param num_bits The number of bits in the Bloom Filter for the size calculation
|
561
|
+
* @return The serialized size of a Bloom Filter with a capacity of num_bits, in bytes
|
562
|
+
*/
|
563
|
+
static size_t get_serialized_size_bytes(uint64_t num_bits);
|
564
|
+
|
565
|
+
/**
|
566
|
+
* @brief Returns a human-readable string representation of the Bloom Filter.
|
567
|
+
* @param print_filter If true, the filter bits will be printed as well.
|
568
|
+
* @return A human-readable string representation of the Bloom Filter.
|
569
|
+
*/
|
570
|
+
string<Allocator> to_string(bool print_filter = false) const;
|
571
|
+
|
572
|
+
private:
|
573
|
+
using A = Allocator;
|
574
|
+
using AllocUint8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
|
575
|
+
|
576
|
+
static const uint64_t DIRTY_BITS_VALUE = static_cast<uint64_t>(-1LL);
|
577
|
+
static const uint64_t MAX_HEADER_SIZE_BYTES = 32; // 4 Java Longs
|
578
|
+
static const uint64_t BIT_ARRAY_LENGTH_OFFSET_BYTES = 16;
|
579
|
+
static const uint64_t NUM_BITS_SET_OFFSET_BYTES = 24;
|
580
|
+
static const uint64_t BIT_ARRAY_OFFSET_BYTES = 32;
|
581
|
+
static const uint64_t MAX_FILTER_SIZE_BITS = (INT32_MAX - MAX_HEADER_SIZE_BYTES) * sizeof(uint64_t);
|
582
|
+
|
583
|
+
static const uint8_t PREAMBLE_LONGS_EMPTY = 3;
|
584
|
+
static const uint8_t PREAMBLE_LONGS_STANDARD = 4;
|
585
|
+
static const uint8_t FAMILY_ID = 21;
|
586
|
+
static const uint8_t SER_VER = 1;
|
587
|
+
static const uint8_t EMPTY_FLAG_MASK = 4;
|
588
|
+
|
589
|
+
// used by builder methods
|
590
|
+
bloom_filter_alloc(uint64_t num_bits, uint16_t num_hashes, uint64_t seed, const A& allocator);
|
591
|
+
bloom_filter_alloc(uint8_t* memory, size_t length_bytes, uint64_t num_bits, uint16_t num_hashes, uint64_t seed, const A& allocator);
|
592
|
+
|
593
|
+
// used by deserialize and wrap
|
594
|
+
bloom_filter_alloc(uint64_t seed,
|
595
|
+
uint16_t num_hashes,
|
596
|
+
bool is_dirty,
|
597
|
+
bool is_owned,
|
598
|
+
bool is_read_only,
|
599
|
+
uint64_t capacity_bits,
|
600
|
+
uint64_t num_bits_set,
|
601
|
+
uint8_t* bit_array,
|
602
|
+
uint8_t* memory,
|
603
|
+
const A& allocator);
|
604
|
+
|
605
|
+
static bloom_filter_alloc internal_deserialize_or_wrap(void* bytes,
|
606
|
+
size_t length_bytes,
|
607
|
+
bool read_only,
|
608
|
+
bool wrap,
|
609
|
+
const A& allocator);
|
610
|
+
|
611
|
+
// internal query/update methods
|
612
|
+
void internal_update(uint64_t h0, uint64_t h1);
|
613
|
+
bool internal_query_and_update(uint64_t h0, uint64_t h1);
|
614
|
+
bool internal_query(uint64_t h0, uint64_t h1) const;
|
615
|
+
|
616
|
+
void update_num_bits_set(uint64_t num_bits_set);
|
617
|
+
|
618
|
+
Allocator allocator_;
|
619
|
+
uint64_t seed_;
|
620
|
+
uint16_t num_hashes_;
|
621
|
+
bool is_dirty_;
|
622
|
+
bool is_owned_; // if true, data is not owned by filter AND memory_ holds the entire filter not just the bit array
|
623
|
+
bool is_read_only_; // if true, filter is read-only
|
624
|
+
uint64_t capacity_bits_;
|
625
|
+
uint64_t num_bits_set_;
|
626
|
+
uint8_t* bit_array_; // data backing bit_array_, regardless of ownership
|
627
|
+
uint8_t* memory_; // if wrapped, pointer to the start of the filter, otheriwse nullptr
|
628
|
+
};
|
629
|
+
|
630
|
+
/**
|
631
|
+
* <p>This class provides methods to help estimate the correct parameters when
|
632
|
+
* creating a Bloom filter, and methods to create the filter using those values.</p>
|
633
|
+
*
|
634
|
+
* <p>The underlying math is described in the
|
635
|
+
* <a href='https://en.wikipedia.org/wiki/Bloom_filter#Optimal_number_of_hash_functions'>
|
636
|
+
* Wikipedia article on Bloom filters</a>.</p>
|
637
|
+
*/
|
638
|
+
template<typename Allocator>
|
639
|
+
class bloom_filter_alloc<Allocator>::builder {
|
640
|
+
public:
|
641
|
+
/**
|
642
|
+
* Returns the optimal number of hash functions to given target numbers of distinct items
|
643
|
+
* and the Bloom filter size in bits. This function will provide a result even if the input
|
644
|
+
* values exceed the capacity of a single Bloom filter.
|
645
|
+
* @param max_distinct_items The maximum expected number of distinct items to add to the filter
|
646
|
+
* @param num_filter_bits The intended size of the Bloom Filter in bits
|
647
|
+
* @return The suggested number of hash functions to use with the filter
|
648
|
+
*/
|
649
|
+
static uint16_t suggest_num_hashes(uint64_t max_distinct_items, uint64_t num_filter_bits);
|
650
|
+
|
651
|
+
/**
|
652
|
+
* Returns the optimal number of hash functions to achieve a target false positive probability.
|
653
|
+
* @param target_false_positive_prob A desired false positive probability per item
|
654
|
+
* @return The suggested number of hash functions to use with the filter.
|
655
|
+
*/
|
656
|
+
static uint16_t suggest_num_hashes(double target_false_positive_prob);
|
657
|
+
|
658
|
+
/**
|
659
|
+
* Returns the optimal number of bits to use in a Bloom filter given a target number of distinct
|
660
|
+
* items and a target false positive probability.
|
661
|
+
* @param max_distinct_items The maximum expected number of distinct items to add to the filter
|
662
|
+
* @param target_false_positive_prob A desired false positive probability per item
|
663
|
+
* @return The suggested number of bits to use with the filter
|
664
|
+
*/
|
665
|
+
static uint64_t suggest_num_filter_bits(uint64_t max_distinct_items, double target_false_positive_prob);
|
666
|
+
|
667
|
+
/**
|
668
|
+
* Creates a new Bloom filter with an optimal number of bits and hash functions for the given inputs,
|
669
|
+
* using a random base seed for the hash function.
|
670
|
+
* @param max_distinct_items The maximum expected number of distinct items to add to the filter
|
671
|
+
* @param target_false_positive_prob A desired false positive probability per item
|
672
|
+
* @param seed A bash hash seed (default: random)
|
673
|
+
* @param allocator The allocator to use for the filter (default: standard allocator)
|
674
|
+
* @return A new Bloom filter configured for the given input parameters
|
675
|
+
*/
|
676
|
+
static bloom_filter_alloc<Allocator> create_by_accuracy(uint64_t max_distinct_items,
|
677
|
+
double target_false_positive_prob,
|
678
|
+
uint64_t seed = generate_random_seed(),
|
679
|
+
const Allocator& allocator = Allocator());
|
680
|
+
|
681
|
+
/**
|
682
|
+
* Creates a Bloom filter with given number of bits and number of hash functions,
|
683
|
+
* using the provided base seed for the hash function.
|
684
|
+
*
|
685
|
+
* @param num_bits The size of the BloomFilter, in bits
|
686
|
+
* @param num_hashes The number of hash functions to apply to items
|
687
|
+
* @param seed A base hash seed (default: random)
|
688
|
+
* @param allocator The allocator to use for the filter (default: standard allocator)
|
689
|
+
* @return A new Bloom filter configured for the given input parameters
|
690
|
+
*/
|
691
|
+
static bloom_filter_alloc<Allocator> create_by_size(uint64_t num_bits,
|
692
|
+
uint16_t num_hashes,
|
693
|
+
uint64_t seed = generate_random_seed(),
|
694
|
+
const Allocator& allocator = Allocator());
|
695
|
+
|
696
|
+
/**
|
697
|
+
* Creates a new Bloom filter with an optimal number of bits and hash functions for the given inputs,
|
698
|
+
* using a random base seed for the hash function and writing into the provided memory. The filter does
|
699
|
+
* not take ownership of the memory but does overwrite the full contents.
|
700
|
+
*
|
701
|
+
* @param memory A pointer to the memory to use for the filter
|
702
|
+
* @param length_bytes The length of the memory in bytes
|
703
|
+
* @param max_distinct_items The maximum expected number of distinct items to add to the filter
|
704
|
+
* @param target_false_positive_prob A desired false positive probability per item
|
705
|
+
* @param dstMem A WritableMemory to hold the initialized filter
|
706
|
+
* @param allocator The allocator to use for the filter (default: standard allocator)
|
707
|
+
* @return A new Bloom filter configured for the given input parameters in the provided memory
|
708
|
+
*/
|
709
|
+
static bloom_filter_alloc<Allocator> initialize_by_accuracy(void* memory,
|
710
|
+
size_t length_bytes,
|
711
|
+
uint64_t max_distinct_items,
|
712
|
+
double target_false_positive_prob,
|
713
|
+
uint64_t seed = generate_random_seed(),
|
714
|
+
const Allocator& allocator = Allocator());
|
715
|
+
|
716
|
+
/**
|
717
|
+
* Initializes a Bloom filter with given number of bits and number of hash functions,
|
718
|
+
* using the provided base seed for the hash function and writing into the provided memory. The filter does
|
719
|
+
* not take ownership of the memory but does overwrite the full contents.
|
720
|
+
*
|
721
|
+
* @param memory A pointer to the memory to use for the filter
|
722
|
+
* @param length_bytes The length of the memory in bytes
|
723
|
+
* @param num_bits The size of the BloomFilter, in bits
|
724
|
+
* @param num_hashes The number of hash functions to apply to items
|
725
|
+
* @param seed A base hash seed (default: random)
|
726
|
+
* @param allocator The allocator to use for the filter (default: standard allocator)
|
727
|
+
* @return A new BloomFilter configured for the given input parameters
|
728
|
+
*/
|
729
|
+
static bloom_filter_alloc<Allocator> initialize_by_size(void* memory,
|
730
|
+
size_t length_bytes,
|
731
|
+
uint64_t num_bits,
|
732
|
+
uint16_t num_hashes,
|
733
|
+
uint64_t seed = generate_random_seed(),
|
734
|
+
const Allocator& allocator = Allocator());
|
735
|
+
|
736
|
+
/**
|
737
|
+
* @brief Generates a random 64-bit seed value
|
738
|
+
*
|
739
|
+
* @return uint64_t a random value over the range of unsigned 64-bit integers
|
740
|
+
*/
|
741
|
+
static uint64_t generate_random_seed();
|
742
|
+
|
743
|
+
private:
|
744
|
+
static void validate_size_inputs(uint64_t num_bits, uint16_t num_hashes);
|
745
|
+
static void validate_accuracy_inputs(uint64_t max_distinct_items, double target_false_positive_prob);
|
746
|
+
};
|
747
|
+
|
748
|
+
} // namespace datasketches
|
749
|
+
|
750
|
+
#include "bloom_filter_builder_impl.hpp"
|
751
|
+
#include "bloom_filter_impl.hpp"
|
752
|
+
|
753
|
+
#endif // _BLOOM_FILTER_HPP_ b
|