datasketches 0.4.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/ext/datasketches/vo_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  6. data/vendor/datasketches-cpp/LICENSE +35 -7
  7. data/vendor/datasketches-cpp/NOTICE +2 -2
  8. data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -1
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +1 -0
  10. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
  11. data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
  12. data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
  13. data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
  14. data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
  15. data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
  16. data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
  17. data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
  18. data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
  19. data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
  20. data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
  21. data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
  22. data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
  23. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +51 -1
  24. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +38 -1
  25. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +12 -3
  26. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
  27. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -39
  28. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  29. metadata +18 -10
@@ -0,0 +1,753 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+
7
+ * to you under the Apache License, Version 2.0 (the
8
+ * "License"); you may not use this file except in compliance
9
+ * with the License. You may obtain a copy of the License at
10
+ *
11
+ * http://www.apache.org/licenses/LICENSE-2.0
12
+ *
13
+ * Unless required by applicable law or agreed to in writing,
14
+ * software distributed under the License is distributed on an
15
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16
+ * KIND, either express or implied. See the License for the
17
+ * specific language governing permissions and limitations
18
+ * under the License.
19
+ */
20
+
21
+ #ifndef _BLOOM_FILTER_HPP_
22
+ #define _BLOOM_FILTER_HPP_
23
+
24
+ #include <cstdint>
25
+ #include <memory>
26
+ #include <vector>
27
+
28
+ #include "common_defs.hpp"
29
+
30
+ namespace datasketches {
31
+
32
+ // forward declarations
33
+ template<typename A> class bloom_filter_alloc;
34
+
35
+ // aliases with default allocator
36
+ using bloom_filter = bloom_filter_alloc<std::allocator<uint8_t>>;
37
+
38
+ /**
39
+ * <p>A Bloom filter is a data structure that can be used for probabilistic
40
+ * set membership.</p>
41
+ *
42
+ * <p>When querying a Bloom filter, there are no false positives. Specifically:
43
+ * When querying an item that has already been inserted to the filter, the filter will
44
+ * always indicate that the item is present. There is a chance of false positives, where
45
+ * querying an item that has never been presented to the filter will indicate that the
46
+ * item has already been seen. Consequently, any query should be interpreted as
47
+ * "might have seen."</p>
48
+ *
49
+ * <p>A standard Bloom filter is unlike typical sketches in that it is not sub-linear
50
+ * in size and does not resize itself. A Bloom filter will work up to a target number of
51
+ * distinct items, beyond which it will saturate and the false positive rate will start to
52
+ * increase. The size of a Bloom filter will be linear in the expected number of
53
+ * distinct items.</p>
54
+ *
55
+ * <p>See the bloom_filter_builder_alloc class for methods to create a filter, especially
56
+ * one sized correctly for a target number of distinct elements and a target
57
+ * false positive probability.</p>
58
+ *
59
+ * <p>This implementation uses xxHash64 and follows the approach in Kirsch and Mitzenmacher,
60
+ * "Less Hashing, Same Performance: Building a Better Bloom Filter," Wiley Interscience, 2008, pp. 187-218.</p>
61
+ */
62
+
63
+ template<typename Allocator = std::allocator<uint8_t>>
64
+ class bloom_filter_alloc {
65
+ public:
66
+
67
+ // no public constructor; use builder or deserialize/wrap methods
68
+ class builder;
69
+
70
+ /**
71
+ * This method deserializes a Bloom filter from a given array of bytes.
72
+ * @param bytes pointer to the array of bytes
73
+ * @param size the size of the array
74
+ * @param allocator instance of an Allocator
75
+ * @return an instance of a Bloom filter
76
+ */
77
+ static bloom_filter_alloc deserialize(const void* bytes, size_t length_bytes, const Allocator& allocator = Allocator());
78
+
79
+ /**
80
+ * This method deserializes a Bloom filter from a given stream.
81
+ * @param is input stream
82
+ * @param allocator instance of an Allocator
83
+ * @return an instance of a Bloom filter
84
+ */
85
+ static bloom_filter_alloc deserialize(std::istream& is, const Allocator& allocator = Allocator());
86
+
87
+ /**
88
+ * @brief Wraps the provided memory as a read-only Bloom filter. Reads the data in-place and does
89
+ * not take ownership of the underlying memory. Does not allow modifying the filter.
90
+ *
91
+ * @param data The memory to wrap
92
+ * @param length_bytes The length of the memory in bytes
93
+ * @param allocator instance of an Allocator
94
+ * @return a const (read-only) Bloom filter wrapping the provided memory
95
+ */
96
+ static const bloom_filter_alloc wrap(const void* data, size_t length_bytes, const Allocator& allocator = Allocator());
97
+
98
+ /**
99
+ * @brief Wraps the provided memory as a writable Bloom filter. Reads the data in-place and does
100
+ * not take ownership of the underlying memory. Allows modifying the filter.
101
+ *
102
+ * @param data the memory to wrap
103
+ * @param length_bytes the length of the memory in bytes
104
+ * @param allocator instance of an Allocator
105
+ * @return a Bloom filter wrapping the provided memory
106
+ */
107
+ static bloom_filter_alloc writable_wrap(void* data, size_t length_bytes, const Allocator& allocator = Allocator());
108
+
109
+ /**
110
+ * Copy constructor
111
+ * @param other filter to be copied
112
+ */
113
+ bloom_filter_alloc(const bloom_filter_alloc& other);
114
+
115
+ /** Move constructor
116
+ * @param other filter to be moved
117
+ */
118
+ bloom_filter_alloc(bloom_filter_alloc&& other) noexcept;
119
+
120
+ /**
121
+ * Copy assignment
122
+ * @param other filter to be copied
123
+ * @return reference to this filter
124
+ */
125
+ bloom_filter_alloc& operator=(const bloom_filter_alloc& other);
126
+
127
+ /**
128
+ * Move assignment
129
+ * @param other filter to be moved
130
+ * @return reference to this filter
131
+ */
132
+ bloom_filter_alloc& operator=(bloom_filter_alloc&& other);
133
+
134
+ /**
135
+ * @brief Destroy the bloom filter object
136
+ */
137
+ ~bloom_filter_alloc();
138
+
139
+ // This is a convenience alias for users
140
+ // The type returned by the following serialize method
141
+ using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
142
+
143
+ /**
144
+ * This method serializes the filter as a vector of bytes.
145
+ * An optional header can be reserved in front of the filter.
146
+ * It is a blank space of a given size.
147
+ * Some integrations such as PostgreSQL may need this header space.
148
+ * @param header_size_bytes space to reserve in front of the filter
149
+ * @return serialized filter as a vector of bytes
150
+ */
151
+ vector_bytes serialize(unsigned header_size_bytes = 0) const;
152
+
153
+ /**
154
+ * This method serializes the filter into a given stream in a binary form
155
+ * @param os output stream
156
+ */
157
+ void serialize(std::ostream& os) const;
158
+
159
+ /**
160
+ * Checks if the Bloom Filter has processed any items
161
+ * @return True if the BloomFilter is empty, otherwise False
162
+ */
163
+ bool is_empty() const;
164
+
165
+ /**
166
+ * Returns the number of bits in the Bloom Filter that are set to 1.
167
+ * @return The number of bits in use in this filter
168
+ */
169
+ uint64_t get_bits_used();
170
+
171
+ /**
172
+ * Returns the total number of bits in the Bloom Filter.
173
+ * @return The total size of the Bloom Filter
174
+ */
175
+ uint64_t get_capacity() const;
176
+
177
+ /**
178
+ * Returns the configured number of hash functions for this Bloom Filter
179
+ * @return The number of hash functions to apply to inputs
180
+ */
181
+ uint16_t get_num_hashes() const;
182
+
183
+ /**
184
+ * Returns the hash seed for this Bloom Filter.
185
+ * @return The hash seed for this filter
186
+ */
187
+ uint64_t get_seed() const;
188
+
189
+ /**
190
+ * Resets the Bloom Filter to its original state.
191
+ */
192
+ void reset();
193
+
194
+ // UPDATE METHODS
195
+
196
+ /**
197
+ * Updates the filter with the given std::string.
198
+ * The string is converted to a byte array using UTF8 encoding.
199
+ * If the string is null or empty no update attempt is made and the method returns.
200
+ * @param item The given string.
201
+ */
202
+ void update(const std::string& item);
203
+
204
+ /**
205
+ * Updates the filter with the given unsigned 64-bit integer.
206
+ * @param item The given integer.
207
+ */
208
+ void update(uint64_t item);
209
+
210
+ /**
211
+ * Updates the filter with the given unsigned 32-bit integer.
212
+ * @param item The given integer.
213
+ */
214
+ void update(uint32_t item);
215
+
216
+ /**
217
+ * Updates the filter with the given unsigned 16-bit integer.
218
+ * @param item The given integer.
219
+ */
220
+ void update(uint16_t item);
221
+
222
+ /**
223
+ * Updates the filter with the given unsigned 8-bit integer.
224
+ * @param item The given integer.
225
+ */
226
+ void update(uint8_t item);
227
+
228
+ /**
229
+ * Updates the filter with the given signed 64-bit integer.
230
+ * @param item The given integer.
231
+ */
232
+ void update(int64_t item);
233
+
234
+ /**
235
+ * Updates the filter with the given signed 32-bit integer.
236
+ * @param item The given integer.
237
+ */
238
+ void update(int32_t item);
239
+
240
+ /**
241
+ * Updates the filter with the given signed 16-bit integer.
242
+ * @param item The given integer.
243
+ */
244
+ void update(int16_t item);
245
+
246
+ /**
247
+ * Updates the filter with the given signed 8-bit integer.
248
+ * @param item The given integer.
249
+ */
250
+ void update(int8_t item);
251
+
252
+ /**
253
+ * Updates the filter with the given 64-bit floating point value.
254
+ * @param item The given double.
255
+ */
256
+ void update(double item);
257
+
258
+ /**
259
+ * Updates the filter with the give 32-bit floating point value.
260
+ * @param item The given float.
261
+ */
262
+ void update(float item);
263
+
264
+ /**
265
+ * Updates the filter with the given data array.
266
+ * @param data The given array.
267
+ * @param length_bytes The array length in bytes.
268
+ */
269
+ void update(const void* data, size_t length_bytes);
270
+
271
+ // QUERY-AND-UPDATE METHODS
272
+
273
+ /**
274
+ * Updates the filter with the given std::string and returns the result from
275
+ * querying the filter prior to the update.
276
+ * The string is converted to a byte array using UTF8 encoding.
277
+ * If the string is null or empty no update attempt is made and the method returns false.
278
+ * @param item The given string.
279
+ * @return The result from querying the filter prior to the update.
280
+ */
281
+ bool query_and_update(const std::string& item);
282
+
283
+ /**
284
+ * Updates the filter with the given unsigned 64-bit integer and returns the result from
285
+ * querying the filter prior to the update.
286
+ * @param item The given integer.
287
+ * @return The result from querying the filter prior to the update.
288
+ */
289
+ bool query_and_update(uint64_t item);
290
+
291
+ /**
292
+ * Updates the filter with the given unsigned 32-bit integer and returns the result from
293
+ * querying the filter prior to the update.
294
+ * @param item The given integer.
295
+ * @return The result from querying the filter prior to the update.
296
+ */
297
+ bool query_and_update(uint32_t item);
298
+
299
+ /**
300
+ * Updates the filter with the given unsigned 16-bit integer and returns the result from
301
+ * querying the filter prior to the update.
302
+ * @param item The given integer.
303
+ * @return The result from querying the filter prior to the update.
304
+ */
305
+ bool query_and_update(uint16_t item);
306
+
307
+ /**
308
+ * Updates the filter with the given unsigned 8-bit integer and returns the result from
309
+ * querying the filter prior to the update.
310
+ * @param item The given integer.
311
+ * @return The result from querying the filter prior to the update.
312
+ */
313
+ bool query_and_update(uint8_t item);
314
+
315
+ /**
316
+ * Updates the filter with the given signed 64-bit integer and returns the result from
317
+ * querying the filter prior to the update.
318
+ * @param item The given integer.
319
+ * @return The result from querying the filter prior to the update.
320
+ */
321
+ bool query_and_update(int64_t item);
322
+
323
+ /**
324
+ * Updates the filter with the given signed 32-bit integer and returns the result from
325
+ * querying the filter prior to the update.
326
+ * @param item The given integer.
327
+ * @return The result from querying the filter prior to the update.
328
+ */
329
+ bool query_and_update(int32_t item);
330
+
331
+ /**
332
+ * Updates the filter with the given signed 16-bit integer and returns the result from
333
+ * querying the filter prior to the update.
334
+ * @param item The given integer.
335
+ * @return The result from querying the filter prior to the update.
336
+ */
337
+ bool query_and_update(int16_t item);
338
+
339
+ /**
340
+ * Updates the filter with the given signed 8-bit integer and returns the result from
341
+ * querying the filter prior to the update.
342
+ * @param item The given integer.
343
+ * @return The result from querying the filter prior to the update.
344
+ */
345
+ bool query_and_update(int8_t item);
346
+
347
+ /**
348
+ * Updates the filter with the given 64-bit floating point value and returns the result from
349
+ * querying the filter prior to the update.
350
+ * @param item The given double.
351
+ * @return The result from querying the filter prior to the update.
352
+ */
353
+ bool query_and_update(double item);
354
+
355
+ /**
356
+ * Updates the filter with the give 32-bit floating point value and returns the result from
357
+ * querying the filter prior to the update.
358
+ * @param item The given float.
359
+ * @return The result from querying the filter prior to the update.
360
+ */
361
+ bool query_and_update(float item);
362
+
363
+ /**
364
+ * Updates the filter with the given data array and returns the result from
365
+ * querying the filter prior to the update.
366
+ * @param data The given array.
367
+ * @param length_bytes The array length in bytes.
368
+ * @return The result from querying the filter prior to the update.
369
+ */
370
+ bool query_and_update(const void* data, size_t length_bytes);
371
+
372
+ // QUERY METHODS
373
+
374
+ /**
375
+ * Queries the filter with the given std::string and returns whether the value
376
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
377
+ * determines the chances of a true result being a false positive. False engatives are
378
+ * never possible.
379
+ * The string is converted to a byte array using UTF8 encoding.
380
+ * If the string is null or empty the method always returns false.
381
+ * @param item The given string.
382
+ * @return The result from querying the filter with the given item.
383
+ */
384
+ bool query(const std::string& item) const;
385
+
386
+ /**
387
+ * Queries the filter with the given unsigned 64-bit integer and returns whether the value
388
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
389
+ * determines the chances of a true result being a false positive. False engatives are
390
+ * never possible.
391
+ * @param item The given integer.
392
+ * @return The result from querying the filter with the given item.
393
+ */
394
+ bool query(uint64_t item) const;
395
+
396
+ /**
397
+ * Queries the filter with the given unsigned 32-bit integer and returns whether the value
398
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
399
+ * determines the chances of a true result being a false positive. False engatives are
400
+ * never possible.
401
+ * @param item The given integer.
402
+ * @return The result from querying the filter with the given item.
403
+ */
404
+ bool query(uint32_t item) const;
405
+
406
+ /**
407
+ * Queries the filter with the given unsigned 16-bit integer and returns whether the value
408
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
409
+ * determines the chances of a true result being a false positive. False engatives are
410
+ * never possible.
411
+ * @param item The given integer.
412
+ * @return The result from querying the filter with the given item.
413
+ */
414
+ bool query(uint16_t item) const;
415
+
416
+ /**
417
+ * Queries the filter with the given unsigned 8-bit integer and returns whether the value
418
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
419
+ * determines the chances of a true result being a false positive. False engatives are
420
+ * never possible.
421
+ * @param item The given integer.
422
+ * @return The result from querying the filter with the given item.
423
+ */
424
+ bool query(uint8_t item) const;
425
+
426
+ /**
427
+ * Queries the filter with the given signed 64-bit integer and returns whether the value
428
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
429
+ * determines the chances of a true result being a false positive. False engatives are
430
+ * never possible.
431
+ * @param item The given integer.
432
+ * @return The result from querying the filter with the given item.
433
+ */
434
+ bool query(int64_t item) const;
435
+
436
+ /**
437
+ * Queries the filter with the given signed 32-bit integer and returns whether the value
438
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
439
+ * determines the chances of a true result being a false positive. False engatives are
440
+ * never possible.
441
+ * @param item The given integer.
442
+ * @return The result from querying the filter with the given item.
443
+ */
444
+ bool query(int32_t item) const;
445
+
446
+ /**
447
+ * Queries the filter with the given signed 16-bit integer and returns whether the value
448
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
449
+ * determines the chances of a true result being a false positive. False engatives are
450
+ * never possible.
451
+ * @param item The given integer.
452
+ * @return The result from querying the filter with the given item.
453
+ */
454
+ bool query(int16_t item) const;
455
+
456
+ /**
457
+ * Queries the filter with the given signed 8-bit integer and returns whether the value
458
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
459
+ * determines the chances of a true result being a false positive. False engatives are
460
+ * never possible.
461
+ * @param item The given integer.
462
+ * @return The result from querying the filter with the given item.
463
+ */
464
+ bool query(int8_t item) const;
465
+
466
+ /**
467
+ * Queries the filter with the given 64-bit floating point value and returns whether the value
468
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
469
+ * determines the chances of a true result being a false positive. False engatives are
470
+ * never possible.
471
+ * @param item The given double.
472
+ * @return The result from querying the filter with the given item.
473
+ */
474
+ bool query(double item) const;
475
+
476
+ /**
477
+ * Queries the filter with the given 32-bit floating point value and returns whether the value
478
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
479
+ * determines the chances of a true result being a false positive. False engatives are
480
+ * never possible.
481
+ * @param item The given float.
482
+ * @return The result from querying the filter with the given item.
483
+ */
484
+ bool query(float item) const;
485
+
486
+ /**
487
+ * Queries the filter with the given data array. and returns the result from
488
+ * Queries the filter with the given 64-bit floating point value and returns whether the value
489
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
490
+ * determines the chances of a true result being a false positive. False engatives are
491
+ * never possible.
492
+ * @param data The given array.
493
+ * @param length_bytes The array length in bytes.
494
+ * @return The result from querying the filter with the given item.
495
+ */
496
+ bool query(const void* data, size_t length_bytes) const;
497
+
498
+ // OTHER OPERATIONS
499
+
500
+ /**
501
+ * Unions two Bloom Filters by applying a logical OR. The result will recognized
502
+ * any values seen by either filter (as well as false positives).
503
+ * @param other A BloomFilter to union with this one
504
+ */
505
+ void union_with(const bloom_filter_alloc& other);
506
+
507
+ /**
508
+ * Intersects two Bloom Filters by applying a logical AND. The result will recognize
509
+ * only values seen by both filters (as well as false positives).
510
+ * @param other A Bloom Filter to union with this one
511
+ */
512
+ void intersect(const bloom_filter_alloc& other);
513
+
514
+ /**
515
+ * Inverts all the bits of the BloomFilter. Approximately inverts the notion of set-membership.
516
+ */
517
+ void invert();
518
+
519
+ /**
520
+ * Helps identify if two Bloom Filters may be unioned or intersected.
521
+ * @param other A Bloom Filter to check for compatibility with this one
522
+ * @return True if the filters are compatible, otherwise false
523
+ */
524
+ bool is_compatible(const bloom_filter_alloc& other) const;
525
+
526
+ /**
527
+ * @brief Checks if the Bloom Filter is read-only.
528
+ *
529
+ * @return True if the filter is read-only, otherwise false.
530
+ */
531
+ bool is_read_only() const;
532
+
533
+ /**
534
+ * @brief Returns whether the filter owns its underlying memory
535
+ * @return True if the filter owns its memory, otherwise false
536
+ */
537
+ bool is_memory_owned() const;
538
+
539
+ /**
540
+ * @brief Checks if the Bloom Filter was created by a call to wrap().
541
+ *
542
+ * @return True if the filter was created by wrapping memory, otherwise false.
543
+ */
544
+ bool is_wrapped() const;
545
+
546
+ /**
547
+ * @brief Returns a pointer to the memory this filter wraps, if it exists.
548
+ * @return A pointer to the wrapped memory, or nullptr if is_wrapped() is false.
549
+ */
550
+ const uint8_t* get_wrapped_memory() const;
551
+
552
+ /**
553
+ * @brief Gets the serialized size of the Bloom Filter in bytes
554
+ * @return The serialized size of the Bloom Filter in bytes
555
+ */
556
+ size_t get_serialized_size_bytes() const;
557
+
558
+ /**
559
+ * @brief Gets the serialized size of the Bloom Filter with the given number of bits, in bytes
560
+ * @param num_bits The number of bits in the Bloom Filter for the size calculation
561
+ * @return The serialized size of a Bloom Filter with a capacity of num_bits, in bytes
562
+ */
563
+ static size_t get_serialized_size_bytes(uint64_t num_bits);
564
+
565
+ /**
566
+ * @brief Returns a human-readable string representation of the Bloom Filter.
567
+ * @param print_filter If true, the filter bits will be printed as well.
568
+ * @return A human-readable string representation of the Bloom Filter.
569
+ */
570
+ string<Allocator> to_string(bool print_filter = false) const;
571
+
572
+ private:
573
+ using A = Allocator;
574
+ using AllocUint8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
575
+
576
+ static const uint64_t DIRTY_BITS_VALUE = static_cast<uint64_t>(-1LL);
577
+ static const uint64_t MAX_HEADER_SIZE_BYTES = 32; // 4 Java Longs
578
+ static const uint64_t BIT_ARRAY_LENGTH_OFFSET_BYTES = 16;
579
+ static const uint64_t NUM_BITS_SET_OFFSET_BYTES = 24;
580
+ static const uint64_t BIT_ARRAY_OFFSET_BYTES = 32;
581
+ static const uint64_t MAX_FILTER_SIZE_BITS = (INT32_MAX - MAX_HEADER_SIZE_BYTES) * sizeof(uint64_t);
582
+
583
+ static const uint8_t PREAMBLE_LONGS_EMPTY = 3;
584
+ static const uint8_t PREAMBLE_LONGS_STANDARD = 4;
585
+ static const uint8_t FAMILY_ID = 21;
586
+ static const uint8_t SER_VER = 1;
587
+ static const uint8_t EMPTY_FLAG_MASK = 4;
588
+
589
+ // used by builder methods
590
+ bloom_filter_alloc(uint64_t num_bits, uint16_t num_hashes, uint64_t seed, const A& allocator);
591
+ bloom_filter_alloc(uint8_t* memory, size_t length_bytes, uint64_t num_bits, uint16_t num_hashes, uint64_t seed, const A& allocator);
592
+
593
+ // used by deserialize and wrap
594
+ bloom_filter_alloc(uint64_t seed,
595
+ uint16_t num_hashes,
596
+ bool is_dirty,
597
+ bool is_owned,
598
+ bool is_read_only,
599
+ uint64_t capacity_bits,
600
+ uint64_t num_bits_set,
601
+ uint8_t* bit_array,
602
+ uint8_t* memory,
603
+ const A& allocator);
604
+
605
+ static bloom_filter_alloc internal_deserialize_or_wrap(void* bytes,
606
+ size_t length_bytes,
607
+ bool read_only,
608
+ bool wrap,
609
+ const A& allocator);
610
+
611
+ // internal query/update methods
612
+ void internal_update(uint64_t h0, uint64_t h1);
613
+ bool internal_query_and_update(uint64_t h0, uint64_t h1);
614
+ bool internal_query(uint64_t h0, uint64_t h1) const;
615
+
616
+ void update_num_bits_set(uint64_t num_bits_set);
617
+
618
+ Allocator allocator_;
619
+ uint64_t seed_;
620
+ uint16_t num_hashes_;
621
+ bool is_dirty_;
622
+ bool is_owned_; // if true, data is not owned by filter AND memory_ holds the entire filter not just the bit array
623
+ bool is_read_only_; // if true, filter is read-only
624
+ uint64_t capacity_bits_;
625
+ uint64_t num_bits_set_;
626
+ uint8_t* bit_array_; // data backing bit_array_, regardless of ownership
627
+ uint8_t* memory_; // if wrapped, pointer to the start of the filter, otheriwse nullptr
628
+ };
629
+
630
+ /**
631
+ * <p>This class provides methods to help estimate the correct parameters when
632
+ * creating a Bloom filter, and methods to create the filter using those values.</p>
633
+ *
634
+ * <p>The underlying math is described in the
635
+ * <a href='https://en.wikipedia.org/wiki/Bloom_filter#Optimal_number_of_hash_functions'>
636
+ * Wikipedia article on Bloom filters</a>.</p>
637
+ */
638
+ template<typename Allocator>
639
+ class bloom_filter_alloc<Allocator>::builder {
640
+ public:
641
+ /**
642
+ * Returns the optimal number of hash functions to given target numbers of distinct items
643
+ * and the Bloom filter size in bits. This function will provide a result even if the input
644
+ * values exceed the capacity of a single Bloom filter.
645
+ * @param max_distinct_items The maximum expected number of distinct items to add to the filter
646
+ * @param num_filter_bits The intended size of the Bloom Filter in bits
647
+ * @return The suggested number of hash functions to use with the filter
648
+ */
649
+ static uint16_t suggest_num_hashes(uint64_t max_distinct_items, uint64_t num_filter_bits);
650
+
651
+ /**
652
+ * Returns the optimal number of hash functions to achieve a target false positive probability.
653
+ * @param target_false_positive_prob A desired false positive probability per item
654
+ * @return The suggested number of hash functions to use with the filter.
655
+ */
656
+ static uint16_t suggest_num_hashes(double target_false_positive_prob);
657
+
658
+ /**
659
+ * Returns the optimal number of bits to use in a Bloom filter given a target number of distinct
660
+ * items and a target false positive probability.
661
+ * @param max_distinct_items The maximum expected number of distinct items to add to the filter
662
+ * @param target_false_positive_prob A desired false positive probability per item
663
+ * @return The suggested number of bits to use with the filter
664
+ */
665
+ static uint64_t suggest_num_filter_bits(uint64_t max_distinct_items, double target_false_positive_prob);
666
+
667
+ /**
668
+ * Creates a new Bloom filter with an optimal number of bits and hash functions for the given inputs,
669
+ * using a random base seed for the hash function.
670
+ * @param max_distinct_items The maximum expected number of distinct items to add to the filter
671
+ * @param target_false_positive_prob A desired false positive probability per item
672
+ * @param seed A bash hash seed (default: random)
673
+ * @param allocator The allocator to use for the filter (default: standard allocator)
674
+ * @return A new Bloom filter configured for the given input parameters
675
+ */
676
+ static bloom_filter_alloc<Allocator> create_by_accuracy(uint64_t max_distinct_items,
677
+ double target_false_positive_prob,
678
+ uint64_t seed = generate_random_seed(),
679
+ const Allocator& allocator = Allocator());
680
+
681
+ /**
682
+ * Creates a Bloom filter with given number of bits and number of hash functions,
683
+ * using the provided base seed for the hash function.
684
+ *
685
+ * @param num_bits The size of the BloomFilter, in bits
686
+ * @param num_hashes The number of hash functions to apply to items
687
+ * @param seed A base hash seed (default: random)
688
+ * @param allocator The allocator to use for the filter (default: standard allocator)
689
+ * @return A new Bloom filter configured for the given input parameters
690
+ */
691
+ static bloom_filter_alloc<Allocator> create_by_size(uint64_t num_bits,
692
+ uint16_t num_hashes,
693
+ uint64_t seed = generate_random_seed(),
694
+ const Allocator& allocator = Allocator());
695
+
696
+ /**
697
+ * Creates a new Bloom filter with an optimal number of bits and hash functions for the given inputs,
698
+ * using a random base seed for the hash function and writing into the provided memory. The filter does
699
+ * not take ownership of the memory but does overwrite the full contents.
700
+ *
701
+ * @param memory A pointer to the memory to use for the filter
702
+ * @param length_bytes The length of the memory in bytes
703
+ * @param max_distinct_items The maximum expected number of distinct items to add to the filter
704
+ * @param target_false_positive_prob A desired false positive probability per item
705
+ * @param dstMem A WritableMemory to hold the initialized filter
706
+ * @param allocator The allocator to use for the filter (default: standard allocator)
707
+ * @return A new Bloom filter configured for the given input parameters in the provided memory
708
+ */
709
+ static bloom_filter_alloc<Allocator> initialize_by_accuracy(void* memory,
710
+ size_t length_bytes,
711
+ uint64_t max_distinct_items,
712
+ double target_false_positive_prob,
713
+ uint64_t seed = generate_random_seed(),
714
+ const Allocator& allocator = Allocator());
715
+
716
+ /**
717
+ * Initializes a Bloom filter with given number of bits and number of hash functions,
718
+ * using the provided base seed for the hash function and writing into the provided memory. The filter does
719
+ * not take ownership of the memory but does overwrite the full contents.
720
+ *
721
+ * @param memory A pointer to the memory to use for the filter
722
+ * @param length_bytes The length of the memory in bytes
723
+ * @param num_bits The size of the BloomFilter, in bits
724
+ * @param num_hashes The number of hash functions to apply to items
725
+ * @param seed A base hash seed (default: random)
726
+ * @param allocator The allocator to use for the filter (default: standard allocator)
727
+ * @return A new BloomFilter configured for the given input parameters
728
+ */
729
+ static bloom_filter_alloc<Allocator> initialize_by_size(void* memory,
730
+ size_t length_bytes,
731
+ uint64_t num_bits,
732
+ uint16_t num_hashes,
733
+ uint64_t seed = generate_random_seed(),
734
+ const Allocator& allocator = Allocator());
735
+
736
+ /**
737
+ * @brief Generates a random 64-bit seed value
738
+ *
739
+ * @return uint64_t a random value over the range of unsigned 64-bit integers
740
+ */
741
+ static uint64_t generate_random_seed();
742
+
743
+ private:
744
+ static void validate_size_inputs(uint64_t num_bits, uint16_t num_hashes);
745
+ static void validate_accuracy_inputs(uint64_t max_distinct_items, double target_false_positive_prob);
746
+ };
747
+
748
+ } // namespace datasketches
749
+
750
+ #include "bloom_filter_builder_impl.hpp"
751
+ #include "bloom_filter_impl.hpp"
752
+
753
+ #endif // _BLOOM_FILTER_HPP_ b