datasketches 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (29) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/vo_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  6. data/vendor/datasketches-cpp/LICENSE +35 -7
  7. data/vendor/datasketches-cpp/NOTICE +2 -2
  8. data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -1
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +1 -0
  10. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
  11. data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
  12. data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
  13. data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
  14. data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
  15. data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
  16. data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
  17. data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
  18. data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
  19. data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
  20. data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
  21. data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
  22. data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
  23. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +51 -1
  24. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +38 -1
  25. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +12 -3
  26. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
  27. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -39
  28. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  29. metadata +17 -9
@@ -0,0 +1,753 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+
7
+ * to you under the Apache License, Version 2.0 (the
8
+ * "License"); you may not use this file except in compliance
9
+ * with the License. You may obtain a copy of the License at
10
+ *
11
+ * http://www.apache.org/licenses/LICENSE-2.0
12
+ *
13
+ * Unless required by applicable law or agreed to in writing,
14
+ * software distributed under the License is distributed on an
15
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16
+ * KIND, either express or implied. See the License for the
17
+ * specific language governing permissions and limitations
18
+ * under the License.
19
+ */
20
+
21
+ #ifndef _BLOOM_FILTER_HPP_
22
+ #define _BLOOM_FILTER_HPP_
23
+
24
+ #include <cstdint>
25
+ #include <memory>
26
+ #include <vector>
27
+
28
+ #include "common_defs.hpp"
29
+
30
+ namespace datasketches {
31
+
32
+ // forward declarations
33
+ template<typename A> class bloom_filter_alloc;
34
+
35
+ // aliases with default allocator
36
+ using bloom_filter = bloom_filter_alloc<std::allocator<uint8_t>>;
37
+
38
+ /**
39
+ * <p>A Bloom filter is a data structure that can be used for probabilistic
40
+ * set membership.</p>
41
+ *
42
+ * <p>When querying a Bloom filter, there are no false positives. Specifically:
43
+ * When querying an item that has already been inserted to the filter, the filter will
44
+ * always indicate that the item is present. There is a chance of false positives, where
45
+ * querying an item that has never been presented to the filter will indicate that the
46
+ * item has already been seen. Consequently, any query should be interpreted as
47
+ * "might have seen."</p>
48
+ *
49
+ * <p>A standard Bloom filter is unlike typical sketches in that it is not sub-linear
50
+ * in size and does not resize itself. A Bloom filter will work up to a target number of
51
+ * distinct items, beyond which it will saturate and the false positive rate will start to
52
+ * increase. The size of a Bloom filter will be linear in the expected number of
53
+ * distinct items.</p>
54
+ *
55
+ * <p>See the bloom_filter_builder_alloc class for methods to create a filter, especially
56
+ * one sized correctly for a target number of distinct elements and a target
57
+ * false positive probability.</p>
58
+ *
59
+ * <p>This implementation uses xxHash64 and follows the approach in Kirsch and Mitzenmacher,
60
+ * "Less Hashing, Same Performance: Building a Better Bloom Filter," Wiley Interscience, 2008, pp. 187-218.</p>
61
+ */
62
+
63
+ template<typename Allocator = std::allocator<uint8_t>>
64
+ class bloom_filter_alloc {
65
+ public:
66
+
67
+ // no public constructor; use builder or deserialize/wrap methods
68
+ class builder;
69
+
70
+ /**
71
+ * This method deserializes a Bloom filter from a given array of bytes.
72
+ * @param bytes pointer to the array of bytes
73
+ * @param size the size of the array
74
+ * @param allocator instance of an Allocator
75
+ * @return an instance of a Bloom filter
76
+ */
77
+ static bloom_filter_alloc deserialize(const void* bytes, size_t length_bytes, const Allocator& allocator = Allocator());
78
+
79
+ /**
80
+ * This method deserializes a Bloom filter from a given stream.
81
+ * @param is input stream
82
+ * @param allocator instance of an Allocator
83
+ * @return an instance of a Bloom filter
84
+ */
85
+ static bloom_filter_alloc deserialize(std::istream& is, const Allocator& allocator = Allocator());
86
+
87
+ /**
88
+ * @brief Wraps the provided memory as a read-only Bloom filter. Reads the data in-place and does
89
+ * not take ownership of the underlying memory. Does not allow modifying the filter.
90
+ *
91
+ * @param data The memory to wrap
92
+ * @param length_bytes The length of the memory in bytes
93
+ * @param allocator instance of an Allocator
94
+ * @return a const (read-only) Bloom filter wrapping the provided memory
95
+ */
96
+ static const bloom_filter_alloc wrap(const void* data, size_t length_bytes, const Allocator& allocator = Allocator());
97
+
98
+ /**
99
+ * @brief Wraps the provided memory as a writable Bloom filter. Reads the data in-place and does
100
+ * not take ownership of the underlying memory. Allows modifying the filter.
101
+ *
102
+ * @param data the memory to wrap
103
+ * @param length_bytes the length of the memory in bytes
104
+ * @param allocator instance of an Allocator
105
+ * @return a Bloom filter wrapping the provided memory
106
+ */
107
+ static bloom_filter_alloc writable_wrap(void* data, size_t length_bytes, const Allocator& allocator = Allocator());
108
+
109
+ /**
110
+ * Copy constructor
111
+ * @param other filter to be copied
112
+ */
113
+ bloom_filter_alloc(const bloom_filter_alloc& other);
114
+
115
+ /** Move constructor
116
+ * @param other filter to be moved
117
+ */
118
+ bloom_filter_alloc(bloom_filter_alloc&& other) noexcept;
119
+
120
+ /**
121
+ * Copy assignment
122
+ * @param other filter to be copied
123
+ * @return reference to this filter
124
+ */
125
+ bloom_filter_alloc& operator=(const bloom_filter_alloc& other);
126
+
127
+ /**
128
+ * Move assignment
129
+ * @param other filter to be moved
130
+ * @return reference to this filter
131
+ */
132
+ bloom_filter_alloc& operator=(bloom_filter_alloc&& other);
133
+
134
+ /**
135
+ * @brief Destroy the bloom filter object
136
+ */
137
+ ~bloom_filter_alloc();
138
+
139
+ // This is a convenience alias for users
140
+ // The type returned by the following serialize method
141
+ using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
142
+
143
+ /**
144
+ * This method serializes the filter as a vector of bytes.
145
+ * An optional header can be reserved in front of the filter.
146
+ * It is a blank space of a given size.
147
+ * Some integrations such as PostgreSQL may need this header space.
148
+ * @param header_size_bytes space to reserve in front of the filter
149
+ * @return serialized filter as a vector of bytes
150
+ */
151
+ vector_bytes serialize(unsigned header_size_bytes = 0) const;
152
+
153
+ /**
154
+ * This method serializes the filter into a given stream in a binary form
155
+ * @param os output stream
156
+ */
157
+ void serialize(std::ostream& os) const;
158
+
159
+ /**
160
+ * Checks if the Bloom Filter has processed any items
161
+ * @return True if the BloomFilter is empty, otherwise False
162
+ */
163
+ bool is_empty() const;
164
+
165
+ /**
166
+ * Returns the number of bits in the Bloom Filter that are set to 1.
167
+ * @return The number of bits in use in this filter
168
+ */
169
+ uint64_t get_bits_used();
170
+
171
+ /**
172
+ * Returns the total number of bits in the Bloom Filter.
173
+ * @return The total size of the Bloom Filter
174
+ */
175
+ uint64_t get_capacity() const;
176
+
177
+ /**
178
+ * Returns the configured number of hash functions for this Bloom Filter
179
+ * @return The number of hash functions to apply to inputs
180
+ */
181
+ uint16_t get_num_hashes() const;
182
+
183
+ /**
184
+ * Returns the hash seed for this Bloom Filter.
185
+ * @return The hash seed for this filter
186
+ */
187
+ uint64_t get_seed() const;
188
+
189
+ /**
190
+ * Resets the Bloom Filter to its original state.
191
+ */
192
+ void reset();
193
+
194
+ // UPDATE METHODS
195
+
196
+ /**
197
+ * Updates the filter with the given std::string.
198
+ * The string is converted to a byte array using UTF8 encoding.
199
+ * If the string is null or empty no update attempt is made and the method returns.
200
+ * @param item The given string.
201
+ */
202
+ void update(const std::string& item);
203
+
204
+ /**
205
+ * Updates the filter with the given unsigned 64-bit integer.
206
+ * @param item The given integer.
207
+ */
208
+ void update(uint64_t item);
209
+
210
+ /**
211
+ * Updates the filter with the given unsigned 32-bit integer.
212
+ * @param item The given integer.
213
+ */
214
+ void update(uint32_t item);
215
+
216
+ /**
217
+ * Updates the filter with the given unsigned 16-bit integer.
218
+ * @param item The given integer.
219
+ */
220
+ void update(uint16_t item);
221
+
222
+ /**
223
+ * Updates the filter with the given unsigned 8-bit integer.
224
+ * @param item The given integer.
225
+ */
226
+ void update(uint8_t item);
227
+
228
+ /**
229
+ * Updates the filter with the given signed 64-bit integer.
230
+ * @param item The given integer.
231
+ */
232
+ void update(int64_t item);
233
+
234
+ /**
235
+ * Updates the filter with the given signed 32-bit integer.
236
+ * @param item The given integer.
237
+ */
238
+ void update(int32_t item);
239
+
240
+ /**
241
+ * Updates the filter with the given signed 16-bit integer.
242
+ * @param item The given integer.
243
+ */
244
+ void update(int16_t item);
245
+
246
+ /**
247
+ * Updates the filter with the given signed 8-bit integer.
248
+ * @param item The given integer.
249
+ */
250
+ void update(int8_t item);
251
+
252
+ /**
253
+ * Updates the filter with the given 64-bit floating point value.
254
+ * @param item The given double.
255
+ */
256
+ void update(double item);
257
+
258
+ /**
259
+ * Updates the filter with the give 32-bit floating point value.
260
+ * @param item The given float.
261
+ */
262
+ void update(float item);
263
+
264
+ /**
265
+ * Updates the filter with the given data array.
266
+ * @param data The given array.
267
+ * @param length_bytes The array length in bytes.
268
+ */
269
+ void update(const void* data, size_t length_bytes);
270
+
271
+ // QUERY-AND-UPDATE METHODS
272
+
273
+ /**
274
+ * Updates the filter with the given std::string and returns the result from
275
+ * querying the filter prior to the update.
276
+ * The string is converted to a byte array using UTF8 encoding.
277
+ * If the string is null or empty no update attempt is made and the method returns false.
278
+ * @param item The given string.
279
+ * @return The result from querying the filter prior to the update.
280
+ */
281
+ bool query_and_update(const std::string& item);
282
+
283
+ /**
284
+ * Updates the filter with the given unsigned 64-bit integer and returns the result from
285
+ * querying the filter prior to the update.
286
+ * @param item The given integer.
287
+ * @return The result from querying the filter prior to the update.
288
+ */
289
+ bool query_and_update(uint64_t item);
290
+
291
+ /**
292
+ * Updates the filter with the given unsigned 32-bit integer and returns the result from
293
+ * querying the filter prior to the update.
294
+ * @param item The given integer.
295
+ * @return The result from querying the filter prior to the update.
296
+ */
297
+ bool query_and_update(uint32_t item);
298
+
299
+ /**
300
+ * Updates the filter with the given unsigned 16-bit integer and returns the result from
301
+ * querying the filter prior to the update.
302
+ * @param item The given integer.
303
+ * @return The result from querying the filter prior to the update.
304
+ */
305
+ bool query_and_update(uint16_t item);
306
+
307
+ /**
308
+ * Updates the filter with the given unsigned 8-bit integer and returns the result from
309
+ * querying the filter prior to the update.
310
+ * @param item The given integer.
311
+ * @return The result from querying the filter prior to the update.
312
+ */
313
+ bool query_and_update(uint8_t item);
314
+
315
+ /**
316
+ * Updates the filter with the given signed 64-bit integer and returns the result from
317
+ * querying the filter prior to the update.
318
+ * @param item The given integer.
319
+ * @return The result from querying the filter prior to the update.
320
+ */
321
+ bool query_and_update(int64_t item);
322
+
323
+ /**
324
+ * Updates the filter with the given signed 32-bit integer and returns the result from
325
+ * querying the filter prior to the update.
326
+ * @param item The given integer.
327
+ * @return The result from querying the filter prior to the update.
328
+ */
329
+ bool query_and_update(int32_t item);
330
+
331
+ /**
332
+ * Updates the filter with the given signed 16-bit integer and returns the result from
333
+ * querying the filter prior to the update.
334
+ * @param item The given integer.
335
+ * @return The result from querying the filter prior to the update.
336
+ */
337
+ bool query_and_update(int16_t item);
338
+
339
+ /**
340
+ * Updates the filter with the given signed 8-bit integer and returns the result from
341
+ * querying the filter prior to the update.
342
+ * @param item The given integer.
343
+ * @return The result from querying the filter prior to the update.
344
+ */
345
+ bool query_and_update(int8_t item);
346
+
347
+ /**
348
+ * Updates the filter with the given 64-bit floating point value and returns the result from
349
+ * querying the filter prior to the update.
350
+ * @param item The given double.
351
+ * @return The result from querying the filter prior to the update.
352
+ */
353
+ bool query_and_update(double item);
354
+
355
+ /**
356
+ * Updates the filter with the give 32-bit floating point value and returns the result from
357
+ * querying the filter prior to the update.
358
+ * @param item The given float.
359
+ * @return The result from querying the filter prior to the update.
360
+ */
361
+ bool query_and_update(float item);
362
+
363
+ /**
364
+ * Updates the filter with the given data array and returns the result from
365
+ * querying the filter prior to the update.
366
+ * @param data The given array.
367
+ * @param length_bytes The array length in bytes.
368
+ * @return The result from querying the filter prior to the update.
369
+ */
370
+ bool query_and_update(const void* data, size_t length_bytes);
371
+
372
+ // QUERY METHODS
373
+
374
+ /**
375
+ * Queries the filter with the given std::string and returns whether the value
376
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
377
+ * determines the chances of a true result being a false positive. False engatives are
378
+ * never possible.
379
+ * The string is converted to a byte array using UTF8 encoding.
380
+ * If the string is null or empty the method always returns false.
381
+ * @param item The given string.
382
+ * @return The result from querying the filter with the given item.
383
+ */
384
+ bool query(const std::string& item) const;
385
+
386
+ /**
387
+ * Queries the filter with the given unsigned 64-bit integer and returns whether the value
388
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
389
+ * determines the chances of a true result being a false positive. False engatives are
390
+ * never possible.
391
+ * @param item The given integer.
392
+ * @return The result from querying the filter with the given item.
393
+ */
394
+ bool query(uint64_t item) const;
395
+
396
+ /**
397
+ * Queries the filter with the given unsigned 32-bit integer and returns whether the value
398
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
399
+ * determines the chances of a true result being a false positive. False engatives are
400
+ * never possible.
401
+ * @param item The given integer.
402
+ * @return The result from querying the filter with the given item.
403
+ */
404
+ bool query(uint32_t item) const;
405
+
406
+ /**
407
+ * Queries the filter with the given unsigned 16-bit integer and returns whether the value
408
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
409
+ * determines the chances of a true result being a false positive. False engatives are
410
+ * never possible.
411
+ * @param item The given integer.
412
+ * @return The result from querying the filter with the given item.
413
+ */
414
+ bool query(uint16_t item) const;
415
+
416
+ /**
417
+ * Queries the filter with the given unsigned 8-bit integer and returns whether the value
418
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
419
+ * determines the chances of a true result being a false positive. False engatives are
420
+ * never possible.
421
+ * @param item The given integer.
422
+ * @return The result from querying the filter with the given item.
423
+ */
424
+ bool query(uint8_t item) const;
425
+
426
+ /**
427
+ * Queries the filter with the given signed 64-bit integer and returns whether the value
428
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
429
+ * determines the chances of a true result being a false positive. False engatives are
430
+ * never possible.
431
+ * @param item The given integer.
432
+ * @return The result from querying the filter with the given item.
433
+ */
434
+ bool query(int64_t item) const;
435
+
436
+ /**
437
+ * Queries the filter with the given signed 32-bit integer and returns whether the value
438
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
439
+ * determines the chances of a true result being a false positive. False engatives are
440
+ * never possible.
441
+ * @param item The given integer.
442
+ * @return The result from querying the filter with the given item.
443
+ */
444
+ bool query(int32_t item) const;
445
+
446
+ /**
447
+ * Queries the filter with the given signed 16-bit integer and returns whether the value
448
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
449
+ * determines the chances of a true result being a false positive. False engatives are
450
+ * never possible.
451
+ * @param item The given integer.
452
+ * @return The result from querying the filter with the given item.
453
+ */
454
+ bool query(int16_t item) const;
455
+
456
+ /**
457
+ * Queries the filter with the given signed 8-bit integer and returns whether the value
458
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
459
+ * determines the chances of a true result being a false positive. False engatives are
460
+ * never possible.
461
+ * @param item The given integer.
462
+ * @return The result from querying the filter with the given item.
463
+ */
464
+ bool query(int8_t item) const;
465
+
466
+ /**
467
+ * Queries the filter with the given 64-bit floating point value and returns whether the value
468
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
469
+ * determines the chances of a true result being a false positive. False engatives are
470
+ * never possible.
471
+ * @param item The given double.
472
+ * @return The result from querying the filter with the given item.
473
+ */
474
+ bool query(double item) const;
475
+
476
+ /**
477
+ * Queries the filter with the given 32-bit floating point value and returns whether the value
478
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
479
+ * determines the chances of a true result being a false positive. False engatives are
480
+ * never possible.
481
+ * @param item The given float.
482
+ * @return The result from querying the filter with the given item.
483
+ */
484
+ bool query(float item) const;
485
+
486
+ /**
487
+ * Queries the filter with the given data array. and returns the result from
488
+ * Queries the filter with the given 64-bit floating point value and returns whether the value
489
+ * might have been seen previoiusly. The filter's expected Fale Positive Probability
490
+ * determines the chances of a true result being a false positive. False engatives are
491
+ * never possible.
492
+ * @param data The given array.
493
+ * @param length_bytes The array length in bytes.
494
+ * @return The result from querying the filter with the given item.
495
+ */
496
+ bool query(const void* data, size_t length_bytes) const;
497
+
498
+ // OTHER OPERATIONS
499
+
500
+ /**
501
+ * Unions two Bloom Filters by applying a logical OR. The result will recognized
502
+ * any values seen by either filter (as well as false positives).
503
+ * @param other A BloomFilter to union with this one
504
+ */
505
+ void union_with(const bloom_filter_alloc& other);
506
+
507
+ /**
508
+ * Intersects two Bloom Filters by applying a logical AND. The result will recognize
509
+ * only values seen by both filters (as well as false positives).
510
+ * @param other A Bloom Filter to union with this one
511
+ */
512
+ void intersect(const bloom_filter_alloc& other);
513
+
514
+ /**
515
+ * Inverts all the bits of the BloomFilter. Approximately inverts the notion of set-membership.
516
+ */
517
+ void invert();
518
+
519
+ /**
520
+ * Helps identify if two Bloom Filters may be unioned or intersected.
521
+ * @param other A Bloom Filter to check for compatibility with this one
522
+ * @return True if the filters are compatible, otherwise false
523
+ */
524
+ bool is_compatible(const bloom_filter_alloc& other) const;
525
+
526
+ /**
527
+ * @brief Checks if the Bloom Filter is read-only.
528
+ *
529
+ * @return True if the filter is read-only, otherwise false.
530
+ */
531
+ bool is_read_only() const;
532
+
533
+ /**
534
+ * @brief Returns whether the filter owns its underlying memory
535
+ * @return True if the filter owns its memory, otherwise false
536
+ */
537
+ bool is_memory_owned() const;
538
+
539
+ /**
540
+ * @brief Checks if the Bloom Filter was created by a call to wrap().
541
+ *
542
+ * @return True if the filter was created by wrapping memory, otherwise false.
543
+ */
544
+ bool is_wrapped() const;
545
+
546
+ /**
547
+ * @brief Returns a pointer to the memory this filter wraps, if it exists.
548
+ * @return A pointer to the wrapped memory, or nullptr if is_wrapped() is false.
549
+ */
550
+ const uint8_t* get_wrapped_memory() const;
551
+
552
+ /**
553
+ * @brief Gets the serialized size of the Bloom Filter in bytes
554
+ * @return The serialized size of the Bloom Filter in bytes
555
+ */
556
+ size_t get_serialized_size_bytes() const;
557
+
558
+ /**
559
+ * @brief Gets the serialized size of the Bloom Filter with the given number of bits, in bytes
560
+ * @param num_bits The number of bits in the Bloom Filter for the size calculation
561
+ * @return The serialized size of a Bloom Filter with a capacity of num_bits, in bytes
562
+ */
563
+ static size_t get_serialized_size_bytes(uint64_t num_bits);
564
+
565
+ /**
566
+ * @brief Returns a human-readable string representation of the Bloom Filter.
567
+ * @param print_filter If true, the filter bits will be printed as well.
568
+ * @return A human-readable string representation of the Bloom Filter.
569
+ */
570
+ string<Allocator> to_string(bool print_filter = false) const;
571
+
572
+ private:
573
+ using A = Allocator;
574
+ using AllocUint8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
575
+
576
+ static const uint64_t DIRTY_BITS_VALUE = static_cast<uint64_t>(-1LL);
577
+ static const uint64_t MAX_HEADER_SIZE_BYTES = 32; // 4 Java Longs
578
+ static const uint64_t BIT_ARRAY_LENGTH_OFFSET_BYTES = 16;
579
+ static const uint64_t NUM_BITS_SET_OFFSET_BYTES = 24;
580
+ static const uint64_t BIT_ARRAY_OFFSET_BYTES = 32;
581
+ static const uint64_t MAX_FILTER_SIZE_BITS = (INT32_MAX - MAX_HEADER_SIZE_BYTES) * sizeof(uint64_t);
582
+
583
+ static const uint8_t PREAMBLE_LONGS_EMPTY = 3;
584
+ static const uint8_t PREAMBLE_LONGS_STANDARD = 4;
585
+ static const uint8_t FAMILY_ID = 21;
586
+ static const uint8_t SER_VER = 1;
587
+ static const uint8_t EMPTY_FLAG_MASK = 4;
588
+
589
+ // used by builder methods
590
+ bloom_filter_alloc(uint64_t num_bits, uint16_t num_hashes, uint64_t seed, const A& allocator);
591
+ bloom_filter_alloc(uint8_t* memory, size_t length_bytes, uint64_t num_bits, uint16_t num_hashes, uint64_t seed, const A& allocator);
592
+
593
+ // used by deserialize and wrap
594
+ bloom_filter_alloc(uint64_t seed,
595
+ uint16_t num_hashes,
596
+ bool is_dirty,
597
+ bool is_owned,
598
+ bool is_read_only,
599
+ uint64_t capacity_bits,
600
+ uint64_t num_bits_set,
601
+ uint8_t* bit_array,
602
+ uint8_t* memory,
603
+ const A& allocator);
604
+
605
+ static bloom_filter_alloc internal_deserialize_or_wrap(void* bytes,
606
+ size_t length_bytes,
607
+ bool read_only,
608
+ bool wrap,
609
+ const A& allocator);
610
+
611
+ // internal query/update methods
612
+ void internal_update(uint64_t h0, uint64_t h1);
613
+ bool internal_query_and_update(uint64_t h0, uint64_t h1);
614
+ bool internal_query(uint64_t h0, uint64_t h1) const;
615
+
616
+ void update_num_bits_set(uint64_t num_bits_set);
617
+
618
+ Allocator allocator_;
619
+ uint64_t seed_;
620
+ uint16_t num_hashes_;
621
+ bool is_dirty_;
622
+ bool is_owned_; // if true, data is not owned by filter AND memory_ holds the entire filter not just the bit array
623
+ bool is_read_only_; // if true, filter is read-only
624
+ uint64_t capacity_bits_;
625
+ uint64_t num_bits_set_;
626
+ uint8_t* bit_array_; // data backing bit_array_, regardless of ownership
627
+ uint8_t* memory_; // if wrapped, pointer to the start of the filter, otheriwse nullptr
628
+ };
629
+
630
+ /**
631
+ * <p>This class provides methods to help estimate the correct parameters when
632
+ * creating a Bloom filter, and methods to create the filter using those values.</p>
633
+ *
634
+ * <p>The underlying math is described in the
635
+ * <a href='https://en.wikipedia.org/wiki/Bloom_filter#Optimal_number_of_hash_functions'>
636
+ * Wikipedia article on Bloom filters</a>.</p>
637
+ */
638
+ template<typename Allocator>
639
+ class bloom_filter_alloc<Allocator>::builder {
640
+ public:
641
+ /**
642
+ * Returns the optimal number of hash functions to given target numbers of distinct items
643
+ * and the Bloom filter size in bits. This function will provide a result even if the input
644
+ * values exceed the capacity of a single Bloom filter.
645
+ * @param max_distinct_items The maximum expected number of distinct items to add to the filter
646
+ * @param num_filter_bits The intended size of the Bloom Filter in bits
647
+ * @return The suggested number of hash functions to use with the filter
648
+ */
649
+ static uint16_t suggest_num_hashes(uint64_t max_distinct_items, uint64_t num_filter_bits);
650
+
651
+ /**
652
+ * Returns the optimal number of hash functions to achieve a target false positive probability.
653
+ * @param target_false_positive_prob A desired false positive probability per item
654
+ * @return The suggested number of hash functions to use with the filter.
655
+ */
656
+ static uint16_t suggest_num_hashes(double target_false_positive_prob);
657
+
658
+ /**
659
+ * Returns the optimal number of bits to use in a Bloom filter given a target number of distinct
660
+ * items and a target false positive probability.
661
+ * @param max_distinct_items The maximum expected number of distinct items to add to the filter
662
+ * @param target_false_positive_prob A desired false positive probability per item
663
+ * @return The suggested number of bits to use with the filter
664
+ */
665
+ static uint64_t suggest_num_filter_bits(uint64_t max_distinct_items, double target_false_positive_prob);
666
+
667
+ /**
668
+ * Creates a new Bloom filter with an optimal number of bits and hash functions for the given inputs,
669
+ * using a random base seed for the hash function.
670
+ * @param max_distinct_items The maximum expected number of distinct items to add to the filter
671
+ * @param target_false_positive_prob A desired false positive probability per item
672
+ * @param seed A bash hash seed (default: random)
673
+ * @param allocator The allocator to use for the filter (default: standard allocator)
674
+ * @return A new Bloom filter configured for the given input parameters
675
+ */
676
+ static bloom_filter_alloc<Allocator> create_by_accuracy(uint64_t max_distinct_items,
677
+ double target_false_positive_prob,
678
+ uint64_t seed = generate_random_seed(),
679
+ const Allocator& allocator = Allocator());
680
+
681
+ /**
682
+ * Creates a Bloom filter with given number of bits and number of hash functions,
683
+ * using the provided base seed for the hash function.
684
+ *
685
+ * @param num_bits The size of the BloomFilter, in bits
686
+ * @param num_hashes The number of hash functions to apply to items
687
+ * @param seed A base hash seed (default: random)
688
+ * @param allocator The allocator to use for the filter (default: standard allocator)
689
+ * @return A new Bloom filter configured for the given input parameters
690
+ */
691
+ static bloom_filter_alloc<Allocator> create_by_size(uint64_t num_bits,
692
+ uint16_t num_hashes,
693
+ uint64_t seed = generate_random_seed(),
694
+ const Allocator& allocator = Allocator());
695
+
696
+ /**
697
+ * Creates a new Bloom filter with an optimal number of bits and hash functions for the given inputs,
698
+ * using a random base seed for the hash function and writing into the provided memory. The filter does
699
+ * not take ownership of the memory but does overwrite the full contents.
700
+ *
701
+ * @param memory A pointer to the memory to use for the filter
702
+ * @param length_bytes The length of the memory in bytes
703
+ * @param max_distinct_items The maximum expected number of distinct items to add to the filter
704
+ * @param target_false_positive_prob A desired false positive probability per item
705
+ * @param dstMem A WritableMemory to hold the initialized filter
706
+ * @param allocator The allocator to use for the filter (default: standard allocator)
707
+ * @return A new Bloom filter configured for the given input parameters in the provided memory
708
+ */
709
+ static bloom_filter_alloc<Allocator> initialize_by_accuracy(void* memory,
710
+ size_t length_bytes,
711
+ uint64_t max_distinct_items,
712
+ double target_false_positive_prob,
713
+ uint64_t seed = generate_random_seed(),
714
+ const Allocator& allocator = Allocator());
715
+
716
+ /**
717
+ * Initializes a Bloom filter with given number of bits and number of hash functions,
718
+ * using the provided base seed for the hash function and writing into the provided memory. The filter does
719
+ * not take ownership of the memory but does overwrite the full contents.
720
+ *
721
+ * @param memory A pointer to the memory to use for the filter
722
+ * @param length_bytes The length of the memory in bytes
723
+ * @param num_bits The size of the BloomFilter, in bits
724
+ * @param num_hashes The number of hash functions to apply to items
725
+ * @param seed A base hash seed (default: random)
726
+ * @param allocator The allocator to use for the filter (default: standard allocator)
727
+ * @return A new BloomFilter configured for the given input parameters
728
+ */
729
+ static bloom_filter_alloc<Allocator> initialize_by_size(void* memory,
730
+ size_t length_bytes,
731
+ uint64_t num_bits,
732
+ uint16_t num_hashes,
733
+ uint64_t seed = generate_random_seed(),
734
+ const Allocator& allocator = Allocator());
735
+
736
+ /**
737
+ * @brief Generates a random 64-bit seed value
738
+ *
739
+ * @return uint64_t a random value over the range of unsigned 64-bit integers
740
+ */
741
+ static uint64_t generate_random_seed();
742
+
743
+ private:
744
+ static void validate_size_inputs(uint64_t num_bits, uint16_t num_hashes);
745
+ static void validate_accuracy_inputs(uint64_t max_distinct_items, double target_false_positive_prob);
746
+ };
747
+
748
+ } // namespace datasketches
749
+
750
+ #include "bloom_filter_builder_impl.hpp"
751
+ #include "bloom_filter_impl.hpp"
752
+
753
+ #endif // _BLOOM_FILTER_HPP_ b