datasketches 0.4.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/ext/datasketches/vo_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  6. data/vendor/datasketches-cpp/LICENSE +35 -7
  7. data/vendor/datasketches-cpp/NOTICE +2 -2
  8. data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -1
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +1 -0
  10. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
  11. data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
  12. data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
  13. data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
  14. data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
  15. data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
  16. data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
  17. data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
  18. data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
  19. data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
  20. data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
  21. data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
  22. data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
  23. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +51 -1
  24. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +38 -1
  25. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +12 -3
  26. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
  27. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -39
  28. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  29. metadata +18 -10
@@ -0,0 +1,908 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _BLOOM_FILTER_IMPL_HPP_
21
+ #define _BLOOM_FILTER_IMPL_HPP_
22
+
23
+ #include <algorithm>
24
+ #include <memory>
25
+ #include <sstream>
26
+ #include <vector>
27
+
28
+ #include "common_defs.hpp"
29
+ #include "bit_array_ops.hpp"
30
+ #include "memory_operations.hpp"
31
+ #include "xxhash64.h"
32
+
33
+ // memory scenarios:
34
+ // * on-heap: owned, bit_array_ set, memory_ null
35
+ // * direct: not owned, bit_array_ set, memory_ set
36
+ // * read-only an option for direct
37
+
38
+ namespace datasketches {
39
+
40
+ template<typename A>
41
+ bloom_filter_alloc<A>::bloom_filter_alloc(uint64_t num_bits, uint16_t num_hashes, uint64_t seed, const A& allocator) :
42
+ allocator_(allocator),
43
+ seed_(seed),
44
+ num_hashes_(num_hashes),
45
+ is_dirty_(false),
46
+ is_owned_(true),
47
+ is_read_only_(false),
48
+ capacity_bits_((num_bits + 63) & ~0x3F), // can round to nearest multiple of 64 prior to bounds checks
49
+ num_bits_set_(0)
50
+ {
51
+ if (num_hashes == 0) {
52
+ throw std::invalid_argument("Must have at least 1 hash function");
53
+ }
54
+ if (num_bits == 0) {
55
+ throw std::invalid_argument("Number of bits must be greater than zero");
56
+ } else if (num_bits > MAX_FILTER_SIZE_BITS) {
57
+ throw std::invalid_argument("Filter may not exceed " + std::to_string(MAX_FILTER_SIZE_BITS) + " bits");
58
+ }
59
+
60
+ const uint64_t num_bytes = capacity_bits_ >> 3;
61
+ bit_array_ = AllocUint8(allocator_).allocate(num_bytes);
62
+ std::fill_n(bit_array_, num_bytes, 0);
63
+ if (bit_array_ == nullptr) {
64
+ throw std::bad_alloc();
65
+ }
66
+ memory_ = nullptr;
67
+ }
68
+
69
+ template<typename A>
70
+ bloom_filter_alloc<A>::bloom_filter_alloc(uint8_t* memory,
71
+ size_t length_bytes,
72
+ uint64_t num_bits,
73
+ uint16_t num_hashes,
74
+ uint64_t seed,
75
+ const A& allocator) :
76
+ allocator_(allocator),
77
+ seed_(seed),
78
+ num_hashes_(num_hashes),
79
+ is_dirty_(false),
80
+ is_owned_(false),
81
+ is_read_only_(false),
82
+ capacity_bits_((num_bits + 63) & ~0x3F), // can round to nearest multiple of 64 prior to bounds checks
83
+ num_bits_set_(0)
84
+ {
85
+ if (num_hashes == 0) {
86
+ throw std::invalid_argument("Must have at least 1 hash function");
87
+ }
88
+ if (num_bits == 0) {
89
+ throw std::invalid_argument("Number of bits must be greater than zero");
90
+ } else if (num_bits > MAX_FILTER_SIZE_BITS) {
91
+ throw std::invalid_argument("Filter may not exceed " + std::to_string(MAX_FILTER_SIZE_BITS) + " bits");
92
+ }
93
+
94
+ const size_t num_bytes = get_serialized_size_bytes(capacity_bits_);
95
+ if (length_bytes < num_bytes) {
96
+ throw std::invalid_argument("Input memory block is too small");
97
+ }
98
+
99
+ // fill in header info
100
+ uint8_t* ptr = memory;
101
+ const uint8_t preamble_longs = PREAMBLE_LONGS_STANDARD; // no resizing so assume non-empty
102
+ ptr += copy_to_mem(preamble_longs, ptr);
103
+ const uint8_t serial_version = SER_VER;
104
+ ptr += copy_to_mem(serial_version, ptr);
105
+ const uint8_t family = FAMILY_ID;
106
+ ptr += copy_to_mem(family, ptr);
107
+ const uint8_t flags_byte = 0; // again, assuming non-empty
108
+ ptr += copy_to_mem(flags_byte, ptr);
109
+
110
+ ptr += copy_to_mem(num_hashes_, ptr);
111
+ ptr += copy_to_mem(static_cast<uint16_t>(0), ptr); // 2 bytes unused
112
+ ptr += copy_to_mem(seed_, ptr);
113
+ ptr += copy_to_mem(static_cast<int32_t>(capacity_bits_ >> 6), ptr); // sized in java longs
114
+ ptr += copy_to_mem(static_cast<uint32_t>(0), ptr); // 4 bytes unused
115
+
116
+ // rest of memory is num bits and bit array, so start with zeroes
117
+ std::fill_n(ptr, sizeof(uint64_t) * ((capacity_bits_ >> 6) + 1), 0);
118
+ bit_array_ = memory + BIT_ARRAY_OFFSET_BYTES;
119
+ memory_ = memory;
120
+ }
121
+
122
+ template<typename A>
123
+ bloom_filter_alloc<A>::bloom_filter_alloc(uint64_t seed,
124
+ uint16_t num_hashes,
125
+ bool is_dirty,
126
+ bool is_owned,
127
+ bool is_read_only,
128
+ uint64_t capacity_bits,
129
+ uint64_t num_bits_set,
130
+ uint8_t* bit_array,
131
+ uint8_t* memory,
132
+ const A& allocator) :
133
+ allocator_(allocator),
134
+ seed_(seed),
135
+ num_hashes_(num_hashes),
136
+ is_dirty_(is_dirty),
137
+ is_owned_(is_owned),
138
+ is_read_only_(is_read_only),
139
+ capacity_bits_((capacity_bits + 63) & ~0x3F),
140
+ num_bits_set_(num_bits_set),
141
+ bit_array_(bit_array),
142
+ memory_(memory)
143
+ {
144
+ // private constructor
145
+ // no consistency checks since we should have done those prior to calling this
146
+ if (is_read_only_ && memory_ != nullptr && num_bits_set == DIRTY_BITS_VALUE) {
147
+ num_bits_set_ = bit_array_ops::count_num_bits_set(bit_array_, capacity_bits_ >> 3);
148
+ }
149
+ }
150
+
151
+ template<typename A>
152
+ bloom_filter_alloc<A>::bloom_filter_alloc(const bloom_filter_alloc& other) :
153
+ allocator_(other.allocator_),
154
+ seed_(other.seed_),
155
+ num_hashes_(other.num_hashes_),
156
+ is_dirty_(other.is_dirty_),
157
+ is_owned_(other.is_owned_),
158
+ is_read_only_(other.is_read_only_),
159
+ capacity_bits_(other.capacity_bits_),
160
+ num_bits_set_(other.num_bits_set_)
161
+ {
162
+ if (is_owned_) {
163
+ const size_t num_bytes = capacity_bits_ >> 3;
164
+ bit_array_ = AllocUint8(allocator_).allocate(num_bytes);
165
+ if (bit_array_ == nullptr) {
166
+ throw std::bad_alloc();
167
+ }
168
+ std::copy_n(other.bit_array_, num_bytes, bit_array_);
169
+ memory_ = nullptr;
170
+ } else {
171
+ bit_array_ = other.bit_array_;
172
+ memory_ = other.memory_;
173
+ }
174
+ }
175
+
176
+ template<typename A>
177
+ bloom_filter_alloc<A>::bloom_filter_alloc(bloom_filter_alloc&& other) noexcept :
178
+ allocator_(std::move(other.allocator_)),
179
+ seed_(other.seed_),
180
+ num_hashes_(other.num_hashes_),
181
+ is_dirty_(other.is_dirty_),
182
+ is_owned_(other.is_owned_),
183
+ is_read_only_(other.is_read_only_),
184
+ capacity_bits_(other.capacity_bits_),
185
+ num_bits_set_(other.num_bits_set_),
186
+ bit_array_(std::move(other.bit_array_)),
187
+ memory_(std::move(other.memory_))
188
+ {
189
+ // ensure destructor on other will behave nicely
190
+ other.is_owned_ = false;
191
+ other.bit_array_ = nullptr;
192
+ other.memory_ = nullptr;
193
+ }
194
+
195
+ template<typename A>
196
+ bloom_filter_alloc<A>& bloom_filter_alloc<A>::operator=(const bloom_filter_alloc& other) {
197
+ bloom_filter_alloc<A> copy(other);
198
+ std::swap(allocator_, copy.allocator_);
199
+ std::swap(seed_, copy.seed_);
200
+ std::swap(num_hashes_, copy.num_hashes_);
201
+ std::swap(is_dirty_, copy.is_dirty_);
202
+ std::swap(is_owned_, copy.is_owned_);
203
+ std::swap(is_read_only_, copy.is_read_only_);
204
+ std::swap(capacity_bits_, copy.capacity_bits_);
205
+ std::swap(num_bits_set_, copy.num_bits_set_);
206
+ std::swap(bit_array_, copy.bit_array_);
207
+ std::swap(memory_, copy.memory_);
208
+ return *this;
209
+ }
210
+
211
+ template<typename A>
212
+ bloom_filter_alloc<A>& bloom_filter_alloc<A>::operator=(bloom_filter_alloc&& other) {
213
+ if (this == &other) { return *this; }
214
+ std::swap(allocator_, other.allocator_);
215
+ std::swap(seed_, other.seed_);
216
+ std::swap(num_hashes_, other.num_hashes_);
217
+ std::swap(is_dirty_, other.is_dirty_);
218
+ std::swap(is_owned_, other.is_owned_);
219
+ std::swap(is_read_only_, other.is_read_only_);
220
+ std::swap(capacity_bits_, other.capacity_bits_);
221
+ std::swap(num_bits_set_, other.num_bits_set_);
222
+ std::swap(bit_array_, other.bit_array_);
223
+ std::swap(memory_, other.memory_);
224
+ return *this;
225
+ }
226
+
227
+ template<typename A>
228
+ bloom_filter_alloc<A>::~bloom_filter_alloc() {
229
+ if (is_owned_) {
230
+ if (memory_ != nullptr) {
231
+ // deallocate total memory_ block, including preamble
232
+ AllocUint8(allocator_).deallocate(memory_, (capacity_bits_ >> 3) + BIT_ARRAY_OFFSET_BYTES);
233
+ } else if (bit_array_ != nullptr) {
234
+ // only need to deallocate bit_array_
235
+ AllocUint8(allocator_).deallocate(bit_array_, capacity_bits_ >> 3);
236
+ }
237
+ memory_ = nullptr;
238
+ bit_array_ = nullptr;
239
+ }
240
+ }
241
+
242
+ template<typename A>
243
+ bloom_filter_alloc<A> bloom_filter_alloc<A>::deserialize(const void* bytes, size_t length_bytes, const A& allocator) {
244
+ // not wrapping so we can cast away const as we're not modifying the memory
245
+ return internal_deserialize_or_wrap(const_cast<void*>(bytes), length_bytes, false, false, allocator);
246
+ }
247
+
248
+ /*
249
+ * A Bloom Filter's serialized image always uses 3 longs of preamble when empty,
250
+ * otherwise 4 longs:
251
+ *
252
+ * <pre>
253
+ * Long || Start Byte Adr:
254
+ * Adr:
255
+ * || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
256
+ * 0 || Preamble_Longs | SerVer | FamID | Flags |----Num Hashes---|-----Unused------|
257
+ *
258
+ * || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
259
+ * 1 ||---------------------------------Hash Seed-------------------------------------|
260
+ *
261
+ * || 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
262
+ * 2 ||-------BitArray Length (in longs)----------|-----------Unused------------------|
263
+ *
264
+ * || 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
265
+ * 3 ||---------------------------------NumBitsSet------------------------------------|
266
+ * </pre>
267
+ *
268
+ * The raw BitArray bits, if non-empty start at byte 32.
269
+ */
270
+
271
+ template<typename A>
272
+ bloom_filter_alloc<A> bloom_filter_alloc<A>::deserialize(std::istream& is, const A& allocator) {
273
+ const uint8_t prelongs = read<uint8_t>(is);
274
+ const uint8_t ser_ver = read<uint8_t>(is);
275
+ const uint8_t family = read<uint8_t>(is);
276
+ const uint8_t flags = read<uint8_t>(is);
277
+
278
+ if (prelongs < 1 || prelongs > 4) {
279
+ throw std::invalid_argument("Possible corruption: Incorrect number of preamble bytes specified in header");
280
+ }
281
+ if (ser_ver != SER_VER) {
282
+ throw std::invalid_argument("Possible corruption: Unrecognized serialization version: " + std::to_string(ser_ver));
283
+ }
284
+ if (family != FAMILY_ID) {
285
+ throw std::invalid_argument("Possible corruption: Incorrect Family ID for bloom filter. Found: " + std::to_string(family));
286
+ }
287
+
288
+ const bool is_empty = (flags & EMPTY_FLAG_MASK) != 0;
289
+
290
+ const uint16_t num_hashes = read<uint16_t>(is);
291
+ read<uint16_t>(is); // unused
292
+ const uint64_t seed = read<uint64_t>(is);
293
+ const uint32_t num_longs = read<uint32_t>(is); // sized in java longs
294
+ read<uint32_t>(is); // unused
295
+
296
+ // if empty, stop reading
297
+ if (is_empty) {
298
+ return bloom_filter_alloc<A>(num_longs << 6, num_hashes, seed, allocator);
299
+ }
300
+
301
+ const uint64_t num_bits_set = read<uint64_t>(is);
302
+ const bool is_dirty = (num_bits_set == DIRTY_BITS_VALUE);
303
+
304
+ // allocate memory
305
+ const uint64_t num_bytes = num_longs << 3;
306
+ AllocUint8 alloc(allocator);
307
+ uint8_t* bit_array = alloc.allocate(num_bytes);
308
+ if (bit_array == nullptr) {
309
+ throw std::bad_alloc();
310
+ }
311
+ read(is, bit_array, num_bytes);
312
+
313
+ // pass to constructor
314
+ return bloom_filter_alloc<A>(seed, num_hashes, is_dirty, true, false, num_longs << 6, num_bits_set, bit_array, nullptr, allocator);
315
+ }
316
+
317
+ template<typename A>
318
+ const bloom_filter_alloc<A> bloom_filter_alloc<A>::wrap(const void* bytes, size_t length_bytes, const A& allocator) {
319
+ // read-only flag means we won't modify the memory, but cast away the const
320
+ return internal_deserialize_or_wrap(const_cast<void*>(bytes), length_bytes, true, true, allocator);
321
+ }
322
+
323
+ template<typename A>
324
+ bloom_filter_alloc<A> bloom_filter_alloc<A>::writable_wrap(void* bytes, size_t length_bytes, const A& allocator) {
325
+ return internal_deserialize_or_wrap(bytes, length_bytes, false, true, allocator);
326
+ }
327
+
328
+ template<typename A>
329
+ bloom_filter_alloc<A> bloom_filter_alloc<A>::internal_deserialize_or_wrap(void* bytes,
330
+ size_t length_bytes,
331
+ bool read_only,
332
+ bool wrap,
333
+ const A& allocator)
334
+ {
335
+ ensure_minimum_memory(length_bytes, 8);
336
+ if (bytes == nullptr) {
337
+ throw std::invalid_argument("Input data is null or empty");
338
+ }
339
+ const uint8_t* ptr = static_cast<const uint8_t*>(bytes);
340
+ const uint8_t* end_ptr = ptr + length_bytes;
341
+ const uint8_t prelongs = *ptr++;
342
+ const uint8_t ser_ver = *ptr++;
343
+ const uint8_t family = *ptr++;
344
+ const uint8_t flags = *ptr++;
345
+
346
+ if (prelongs < PREAMBLE_LONGS_EMPTY || prelongs > PREAMBLE_LONGS_STANDARD) {
347
+ throw std::invalid_argument("Possible corruption: Incorrect number of preamble bytes specified in header");
348
+ }
349
+ if (ser_ver != SER_VER) {
350
+ throw std::invalid_argument("Possible corruption: Unrecognized serialization version: " + std::to_string(ser_ver));
351
+ }
352
+ if (family != FAMILY_ID) {
353
+ throw std::invalid_argument("Possible corruption: Incorrect Family ID for bloom filter. Found: " + std::to_string(family));
354
+ }
355
+
356
+ const bool is_empty = (flags & EMPTY_FLAG_MASK) != 0;
357
+
358
+ ensure_minimum_memory(length_bytes, prelongs * sizeof(uint64_t));
359
+
360
+ uint16_t num_hashes;
361
+ ptr += copy_from_mem(ptr, num_hashes);
362
+ ptr += sizeof(uint16_t); // 16 bits unused after num_hashes
363
+ uint64_t seed;
364
+ ptr += copy_from_mem(ptr, seed);
365
+
366
+ uint32_t num_longs;
367
+ ptr += copy_from_mem(ptr, num_longs); // sized in java longs
368
+ ptr += sizeof(uint32_t); // unused 32 bits follow
369
+
370
+ // if empty, stop reading
371
+ if (wrap && is_empty && !read_only) {
372
+ throw std::invalid_argument("Cannot wrap an empty filter for writing");
373
+ } else if (is_empty) {
374
+ return bloom_filter_alloc<A>(num_longs << 6, num_hashes, seed, allocator);
375
+ }
376
+
377
+ uint64_t num_bits_set;
378
+ ptr += copy_from_mem(ptr, num_bits_set);
379
+ const bool is_dirty = (num_bits_set == DIRTY_BITS_VALUE);
380
+
381
+ uint8_t* bit_array;
382
+ uint8_t* memory;
383
+ if (wrap) {
384
+ memory = static_cast<uint8_t*>(bytes);
385
+ bit_array = memory + BIT_ARRAY_OFFSET_BYTES;
386
+ } else {
387
+ // allocate memory
388
+ memory = nullptr;
389
+ const uint64_t num_bytes = num_longs << 3;
390
+ ensure_minimum_memory(end_ptr - ptr, num_bytes);
391
+ AllocUint8 alloc(allocator);
392
+ bit_array = alloc.allocate(num_bytes);
393
+ if (bit_array == nullptr) {
394
+ throw std::bad_alloc();
395
+ }
396
+ copy_from_mem(ptr, bit_array, num_bytes);
397
+ }
398
+
399
+ // pass to constructor -- !wrap == is_owned_
400
+ return bloom_filter_alloc<A>(seed, num_hashes, is_dirty, !wrap, read_only, num_longs << 6, num_bits_set, bit_array, memory, allocator);
401
+ }
402
+
403
+ template<typename A>
404
+ void bloom_filter_alloc<A>::serialize(std::ostream& os) const {
405
+ // Should we serialize memory_ directly if it exists?
406
+ const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_STANDARD;
407
+ write(os, preamble_longs);
408
+ const uint8_t serial_version = SER_VER;
409
+ write(os, serial_version);
410
+ const uint8_t family = FAMILY_ID;
411
+ write(os, family);
412
+ const uint8_t flags_byte = is_empty() ? EMPTY_FLAG_MASK : 0;
413
+ write(os, flags_byte);
414
+
415
+ write(os, num_hashes_);
416
+ write(os, static_cast<uint16_t>(0)); // 2 bytes unused
417
+ write(os, seed_);
418
+ write(os, static_cast<int32_t>(capacity_bits_ >> 6)); // sized in java longs
419
+ write(os, static_cast<uint32_t>(0)); // 4 bytes unused
420
+
421
+ if (!is_empty()) {
422
+ write(os, is_dirty_ ? DIRTY_BITS_VALUE : num_bits_set_);
423
+ write(os, bit_array_, capacity_bits_ >> 3);
424
+ }
425
+
426
+ os.flush();
427
+ }
428
+
429
+ template<typename A>
430
+ auto bloom_filter_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
431
+ // Should we serialize memory_ directly if it exists?
432
+ const size_t size = header_size_bytes + get_serialized_size_bytes();
433
+ vector_bytes bytes(size, 0, allocator_);
434
+ uint8_t* ptr = bytes.data() + header_size_bytes;
435
+
436
+ const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_STANDARD;
437
+ ptr += copy_to_mem(preamble_longs, ptr);
438
+ const uint8_t serial_version = SER_VER;
439
+ ptr += copy_to_mem(serial_version, ptr);
440
+ const uint8_t family = FAMILY_ID;
441
+ ptr += copy_to_mem(family, ptr);
442
+ const uint8_t flags_byte = is_empty() ? EMPTY_FLAG_MASK : 0;
443
+ ptr += copy_to_mem(flags_byte, ptr);
444
+
445
+ ptr += copy_to_mem(num_hashes_, ptr);
446
+ ptr += copy_to_mem(static_cast<uint16_t>(0), ptr); // 2 bytes unused
447
+ ptr += copy_to_mem(seed_, ptr);
448
+ ptr += copy_to_mem(static_cast<int32_t>(capacity_bits_ >> 6), ptr); // sized in java longs
449
+ ptr += copy_to_mem(static_cast<uint32_t>(0), ptr); // 4 bytes unused
450
+
451
+ if (!is_empty()) {
452
+ ptr += copy_to_mem(is_dirty_ ? DIRTY_BITS_VALUE : num_bits_set_, ptr);
453
+ ptr += copy_to_mem(bit_array_, ptr, capacity_bits_ >> 3);
454
+ }
455
+
456
+ return bytes;
457
+ }
458
+
459
+ template<typename A>
460
+ size_t bloom_filter_alloc<A>::get_serialized_size_bytes() const {
461
+ return sizeof(uint64_t) * (is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_STANDARD + (capacity_bits_ >> 6));
462
+ }
463
+
464
+ template<typename A>
465
+ size_t bloom_filter_alloc<A>::get_serialized_size_bytes(uint64_t num_bits) {
466
+ if (num_bits == 0)
467
+ throw std::invalid_argument("Number of bits must be greater than zero");
468
+
469
+ size_t num_bytes = (num_bits + 63) >> 6;
470
+ return sizeof(uint64_t) * (PREAMBLE_LONGS_STANDARD + num_bytes);
471
+ }
472
+
473
+ template<typename A>
474
+ bool bloom_filter_alloc<A>::is_empty() const {
475
+ return !is_dirty_ && num_bits_set_ == 0;
476
+ }
477
+
478
+ template<typename A>
479
+ uint64_t bloom_filter_alloc<A>::get_bits_used() {
480
+ if (is_dirty_) {
481
+ num_bits_set_ = bit_array_ops::count_num_bits_set(bit_array_, capacity_bits_ >> 3);
482
+ is_dirty_ = false;
483
+ }
484
+ return num_bits_set_;
485
+ }
486
+
487
+ template<typename A>
488
+ uint64_t bloom_filter_alloc<A>::get_capacity() const {
489
+ return capacity_bits_;
490
+ }
491
+
492
+ template<typename A>
493
+ uint16_t bloom_filter_alloc<A>::get_num_hashes() const {
494
+ return num_hashes_;
495
+ }
496
+
497
+ template<typename A>
498
+ uint64_t bloom_filter_alloc<A>::get_seed() const {
499
+ return seed_;
500
+ }
501
+
502
+ template<typename A>
503
+ bool bloom_filter_alloc<A>::is_read_only() const {
504
+ return is_read_only_;
505
+ }
506
+
507
+ template<typename A>
508
+ bool bloom_filter_alloc<A>::is_wrapped() const {
509
+ return memory_ != nullptr;
510
+ }
511
+
512
+ template<typename A>
513
+ bool bloom_filter_alloc<A>::is_memory_owned() const {
514
+ return is_owned_;
515
+ }
516
+
517
+ template<typename A>
518
+ const uint8_t* bloom_filter_alloc<A>::get_wrapped_memory() const {
519
+ return memory_;
520
+ }
521
+
522
+ template<typename A>
523
+ void bloom_filter_alloc<A>::reset() {
524
+ if (is_read_only_) {
525
+ throw std::logic_error("Cannot reset a read-only filter");
526
+ }
527
+ update_num_bits_set(0);
528
+ std::fill_n(bit_array_, capacity_bits_ >> 3, 0);
529
+ }
530
+
531
+ template<typename A>
532
+ void bloom_filter_alloc<A>::update_num_bits_set(uint64_t num_bits_set) {
533
+ num_bits_set_ = num_bits_set;
534
+ is_dirty_ = false;
535
+ if (memory_ != nullptr && !is_read_only_) {
536
+ copy_to_mem(num_bits_set_, memory_ + NUM_BITS_SET_OFFSET_BYTES);
537
+ }
538
+ }
539
+
540
+ // UPDATE METHODS
541
+
542
+ template<typename A>
543
+ void bloom_filter_alloc<A>::update(const std::string& item) {
544
+ if (item.empty()) return;
545
+ const uint64_t h0 = XXHash64::hash(item.data(), item.size(), seed_);
546
+ const uint64_t h1 = XXHash64::hash(item.data(), item.size(), h0);
547
+ internal_update(h0, h1);
548
+ }
549
+
550
+ template<typename A>
551
+ void bloom_filter_alloc<A>::update(uint64_t item) {
552
+ const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_);
553
+ const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0);
554
+ internal_update(h0, h1);
555
+ }
556
+
557
+ template<typename A>
558
+ void bloom_filter_alloc<A>::update(uint32_t item) {
559
+ update(static_cast<uint64_t>(item));
560
+ }
561
+
562
+ template<typename A>
563
+ void bloom_filter_alloc<A>::update(uint16_t item) {
564
+ update(static_cast<uint64_t>(item));
565
+ }
566
+
567
+ template<typename A>
568
+ void bloom_filter_alloc<A>::update(uint8_t item) {
569
+ update(static_cast<uint64_t>(item));
570
+ }
571
+
572
+ template<typename A>
573
+ void bloom_filter_alloc<A>::update(int64_t item) {
574
+ const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_);
575
+ const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0);
576
+ internal_update(h0, h1);
577
+ }
578
+
579
+ template<typename A>
580
+ void bloom_filter_alloc<A>::update(int32_t item) {
581
+ update(static_cast<int64_t>(item));
582
+ }
583
+
584
+ template<typename A>
585
+ void bloom_filter_alloc<A>::update(int16_t item) {
586
+ update(static_cast<int64_t>(item));
587
+ }
588
+
589
+ template<typename A>
590
+ void bloom_filter_alloc<A>::update(int8_t item) {
591
+ update(static_cast<int64_t>(item));
592
+ }
593
+
594
+ template<typename A>
595
+ void bloom_filter_alloc<A>::update(double item) {
596
+ union {
597
+ int64_t long_value;
598
+ double double_value;
599
+ } ldu;
600
+ ldu.double_value = static_cast<double>(item);
601
+ if (item == 0.0) {
602
+ ldu.double_value = 0.0; // canonicalize -0.0 to 0.0
603
+ } else if (std::isnan(ldu.double_value)) {
604
+ ldu.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
605
+ }
606
+ const uint64_t h0 = XXHash64::hash(&ldu, sizeof(ldu), seed_);
607
+ const uint64_t h1 = XXHash64::hash(&ldu, sizeof(ldu), h0);
608
+ internal_update(h0, h1);
609
+ }
610
+
611
+ template<typename A>
612
+ void bloom_filter_alloc<A>::update(float item) {
613
+ update(static_cast<double>(item));
614
+ }
615
+
616
+ template<typename A>
617
+ void bloom_filter_alloc<A>::update(const void* item, size_t size) {
618
+ if (item == nullptr || size == 0) return;
619
+ const uint64_t h0 = XXHash64::hash(item, size, seed_);
620
+ const uint64_t h1 = XXHash64::hash(item, size, h0);
621
+ internal_update(h0, h1);
622
+ }
623
+
624
+ template<typename A>
625
+ void bloom_filter_alloc<A>::internal_update(uint64_t h0, uint64_t h1) {
626
+ if (is_read_only_) {
627
+ throw std::logic_error("Cannot update a read-only filter");
628
+ }
629
+ const uint64_t num_bits = get_capacity();
630
+ for (uint16_t i = 1; i <= num_hashes_; i++) {
631
+ const uint64_t hash_index = ((h0 + i * h1) >> 1) % num_bits;
632
+ bit_array_ops::set_bit(bit_array_, hash_index);
633
+ }
634
+ is_dirty_ = true;
635
+ }
636
+
637
+ // QUERY-AND-UPDATE METHODS
638
+
639
+ template<typename A>
640
+ bool bloom_filter_alloc<A>::query_and_update(const std::string& item) {
641
+ if (item.empty()) return false;
642
+ const uint64_t h0 = XXHash64::hash(item.data(), item.size(), seed_);
643
+ const uint64_t h1 = XXHash64::hash(item.data(), item.size(), h0);
644
+ return internal_query_and_update(h0, h1);
645
+ }
646
+
647
+ template<typename A>
648
+ bool bloom_filter_alloc<A>::query_and_update(uint64_t item) {
649
+ const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_);
650
+ const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0);
651
+ return internal_query_and_update(h0, h1);
652
+ }
653
+
654
+ template<typename A>
655
+ bool bloom_filter_alloc<A>::query_and_update(uint32_t item) {
656
+ return query_and_update(static_cast<uint64_t>(item));
657
+ }
658
+
659
+ template<typename A>
660
+ bool bloom_filter_alloc<A>::query_and_update(uint16_t item) {
661
+ return query_and_update(static_cast<uint64_t>(item));
662
+ }
663
+
664
+ template<typename A>
665
+ bool bloom_filter_alloc<A>::query_and_update(uint8_t item) {
666
+ return query_and_update(static_cast<uint64_t>(item));
667
+ }
668
+
669
+ template<typename A>
670
+ bool bloom_filter_alloc<A>::query_and_update(int64_t item) {
671
+ const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_);
672
+ const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0);
673
+ return internal_query_and_update(h0, h1);
674
+ }
675
+
676
+ template<typename A>
677
+ bool bloom_filter_alloc<A>::query_and_update(int32_t item) {
678
+ return query_and_update(static_cast<int64_t>(item));
679
+ }
680
+
681
+ template<typename A>
682
+ bool bloom_filter_alloc<A>::query_and_update(int16_t item) {
683
+ return query_and_update(static_cast<int64_t>(item));
684
+ }
685
+
686
+ template<typename A>
687
+ bool bloom_filter_alloc<A>::query_and_update(int8_t item) {
688
+ return query_and_update(static_cast<int64_t>(item));
689
+ }
690
+
691
+ template<typename A>
692
+ bool bloom_filter_alloc<A>::query_and_update(double item) {
693
+ union {
694
+ int64_t long_value;
695
+ double double_value;
696
+ } ldu;
697
+ ldu.double_value = item;
698
+ if (item == 0.0) {
699
+ ldu.double_value = 0.0; // canonicalize -0.0 to 0.0
700
+ } else if (std::isnan(ldu.double_value)) {
701
+ ldu.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
702
+ }
703
+ const uint64_t h0 = XXHash64::hash(&ldu, sizeof(ldu), seed_);
704
+ const uint64_t h1 = XXHash64::hash(&ldu, sizeof(ldu), h0);
705
+ return internal_query_and_update(h0, h1);
706
+ }
707
+
708
+ template<typename A>
709
+ bool bloom_filter_alloc<A>::query_and_update(float item) {
710
+ return query_and_update(static_cast<double>(item));
711
+ }
712
+
713
+ template<typename A>
714
+ bool bloom_filter_alloc<A>::query_and_update(const void* item, size_t size) {
715
+ if (item == nullptr || size == 0) return false;
716
+ const uint64_t h0 = XXHash64::hash(item, size, seed_);
717
+ const uint64_t h1 = XXHash64::hash(item, size, h0);
718
+ return internal_query_and_update(h0, h1);
719
+ }
720
+
721
+ template<typename A>
722
+ bool bloom_filter_alloc<A>::internal_query_and_update(uint64_t h0, uint64_t h1) {
723
+ if (is_read_only_) {
724
+ throw std::logic_error("Cannot update a read-only filter");
725
+ }
726
+ const uint64_t num_bits = get_capacity();
727
+ bool value_exists = true;
728
+ for (uint16_t i = 1; i <= num_hashes_; i++) {
729
+ const uint64_t hash_index = ((h0 + i * h1) >> 1) % num_bits;
730
+ bool value = bit_array_ops::get_and_set_bit(bit_array_, hash_index);
731
+ update_num_bits_set(num_bits_set_ + (value ? 0 : 1));
732
+ value_exists &= value;
733
+ }
734
+ return value_exists;
735
+ }
736
+
737
+ // QUERY METHODS
738
+
739
+ template<typename A>
740
+ bool bloom_filter_alloc<A>::query(const std::string& item) const {
741
+ if (item.empty()) return false;
742
+ const uint64_t h0 = XXHash64::hash(item.data(), item.size(), seed_);
743
+ const uint64_t h1 = XXHash64::hash(item.data(), item.size(), h0);
744
+ return internal_query(h0, h1);
745
+ }
746
+
747
+ template<typename A>
748
+ bool bloom_filter_alloc<A>::query(uint64_t item) const {
749
+ const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_);
750
+ const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0);
751
+ return internal_query(h0, h1);
752
+ }
753
+
754
+ template<typename A>
755
+ bool bloom_filter_alloc<A>::query(uint32_t item) const {
756
+ return query(static_cast<uint64_t>(item));
757
+ }
758
+
759
+ template<typename A>
760
+ bool bloom_filter_alloc<A>::query(uint16_t item) const {
761
+ return query(static_cast<uint64_t>(item));
762
+ }
763
+
764
+ template<typename A>
765
+ bool bloom_filter_alloc<A>::query(uint8_t item) const {
766
+ return query(static_cast<uint64_t>(item));
767
+ }
768
+
769
+ template<typename A>
770
+ bool bloom_filter_alloc<A>::query(int64_t item) const {
771
+ const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_);
772
+ const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0);
773
+ return internal_query(h0, h1);
774
+ }
775
+
776
+ template<typename A>
777
+ bool bloom_filter_alloc<A>::query(int32_t item) const {
778
+ return query(static_cast<int64_t>(item));
779
+ }
780
+
781
+ template<typename A>
782
+ bool bloom_filter_alloc<A>::query(int16_t item) const {
783
+ return query(static_cast<int64_t>(item));
784
+ }
785
+
786
+ template<typename A>
787
+ bool bloom_filter_alloc<A>::query(int8_t item) const {
788
+ return query(static_cast<int64_t>(item));
789
+ }
790
+
791
+ template<typename A>
792
+ bool bloom_filter_alloc<A>::query(double item) const {
793
+ union {
794
+ int64_t long_value;
795
+ double double_value;
796
+ } ldu;
797
+ ldu.double_value = static_cast<double>(item);
798
+ if (item == 0.0) {
799
+ ldu.double_value = 0.0; // canonicalize -0.0 to 0.0
800
+ } else if (std::isnan(ldu.double_value)) {
801
+ ldu.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
802
+ }
803
+ const uint64_t h0 = XXHash64::hash(&ldu, sizeof(ldu), seed_);
804
+ const uint64_t h1 = XXHash64::hash(&ldu, sizeof(ldu), h0);
805
+ return internal_query(h0, h1);
806
+ }
807
+
808
+ template<typename A>
809
+ bool bloom_filter_alloc<A>::query(float item) const {
810
+ return query(static_cast<double>(item));
811
+ }
812
+
813
+ template<typename A>
814
+ bool bloom_filter_alloc<A>::query(const void* item, size_t size) const {
815
+ if (item == nullptr || size == 0) return false;
816
+ const uint64_t h0 = XXHash64::hash(item, size, seed_);
817
+ const uint64_t h1 = XXHash64::hash(item, size, h0);
818
+ return internal_query(h0, h1);
819
+ }
820
+
821
+ template<typename A>
822
+ bool bloom_filter_alloc<A>::internal_query(uint64_t h0, uint64_t h1) const {
823
+ if (is_empty()) return false;
824
+ const uint64_t num_bits = get_capacity();
825
+ for (uint16_t i = 1; i <= num_hashes_; i++) {
826
+ const uint64_t hash_index = ((h0 + i * h1) >> 1) % num_bits;
827
+ if (!bit_array_ops::get_bit(bit_array_, hash_index))
828
+ return false;
829
+ }
830
+ return true;
831
+ }
832
+
833
+ // OTHER METHODS
834
+
835
+ template<typename A>
836
+ bool bloom_filter_alloc<A>::is_compatible(const bloom_filter_alloc& other) const {
837
+ return seed_ == other.seed_
838
+ && num_hashes_ == other.num_hashes_
839
+ && get_capacity() == other.get_capacity()
840
+ ;
841
+ }
842
+
843
+ template<typename A>
844
+ void bloom_filter_alloc<A>::union_with(const bloom_filter_alloc& other) {
845
+ if (!is_compatible(other)) {
846
+ throw std::invalid_argument("Incompatible bloom filters");
847
+ }
848
+ uint64_t bits_set = bit_array_ops::union_with(bit_array_, other.bit_array_, capacity_bits_ >> 3);
849
+ update_num_bits_set(bits_set);
850
+ }
851
+
852
+ template<typename A>
853
+ void bloom_filter_alloc<A>::intersect(const bloom_filter_alloc& other) {
854
+ if (!is_compatible(other)) {
855
+ throw std::invalid_argument("Incompatible bloom filters");
856
+ }
857
+ uint64_t bits_set = bit_array_ops::intersect(bit_array_, other.bit_array_, capacity_bits_ >> 3);
858
+ update_num_bits_set(bits_set);
859
+ }
860
+
861
+ template<typename A>
862
+ void bloom_filter_alloc<A>::invert() {
863
+ uint64_t bits_set = bit_array_ops::invert(bit_array_, capacity_bits_ >> 3);
864
+ update_num_bits_set(bits_set);
865
+ }
866
+
867
+ template<typename A>
868
+ string<A> bloom_filter_alloc<A>::to_string(bool print_filter) const {
869
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
870
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
871
+ std::ostringstream oss;
872
+ uint64_t num_bits_set = num_bits_set_;
873
+ if (is_dirty_) {
874
+ num_bits_set = bit_array_ops::count_num_bits_set(bit_array_, capacity_bits_ >> 3);
875
+ }
876
+
877
+ oss << "### Bloom Filter Summary:" << std::endl;
878
+ oss << " num_bits : " << get_capacity() << std::endl;
879
+ oss << " num_hashes : " << num_hashes_ << std::endl;
880
+ oss << " seed : " << seed_ << std::endl;
881
+ oss << " is_dirty : " << (is_dirty_ ? "true" : "false") << std::endl;
882
+ oss << " bits_used : " << num_bits_set << std::endl;
883
+ oss << " fill % : " << (num_bits_set * 100.0) / get_capacity() << std::endl;
884
+ oss << "### End filter summary" << std::endl;
885
+
886
+ if (print_filter) {
887
+ uint64_t num_blocks = capacity_bits_ >> 6; // groups of 64 bits
888
+ for (uint64_t i = 0; i < num_blocks; ++i) {
889
+ oss << i << ": ";
890
+ for (uint64_t j = 0; j < 8; ++j) { // bytes w/in a block
891
+ for (uint64_t b = 0; b < 8; ++b) { // bits w/in a byte
892
+ oss << ((bit_array_[i * 8 + j] & (1 << b)) ? "1" : "0");
893
+ }
894
+ oss << " ";
895
+ }
896
+ oss << std::endl;
897
+ }
898
+ oss << std::endl;
899
+ }
900
+
901
+ oss << std::endl;
902
+ return string<A>(oss.str(), allocator_);
903
+ }
904
+
905
+
906
+ } // namespace datasketches
907
+
908
+ #endif // _BLOOM_FILTER_IMPL_HPP_