datasketches 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (29) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/vo_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  6. data/vendor/datasketches-cpp/LICENSE +35 -7
  7. data/vendor/datasketches-cpp/NOTICE +2 -2
  8. data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -1
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +1 -0
  10. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
  11. data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
  12. data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
  13. data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
  14. data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
  15. data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
  16. data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
  17. data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
  18. data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
  19. data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
  20. data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
  21. data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
  22. data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
  23. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +51 -1
  24. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +38 -1
  25. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +12 -3
  26. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
  27. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -39
  28. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  29. metadata +17 -9
@@ -0,0 +1,908 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _BLOOM_FILTER_IMPL_HPP_
21
+ #define _BLOOM_FILTER_IMPL_HPP_
22
+
23
+ #include <algorithm>
24
+ #include <memory>
25
+ #include <sstream>
26
+ #include <vector>
27
+
28
+ #include "common_defs.hpp"
29
+ #include "bit_array_ops.hpp"
30
+ #include "memory_operations.hpp"
31
+ #include "xxhash64.h"
32
+
33
+ // memory scenarios:
34
+ // * on-heap: owned, bit_array_ set, memory_ null
35
+ // * direct: not owned, bit_array_ set, memory_ set
36
+ // * read-only an option for direct
37
+
38
+ namespace datasketches {
39
+
40
+ template<typename A>
41
+ bloom_filter_alloc<A>::bloom_filter_alloc(uint64_t num_bits, uint16_t num_hashes, uint64_t seed, const A& allocator) :
42
+ allocator_(allocator),
43
+ seed_(seed),
44
+ num_hashes_(num_hashes),
45
+ is_dirty_(false),
46
+ is_owned_(true),
47
+ is_read_only_(false),
48
+ capacity_bits_((num_bits + 63) & ~0x3F), // can round to nearest multiple of 64 prior to bounds checks
49
+ num_bits_set_(0)
50
+ {
51
+ if (num_hashes == 0) {
52
+ throw std::invalid_argument("Must have at least 1 hash function");
53
+ }
54
+ if (num_bits == 0) {
55
+ throw std::invalid_argument("Number of bits must be greater than zero");
56
+ } else if (num_bits > MAX_FILTER_SIZE_BITS) {
57
+ throw std::invalid_argument("Filter may not exceed " + std::to_string(MAX_FILTER_SIZE_BITS) + " bits");
58
+ }
59
+
60
+ const uint64_t num_bytes = capacity_bits_ >> 3;
61
+ bit_array_ = AllocUint8(allocator_).allocate(num_bytes);
62
+ std::fill_n(bit_array_, num_bytes, 0);
63
+ if (bit_array_ == nullptr) {
64
+ throw std::bad_alloc();
65
+ }
66
+ memory_ = nullptr;
67
+ }
68
+
69
+ template<typename A>
70
+ bloom_filter_alloc<A>::bloom_filter_alloc(uint8_t* memory,
71
+ size_t length_bytes,
72
+ uint64_t num_bits,
73
+ uint16_t num_hashes,
74
+ uint64_t seed,
75
+ const A& allocator) :
76
+ allocator_(allocator),
77
+ seed_(seed),
78
+ num_hashes_(num_hashes),
79
+ is_dirty_(false),
80
+ is_owned_(false),
81
+ is_read_only_(false),
82
+ capacity_bits_((num_bits + 63) & ~0x3F), // can round to nearest multiple of 64 prior to bounds checks
83
+ num_bits_set_(0)
84
+ {
85
+ if (num_hashes == 0) {
86
+ throw std::invalid_argument("Must have at least 1 hash function");
87
+ }
88
+ if (num_bits == 0) {
89
+ throw std::invalid_argument("Number of bits must be greater than zero");
90
+ } else if (num_bits > MAX_FILTER_SIZE_BITS) {
91
+ throw std::invalid_argument("Filter may not exceed " + std::to_string(MAX_FILTER_SIZE_BITS) + " bits");
92
+ }
93
+
94
+ const size_t num_bytes = get_serialized_size_bytes(capacity_bits_);
95
+ if (length_bytes < num_bytes) {
96
+ throw std::invalid_argument("Input memory block is too small");
97
+ }
98
+
99
+ // fill in header info
100
+ uint8_t* ptr = memory;
101
+ const uint8_t preamble_longs = PREAMBLE_LONGS_STANDARD; // no resizing so assume non-empty
102
+ ptr += copy_to_mem(preamble_longs, ptr);
103
+ const uint8_t serial_version = SER_VER;
104
+ ptr += copy_to_mem(serial_version, ptr);
105
+ const uint8_t family = FAMILY_ID;
106
+ ptr += copy_to_mem(family, ptr);
107
+ const uint8_t flags_byte = 0; // again, assuming non-empty
108
+ ptr += copy_to_mem(flags_byte, ptr);
109
+
110
+ ptr += copy_to_mem(num_hashes_, ptr);
111
+ ptr += copy_to_mem(static_cast<uint16_t>(0), ptr); // 2 bytes unused
112
+ ptr += copy_to_mem(seed_, ptr);
113
+ ptr += copy_to_mem(static_cast<int32_t>(capacity_bits_ >> 6), ptr); // sized in java longs
114
+ ptr += copy_to_mem(static_cast<uint32_t>(0), ptr); // 4 bytes unused
115
+
116
+ // rest of memory is num bits and bit array, so start with zeroes
117
+ std::fill_n(ptr, sizeof(uint64_t) * ((capacity_bits_ >> 6) + 1), 0);
118
+ bit_array_ = memory + BIT_ARRAY_OFFSET_BYTES;
119
+ memory_ = memory;
120
+ }
121
+
122
+ template<typename A>
123
+ bloom_filter_alloc<A>::bloom_filter_alloc(uint64_t seed,
124
+ uint16_t num_hashes,
125
+ bool is_dirty,
126
+ bool is_owned,
127
+ bool is_read_only,
128
+ uint64_t capacity_bits,
129
+ uint64_t num_bits_set,
130
+ uint8_t* bit_array,
131
+ uint8_t* memory,
132
+ const A& allocator) :
133
+ allocator_(allocator),
134
+ seed_(seed),
135
+ num_hashes_(num_hashes),
136
+ is_dirty_(is_dirty),
137
+ is_owned_(is_owned),
138
+ is_read_only_(is_read_only),
139
+ capacity_bits_((capacity_bits + 63) & ~0x3F),
140
+ num_bits_set_(num_bits_set),
141
+ bit_array_(bit_array),
142
+ memory_(memory)
143
+ {
144
+ // private constructor
145
+ // no consistency checks since we should have done those prior to calling this
146
+ if (is_read_only_ && memory_ != nullptr && num_bits_set == DIRTY_BITS_VALUE) {
147
+ num_bits_set_ = bit_array_ops::count_num_bits_set(bit_array_, capacity_bits_ >> 3);
148
+ }
149
+ }
150
+
151
+ template<typename A>
152
+ bloom_filter_alloc<A>::bloom_filter_alloc(const bloom_filter_alloc& other) :
153
+ allocator_(other.allocator_),
154
+ seed_(other.seed_),
155
+ num_hashes_(other.num_hashes_),
156
+ is_dirty_(other.is_dirty_),
157
+ is_owned_(other.is_owned_),
158
+ is_read_only_(other.is_read_only_),
159
+ capacity_bits_(other.capacity_bits_),
160
+ num_bits_set_(other.num_bits_set_)
161
+ {
162
+ if (is_owned_) {
163
+ const size_t num_bytes = capacity_bits_ >> 3;
164
+ bit_array_ = AllocUint8(allocator_).allocate(num_bytes);
165
+ if (bit_array_ == nullptr) {
166
+ throw std::bad_alloc();
167
+ }
168
+ std::copy_n(other.bit_array_, num_bytes, bit_array_);
169
+ memory_ = nullptr;
170
+ } else {
171
+ bit_array_ = other.bit_array_;
172
+ memory_ = other.memory_;
173
+ }
174
+ }
175
+
176
+ template<typename A>
177
+ bloom_filter_alloc<A>::bloom_filter_alloc(bloom_filter_alloc&& other) noexcept :
178
+ allocator_(std::move(other.allocator_)),
179
+ seed_(other.seed_),
180
+ num_hashes_(other.num_hashes_),
181
+ is_dirty_(other.is_dirty_),
182
+ is_owned_(other.is_owned_),
183
+ is_read_only_(other.is_read_only_),
184
+ capacity_bits_(other.capacity_bits_),
185
+ num_bits_set_(other.num_bits_set_),
186
+ bit_array_(std::move(other.bit_array_)),
187
+ memory_(std::move(other.memory_))
188
+ {
189
+ // ensure destructor on other will behave nicely
190
+ other.is_owned_ = false;
191
+ other.bit_array_ = nullptr;
192
+ other.memory_ = nullptr;
193
+ }
194
+
195
+ template<typename A>
196
+ bloom_filter_alloc<A>& bloom_filter_alloc<A>::operator=(const bloom_filter_alloc& other) {
197
+ bloom_filter_alloc<A> copy(other);
198
+ std::swap(allocator_, copy.allocator_);
199
+ std::swap(seed_, copy.seed_);
200
+ std::swap(num_hashes_, copy.num_hashes_);
201
+ std::swap(is_dirty_, copy.is_dirty_);
202
+ std::swap(is_owned_, copy.is_owned_);
203
+ std::swap(is_read_only_, copy.is_read_only_);
204
+ std::swap(capacity_bits_, copy.capacity_bits_);
205
+ std::swap(num_bits_set_, copy.num_bits_set_);
206
+ std::swap(bit_array_, copy.bit_array_);
207
+ std::swap(memory_, copy.memory_);
208
+ return *this;
209
+ }
210
+
211
+ template<typename A>
212
+ bloom_filter_alloc<A>& bloom_filter_alloc<A>::operator=(bloom_filter_alloc&& other) {
213
+ if (this == &other) { return *this; }
214
+ std::swap(allocator_, other.allocator_);
215
+ std::swap(seed_, other.seed_);
216
+ std::swap(num_hashes_, other.num_hashes_);
217
+ std::swap(is_dirty_, other.is_dirty_);
218
+ std::swap(is_owned_, other.is_owned_);
219
+ std::swap(is_read_only_, other.is_read_only_);
220
+ std::swap(capacity_bits_, other.capacity_bits_);
221
+ std::swap(num_bits_set_, other.num_bits_set_);
222
+ std::swap(bit_array_, other.bit_array_);
223
+ std::swap(memory_, other.memory_);
224
+ return *this;
225
+ }
226
+
227
+ template<typename A>
228
+ bloom_filter_alloc<A>::~bloom_filter_alloc() {
229
+ if (is_owned_) {
230
+ if (memory_ != nullptr) {
231
+ // deallocate total memory_ block, including preamble
232
+ AllocUint8(allocator_).deallocate(memory_, (capacity_bits_ >> 3) + BIT_ARRAY_OFFSET_BYTES);
233
+ } else if (bit_array_ != nullptr) {
234
+ // only need to deallocate bit_array_
235
+ AllocUint8(allocator_).deallocate(bit_array_, capacity_bits_ >> 3);
236
+ }
237
+ memory_ = nullptr;
238
+ bit_array_ = nullptr;
239
+ }
240
+ }
241
+
242
+ template<typename A>
243
+ bloom_filter_alloc<A> bloom_filter_alloc<A>::deserialize(const void* bytes, size_t length_bytes, const A& allocator) {
244
+ // not wrapping so we can cast away const as we're not modifying the memory
245
+ return internal_deserialize_or_wrap(const_cast<void*>(bytes), length_bytes, false, false, allocator);
246
+ }
247
+
248
+ /*
249
+ * A Bloom Filter's serialized image always uses 3 longs of preamble when empty,
250
+ * otherwise 4 longs:
251
+ *
252
+ * <pre>
253
+ * Long || Start Byte Adr:
254
+ * Adr:
255
+ * || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
256
+ * 0 || Preamble_Longs | SerVer | FamID | Flags |----Num Hashes---|-----Unused------|
257
+ *
258
+ * || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
259
+ * 1 ||---------------------------------Hash Seed-------------------------------------|
260
+ *
261
+ * || 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
262
+ * 2 ||-------BitArray Length (in longs)----------|-----------Unused------------------|
263
+ *
264
+ * || 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
265
+ * 3 ||---------------------------------NumBitsSet------------------------------------|
266
+ * </pre>
267
+ *
268
+ * The raw BitArray bits, if non-empty start at byte 32.
269
+ */
270
+
271
+ template<typename A>
272
+ bloom_filter_alloc<A> bloom_filter_alloc<A>::deserialize(std::istream& is, const A& allocator) {
273
+ const uint8_t prelongs = read<uint8_t>(is);
274
+ const uint8_t ser_ver = read<uint8_t>(is);
275
+ const uint8_t family = read<uint8_t>(is);
276
+ const uint8_t flags = read<uint8_t>(is);
277
+
278
+ if (prelongs < 1 || prelongs > 4) {
279
+ throw std::invalid_argument("Possible corruption: Incorrect number of preamble bytes specified in header");
280
+ }
281
+ if (ser_ver != SER_VER) {
282
+ throw std::invalid_argument("Possible corruption: Unrecognized serialization version: " + std::to_string(ser_ver));
283
+ }
284
+ if (family != FAMILY_ID) {
285
+ throw std::invalid_argument("Possible corruption: Incorrect Family ID for bloom filter. Found: " + std::to_string(family));
286
+ }
287
+
288
+ const bool is_empty = (flags & EMPTY_FLAG_MASK) != 0;
289
+
290
+ const uint16_t num_hashes = read<uint16_t>(is);
291
+ read<uint16_t>(is); // unused
292
+ const uint64_t seed = read<uint64_t>(is);
293
+ const uint32_t num_longs = read<uint32_t>(is); // sized in java longs
294
+ read<uint32_t>(is); // unused
295
+
296
+ // if empty, stop reading
297
+ if (is_empty) {
298
+ return bloom_filter_alloc<A>(num_longs << 6, num_hashes, seed, allocator);
299
+ }
300
+
301
+ const uint64_t num_bits_set = read<uint64_t>(is);
302
+ const bool is_dirty = (num_bits_set == DIRTY_BITS_VALUE);
303
+
304
+ // allocate memory
305
+ const uint64_t num_bytes = num_longs << 3;
306
+ AllocUint8 alloc(allocator);
307
+ uint8_t* bit_array = alloc.allocate(num_bytes);
308
+ if (bit_array == nullptr) {
309
+ throw std::bad_alloc();
310
+ }
311
+ read(is, bit_array, num_bytes);
312
+
313
+ // pass to constructor
314
+ return bloom_filter_alloc<A>(seed, num_hashes, is_dirty, true, false, num_longs << 6, num_bits_set, bit_array, nullptr, allocator);
315
+ }
316
+
317
+ template<typename A>
318
+ const bloom_filter_alloc<A> bloom_filter_alloc<A>::wrap(const void* bytes, size_t length_bytes, const A& allocator) {
319
+ // read-only flag means we won't modify the memory, but cast away the const
320
+ return internal_deserialize_or_wrap(const_cast<void*>(bytes), length_bytes, true, true, allocator);
321
+ }
322
+
323
+ template<typename A>
324
+ bloom_filter_alloc<A> bloom_filter_alloc<A>::writable_wrap(void* bytes, size_t length_bytes, const A& allocator) {
325
+ return internal_deserialize_or_wrap(bytes, length_bytes, false, true, allocator);
326
+ }
327
+
328
+ template<typename A>
329
+ bloom_filter_alloc<A> bloom_filter_alloc<A>::internal_deserialize_or_wrap(void* bytes,
330
+ size_t length_bytes,
331
+ bool read_only,
332
+ bool wrap,
333
+ const A& allocator)
334
+ {
335
+ ensure_minimum_memory(length_bytes, 8);
336
+ if (bytes == nullptr) {
337
+ throw std::invalid_argument("Input data is null or empty");
338
+ }
339
+ const uint8_t* ptr = static_cast<const uint8_t*>(bytes);
340
+ const uint8_t* end_ptr = ptr + length_bytes;
341
+ const uint8_t prelongs = *ptr++;
342
+ const uint8_t ser_ver = *ptr++;
343
+ const uint8_t family = *ptr++;
344
+ const uint8_t flags = *ptr++;
345
+
346
+ if (prelongs < PREAMBLE_LONGS_EMPTY || prelongs > PREAMBLE_LONGS_STANDARD) {
347
+ throw std::invalid_argument("Possible corruption: Incorrect number of preamble bytes specified in header");
348
+ }
349
+ if (ser_ver != SER_VER) {
350
+ throw std::invalid_argument("Possible corruption: Unrecognized serialization version: " + std::to_string(ser_ver));
351
+ }
352
+ if (family != FAMILY_ID) {
353
+ throw std::invalid_argument("Possible corruption: Incorrect Family ID for bloom filter. Found: " + std::to_string(family));
354
+ }
355
+
356
+ const bool is_empty = (flags & EMPTY_FLAG_MASK) != 0;
357
+
358
+ ensure_minimum_memory(length_bytes, prelongs * sizeof(uint64_t));
359
+
360
+ uint16_t num_hashes;
361
+ ptr += copy_from_mem(ptr, num_hashes);
362
+ ptr += sizeof(uint16_t); // 16 bits unused after num_hashes
363
+ uint64_t seed;
364
+ ptr += copy_from_mem(ptr, seed);
365
+
366
+ uint32_t num_longs;
367
+ ptr += copy_from_mem(ptr, num_longs); // sized in java longs
368
+ ptr += sizeof(uint32_t); // unused 32 bits follow
369
+
370
+ // if empty, stop reading
371
+ if (wrap && is_empty && !read_only) {
372
+ throw std::invalid_argument("Cannot wrap an empty filter for writing");
373
+ } else if (is_empty) {
374
+ return bloom_filter_alloc<A>(num_longs << 6, num_hashes, seed, allocator);
375
+ }
376
+
377
+ uint64_t num_bits_set;
378
+ ptr += copy_from_mem(ptr, num_bits_set);
379
+ const bool is_dirty = (num_bits_set == DIRTY_BITS_VALUE);
380
+
381
+ uint8_t* bit_array;
382
+ uint8_t* memory;
383
+ if (wrap) {
384
+ memory = static_cast<uint8_t*>(bytes);
385
+ bit_array = memory + BIT_ARRAY_OFFSET_BYTES;
386
+ } else {
387
+ // allocate memory
388
+ memory = nullptr;
389
+ const uint64_t num_bytes = num_longs << 3;
390
+ ensure_minimum_memory(end_ptr - ptr, num_bytes);
391
+ AllocUint8 alloc(allocator);
392
+ bit_array = alloc.allocate(num_bytes);
393
+ if (bit_array == nullptr) {
394
+ throw std::bad_alloc();
395
+ }
396
+ copy_from_mem(ptr, bit_array, num_bytes);
397
+ }
398
+
399
+ // pass to constructor -- !wrap == is_owned_
400
+ return bloom_filter_alloc<A>(seed, num_hashes, is_dirty, !wrap, read_only, num_longs << 6, num_bits_set, bit_array, memory, allocator);
401
+ }
402
+
403
+ template<typename A>
404
+ void bloom_filter_alloc<A>::serialize(std::ostream& os) const {
405
+ // Should we serialize memory_ directly if it exists?
406
+ const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_STANDARD;
407
+ write(os, preamble_longs);
408
+ const uint8_t serial_version = SER_VER;
409
+ write(os, serial_version);
410
+ const uint8_t family = FAMILY_ID;
411
+ write(os, family);
412
+ const uint8_t flags_byte = is_empty() ? EMPTY_FLAG_MASK : 0;
413
+ write(os, flags_byte);
414
+
415
+ write(os, num_hashes_);
416
+ write(os, static_cast<uint16_t>(0)); // 2 bytes unused
417
+ write(os, seed_);
418
+ write(os, static_cast<int32_t>(capacity_bits_ >> 6)); // sized in java longs
419
+ write(os, static_cast<uint32_t>(0)); // 4 bytes unused
420
+
421
+ if (!is_empty()) {
422
+ write(os, is_dirty_ ? DIRTY_BITS_VALUE : num_bits_set_);
423
+ write(os, bit_array_, capacity_bits_ >> 3);
424
+ }
425
+
426
+ os.flush();
427
+ }
428
+
429
+ template<typename A>
430
+ auto bloom_filter_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
431
+ // Should we serialize memory_ directly if it exists?
432
+ const size_t size = header_size_bytes + get_serialized_size_bytes();
433
+ vector_bytes bytes(size, 0, allocator_);
434
+ uint8_t* ptr = bytes.data() + header_size_bytes;
435
+
436
+ const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_STANDARD;
437
+ ptr += copy_to_mem(preamble_longs, ptr);
438
+ const uint8_t serial_version = SER_VER;
439
+ ptr += copy_to_mem(serial_version, ptr);
440
+ const uint8_t family = FAMILY_ID;
441
+ ptr += copy_to_mem(family, ptr);
442
+ const uint8_t flags_byte = is_empty() ? EMPTY_FLAG_MASK : 0;
443
+ ptr += copy_to_mem(flags_byte, ptr);
444
+
445
+ ptr += copy_to_mem(num_hashes_, ptr);
446
+ ptr += copy_to_mem(static_cast<uint16_t>(0), ptr); // 2 bytes unused
447
+ ptr += copy_to_mem(seed_, ptr);
448
+ ptr += copy_to_mem(static_cast<int32_t>(capacity_bits_ >> 6), ptr); // sized in java longs
449
+ ptr += copy_to_mem(static_cast<uint32_t>(0), ptr); // 4 bytes unused
450
+
451
+ if (!is_empty()) {
452
+ ptr += copy_to_mem(is_dirty_ ? DIRTY_BITS_VALUE : num_bits_set_, ptr);
453
+ ptr += copy_to_mem(bit_array_, ptr, capacity_bits_ >> 3);
454
+ }
455
+
456
+ return bytes;
457
+ }
458
+
459
+ template<typename A>
460
+ size_t bloom_filter_alloc<A>::get_serialized_size_bytes() const {
461
+ return sizeof(uint64_t) * (is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_STANDARD + (capacity_bits_ >> 6));
462
+ }
463
+
464
+ template<typename A>
465
+ size_t bloom_filter_alloc<A>::get_serialized_size_bytes(uint64_t num_bits) {
466
+ if (num_bits == 0)
467
+ throw std::invalid_argument("Number of bits must be greater than zero");
468
+
469
+ size_t num_bytes = (num_bits + 63) >> 6;
470
+ return sizeof(uint64_t) * (PREAMBLE_LONGS_STANDARD + num_bytes);
471
+ }
472
+
473
+ template<typename A>
474
+ bool bloom_filter_alloc<A>::is_empty() const {
475
+ return !is_dirty_ && num_bits_set_ == 0;
476
+ }
477
+
478
+ template<typename A>
479
+ uint64_t bloom_filter_alloc<A>::get_bits_used() {
480
+ if (is_dirty_) {
481
+ num_bits_set_ = bit_array_ops::count_num_bits_set(bit_array_, capacity_bits_ >> 3);
482
+ is_dirty_ = false;
483
+ }
484
+ return num_bits_set_;
485
+ }
486
+
487
+ template<typename A>
488
+ uint64_t bloom_filter_alloc<A>::get_capacity() const {
489
+ return capacity_bits_;
490
+ }
491
+
492
+ template<typename A>
493
+ uint16_t bloom_filter_alloc<A>::get_num_hashes() const {
494
+ return num_hashes_;
495
+ }
496
+
497
+ template<typename A>
498
+ uint64_t bloom_filter_alloc<A>::get_seed() const {
499
+ return seed_;
500
+ }
501
+
502
+ template<typename A>
503
+ bool bloom_filter_alloc<A>::is_read_only() const {
504
+ return is_read_only_;
505
+ }
506
+
507
+ template<typename A>
508
+ bool bloom_filter_alloc<A>::is_wrapped() const {
509
+ return memory_ != nullptr;
510
+ }
511
+
512
+ template<typename A>
513
+ bool bloom_filter_alloc<A>::is_memory_owned() const {
514
+ return is_owned_;
515
+ }
516
+
517
+ template<typename A>
518
+ const uint8_t* bloom_filter_alloc<A>::get_wrapped_memory() const {
519
+ return memory_;
520
+ }
521
+
522
+ template<typename A>
523
+ void bloom_filter_alloc<A>::reset() {
524
+ if (is_read_only_) {
525
+ throw std::logic_error("Cannot reset a read-only filter");
526
+ }
527
+ update_num_bits_set(0);
528
+ std::fill_n(bit_array_, capacity_bits_ >> 3, 0);
529
+ }
530
+
531
+ template<typename A>
532
+ void bloom_filter_alloc<A>::update_num_bits_set(uint64_t num_bits_set) {
533
+ num_bits_set_ = num_bits_set;
534
+ is_dirty_ = false;
535
+ if (memory_ != nullptr && !is_read_only_) {
536
+ copy_to_mem(num_bits_set_, memory_ + NUM_BITS_SET_OFFSET_BYTES);
537
+ }
538
+ }
539
+
540
+ // UPDATE METHODS
541
+
542
+ template<typename A>
543
+ void bloom_filter_alloc<A>::update(const std::string& item) {
544
+ if (item.empty()) return;
545
+ const uint64_t h0 = XXHash64::hash(item.data(), item.size(), seed_);
546
+ const uint64_t h1 = XXHash64::hash(item.data(), item.size(), h0);
547
+ internal_update(h0, h1);
548
+ }
549
+
550
+ template<typename A>
551
+ void bloom_filter_alloc<A>::update(uint64_t item) {
552
+ const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_);
553
+ const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0);
554
+ internal_update(h0, h1);
555
+ }
556
+
557
+ template<typename A>
558
+ void bloom_filter_alloc<A>::update(uint32_t item) {
559
+ update(static_cast<uint64_t>(item));
560
+ }
561
+
562
+ template<typename A>
563
+ void bloom_filter_alloc<A>::update(uint16_t item) {
564
+ update(static_cast<uint64_t>(item));
565
+ }
566
+
567
+ template<typename A>
568
+ void bloom_filter_alloc<A>::update(uint8_t item) {
569
+ update(static_cast<uint64_t>(item));
570
+ }
571
+
572
+ template<typename A>
573
+ void bloom_filter_alloc<A>::update(int64_t item) {
574
+ const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_);
575
+ const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0);
576
+ internal_update(h0, h1);
577
+ }
578
+
579
+ template<typename A>
580
+ void bloom_filter_alloc<A>::update(int32_t item) {
581
+ update(static_cast<int64_t>(item));
582
+ }
583
+
584
+ template<typename A>
585
+ void bloom_filter_alloc<A>::update(int16_t item) {
586
+ update(static_cast<int64_t>(item));
587
+ }
588
+
589
+ template<typename A>
590
+ void bloom_filter_alloc<A>::update(int8_t item) {
591
+ update(static_cast<int64_t>(item));
592
+ }
593
+
594
+ template<typename A>
595
+ void bloom_filter_alloc<A>::update(double item) {
596
+ union {
597
+ int64_t long_value;
598
+ double double_value;
599
+ } ldu;
600
+ ldu.double_value = static_cast<double>(item);
601
+ if (item == 0.0) {
602
+ ldu.double_value = 0.0; // canonicalize -0.0 to 0.0
603
+ } else if (std::isnan(ldu.double_value)) {
604
+ ldu.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
605
+ }
606
+ const uint64_t h0 = XXHash64::hash(&ldu, sizeof(ldu), seed_);
607
+ const uint64_t h1 = XXHash64::hash(&ldu, sizeof(ldu), h0);
608
+ internal_update(h0, h1);
609
+ }
610
+
611
+ template<typename A>
612
+ void bloom_filter_alloc<A>::update(float item) {
613
+ update(static_cast<double>(item));
614
+ }
615
+
616
+ template<typename A>
617
+ void bloom_filter_alloc<A>::update(const void* item, size_t size) {
618
+ if (item == nullptr || size == 0) return;
619
+ const uint64_t h0 = XXHash64::hash(item, size, seed_);
620
+ const uint64_t h1 = XXHash64::hash(item, size, h0);
621
+ internal_update(h0, h1);
622
+ }
623
+
624
+ template<typename A>
625
+ void bloom_filter_alloc<A>::internal_update(uint64_t h0, uint64_t h1) {
626
+ if (is_read_only_) {
627
+ throw std::logic_error("Cannot update a read-only filter");
628
+ }
629
+ const uint64_t num_bits = get_capacity();
630
+ for (uint16_t i = 1; i <= num_hashes_; i++) {
631
+ const uint64_t hash_index = ((h0 + i * h1) >> 1) % num_bits;
632
+ bit_array_ops::set_bit(bit_array_, hash_index);
633
+ }
634
+ is_dirty_ = true;
635
+ }
636
+
637
+ // QUERY-AND-UPDATE METHODS
638
+
639
+ template<typename A>
640
+ bool bloom_filter_alloc<A>::query_and_update(const std::string& item) {
641
+ if (item.empty()) return false;
642
+ const uint64_t h0 = XXHash64::hash(item.data(), item.size(), seed_);
643
+ const uint64_t h1 = XXHash64::hash(item.data(), item.size(), h0);
644
+ return internal_query_and_update(h0, h1);
645
+ }
646
+
647
+ template<typename A>
648
+ bool bloom_filter_alloc<A>::query_and_update(uint64_t item) {
649
+ const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_);
650
+ const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0);
651
+ return internal_query_and_update(h0, h1);
652
+ }
653
+
654
+ template<typename A>
655
+ bool bloom_filter_alloc<A>::query_and_update(uint32_t item) {
656
+ return query_and_update(static_cast<uint64_t>(item));
657
+ }
658
+
659
+ template<typename A>
660
+ bool bloom_filter_alloc<A>::query_and_update(uint16_t item) {
661
+ return query_and_update(static_cast<uint64_t>(item));
662
+ }
663
+
664
+ template<typename A>
665
+ bool bloom_filter_alloc<A>::query_and_update(uint8_t item) {
666
+ return query_and_update(static_cast<uint64_t>(item));
667
+ }
668
+
669
+ template<typename A>
670
+ bool bloom_filter_alloc<A>::query_and_update(int64_t item) {
671
+ const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_);
672
+ const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0);
673
+ return internal_query_and_update(h0, h1);
674
+ }
675
+
676
+ template<typename A>
677
+ bool bloom_filter_alloc<A>::query_and_update(int32_t item) {
678
+ return query_and_update(static_cast<int64_t>(item));
679
+ }
680
+
681
+ template<typename A>
682
+ bool bloom_filter_alloc<A>::query_and_update(int16_t item) {
683
+ return query_and_update(static_cast<int64_t>(item));
684
+ }
685
+
686
+ template<typename A>
687
+ bool bloom_filter_alloc<A>::query_and_update(int8_t item) {
688
+ return query_and_update(static_cast<int64_t>(item));
689
+ }
690
+
691
+ template<typename A>
692
+ bool bloom_filter_alloc<A>::query_and_update(double item) {
693
+ union {
694
+ int64_t long_value;
695
+ double double_value;
696
+ } ldu;
697
+ ldu.double_value = item;
698
+ if (item == 0.0) {
699
+ ldu.double_value = 0.0; // canonicalize -0.0 to 0.0
700
+ } else if (std::isnan(ldu.double_value)) {
701
+ ldu.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
702
+ }
703
+ const uint64_t h0 = XXHash64::hash(&ldu, sizeof(ldu), seed_);
704
+ const uint64_t h1 = XXHash64::hash(&ldu, sizeof(ldu), h0);
705
+ return internal_query_and_update(h0, h1);
706
+ }
707
+
708
+ template<typename A>
709
+ bool bloom_filter_alloc<A>::query_and_update(float item) {
710
+ return query_and_update(static_cast<double>(item));
711
+ }
712
+
713
+ template<typename A>
714
+ bool bloom_filter_alloc<A>::query_and_update(const void* item, size_t size) {
715
+ if (item == nullptr || size == 0) return false;
716
+ const uint64_t h0 = XXHash64::hash(item, size, seed_);
717
+ const uint64_t h1 = XXHash64::hash(item, size, h0);
718
+ return internal_query_and_update(h0, h1);
719
+ }
720
+
721
+ template<typename A>
722
+ bool bloom_filter_alloc<A>::internal_query_and_update(uint64_t h0, uint64_t h1) {
723
+ if (is_read_only_) {
724
+ throw std::logic_error("Cannot update a read-only filter");
725
+ }
726
+ const uint64_t num_bits = get_capacity();
727
+ bool value_exists = true;
728
+ for (uint16_t i = 1; i <= num_hashes_; i++) {
729
+ const uint64_t hash_index = ((h0 + i * h1) >> 1) % num_bits;
730
+ bool value = bit_array_ops::get_and_set_bit(bit_array_, hash_index);
731
+ update_num_bits_set(num_bits_set_ + (value ? 0 : 1));
732
+ value_exists &= value;
733
+ }
734
+ return value_exists;
735
+ }
736
+
737
+ // QUERY METHODS
738
+
739
+ template<typename A>
740
+ bool bloom_filter_alloc<A>::query(const std::string& item) const {
741
+ if (item.empty()) return false;
742
+ const uint64_t h0 = XXHash64::hash(item.data(), item.size(), seed_);
743
+ const uint64_t h1 = XXHash64::hash(item.data(), item.size(), h0);
744
+ return internal_query(h0, h1);
745
+ }
746
+
747
+ template<typename A>
748
+ bool bloom_filter_alloc<A>::query(uint64_t item) const {
749
+ const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_);
750
+ const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0);
751
+ return internal_query(h0, h1);
752
+ }
753
+
754
+ template<typename A>
755
+ bool bloom_filter_alloc<A>::query(uint32_t item) const {
756
+ return query(static_cast<uint64_t>(item));
757
+ }
758
+
759
+ template<typename A>
760
+ bool bloom_filter_alloc<A>::query(uint16_t item) const {
761
+ return query(static_cast<uint64_t>(item));
762
+ }
763
+
764
+ template<typename A>
765
+ bool bloom_filter_alloc<A>::query(uint8_t item) const {
766
+ return query(static_cast<uint64_t>(item));
767
+ }
768
+
769
+ template<typename A>
770
+ bool bloom_filter_alloc<A>::query(int64_t item) const {
771
+ const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_);
772
+ const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0);
773
+ return internal_query(h0, h1);
774
+ }
775
+
776
+ template<typename A>
777
+ bool bloom_filter_alloc<A>::query(int32_t item) const {
778
+ return query(static_cast<int64_t>(item));
779
+ }
780
+
781
+ template<typename A>
782
+ bool bloom_filter_alloc<A>::query(int16_t item) const {
783
+ return query(static_cast<int64_t>(item));
784
+ }
785
+
786
+ template<typename A>
787
+ bool bloom_filter_alloc<A>::query(int8_t item) const {
788
+ return query(static_cast<int64_t>(item));
789
+ }
790
+
791
+ template<typename A>
792
+ bool bloom_filter_alloc<A>::query(double item) const {
793
+ union {
794
+ int64_t long_value;
795
+ double double_value;
796
+ } ldu;
797
+ ldu.double_value = static_cast<double>(item);
798
+ if (item == 0.0) {
799
+ ldu.double_value = 0.0; // canonicalize -0.0 to 0.0
800
+ } else if (std::isnan(ldu.double_value)) {
801
+ ldu.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
802
+ }
803
+ const uint64_t h0 = XXHash64::hash(&ldu, sizeof(ldu), seed_);
804
+ const uint64_t h1 = XXHash64::hash(&ldu, sizeof(ldu), h0);
805
+ return internal_query(h0, h1);
806
+ }
807
+
808
+ template<typename A>
809
+ bool bloom_filter_alloc<A>::query(float item) const {
810
+ return query(static_cast<double>(item));
811
+ }
812
+
813
+ template<typename A>
814
+ bool bloom_filter_alloc<A>::query(const void* item, size_t size) const {
815
+ if (item == nullptr || size == 0) return false;
816
+ const uint64_t h0 = XXHash64::hash(item, size, seed_);
817
+ const uint64_t h1 = XXHash64::hash(item, size, h0);
818
+ return internal_query(h0, h1);
819
+ }
820
+
821
+ template<typename A>
822
+ bool bloom_filter_alloc<A>::internal_query(uint64_t h0, uint64_t h1) const {
823
+ if (is_empty()) return false;
824
+ const uint64_t num_bits = get_capacity();
825
+ for (uint16_t i = 1; i <= num_hashes_; i++) {
826
+ const uint64_t hash_index = ((h0 + i * h1) >> 1) % num_bits;
827
+ if (!bit_array_ops::get_bit(bit_array_, hash_index))
828
+ return false;
829
+ }
830
+ return true;
831
+ }
832
+
833
+ // OTHER METHODS
834
+
835
+ template<typename A>
836
+ bool bloom_filter_alloc<A>::is_compatible(const bloom_filter_alloc& other) const {
837
+ return seed_ == other.seed_
838
+ && num_hashes_ == other.num_hashes_
839
+ && get_capacity() == other.get_capacity()
840
+ ;
841
+ }
842
+
843
+ template<typename A>
844
+ void bloom_filter_alloc<A>::union_with(const bloom_filter_alloc& other) {
845
+ if (!is_compatible(other)) {
846
+ throw std::invalid_argument("Incompatible bloom filters");
847
+ }
848
+ uint64_t bits_set = bit_array_ops::union_with(bit_array_, other.bit_array_, capacity_bits_ >> 3);
849
+ update_num_bits_set(bits_set);
850
+ }
851
+
852
+ template<typename A>
853
+ void bloom_filter_alloc<A>::intersect(const bloom_filter_alloc& other) {
854
+ if (!is_compatible(other)) {
855
+ throw std::invalid_argument("Incompatible bloom filters");
856
+ }
857
+ uint64_t bits_set = bit_array_ops::intersect(bit_array_, other.bit_array_, capacity_bits_ >> 3);
858
+ update_num_bits_set(bits_set);
859
+ }
860
+
861
+ template<typename A>
862
+ void bloom_filter_alloc<A>::invert() {
863
+ uint64_t bits_set = bit_array_ops::invert(bit_array_, capacity_bits_ >> 3);
864
+ update_num_bits_set(bits_set);
865
+ }
866
+
867
+ template<typename A>
868
+ string<A> bloom_filter_alloc<A>::to_string(bool print_filter) const {
869
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
870
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
871
+ std::ostringstream oss;
872
+ uint64_t num_bits_set = num_bits_set_;
873
+ if (is_dirty_) {
874
+ num_bits_set = bit_array_ops::count_num_bits_set(bit_array_, capacity_bits_ >> 3);
875
+ }
876
+
877
+ oss << "### Bloom Filter Summary:" << std::endl;
878
+ oss << " num_bits : " << get_capacity() << std::endl;
879
+ oss << " num_hashes : " << num_hashes_ << std::endl;
880
+ oss << " seed : " << seed_ << std::endl;
881
+ oss << " is_dirty : " << (is_dirty_ ? "true" : "false") << std::endl;
882
+ oss << " bits_used : " << num_bits_set << std::endl;
883
+ oss << " fill % : " << (num_bits_set * 100.0) / get_capacity() << std::endl;
884
+ oss << "### End filter summary" << std::endl;
885
+
886
+ if (print_filter) {
887
+ uint64_t num_blocks = capacity_bits_ >> 6; // groups of 64 bits
888
+ for (uint64_t i = 0; i < num_blocks; ++i) {
889
+ oss << i << ": ";
890
+ for (uint64_t j = 0; j < 8; ++j) { // bytes w/in a block
891
+ for (uint64_t b = 0; b < 8; ++b) { // bits w/in a byte
892
+ oss << ((bit_array_[i * 8 + j] & (1 << b)) ? "1" : "0");
893
+ }
894
+ oss << " ";
895
+ }
896
+ oss << std::endl;
897
+ }
898
+ oss << std::endl;
899
+ }
900
+
901
+ oss << std::endl;
902
+ return string<A>(oss.str(), allocator_);
903
+ }
904
+
905
+
906
+ } // namespace datasketches
907
+
908
+ #endif // _BLOOM_FILTER_IMPL_HPP_