datasketches 0.4.3 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/datasketches/vo_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/LICENSE +35 -7
- data/vendor/datasketches-cpp/NOTICE +2 -2
- data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -1
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
- data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
- data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
- data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +51 -1
- data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +38 -1
- data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +12 -3
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -39
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +18 -10
@@ -0,0 +1,908 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#ifndef _BLOOM_FILTER_IMPL_HPP_
|
21
|
+
#define _BLOOM_FILTER_IMPL_HPP_
|
22
|
+
|
23
|
+
#include <algorithm>
|
24
|
+
#include <memory>
|
25
|
+
#include <sstream>
|
26
|
+
#include <vector>
|
27
|
+
|
28
|
+
#include "common_defs.hpp"
|
29
|
+
#include "bit_array_ops.hpp"
|
30
|
+
#include "memory_operations.hpp"
|
31
|
+
#include "xxhash64.h"
|
32
|
+
|
33
|
+
// memory scenarios:
|
34
|
+
// * on-heap: owned, bit_array_ set, memory_ null
|
35
|
+
// * direct: not owned, bit_array_ set, memory_ set
|
36
|
+
// * read-only an option for direct
|
37
|
+
|
38
|
+
namespace datasketches {
|
39
|
+
|
40
|
+
template<typename A>
|
41
|
+
bloom_filter_alloc<A>::bloom_filter_alloc(uint64_t num_bits, uint16_t num_hashes, uint64_t seed, const A& allocator) :
|
42
|
+
allocator_(allocator),
|
43
|
+
seed_(seed),
|
44
|
+
num_hashes_(num_hashes),
|
45
|
+
is_dirty_(false),
|
46
|
+
is_owned_(true),
|
47
|
+
is_read_only_(false),
|
48
|
+
capacity_bits_((num_bits + 63) & ~0x3F), // can round to nearest multiple of 64 prior to bounds checks
|
49
|
+
num_bits_set_(0)
|
50
|
+
{
|
51
|
+
if (num_hashes == 0) {
|
52
|
+
throw std::invalid_argument("Must have at least 1 hash function");
|
53
|
+
}
|
54
|
+
if (num_bits == 0) {
|
55
|
+
throw std::invalid_argument("Number of bits must be greater than zero");
|
56
|
+
} else if (num_bits > MAX_FILTER_SIZE_BITS) {
|
57
|
+
throw std::invalid_argument("Filter may not exceed " + std::to_string(MAX_FILTER_SIZE_BITS) + " bits");
|
58
|
+
}
|
59
|
+
|
60
|
+
const uint64_t num_bytes = capacity_bits_ >> 3;
|
61
|
+
bit_array_ = AllocUint8(allocator_).allocate(num_bytes);
|
62
|
+
std::fill_n(bit_array_, num_bytes, 0);
|
63
|
+
if (bit_array_ == nullptr) {
|
64
|
+
throw std::bad_alloc();
|
65
|
+
}
|
66
|
+
memory_ = nullptr;
|
67
|
+
}
|
68
|
+
|
69
|
+
template<typename A>
|
70
|
+
bloom_filter_alloc<A>::bloom_filter_alloc(uint8_t* memory,
|
71
|
+
size_t length_bytes,
|
72
|
+
uint64_t num_bits,
|
73
|
+
uint16_t num_hashes,
|
74
|
+
uint64_t seed,
|
75
|
+
const A& allocator) :
|
76
|
+
allocator_(allocator),
|
77
|
+
seed_(seed),
|
78
|
+
num_hashes_(num_hashes),
|
79
|
+
is_dirty_(false),
|
80
|
+
is_owned_(false),
|
81
|
+
is_read_only_(false),
|
82
|
+
capacity_bits_((num_bits + 63) & ~0x3F), // can round to nearest multiple of 64 prior to bounds checks
|
83
|
+
num_bits_set_(0)
|
84
|
+
{
|
85
|
+
if (num_hashes == 0) {
|
86
|
+
throw std::invalid_argument("Must have at least 1 hash function");
|
87
|
+
}
|
88
|
+
if (num_bits == 0) {
|
89
|
+
throw std::invalid_argument("Number of bits must be greater than zero");
|
90
|
+
} else if (num_bits > MAX_FILTER_SIZE_BITS) {
|
91
|
+
throw std::invalid_argument("Filter may not exceed " + std::to_string(MAX_FILTER_SIZE_BITS) + " bits");
|
92
|
+
}
|
93
|
+
|
94
|
+
const size_t num_bytes = get_serialized_size_bytes(capacity_bits_);
|
95
|
+
if (length_bytes < num_bytes) {
|
96
|
+
throw std::invalid_argument("Input memory block is too small");
|
97
|
+
}
|
98
|
+
|
99
|
+
// fill in header info
|
100
|
+
uint8_t* ptr = memory;
|
101
|
+
const uint8_t preamble_longs = PREAMBLE_LONGS_STANDARD; // no resizing so assume non-empty
|
102
|
+
ptr += copy_to_mem(preamble_longs, ptr);
|
103
|
+
const uint8_t serial_version = SER_VER;
|
104
|
+
ptr += copy_to_mem(serial_version, ptr);
|
105
|
+
const uint8_t family = FAMILY_ID;
|
106
|
+
ptr += copy_to_mem(family, ptr);
|
107
|
+
const uint8_t flags_byte = 0; // again, assuming non-empty
|
108
|
+
ptr += copy_to_mem(flags_byte, ptr);
|
109
|
+
|
110
|
+
ptr += copy_to_mem(num_hashes_, ptr);
|
111
|
+
ptr += copy_to_mem(static_cast<uint16_t>(0), ptr); // 2 bytes unused
|
112
|
+
ptr += copy_to_mem(seed_, ptr);
|
113
|
+
ptr += copy_to_mem(static_cast<int32_t>(capacity_bits_ >> 6), ptr); // sized in java longs
|
114
|
+
ptr += copy_to_mem(static_cast<uint32_t>(0), ptr); // 4 bytes unused
|
115
|
+
|
116
|
+
// rest of memory is num bits and bit array, so start with zeroes
|
117
|
+
std::fill_n(ptr, sizeof(uint64_t) * ((capacity_bits_ >> 6) + 1), 0);
|
118
|
+
bit_array_ = memory + BIT_ARRAY_OFFSET_BYTES;
|
119
|
+
memory_ = memory;
|
120
|
+
}
|
121
|
+
|
122
|
+
template<typename A>
|
123
|
+
bloom_filter_alloc<A>::bloom_filter_alloc(uint64_t seed,
|
124
|
+
uint16_t num_hashes,
|
125
|
+
bool is_dirty,
|
126
|
+
bool is_owned,
|
127
|
+
bool is_read_only,
|
128
|
+
uint64_t capacity_bits,
|
129
|
+
uint64_t num_bits_set,
|
130
|
+
uint8_t* bit_array,
|
131
|
+
uint8_t* memory,
|
132
|
+
const A& allocator) :
|
133
|
+
allocator_(allocator),
|
134
|
+
seed_(seed),
|
135
|
+
num_hashes_(num_hashes),
|
136
|
+
is_dirty_(is_dirty),
|
137
|
+
is_owned_(is_owned),
|
138
|
+
is_read_only_(is_read_only),
|
139
|
+
capacity_bits_((capacity_bits + 63) & ~0x3F),
|
140
|
+
num_bits_set_(num_bits_set),
|
141
|
+
bit_array_(bit_array),
|
142
|
+
memory_(memory)
|
143
|
+
{
|
144
|
+
// private constructor
|
145
|
+
// no consistency checks since we should have done those prior to calling this
|
146
|
+
if (is_read_only_ && memory_ != nullptr && num_bits_set == DIRTY_BITS_VALUE) {
|
147
|
+
num_bits_set_ = bit_array_ops::count_num_bits_set(bit_array_, capacity_bits_ >> 3);
|
148
|
+
}
|
149
|
+
}
|
150
|
+
|
151
|
+
template<typename A>
|
152
|
+
bloom_filter_alloc<A>::bloom_filter_alloc(const bloom_filter_alloc& other) :
|
153
|
+
allocator_(other.allocator_),
|
154
|
+
seed_(other.seed_),
|
155
|
+
num_hashes_(other.num_hashes_),
|
156
|
+
is_dirty_(other.is_dirty_),
|
157
|
+
is_owned_(other.is_owned_),
|
158
|
+
is_read_only_(other.is_read_only_),
|
159
|
+
capacity_bits_(other.capacity_bits_),
|
160
|
+
num_bits_set_(other.num_bits_set_)
|
161
|
+
{
|
162
|
+
if (is_owned_) {
|
163
|
+
const size_t num_bytes = capacity_bits_ >> 3;
|
164
|
+
bit_array_ = AllocUint8(allocator_).allocate(num_bytes);
|
165
|
+
if (bit_array_ == nullptr) {
|
166
|
+
throw std::bad_alloc();
|
167
|
+
}
|
168
|
+
std::copy_n(other.bit_array_, num_bytes, bit_array_);
|
169
|
+
memory_ = nullptr;
|
170
|
+
} else {
|
171
|
+
bit_array_ = other.bit_array_;
|
172
|
+
memory_ = other.memory_;
|
173
|
+
}
|
174
|
+
}
|
175
|
+
|
176
|
+
template<typename A>
|
177
|
+
bloom_filter_alloc<A>::bloom_filter_alloc(bloom_filter_alloc&& other) noexcept :
|
178
|
+
allocator_(std::move(other.allocator_)),
|
179
|
+
seed_(other.seed_),
|
180
|
+
num_hashes_(other.num_hashes_),
|
181
|
+
is_dirty_(other.is_dirty_),
|
182
|
+
is_owned_(other.is_owned_),
|
183
|
+
is_read_only_(other.is_read_only_),
|
184
|
+
capacity_bits_(other.capacity_bits_),
|
185
|
+
num_bits_set_(other.num_bits_set_),
|
186
|
+
bit_array_(std::move(other.bit_array_)),
|
187
|
+
memory_(std::move(other.memory_))
|
188
|
+
{
|
189
|
+
// ensure destructor on other will behave nicely
|
190
|
+
other.is_owned_ = false;
|
191
|
+
other.bit_array_ = nullptr;
|
192
|
+
other.memory_ = nullptr;
|
193
|
+
}
|
194
|
+
|
195
|
+
template<typename A>
|
196
|
+
bloom_filter_alloc<A>& bloom_filter_alloc<A>::operator=(const bloom_filter_alloc& other) {
|
197
|
+
bloom_filter_alloc<A> copy(other);
|
198
|
+
std::swap(allocator_, copy.allocator_);
|
199
|
+
std::swap(seed_, copy.seed_);
|
200
|
+
std::swap(num_hashes_, copy.num_hashes_);
|
201
|
+
std::swap(is_dirty_, copy.is_dirty_);
|
202
|
+
std::swap(is_owned_, copy.is_owned_);
|
203
|
+
std::swap(is_read_only_, copy.is_read_only_);
|
204
|
+
std::swap(capacity_bits_, copy.capacity_bits_);
|
205
|
+
std::swap(num_bits_set_, copy.num_bits_set_);
|
206
|
+
std::swap(bit_array_, copy.bit_array_);
|
207
|
+
std::swap(memory_, copy.memory_);
|
208
|
+
return *this;
|
209
|
+
}
|
210
|
+
|
211
|
+
template<typename A>
|
212
|
+
bloom_filter_alloc<A>& bloom_filter_alloc<A>::operator=(bloom_filter_alloc&& other) {
|
213
|
+
if (this == &other) { return *this; }
|
214
|
+
std::swap(allocator_, other.allocator_);
|
215
|
+
std::swap(seed_, other.seed_);
|
216
|
+
std::swap(num_hashes_, other.num_hashes_);
|
217
|
+
std::swap(is_dirty_, other.is_dirty_);
|
218
|
+
std::swap(is_owned_, other.is_owned_);
|
219
|
+
std::swap(is_read_only_, other.is_read_only_);
|
220
|
+
std::swap(capacity_bits_, other.capacity_bits_);
|
221
|
+
std::swap(num_bits_set_, other.num_bits_set_);
|
222
|
+
std::swap(bit_array_, other.bit_array_);
|
223
|
+
std::swap(memory_, other.memory_);
|
224
|
+
return *this;
|
225
|
+
}
|
226
|
+
|
227
|
+
template<typename A>
|
228
|
+
bloom_filter_alloc<A>::~bloom_filter_alloc() {
|
229
|
+
if (is_owned_) {
|
230
|
+
if (memory_ != nullptr) {
|
231
|
+
// deallocate total memory_ block, including preamble
|
232
|
+
AllocUint8(allocator_).deallocate(memory_, (capacity_bits_ >> 3) + BIT_ARRAY_OFFSET_BYTES);
|
233
|
+
} else if (bit_array_ != nullptr) {
|
234
|
+
// only need to deallocate bit_array_
|
235
|
+
AllocUint8(allocator_).deallocate(bit_array_, capacity_bits_ >> 3);
|
236
|
+
}
|
237
|
+
memory_ = nullptr;
|
238
|
+
bit_array_ = nullptr;
|
239
|
+
}
|
240
|
+
}
|
241
|
+
|
242
|
+
template<typename A>
|
243
|
+
bloom_filter_alloc<A> bloom_filter_alloc<A>::deserialize(const void* bytes, size_t length_bytes, const A& allocator) {
|
244
|
+
// not wrapping so we can cast away const as we're not modifying the memory
|
245
|
+
return internal_deserialize_or_wrap(const_cast<void*>(bytes), length_bytes, false, false, allocator);
|
246
|
+
}
|
247
|
+
|
248
|
+
/*
|
249
|
+
* A Bloom Filter's serialized image always uses 3 longs of preamble when empty,
|
250
|
+
* otherwise 4 longs:
|
251
|
+
*
|
252
|
+
* <pre>
|
253
|
+
* Long || Start Byte Adr:
|
254
|
+
* Adr:
|
255
|
+
* || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
|
256
|
+
* 0 || Preamble_Longs | SerVer | FamID | Flags |----Num Hashes---|-----Unused------|
|
257
|
+
*
|
258
|
+
* || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
|
259
|
+
* 1 ||---------------------------------Hash Seed-------------------------------------|
|
260
|
+
*
|
261
|
+
* || 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
|
262
|
+
* 2 ||-------BitArray Length (in longs)----------|-----------Unused------------------|
|
263
|
+
*
|
264
|
+
* || 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
|
265
|
+
* 3 ||---------------------------------NumBitsSet------------------------------------|
|
266
|
+
* </pre>
|
267
|
+
*
|
268
|
+
* The raw BitArray bits, if non-empty start at byte 32.
|
269
|
+
*/
|
270
|
+
|
271
|
+
template<typename A>
|
272
|
+
bloom_filter_alloc<A> bloom_filter_alloc<A>::deserialize(std::istream& is, const A& allocator) {
|
273
|
+
const uint8_t prelongs = read<uint8_t>(is);
|
274
|
+
const uint8_t ser_ver = read<uint8_t>(is);
|
275
|
+
const uint8_t family = read<uint8_t>(is);
|
276
|
+
const uint8_t flags = read<uint8_t>(is);
|
277
|
+
|
278
|
+
if (prelongs < 1 || prelongs > 4) {
|
279
|
+
throw std::invalid_argument("Possible corruption: Incorrect number of preamble bytes specified in header");
|
280
|
+
}
|
281
|
+
if (ser_ver != SER_VER) {
|
282
|
+
throw std::invalid_argument("Possible corruption: Unrecognized serialization version: " + std::to_string(ser_ver));
|
283
|
+
}
|
284
|
+
if (family != FAMILY_ID) {
|
285
|
+
throw std::invalid_argument("Possible corruption: Incorrect Family ID for bloom filter. Found: " + std::to_string(family));
|
286
|
+
}
|
287
|
+
|
288
|
+
const bool is_empty = (flags & EMPTY_FLAG_MASK) != 0;
|
289
|
+
|
290
|
+
const uint16_t num_hashes = read<uint16_t>(is);
|
291
|
+
read<uint16_t>(is); // unused
|
292
|
+
const uint64_t seed = read<uint64_t>(is);
|
293
|
+
const uint32_t num_longs = read<uint32_t>(is); // sized in java longs
|
294
|
+
read<uint32_t>(is); // unused
|
295
|
+
|
296
|
+
// if empty, stop reading
|
297
|
+
if (is_empty) {
|
298
|
+
return bloom_filter_alloc<A>(num_longs << 6, num_hashes, seed, allocator);
|
299
|
+
}
|
300
|
+
|
301
|
+
const uint64_t num_bits_set = read<uint64_t>(is);
|
302
|
+
const bool is_dirty = (num_bits_set == DIRTY_BITS_VALUE);
|
303
|
+
|
304
|
+
// allocate memory
|
305
|
+
const uint64_t num_bytes = num_longs << 3;
|
306
|
+
AllocUint8 alloc(allocator);
|
307
|
+
uint8_t* bit_array = alloc.allocate(num_bytes);
|
308
|
+
if (bit_array == nullptr) {
|
309
|
+
throw std::bad_alloc();
|
310
|
+
}
|
311
|
+
read(is, bit_array, num_bytes);
|
312
|
+
|
313
|
+
// pass to constructor
|
314
|
+
return bloom_filter_alloc<A>(seed, num_hashes, is_dirty, true, false, num_longs << 6, num_bits_set, bit_array, nullptr, allocator);
|
315
|
+
}
|
316
|
+
|
317
|
+
template<typename A>
|
318
|
+
const bloom_filter_alloc<A> bloom_filter_alloc<A>::wrap(const void* bytes, size_t length_bytes, const A& allocator) {
|
319
|
+
// read-only flag means we won't modify the memory, but cast away the const
|
320
|
+
return internal_deserialize_or_wrap(const_cast<void*>(bytes), length_bytes, true, true, allocator);
|
321
|
+
}
|
322
|
+
|
323
|
+
template<typename A>
|
324
|
+
bloom_filter_alloc<A> bloom_filter_alloc<A>::writable_wrap(void* bytes, size_t length_bytes, const A& allocator) {
|
325
|
+
return internal_deserialize_or_wrap(bytes, length_bytes, false, true, allocator);
|
326
|
+
}
|
327
|
+
|
328
|
+
template<typename A>
|
329
|
+
bloom_filter_alloc<A> bloom_filter_alloc<A>::internal_deserialize_or_wrap(void* bytes,
|
330
|
+
size_t length_bytes,
|
331
|
+
bool read_only,
|
332
|
+
bool wrap,
|
333
|
+
const A& allocator)
|
334
|
+
{
|
335
|
+
ensure_minimum_memory(length_bytes, 8);
|
336
|
+
if (bytes == nullptr) {
|
337
|
+
throw std::invalid_argument("Input data is null or empty");
|
338
|
+
}
|
339
|
+
const uint8_t* ptr = static_cast<const uint8_t*>(bytes);
|
340
|
+
const uint8_t* end_ptr = ptr + length_bytes;
|
341
|
+
const uint8_t prelongs = *ptr++;
|
342
|
+
const uint8_t ser_ver = *ptr++;
|
343
|
+
const uint8_t family = *ptr++;
|
344
|
+
const uint8_t flags = *ptr++;
|
345
|
+
|
346
|
+
if (prelongs < PREAMBLE_LONGS_EMPTY || prelongs > PREAMBLE_LONGS_STANDARD) {
|
347
|
+
throw std::invalid_argument("Possible corruption: Incorrect number of preamble bytes specified in header");
|
348
|
+
}
|
349
|
+
if (ser_ver != SER_VER) {
|
350
|
+
throw std::invalid_argument("Possible corruption: Unrecognized serialization version: " + std::to_string(ser_ver));
|
351
|
+
}
|
352
|
+
if (family != FAMILY_ID) {
|
353
|
+
throw std::invalid_argument("Possible corruption: Incorrect Family ID for bloom filter. Found: " + std::to_string(family));
|
354
|
+
}
|
355
|
+
|
356
|
+
const bool is_empty = (flags & EMPTY_FLAG_MASK) != 0;
|
357
|
+
|
358
|
+
ensure_minimum_memory(length_bytes, prelongs * sizeof(uint64_t));
|
359
|
+
|
360
|
+
uint16_t num_hashes;
|
361
|
+
ptr += copy_from_mem(ptr, num_hashes);
|
362
|
+
ptr += sizeof(uint16_t); // 16 bits unused after num_hashes
|
363
|
+
uint64_t seed;
|
364
|
+
ptr += copy_from_mem(ptr, seed);
|
365
|
+
|
366
|
+
uint32_t num_longs;
|
367
|
+
ptr += copy_from_mem(ptr, num_longs); // sized in java longs
|
368
|
+
ptr += sizeof(uint32_t); // unused 32 bits follow
|
369
|
+
|
370
|
+
// if empty, stop reading
|
371
|
+
if (wrap && is_empty && !read_only) {
|
372
|
+
throw std::invalid_argument("Cannot wrap an empty filter for writing");
|
373
|
+
} else if (is_empty) {
|
374
|
+
return bloom_filter_alloc<A>(num_longs << 6, num_hashes, seed, allocator);
|
375
|
+
}
|
376
|
+
|
377
|
+
uint64_t num_bits_set;
|
378
|
+
ptr += copy_from_mem(ptr, num_bits_set);
|
379
|
+
const bool is_dirty = (num_bits_set == DIRTY_BITS_VALUE);
|
380
|
+
|
381
|
+
uint8_t* bit_array;
|
382
|
+
uint8_t* memory;
|
383
|
+
if (wrap) {
|
384
|
+
memory = static_cast<uint8_t*>(bytes);
|
385
|
+
bit_array = memory + BIT_ARRAY_OFFSET_BYTES;
|
386
|
+
} else {
|
387
|
+
// allocate memory
|
388
|
+
memory = nullptr;
|
389
|
+
const uint64_t num_bytes = num_longs << 3;
|
390
|
+
ensure_minimum_memory(end_ptr - ptr, num_bytes);
|
391
|
+
AllocUint8 alloc(allocator);
|
392
|
+
bit_array = alloc.allocate(num_bytes);
|
393
|
+
if (bit_array == nullptr) {
|
394
|
+
throw std::bad_alloc();
|
395
|
+
}
|
396
|
+
copy_from_mem(ptr, bit_array, num_bytes);
|
397
|
+
}
|
398
|
+
|
399
|
+
// pass to constructor -- !wrap == is_owned_
|
400
|
+
return bloom_filter_alloc<A>(seed, num_hashes, is_dirty, !wrap, read_only, num_longs << 6, num_bits_set, bit_array, memory, allocator);
|
401
|
+
}
|
402
|
+
|
403
|
+
template<typename A>
|
404
|
+
void bloom_filter_alloc<A>::serialize(std::ostream& os) const {
|
405
|
+
// Should we serialize memory_ directly if it exists?
|
406
|
+
const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_STANDARD;
|
407
|
+
write(os, preamble_longs);
|
408
|
+
const uint8_t serial_version = SER_VER;
|
409
|
+
write(os, serial_version);
|
410
|
+
const uint8_t family = FAMILY_ID;
|
411
|
+
write(os, family);
|
412
|
+
const uint8_t flags_byte = is_empty() ? EMPTY_FLAG_MASK : 0;
|
413
|
+
write(os, flags_byte);
|
414
|
+
|
415
|
+
write(os, num_hashes_);
|
416
|
+
write(os, static_cast<uint16_t>(0)); // 2 bytes unused
|
417
|
+
write(os, seed_);
|
418
|
+
write(os, static_cast<int32_t>(capacity_bits_ >> 6)); // sized in java longs
|
419
|
+
write(os, static_cast<uint32_t>(0)); // 4 bytes unused
|
420
|
+
|
421
|
+
if (!is_empty()) {
|
422
|
+
write(os, is_dirty_ ? DIRTY_BITS_VALUE : num_bits_set_);
|
423
|
+
write(os, bit_array_, capacity_bits_ >> 3);
|
424
|
+
}
|
425
|
+
|
426
|
+
os.flush();
|
427
|
+
}
|
428
|
+
|
429
|
+
template<typename A>
|
430
|
+
auto bloom_filter_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
|
431
|
+
// Should we serialize memory_ directly if it exists?
|
432
|
+
const size_t size = header_size_bytes + get_serialized_size_bytes();
|
433
|
+
vector_bytes bytes(size, 0, allocator_);
|
434
|
+
uint8_t* ptr = bytes.data() + header_size_bytes;
|
435
|
+
|
436
|
+
const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_STANDARD;
|
437
|
+
ptr += copy_to_mem(preamble_longs, ptr);
|
438
|
+
const uint8_t serial_version = SER_VER;
|
439
|
+
ptr += copy_to_mem(serial_version, ptr);
|
440
|
+
const uint8_t family = FAMILY_ID;
|
441
|
+
ptr += copy_to_mem(family, ptr);
|
442
|
+
const uint8_t flags_byte = is_empty() ? EMPTY_FLAG_MASK : 0;
|
443
|
+
ptr += copy_to_mem(flags_byte, ptr);
|
444
|
+
|
445
|
+
ptr += copy_to_mem(num_hashes_, ptr);
|
446
|
+
ptr += copy_to_mem(static_cast<uint16_t>(0), ptr); // 2 bytes unused
|
447
|
+
ptr += copy_to_mem(seed_, ptr);
|
448
|
+
ptr += copy_to_mem(static_cast<int32_t>(capacity_bits_ >> 6), ptr); // sized in java longs
|
449
|
+
ptr += copy_to_mem(static_cast<uint32_t>(0), ptr); // 4 bytes unused
|
450
|
+
|
451
|
+
if (!is_empty()) {
|
452
|
+
ptr += copy_to_mem(is_dirty_ ? DIRTY_BITS_VALUE : num_bits_set_, ptr);
|
453
|
+
ptr += copy_to_mem(bit_array_, ptr, capacity_bits_ >> 3);
|
454
|
+
}
|
455
|
+
|
456
|
+
return bytes;
|
457
|
+
}
|
458
|
+
|
459
|
+
template<typename A>
|
460
|
+
size_t bloom_filter_alloc<A>::get_serialized_size_bytes() const {
|
461
|
+
return sizeof(uint64_t) * (is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_STANDARD + (capacity_bits_ >> 6));
|
462
|
+
}
|
463
|
+
|
464
|
+
template<typename A>
|
465
|
+
size_t bloom_filter_alloc<A>::get_serialized_size_bytes(uint64_t num_bits) {
|
466
|
+
if (num_bits == 0)
|
467
|
+
throw std::invalid_argument("Number of bits must be greater than zero");
|
468
|
+
|
469
|
+
size_t num_bytes = (num_bits + 63) >> 6;
|
470
|
+
return sizeof(uint64_t) * (PREAMBLE_LONGS_STANDARD + num_bytes);
|
471
|
+
}
|
472
|
+
|
473
|
+
template<typename A>
|
474
|
+
bool bloom_filter_alloc<A>::is_empty() const {
|
475
|
+
return !is_dirty_ && num_bits_set_ == 0;
|
476
|
+
}
|
477
|
+
|
478
|
+
template<typename A>
|
479
|
+
uint64_t bloom_filter_alloc<A>::get_bits_used() {
|
480
|
+
if (is_dirty_) {
|
481
|
+
num_bits_set_ = bit_array_ops::count_num_bits_set(bit_array_, capacity_bits_ >> 3);
|
482
|
+
is_dirty_ = false;
|
483
|
+
}
|
484
|
+
return num_bits_set_;
|
485
|
+
}
|
486
|
+
|
487
|
+
template<typename A>
|
488
|
+
uint64_t bloom_filter_alloc<A>::get_capacity() const {
|
489
|
+
return capacity_bits_;
|
490
|
+
}
|
491
|
+
|
492
|
+
template<typename A>
|
493
|
+
uint16_t bloom_filter_alloc<A>::get_num_hashes() const {
|
494
|
+
return num_hashes_;
|
495
|
+
}
|
496
|
+
|
497
|
+
template<typename A>
|
498
|
+
uint64_t bloom_filter_alloc<A>::get_seed() const {
|
499
|
+
return seed_;
|
500
|
+
}
|
501
|
+
|
502
|
+
template<typename A>
|
503
|
+
bool bloom_filter_alloc<A>::is_read_only() const {
|
504
|
+
return is_read_only_;
|
505
|
+
}
|
506
|
+
|
507
|
+
template<typename A>
|
508
|
+
bool bloom_filter_alloc<A>::is_wrapped() const {
|
509
|
+
return memory_ != nullptr;
|
510
|
+
}
|
511
|
+
|
512
|
+
template<typename A>
|
513
|
+
bool bloom_filter_alloc<A>::is_memory_owned() const {
|
514
|
+
return is_owned_;
|
515
|
+
}
|
516
|
+
|
517
|
+
template<typename A>
|
518
|
+
const uint8_t* bloom_filter_alloc<A>::get_wrapped_memory() const {
|
519
|
+
return memory_;
|
520
|
+
}
|
521
|
+
|
522
|
+
template<typename A>
|
523
|
+
void bloom_filter_alloc<A>::reset() {
|
524
|
+
if (is_read_only_) {
|
525
|
+
throw std::logic_error("Cannot reset a read-only filter");
|
526
|
+
}
|
527
|
+
update_num_bits_set(0);
|
528
|
+
std::fill_n(bit_array_, capacity_bits_ >> 3, 0);
|
529
|
+
}
|
530
|
+
|
531
|
+
template<typename A>
|
532
|
+
void bloom_filter_alloc<A>::update_num_bits_set(uint64_t num_bits_set) {
|
533
|
+
num_bits_set_ = num_bits_set;
|
534
|
+
is_dirty_ = false;
|
535
|
+
if (memory_ != nullptr && !is_read_only_) {
|
536
|
+
copy_to_mem(num_bits_set_, memory_ + NUM_BITS_SET_OFFSET_BYTES);
|
537
|
+
}
|
538
|
+
}
|
539
|
+
|
540
|
+
// UPDATE METHODS
|
541
|
+
|
542
|
+
template<typename A>
|
543
|
+
void bloom_filter_alloc<A>::update(const std::string& item) {
|
544
|
+
if (item.empty()) return;
|
545
|
+
const uint64_t h0 = XXHash64::hash(item.data(), item.size(), seed_);
|
546
|
+
const uint64_t h1 = XXHash64::hash(item.data(), item.size(), h0);
|
547
|
+
internal_update(h0, h1);
|
548
|
+
}
|
549
|
+
|
550
|
+
template<typename A>
|
551
|
+
void bloom_filter_alloc<A>::update(uint64_t item) {
|
552
|
+
const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_);
|
553
|
+
const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0);
|
554
|
+
internal_update(h0, h1);
|
555
|
+
}
|
556
|
+
|
557
|
+
template<typename A>
|
558
|
+
void bloom_filter_alloc<A>::update(uint32_t item) {
|
559
|
+
update(static_cast<uint64_t>(item));
|
560
|
+
}
|
561
|
+
|
562
|
+
template<typename A>
|
563
|
+
void bloom_filter_alloc<A>::update(uint16_t item) {
|
564
|
+
update(static_cast<uint64_t>(item));
|
565
|
+
}
|
566
|
+
|
567
|
+
template<typename A>
|
568
|
+
void bloom_filter_alloc<A>::update(uint8_t item) {
|
569
|
+
update(static_cast<uint64_t>(item));
|
570
|
+
}
|
571
|
+
|
572
|
+
template<typename A>
|
573
|
+
void bloom_filter_alloc<A>::update(int64_t item) {
|
574
|
+
const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_);
|
575
|
+
const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0);
|
576
|
+
internal_update(h0, h1);
|
577
|
+
}
|
578
|
+
|
579
|
+
template<typename A>
|
580
|
+
void bloom_filter_alloc<A>::update(int32_t item) {
|
581
|
+
update(static_cast<int64_t>(item));
|
582
|
+
}
|
583
|
+
|
584
|
+
template<typename A>
|
585
|
+
void bloom_filter_alloc<A>::update(int16_t item) {
|
586
|
+
update(static_cast<int64_t>(item));
|
587
|
+
}
|
588
|
+
|
589
|
+
template<typename A>
|
590
|
+
void bloom_filter_alloc<A>::update(int8_t item) {
|
591
|
+
update(static_cast<int64_t>(item));
|
592
|
+
}
|
593
|
+
|
594
|
+
template<typename A>
|
595
|
+
void bloom_filter_alloc<A>::update(double item) {
|
596
|
+
union {
|
597
|
+
int64_t long_value;
|
598
|
+
double double_value;
|
599
|
+
} ldu;
|
600
|
+
ldu.double_value = static_cast<double>(item);
|
601
|
+
if (item == 0.0) {
|
602
|
+
ldu.double_value = 0.0; // canonicalize -0.0 to 0.0
|
603
|
+
} else if (std::isnan(ldu.double_value)) {
|
604
|
+
ldu.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
|
605
|
+
}
|
606
|
+
const uint64_t h0 = XXHash64::hash(&ldu, sizeof(ldu), seed_);
|
607
|
+
const uint64_t h1 = XXHash64::hash(&ldu, sizeof(ldu), h0);
|
608
|
+
internal_update(h0, h1);
|
609
|
+
}
|
610
|
+
|
611
|
+
template<typename A>
|
612
|
+
void bloom_filter_alloc<A>::update(float item) {
|
613
|
+
update(static_cast<double>(item));
|
614
|
+
}
|
615
|
+
|
616
|
+
template<typename A>
|
617
|
+
void bloom_filter_alloc<A>::update(const void* item, size_t size) {
|
618
|
+
if (item == nullptr || size == 0) return;
|
619
|
+
const uint64_t h0 = XXHash64::hash(item, size, seed_);
|
620
|
+
const uint64_t h1 = XXHash64::hash(item, size, h0);
|
621
|
+
internal_update(h0, h1);
|
622
|
+
}
|
623
|
+
|
624
|
+
template<typename A>
|
625
|
+
void bloom_filter_alloc<A>::internal_update(uint64_t h0, uint64_t h1) {
|
626
|
+
if (is_read_only_) {
|
627
|
+
throw std::logic_error("Cannot update a read-only filter");
|
628
|
+
}
|
629
|
+
const uint64_t num_bits = get_capacity();
|
630
|
+
for (uint16_t i = 1; i <= num_hashes_; i++) {
|
631
|
+
const uint64_t hash_index = ((h0 + i * h1) >> 1) % num_bits;
|
632
|
+
bit_array_ops::set_bit(bit_array_, hash_index);
|
633
|
+
}
|
634
|
+
is_dirty_ = true;
|
635
|
+
}
|
636
|
+
|
637
|
+
// QUERY-AND-UPDATE METHODS
|
638
|
+
|
639
|
+
template<typename A>
|
640
|
+
bool bloom_filter_alloc<A>::query_and_update(const std::string& item) {
|
641
|
+
if (item.empty()) return false;
|
642
|
+
const uint64_t h0 = XXHash64::hash(item.data(), item.size(), seed_);
|
643
|
+
const uint64_t h1 = XXHash64::hash(item.data(), item.size(), h0);
|
644
|
+
return internal_query_and_update(h0, h1);
|
645
|
+
}
|
646
|
+
|
647
|
+
template<typename A>
|
648
|
+
bool bloom_filter_alloc<A>::query_and_update(uint64_t item) {
|
649
|
+
const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_);
|
650
|
+
const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0);
|
651
|
+
return internal_query_and_update(h0, h1);
|
652
|
+
}
|
653
|
+
|
654
|
+
template<typename A>
|
655
|
+
bool bloom_filter_alloc<A>::query_and_update(uint32_t item) {
|
656
|
+
return query_and_update(static_cast<uint64_t>(item));
|
657
|
+
}
|
658
|
+
|
659
|
+
template<typename A>
|
660
|
+
bool bloom_filter_alloc<A>::query_and_update(uint16_t item) {
|
661
|
+
return query_and_update(static_cast<uint64_t>(item));
|
662
|
+
}
|
663
|
+
|
664
|
+
template<typename A>
|
665
|
+
bool bloom_filter_alloc<A>::query_and_update(uint8_t item) {
|
666
|
+
return query_and_update(static_cast<uint64_t>(item));
|
667
|
+
}
|
668
|
+
|
669
|
+
template<typename A>
|
670
|
+
bool bloom_filter_alloc<A>::query_and_update(int64_t item) {
|
671
|
+
const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_);
|
672
|
+
const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0);
|
673
|
+
return internal_query_and_update(h0, h1);
|
674
|
+
}
|
675
|
+
|
676
|
+
template<typename A>
|
677
|
+
bool bloom_filter_alloc<A>::query_and_update(int32_t item) {
|
678
|
+
return query_and_update(static_cast<int64_t>(item));
|
679
|
+
}
|
680
|
+
|
681
|
+
template<typename A>
|
682
|
+
bool bloom_filter_alloc<A>::query_and_update(int16_t item) {
|
683
|
+
return query_and_update(static_cast<int64_t>(item));
|
684
|
+
}
|
685
|
+
|
686
|
+
template<typename A>
|
687
|
+
bool bloom_filter_alloc<A>::query_and_update(int8_t item) {
|
688
|
+
return query_and_update(static_cast<int64_t>(item));
|
689
|
+
}
|
690
|
+
|
691
|
+
template<typename A>
|
692
|
+
bool bloom_filter_alloc<A>::query_and_update(double item) {
|
693
|
+
union {
|
694
|
+
int64_t long_value;
|
695
|
+
double double_value;
|
696
|
+
} ldu;
|
697
|
+
ldu.double_value = item;
|
698
|
+
if (item == 0.0) {
|
699
|
+
ldu.double_value = 0.0; // canonicalize -0.0 to 0.0
|
700
|
+
} else if (std::isnan(ldu.double_value)) {
|
701
|
+
ldu.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
|
702
|
+
}
|
703
|
+
const uint64_t h0 = XXHash64::hash(&ldu, sizeof(ldu), seed_);
|
704
|
+
const uint64_t h1 = XXHash64::hash(&ldu, sizeof(ldu), h0);
|
705
|
+
return internal_query_and_update(h0, h1);
|
706
|
+
}
|
707
|
+
|
708
|
+
template<typename A>
|
709
|
+
bool bloom_filter_alloc<A>::query_and_update(float item) {
|
710
|
+
return query_and_update(static_cast<double>(item));
|
711
|
+
}
|
712
|
+
|
713
|
+
template<typename A>
|
714
|
+
bool bloom_filter_alloc<A>::query_and_update(const void* item, size_t size) {
|
715
|
+
if (item == nullptr || size == 0) return false;
|
716
|
+
const uint64_t h0 = XXHash64::hash(item, size, seed_);
|
717
|
+
const uint64_t h1 = XXHash64::hash(item, size, h0);
|
718
|
+
return internal_query_and_update(h0, h1);
|
719
|
+
}
|
720
|
+
|
721
|
+
template<typename A>
|
722
|
+
bool bloom_filter_alloc<A>::internal_query_and_update(uint64_t h0, uint64_t h1) {
|
723
|
+
if (is_read_only_) {
|
724
|
+
throw std::logic_error("Cannot update a read-only filter");
|
725
|
+
}
|
726
|
+
const uint64_t num_bits = get_capacity();
|
727
|
+
bool value_exists = true;
|
728
|
+
for (uint16_t i = 1; i <= num_hashes_; i++) {
|
729
|
+
const uint64_t hash_index = ((h0 + i * h1) >> 1) % num_bits;
|
730
|
+
bool value = bit_array_ops::get_and_set_bit(bit_array_, hash_index);
|
731
|
+
update_num_bits_set(num_bits_set_ + (value ? 0 : 1));
|
732
|
+
value_exists &= value;
|
733
|
+
}
|
734
|
+
return value_exists;
|
735
|
+
}
|
736
|
+
|
737
|
+
// QUERY METHODS
|
738
|
+
|
739
|
+
template<typename A>
|
740
|
+
bool bloom_filter_alloc<A>::query(const std::string& item) const {
|
741
|
+
if (item.empty()) return false;
|
742
|
+
const uint64_t h0 = XXHash64::hash(item.data(), item.size(), seed_);
|
743
|
+
const uint64_t h1 = XXHash64::hash(item.data(), item.size(), h0);
|
744
|
+
return internal_query(h0, h1);
|
745
|
+
}
|
746
|
+
|
747
|
+
template<typename A>
|
748
|
+
bool bloom_filter_alloc<A>::query(uint64_t item) const {
|
749
|
+
const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_);
|
750
|
+
const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0);
|
751
|
+
return internal_query(h0, h1);
|
752
|
+
}
|
753
|
+
|
754
|
+
template<typename A>
|
755
|
+
bool bloom_filter_alloc<A>::query(uint32_t item) const {
|
756
|
+
return query(static_cast<uint64_t>(item));
|
757
|
+
}
|
758
|
+
|
759
|
+
template<typename A>
|
760
|
+
bool bloom_filter_alloc<A>::query(uint16_t item) const {
|
761
|
+
return query(static_cast<uint64_t>(item));
|
762
|
+
}
|
763
|
+
|
764
|
+
template<typename A>
|
765
|
+
bool bloom_filter_alloc<A>::query(uint8_t item) const {
|
766
|
+
return query(static_cast<uint64_t>(item));
|
767
|
+
}
|
768
|
+
|
769
|
+
template<typename A>
|
770
|
+
bool bloom_filter_alloc<A>::query(int64_t item) const {
|
771
|
+
const uint64_t h0 = XXHash64::hash(&item, sizeof(item), seed_);
|
772
|
+
const uint64_t h1 = XXHash64::hash(&item, sizeof(item), h0);
|
773
|
+
return internal_query(h0, h1);
|
774
|
+
}
|
775
|
+
|
776
|
+
template<typename A>
|
777
|
+
bool bloom_filter_alloc<A>::query(int32_t item) const {
|
778
|
+
return query(static_cast<int64_t>(item));
|
779
|
+
}
|
780
|
+
|
781
|
+
template<typename A>
|
782
|
+
bool bloom_filter_alloc<A>::query(int16_t item) const {
|
783
|
+
return query(static_cast<int64_t>(item));
|
784
|
+
}
|
785
|
+
|
786
|
+
template<typename A>
|
787
|
+
bool bloom_filter_alloc<A>::query(int8_t item) const {
|
788
|
+
return query(static_cast<int64_t>(item));
|
789
|
+
}
|
790
|
+
|
791
|
+
template<typename A>
|
792
|
+
bool bloom_filter_alloc<A>::query(double item) const {
|
793
|
+
union {
|
794
|
+
int64_t long_value;
|
795
|
+
double double_value;
|
796
|
+
} ldu;
|
797
|
+
ldu.double_value = static_cast<double>(item);
|
798
|
+
if (item == 0.0) {
|
799
|
+
ldu.double_value = 0.0; // canonicalize -0.0 to 0.0
|
800
|
+
} else if (std::isnan(ldu.double_value)) {
|
801
|
+
ldu.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
|
802
|
+
}
|
803
|
+
const uint64_t h0 = XXHash64::hash(&ldu, sizeof(ldu), seed_);
|
804
|
+
const uint64_t h1 = XXHash64::hash(&ldu, sizeof(ldu), h0);
|
805
|
+
return internal_query(h0, h1);
|
806
|
+
}
|
807
|
+
|
808
|
+
template<typename A>
|
809
|
+
bool bloom_filter_alloc<A>::query(float item) const {
|
810
|
+
return query(static_cast<double>(item));
|
811
|
+
}
|
812
|
+
|
813
|
+
template<typename A>
|
814
|
+
bool bloom_filter_alloc<A>::query(const void* item, size_t size) const {
|
815
|
+
if (item == nullptr || size == 0) return false;
|
816
|
+
const uint64_t h0 = XXHash64::hash(item, size, seed_);
|
817
|
+
const uint64_t h1 = XXHash64::hash(item, size, h0);
|
818
|
+
return internal_query(h0, h1);
|
819
|
+
}
|
820
|
+
|
821
|
+
template<typename A>
|
822
|
+
bool bloom_filter_alloc<A>::internal_query(uint64_t h0, uint64_t h1) const {
|
823
|
+
if (is_empty()) return false;
|
824
|
+
const uint64_t num_bits = get_capacity();
|
825
|
+
for (uint16_t i = 1; i <= num_hashes_; i++) {
|
826
|
+
const uint64_t hash_index = ((h0 + i * h1) >> 1) % num_bits;
|
827
|
+
if (!bit_array_ops::get_bit(bit_array_, hash_index))
|
828
|
+
return false;
|
829
|
+
}
|
830
|
+
return true;
|
831
|
+
}
|
832
|
+
|
833
|
+
// OTHER METHODS
|
834
|
+
|
835
|
+
template<typename A>
|
836
|
+
bool bloom_filter_alloc<A>::is_compatible(const bloom_filter_alloc& other) const {
|
837
|
+
return seed_ == other.seed_
|
838
|
+
&& num_hashes_ == other.num_hashes_
|
839
|
+
&& get_capacity() == other.get_capacity()
|
840
|
+
;
|
841
|
+
}
|
842
|
+
|
843
|
+
template<typename A>
|
844
|
+
void bloom_filter_alloc<A>::union_with(const bloom_filter_alloc& other) {
|
845
|
+
if (!is_compatible(other)) {
|
846
|
+
throw std::invalid_argument("Incompatible bloom filters");
|
847
|
+
}
|
848
|
+
uint64_t bits_set = bit_array_ops::union_with(bit_array_, other.bit_array_, capacity_bits_ >> 3);
|
849
|
+
update_num_bits_set(bits_set);
|
850
|
+
}
|
851
|
+
|
852
|
+
template<typename A>
|
853
|
+
void bloom_filter_alloc<A>::intersect(const bloom_filter_alloc& other) {
|
854
|
+
if (!is_compatible(other)) {
|
855
|
+
throw std::invalid_argument("Incompatible bloom filters");
|
856
|
+
}
|
857
|
+
uint64_t bits_set = bit_array_ops::intersect(bit_array_, other.bit_array_, capacity_bits_ >> 3);
|
858
|
+
update_num_bits_set(bits_set);
|
859
|
+
}
|
860
|
+
|
861
|
+
template<typename A>
|
862
|
+
void bloom_filter_alloc<A>::invert() {
|
863
|
+
uint64_t bits_set = bit_array_ops::invert(bit_array_, capacity_bits_ >> 3);
|
864
|
+
update_num_bits_set(bits_set);
|
865
|
+
}
|
866
|
+
|
867
|
+
template<typename A>
|
868
|
+
string<A> bloom_filter_alloc<A>::to_string(bool print_filter) const {
|
869
|
+
// Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
|
870
|
+
// The stream does not support passing an allocator instance, and alternatives are complicated.
|
871
|
+
std::ostringstream oss;
|
872
|
+
uint64_t num_bits_set = num_bits_set_;
|
873
|
+
if (is_dirty_) {
|
874
|
+
num_bits_set = bit_array_ops::count_num_bits_set(bit_array_, capacity_bits_ >> 3);
|
875
|
+
}
|
876
|
+
|
877
|
+
oss << "### Bloom Filter Summary:" << std::endl;
|
878
|
+
oss << " num_bits : " << get_capacity() << std::endl;
|
879
|
+
oss << " num_hashes : " << num_hashes_ << std::endl;
|
880
|
+
oss << " seed : " << seed_ << std::endl;
|
881
|
+
oss << " is_dirty : " << (is_dirty_ ? "true" : "false") << std::endl;
|
882
|
+
oss << " bits_used : " << num_bits_set << std::endl;
|
883
|
+
oss << " fill % : " << (num_bits_set * 100.0) / get_capacity() << std::endl;
|
884
|
+
oss << "### End filter summary" << std::endl;
|
885
|
+
|
886
|
+
if (print_filter) {
|
887
|
+
uint64_t num_blocks = capacity_bits_ >> 6; // groups of 64 bits
|
888
|
+
for (uint64_t i = 0; i < num_blocks; ++i) {
|
889
|
+
oss << i << ": ";
|
890
|
+
for (uint64_t j = 0; j < 8; ++j) { // bytes w/in a block
|
891
|
+
for (uint64_t b = 0; b < 8; ++b) { // bits w/in a byte
|
892
|
+
oss << ((bit_array_[i * 8 + j] & (1 << b)) ? "1" : "0");
|
893
|
+
}
|
894
|
+
oss << " ";
|
895
|
+
}
|
896
|
+
oss << std::endl;
|
897
|
+
}
|
898
|
+
oss << std::endl;
|
899
|
+
}
|
900
|
+
|
901
|
+
oss << std::endl;
|
902
|
+
return string<A>(oss.str(), allocator_);
|
903
|
+
}
|
904
|
+
|
905
|
+
|
906
|
+
} // namespace datasketches
|
907
|
+
|
908
|
+
#endif // _BLOOM_FILTER_IMPL_HPP_
|