datasketches 0.4.2 → 0.4.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/NOTICE +1 -1
- data/README.md +1 -1
- data/ext/datasketches/vo_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +2 -0
- data/vendor/datasketches-cpp/LICENSE +35 -7
- data/vendor/datasketches-cpp/NOTICE +3 -3
- data/vendor/datasketches-cpp/README.md +2 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -3
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +18 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
- data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
- data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
- data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
- data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +304 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +632 -0
- data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +456 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -38
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +27 -9
@@ -0,0 +1,406 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <catch2/catch.hpp>
|
21
|
+
|
22
|
+
#include "bloom_filter.hpp"
|
23
|
+
|
24
|
+
#ifdef TEST_BINARY_INPUT_PATH
|
25
|
+
static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
|
26
|
+
#else
|
27
|
+
static std::string testBinaryInputPath = "test/";
|
28
|
+
#endif
|
29
|
+
|
30
|
+
namespace datasketches {
|
31
|
+
|
32
|
+
TEST_CASE("bloom_filter: invalid constructor args", "[bloom_filter]") {
|
33
|
+
REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(0, 4), std::invalid_argument);
|
34
|
+
REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(1L << 60, 4), std::invalid_argument);
|
35
|
+
REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(65535, 0), std::invalid_argument);
|
36
|
+
}
|
37
|
+
|
38
|
+
TEST_CASE("bloom_filter: standard constructors", "[bloom_filter]") {
|
39
|
+
uint64_t num_items = 4000;
|
40
|
+
double fpp = 0.01;
|
41
|
+
|
42
|
+
uint64_t num_bits = bloom_filter::builder::suggest_num_filter_bits(num_items, fpp);
|
43
|
+
uint16_t num_hashes = bloom_filter::builder::suggest_num_hashes(num_items, num_bits);
|
44
|
+
uint64_t seed = 89023;
|
45
|
+
|
46
|
+
auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes, seed);
|
47
|
+
uint64_t adjusted_num_bits = (num_bits + 63) & ~0x3F; // round up to the nearest multiple of 64
|
48
|
+
REQUIRE(bf.get_capacity() == adjusted_num_bits);
|
49
|
+
REQUIRE(bf.get_num_hashes() == num_hashes);
|
50
|
+
REQUIRE(bf.get_seed() == seed);
|
51
|
+
REQUIRE(bf.is_empty());
|
52
|
+
|
53
|
+
// should match above
|
54
|
+
bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
|
55
|
+
REQUIRE(bf.get_capacity() == adjusted_num_bits);
|
56
|
+
REQUIRE(bf.get_num_hashes() == num_hashes);
|
57
|
+
REQUIRE(bf.get_seed() == seed);
|
58
|
+
REQUIRE(bf.is_empty());
|
59
|
+
|
60
|
+
// same for initializing memory in-place
|
61
|
+
size_t serialized_size_bytes = bloom_filter::get_serialized_size_bytes(num_bits);
|
62
|
+
uint8_t* bytes = new uint8_t[serialized_size_bytes];
|
63
|
+
|
64
|
+
bf = bloom_filter::builder::initialize_by_size(bytes, serialized_size_bytes, num_bits, num_hashes, seed);
|
65
|
+
REQUIRE(bf.get_capacity() == adjusted_num_bits);
|
66
|
+
REQUIRE(bf.get_num_hashes() == num_hashes);
|
67
|
+
REQUIRE(bf.get_seed() == seed);
|
68
|
+
REQUIRE(bf.is_empty());
|
69
|
+
|
70
|
+
bf = bloom_filter::builder::initialize_by_accuracy(bytes, serialized_size_bytes, num_items, fpp, seed);
|
71
|
+
REQUIRE(bf.get_capacity() == adjusted_num_bits);
|
72
|
+
REQUIRE(bf.get_num_hashes() == num_hashes);
|
73
|
+
REQUIRE(bf.get_seed() == seed);
|
74
|
+
REQUIRE(bf.is_empty());
|
75
|
+
|
76
|
+
delete [] bytes;
|
77
|
+
}
|
78
|
+
|
79
|
+
TEST_CASE("bloom_filter: basic operations", "[bloom_filter]") {
|
80
|
+
uint64_t num_items = 5000;
|
81
|
+
double fpp = 0.01;
|
82
|
+
uint64_t seed = 4897301548054ULL;
|
83
|
+
|
84
|
+
auto bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
|
85
|
+
REQUIRE(bf.is_empty());
|
86
|
+
REQUIRE(bf.get_bits_used() == 0);
|
87
|
+
|
88
|
+
for (uint64_t i = 0; i < num_items; ++i) {
|
89
|
+
bf.query_and_update(i);
|
90
|
+
}
|
91
|
+
|
92
|
+
REQUIRE(!bf.is_empty());
|
93
|
+
// filter is about 50% full at target capacity
|
94
|
+
// since seed is fixed we expect an exact value every time
|
95
|
+
// but leaving the approximate test in since that's more the "expectation"
|
96
|
+
REQUIRE(bf.get_bits_used() == 24793); // exact value is not important but should be consistent
|
97
|
+
REQUIRE(bf.get_bits_used() == Approx(0.5 * bf.get_capacity()).epsilon(0.05)); // just over 3.3% in practice
|
98
|
+
|
99
|
+
uint32_t num_found = 0;
|
100
|
+
for (uint64_t i = num_items; i < bf.get_capacity(); ++i) {
|
101
|
+
if (bf.query(i)) {
|
102
|
+
++num_found;
|
103
|
+
}
|
104
|
+
}
|
105
|
+
// fpp is average with significant variance -- even at 12% it would fail occasionally
|
106
|
+
REQUIRE(num_found == 423);
|
107
|
+
//REQUIRE(num_found == Approx((bf.get_capacity() - num_items) * fpp).epsilon(0.12));
|
108
|
+
auto bytes = bf.serialize();
|
109
|
+
|
110
|
+
// initialize in memory and run the same tests
|
111
|
+
// also checking against the results from the first part
|
112
|
+
uint8_t* bf_memory = new uint8_t[bytes.size()];
|
113
|
+
auto bf2 = bloom_filter::builder::initialize_by_accuracy(bf_memory, bytes.size(), num_items, fpp, bf.get_seed());
|
114
|
+
REQUIRE(bf2.is_empty());
|
115
|
+
REQUIRE(bf2.get_bits_used() == 0);
|
116
|
+
|
117
|
+
for (uint64_t i = 0; i < num_items; ++i) {
|
118
|
+
bf2.query_and_update(i);
|
119
|
+
}
|
120
|
+
|
121
|
+
REQUIRE(!bf2.is_empty());
|
122
|
+
REQUIRE(bf2.get_bits_used() == bf.get_bits_used()); // should exactly match above
|
123
|
+
|
124
|
+
uint32_t num_found2 = 0;
|
125
|
+
for (uint64_t i = num_items; i < bf2.get_capacity(); ++i) {
|
126
|
+
if (bf2.query(i)) {
|
127
|
+
++num_found2;
|
128
|
+
}
|
129
|
+
}
|
130
|
+
REQUIRE(num_found == num_found2); // should exactly match above
|
131
|
+
auto bytes2 = bf2.serialize();
|
132
|
+
|
133
|
+
REQUIRE(bytes.size() == bytes2.size());
|
134
|
+
for (size_t i = 0; i < bytes.size(); ++i) {
|
135
|
+
REQUIRE(bytes[i] == bytes2[i]);
|
136
|
+
}
|
137
|
+
|
138
|
+
// check that raw memory also matches serialized sketch
|
139
|
+
const uint8_t* bf_bytes = bf2.get_wrapped_memory();
|
140
|
+
REQUIRE(bf_bytes == bf_memory);
|
141
|
+
for (size_t i = 0; i < bytes.size(); ++i) {
|
142
|
+
REQUIRE(bf_bytes[i] == bytes[i]);
|
143
|
+
}
|
144
|
+
|
145
|
+
// ensure the filters reset properly
|
146
|
+
bf.reset();
|
147
|
+
REQUIRE(bf.is_empty());
|
148
|
+
REQUIRE(bf.get_bits_used() == 0);
|
149
|
+
|
150
|
+
bf2.reset();
|
151
|
+
REQUIRE(bf2.is_empty());
|
152
|
+
REQUIRE(bf2.get_bits_used() == 0);
|
153
|
+
|
154
|
+
delete [] bf_memory;
|
155
|
+
}
|
156
|
+
|
157
|
+
TEST_CASE("bloom_filter: inversion", "[bloom_filter]") {
|
158
|
+
uint64_t num_bits = 8192;
|
159
|
+
uint16_t num_hashes = 3;
|
160
|
+
|
161
|
+
auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
|
162
|
+
|
163
|
+
uint64_t n = 500;
|
164
|
+
for (uint64_t i = 0; i < n; ++i) {
|
165
|
+
bf.update(i);
|
166
|
+
}
|
167
|
+
uint64_t num_bits_set = bf.get_bits_used();
|
168
|
+
bf.invert();
|
169
|
+
REQUIRE(bf.get_bits_used() == num_bits - num_bits_set);
|
170
|
+
|
171
|
+
// original items should be mostly not-present
|
172
|
+
uint32_t num_found = 0;
|
173
|
+
for (uint64_t i = 0; i < n; ++i) {
|
174
|
+
if (bf.query(i)) {
|
175
|
+
++num_found;
|
176
|
+
}
|
177
|
+
}
|
178
|
+
REQUIRE(num_found < n / 10);
|
179
|
+
|
180
|
+
// many other items should be "present"
|
181
|
+
num_found = 0;
|
182
|
+
for (uint64_t i = n; i < num_bits; ++i) {
|
183
|
+
if (bf.query(i)) {
|
184
|
+
++num_found;
|
185
|
+
}
|
186
|
+
}
|
187
|
+
REQUIRE(num_found > n);
|
188
|
+
}
|
189
|
+
|
190
|
+
TEST_CASE("bloom_filter: incompatible set operations", "[bloom_filter]") {
|
191
|
+
uint64_t num_bits = 32768;
|
192
|
+
uint16_t num_hashes = 4;
|
193
|
+
|
194
|
+
auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
|
195
|
+
|
196
|
+
// mismatched num bits
|
197
|
+
auto bf2 = bloom_filter::builder::create_by_size(2 * num_bits, num_hashes);
|
198
|
+
REQUIRE_THROWS_AS(bf1.union_with(bf2), std::invalid_argument);
|
199
|
+
|
200
|
+
// mismatched num hashes
|
201
|
+
auto bf3 = bloom_filter::builder::create_by_size(num_bits, 2 * num_hashes);
|
202
|
+
REQUIRE_THROWS_AS(bf1.intersect(bf2), std::invalid_argument);
|
203
|
+
|
204
|
+
// mismatched seed
|
205
|
+
auto bf4 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed() + 1);
|
206
|
+
REQUIRE_THROWS_AS(bf1.union_with(bf4), std::invalid_argument);
|
207
|
+
}
|
208
|
+
|
209
|
+
TEST_CASE("bloom_filter: basic union", "[bloom_filter]") {
|
210
|
+
const uint64_t num_bits = 12288;
|
211
|
+
const uint16_t num_hashes = 4;
|
212
|
+
|
213
|
+
auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
|
214
|
+
auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed());
|
215
|
+
|
216
|
+
const uint64_t n = 1000;
|
217
|
+
const uint32_t max_item = 3 * n / 2 - 1;
|
218
|
+
for (uint64_t i = 0; i < n; ++i) {
|
219
|
+
bf1.query_and_update(i);
|
220
|
+
bf2.update(n / 2 + i);
|
221
|
+
}
|
222
|
+
|
223
|
+
bf1.union_with(bf2);
|
224
|
+
for (uint64_t i = 0; i < max_item; ++i) {
|
225
|
+
REQUIRE(bf1.query(i));
|
226
|
+
}
|
227
|
+
|
228
|
+
uint32_t num_found = 0;
|
229
|
+
for (uint64_t i = max_item; i < num_bits; ++i) {
|
230
|
+
if (bf1.query(i)) {
|
231
|
+
++num_found;
|
232
|
+
}
|
233
|
+
}
|
234
|
+
REQUIRE(num_found < num_bits / 10); // not being super strict
|
235
|
+
}
|
236
|
+
|
237
|
+
TEST_CASE("bloom_filter: basic intersection", "[bloom_filter]") {
|
238
|
+
const uint64_t num_bits = 8192;
|
239
|
+
const uint16_t num_hahes = 5;
|
240
|
+
|
241
|
+
auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hahes);
|
242
|
+
auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hahes, bf1.get_seed());
|
243
|
+
|
244
|
+
const uint64_t n = 1024;
|
245
|
+
const uint32_t max_item = 3 * n / 2 - 1;
|
246
|
+
for (uint64_t i = 0; i < n; ++i) {
|
247
|
+
bf1.update(i);
|
248
|
+
bf2.update(n / 2 + i);
|
249
|
+
}
|
250
|
+
|
251
|
+
bf1.intersect(bf2);
|
252
|
+
// overlap bit should all be set
|
253
|
+
for (uint64_t i = n / 2; i < n; ++i) {
|
254
|
+
REQUIRE(bf1.query(i));
|
255
|
+
}
|
256
|
+
|
257
|
+
uint32_t num_found = 0;
|
258
|
+
for (uint64_t i = 0; i < n / 2; ++i) {
|
259
|
+
if (bf1.query(i)) {
|
260
|
+
++num_found;
|
261
|
+
}
|
262
|
+
}
|
263
|
+
for (uint64_t i = max_item; i < num_bits; ++i) {
|
264
|
+
if (bf1.query(i)) {
|
265
|
+
++num_found;
|
266
|
+
}
|
267
|
+
}
|
268
|
+
|
269
|
+
REQUIRE(num_found < num_bits / 10); // not being super strict
|
270
|
+
}
|
271
|
+
|
272
|
+
TEST_CASE("bloom_filter: empty serialization", "[bloom_filter]") {
|
273
|
+
const uint64_t num_bits = 32769;
|
274
|
+
const uint16_t num_hashes = 7;
|
275
|
+
|
276
|
+
auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
|
277
|
+
auto bytes = bf.serialize();
|
278
|
+
REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
|
279
|
+
|
280
|
+
auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
|
281
|
+
REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
|
282
|
+
REQUIRE(bf.get_seed() == bf_bytes.get_seed());
|
283
|
+
REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
|
284
|
+
REQUIRE(bf_bytes.is_empty());
|
285
|
+
|
286
|
+
std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
|
287
|
+
bf.serialize(ss);
|
288
|
+
auto bf_stream = bloom_filter::deserialize(ss);
|
289
|
+
REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
|
290
|
+
REQUIRE(bf.get_seed() == bf_stream.get_seed());
|
291
|
+
REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
|
292
|
+
REQUIRE(bf_stream.is_empty());
|
293
|
+
|
294
|
+
// read-only wrap should work
|
295
|
+
auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
|
296
|
+
REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
|
297
|
+
REQUIRE(bf.get_seed() == bf_wrap.get_seed());
|
298
|
+
REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
|
299
|
+
REQUIRE(bf_wrap.is_empty());
|
300
|
+
|
301
|
+
// writable wrap should not
|
302
|
+
REQUIRE_THROWS_AS(bloom_filter::writable_wrap(bytes.data(), bytes.size()), std::invalid_argument);
|
303
|
+
}
|
304
|
+
|
305
|
+
TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") {
|
306
|
+
const uint64_t num_bits = 32768;
|
307
|
+
const uint16_t num_hashes = 5;
|
308
|
+
|
309
|
+
auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
|
310
|
+
const uint64_t n = 1000;
|
311
|
+
for (uint64_t i = 0; i < n; ++i) {
|
312
|
+
bf.update(0.5 + i); // testing floats
|
313
|
+
}
|
314
|
+
|
315
|
+
// test more items without updating, assuming some false positives
|
316
|
+
// so we can check that we get the same number of false positives
|
317
|
+
// with the same query items
|
318
|
+
uint64_t fp_count = 0;
|
319
|
+
for (uint64_t i = n; i < num_bits; ++i) {
|
320
|
+
fp_count += bf.query(0.5 + i) ? 1 : 0;
|
321
|
+
}
|
322
|
+
|
323
|
+
auto bytes = bf.serialize();
|
324
|
+
REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
|
325
|
+
|
326
|
+
auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
|
327
|
+
REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
|
328
|
+
REQUIRE(bf.get_seed() == bf_bytes.get_seed());
|
329
|
+
REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
|
330
|
+
REQUIRE(!bf_bytes.is_empty());
|
331
|
+
REQUIRE(bf.is_memory_owned());
|
332
|
+
uint64_t fp_count_bytes = 0;
|
333
|
+
for (uint64_t i = 0; i < num_bits; ++i) {
|
334
|
+
bool val = bf_bytes.query(0.5 + i);
|
335
|
+
if (i < n)
|
336
|
+
REQUIRE(val);
|
337
|
+
else if (val)
|
338
|
+
++fp_count_bytes;
|
339
|
+
}
|
340
|
+
REQUIRE(fp_count_bytes == fp_count);
|
341
|
+
|
342
|
+
std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
|
343
|
+
bf.serialize(ss);
|
344
|
+
auto bf_stream = bloom_filter::deserialize(ss);
|
345
|
+
REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
|
346
|
+
REQUIRE(bf.get_seed() == bf_stream.get_seed());
|
347
|
+
REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
|
348
|
+
REQUIRE(!bf_stream.is_empty());
|
349
|
+
REQUIRE(bf_stream.is_memory_owned());
|
350
|
+
uint64_t fp_count_stream = 0;
|
351
|
+
for (uint64_t i = 0; i < num_bits; ++i) {
|
352
|
+
bool val = bf_stream.query(0.5 + i);
|
353
|
+
if (i < n)
|
354
|
+
REQUIRE(val);
|
355
|
+
else if (val)
|
356
|
+
++fp_count_stream;
|
357
|
+
}
|
358
|
+
REQUIRE(fp_count_stream == fp_count);
|
359
|
+
|
360
|
+
// read-only wrap
|
361
|
+
auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
|
362
|
+
REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
|
363
|
+
REQUIRE(bf.get_seed() == bf_wrap.get_seed());
|
364
|
+
REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
|
365
|
+
REQUIRE(!bf_wrap.is_empty());
|
366
|
+
REQUIRE(!bf_wrap.is_memory_owned());
|
367
|
+
uint64_t fp_count_wrap = 0;
|
368
|
+
for (uint64_t i = 0; i < num_bits; ++i) {
|
369
|
+
bool val = bf_wrap.query(0.5 + i);
|
370
|
+
if (i < n)
|
371
|
+
REQUIRE(val);
|
372
|
+
else if (val)
|
373
|
+
++fp_count_wrap;
|
374
|
+
}
|
375
|
+
REQUIRE(fp_count_wrap == fp_count);
|
376
|
+
REQUIRE_THROWS_AS(bf_wrap.update(-1.0), std::logic_error);
|
377
|
+
REQUIRE_THROWS_AS(bf_wrap.query_and_update(-2.0), std::logic_error);
|
378
|
+
REQUIRE_THROWS_AS(bf_wrap.reset(), std::logic_error);
|
379
|
+
|
380
|
+
// writable wrap
|
381
|
+
auto bf_writable = bloom_filter::writable_wrap(bytes.data(), bytes.size());
|
382
|
+
REQUIRE(bf.get_capacity() == bf_writable.get_capacity());
|
383
|
+
REQUIRE(bf.get_seed() == bf_writable.get_seed());
|
384
|
+
REQUIRE(bf.get_num_hashes() == bf_writable.get_num_hashes());
|
385
|
+
REQUIRE(!bf_writable.is_empty());
|
386
|
+
REQUIRE(!bf_writable.is_memory_owned());
|
387
|
+
uint64_t fp_count_writable = 0;
|
388
|
+
for (uint64_t i = 0; i < num_bits; ++i) {
|
389
|
+
bool val = bf_writable.query(0.5 + i);
|
390
|
+
if (i < n)
|
391
|
+
REQUIRE(val);
|
392
|
+
else if (val)
|
393
|
+
++fp_count_writable;
|
394
|
+
}
|
395
|
+
REQUIRE(fp_count_writable == fp_count);
|
396
|
+
|
397
|
+
REQUIRE(!bf_writable.query(-1.0));
|
398
|
+
bf_writable.update(-1.0);
|
399
|
+
REQUIRE(bf_writable.query(-1.0));
|
400
|
+
|
401
|
+
// not good memory management to do this, but because we wrapped the same bytes as both
|
402
|
+
// read-only adn writable, that update should ahve changed the read-only version, too
|
403
|
+
REQUIRE(bf_wrap.query(-1.0));
|
404
|
+
}
|
405
|
+
|
406
|
+
} // namespace datasketches
|
@@ -31,11 +31,14 @@ using alloc = test_allocator<test_type>;
|
|
31
31
|
|
32
32
|
TEST_CASE("kll sketch custom type", "[kll_sketch]") {
|
33
33
|
|
34
|
-
// setup section
|
35
34
|
test_allocator_total_bytes = 0;
|
35
|
+
test_allocator_net_allocations = 0;
|
36
36
|
|
37
37
|
SECTION("compact level zero") {
|
38
38
|
kll_test_type_sketch sketch(8, test_type_less(), 0);
|
39
|
+
REQUIRE(test_allocator_total_bytes != 0);
|
40
|
+
REQUIRE(test_allocator_net_allocations != 0);
|
41
|
+
|
39
42
|
REQUIRE_THROWS_AS(sketch.get_quantile(0), std::runtime_error);
|
40
43
|
REQUIRE_THROWS_AS(sketch.get_min_item(), std::runtime_error);
|
41
44
|
REQUIRE_THROWS_AS(sketch.get_max_item(), std::runtime_error);
|
@@ -146,10 +149,8 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
|
|
146
149
|
REQUIRE(sketch2.get_n() == 11);
|
147
150
|
}
|
148
151
|
|
149
|
-
|
150
|
-
|
151
|
-
REQUIRE(test_allocator_total_bytes == 0);
|
152
|
-
}
|
152
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
153
|
+
REQUIRE(test_allocator_net_allocations == 0);
|
153
154
|
}
|
154
155
|
|
155
156
|
} /* namespace datasketches */
|
@@ -37,14 +37,14 @@ class ebpps_sample {
|
|
37
37
|
public:
|
38
38
|
explicit ebpps_sample(uint32_t k, const A& allocator = A());
|
39
39
|
|
40
|
-
// constructor used to create a sample to merge one itme
|
41
|
-
template<typename TT>
|
42
|
-
ebpps_sample(TT&& item, double theta, const A& allocator = A());
|
43
|
-
|
44
40
|
// for deserialization
|
45
41
|
class items_deleter;
|
46
42
|
ebpps_sample(std::vector<T, A>&& data, optional<T>&& partial_item, double c, const A& allocator = A());
|
47
43
|
|
44
|
+
// used instead of having a single-item constructor for update/merge calls
|
45
|
+
template<typename TT>
|
46
|
+
void replace_content(TT&& item, double theta);
|
47
|
+
|
48
48
|
void reset();
|
49
49
|
void downsample(double theta);
|
50
50
|
|
@@ -41,22 +41,6 @@ ebpps_sample<T,A>::ebpps_sample(uint32_t reserved_size, const A& allocator) :
|
|
41
41
|
data_.reserve(reserved_size);
|
42
42
|
}
|
43
43
|
|
44
|
-
template<typename T, typename A>
|
45
|
-
template<typename TT>
|
46
|
-
ebpps_sample<T,A>::ebpps_sample(TT&& item, double theta, const A& allocator) :
|
47
|
-
allocator_(allocator),
|
48
|
-
c_(theta),
|
49
|
-
partial_item_(),
|
50
|
-
data_(allocator)
|
51
|
-
{
|
52
|
-
if (theta == 1.0) {
|
53
|
-
data_.reserve(1);
|
54
|
-
data_.emplace_back(std::forward<TT>(item));
|
55
|
-
} else {
|
56
|
-
partial_item_.emplace(std::forward<TT>(item));
|
57
|
-
}
|
58
|
-
}
|
59
|
-
|
60
44
|
template<typename T, typename A>
|
61
45
|
ebpps_sample<T,A>::ebpps_sample(std::vector<T, A>&& data, optional<T>&& partial_item, double c, const A& allocator) :
|
62
46
|
allocator_(allocator),
|
@@ -65,6 +49,19 @@ ebpps_sample<T,A>::ebpps_sample(std::vector<T, A>&& data, optional<T>&& partial_
|
|
65
49
|
data_(data, allocator)
|
66
50
|
{}
|
67
51
|
|
52
|
+
template<typename T, typename A>
|
53
|
+
template<typename TT>
|
54
|
+
void ebpps_sample<T,A>::replace_content(TT&& item, double theta) {
|
55
|
+
c_ = theta;
|
56
|
+
data_.clear();
|
57
|
+
partial_item_.reset();
|
58
|
+
if (theta == 1.0) {
|
59
|
+
data_.emplace_back(std::forward<TT>(item));
|
60
|
+
} else {
|
61
|
+
partial_item_.emplace(std::forward<TT>(item));
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
68
65
|
template<typename T, typename A>
|
69
66
|
auto ebpps_sample<T,A>::get_sample() const -> result_type {
|
70
67
|
double unused;
|
@@ -43,7 +43,7 @@ namespace ebpps_constants {
|
|
43
43
|
* From: "Exact PPS Sampling with Bounded Sample Size",
|
44
44
|
* B. Hentschel, P. J. Haas, Y. Tian. Information Processing Letters, 2023.
|
45
45
|
*
|
46
|
-
* This sketch samples data from a stream of items
|
46
|
+
* This sketch samples data from a stream of items proportional to the weight of each item.
|
47
47
|
* The sample guarantees the presence of an item in the result is proportional to that item's
|
48
48
|
* portion of the total weight seen by the sketch, and returns a sample no larger than size k.
|
49
49
|
*
|
@@ -256,6 +256,8 @@ class ebpps_sketch {
|
|
256
256
|
|
257
257
|
ebpps_sample<T,A> sample_; // Object holding the current state of the sample
|
258
258
|
|
259
|
+
ebpps_sample<T,A> tmp_; // Temporary sample of size 1 used in updates
|
260
|
+
|
259
261
|
// handles merge after ensuring other.cumulative_wt_ <= this->cumulative_wt_
|
260
262
|
// so we can send items in individually
|
261
263
|
template<typename O>
|
@@ -40,7 +40,8 @@ ebpps_sketch<T, A>::ebpps_sketch(uint32_t k, const A& allocator) :
|
|
40
40
|
cumulative_wt_(0.0),
|
41
41
|
wt_max_(0.0),
|
42
42
|
rho_(1.0),
|
43
|
-
sample_(check_k(k), allocator)
|
43
|
+
sample_(check_k(k), allocator),
|
44
|
+
tmp_(1, allocator)
|
44
45
|
{}
|
45
46
|
|
46
47
|
template<typename T, typename A>
|
@@ -53,7 +54,8 @@ ebpps_sketch<T,A>::ebpps_sketch(uint32_t k, uint64_t n, double cumulative_wt,
|
|
53
54
|
cumulative_wt_(cumulative_wt),
|
54
55
|
wt_max_(wt_max),
|
55
56
|
rho_(rho),
|
56
|
-
sample_(sample)
|
57
|
+
sample_(sample),
|
58
|
+
tmp_(1, allocator)
|
57
59
|
{}
|
58
60
|
|
59
61
|
template<typename T, typename A>
|
@@ -148,9 +150,8 @@ void ebpps_sketch<T, A>::internal_update(FwdItem&& item, double weight) {
|
|
148
150
|
if (cumulative_wt_ > 0.0)
|
149
151
|
sample_.downsample(new_rho / rho_);
|
150
152
|
|
151
|
-
|
152
|
-
|
153
|
-
sample_.merge(tmp);
|
153
|
+
tmp_.replace_content(conditional_forward<FwdItem>(item), new_rho * weight);
|
154
|
+
sample_.merge(tmp_);
|
154
155
|
|
155
156
|
cumulative_wt_ = new_cum_wt;
|
156
157
|
wt_max_ = new_wt_max;
|
@@ -240,9 +241,8 @@ void ebpps_sketch<T, A>::internal_merge(O&& sk) {
|
|
240
241
|
if (cumulative_wt_ > 0.0)
|
241
242
|
sample_.downsample(new_rho / rho_);
|
242
243
|
|
243
|
-
|
244
|
-
|
245
|
-
sample_.merge(tmp);
|
244
|
+
tmp_.replace_content(conditional_forward<O>(items[i]), new_rho * avg_wt);
|
245
|
+
sample_.merge(tmp_);
|
246
246
|
|
247
247
|
cumulative_wt_ = new_cum_wt;
|
248
248
|
rho_ = new_rho;
|
@@ -259,9 +259,8 @@ void ebpps_sketch<T, A>::internal_merge(O&& sk) {
|
|
259
259
|
if (cumulative_wt_ > 0.0)
|
260
260
|
sample_.downsample(new_rho / rho_);
|
261
261
|
|
262
|
-
|
263
|
-
|
264
|
-
sample_.merge(tmp);
|
262
|
+
tmp_.replace_content(conditional_forward<O>(other_sample.get_partial_item()), new_rho * other_c_frac * avg_wt);
|
263
|
+
sample_.merge(tmp_);
|
265
264
|
|
266
265
|
cumulative_wt_ = new_cum_wt;
|
267
266
|
rho_ = new_rho;
|
@@ -42,14 +42,15 @@ TEST_CASE("ebpps sample: basic initialization", "[ebpps_sketch]") {
|
|
42
42
|
|
43
43
|
TEST_CASE("ebpps sample: pre-initialized", "[ebpps_sketch]") {
|
44
44
|
double theta = 1.0;
|
45
|
-
ebpps_sample<int> sample
|
45
|
+
ebpps_sample<int> sample(1);
|
46
|
+
sample.replace_content(-1, theta);
|
46
47
|
REQUIRE(sample.get_c() == theta);
|
47
48
|
REQUIRE(sample.get_num_retained_items() == 1);
|
48
49
|
REQUIRE(sample.get_sample().size() == 1);
|
49
50
|
REQUIRE(sample.has_partial_item() == false);
|
50
51
|
|
51
52
|
theta = 1e-300;
|
52
|
-
sample
|
53
|
+
sample.replace_content(-1, theta);
|
53
54
|
REQUIRE(sample.get_c() == theta);
|
54
55
|
REQUIRE(sample.get_num_retained_items() == 1);
|
55
56
|
REQUIRE(sample.get_sample().size() == 0); // assuming the random number is > 1e-300
|
@@ -57,7 +58,8 @@ TEST_CASE("ebpps sample: pre-initialized", "[ebpps_sketch]") {
|
|
57
58
|
}
|
58
59
|
|
59
60
|
TEST_CASE("ebpps sample: downsampling", "[ebpps_sketch]") {
|
60
|
-
ebpps_sample<char> sample
|
61
|
+
ebpps_sample<char> sample(1);
|
62
|
+
sample.replace_content('a', 1.0);
|
61
63
|
|
62
64
|
sample.downsample(2.0); // no-op
|
63
65
|
REQUIRE(sample.get_c() == 1.0);
|
@@ -121,8 +123,9 @@ TEST_CASE("ebpps sample: merge unit samples", "[ebpps_sketch]") {
|
|
121
123
|
uint32_t k = 8;
|
122
124
|
ebpps_sample<int> sample = ebpps_sample<int>(k);
|
123
125
|
|
126
|
+
ebpps_sample<int> s(1);
|
124
127
|
for (uint32_t i = 1; i <= k; ++i) {
|
125
|
-
|
128
|
+
s.replace_content(i, 1.0);
|
126
129
|
sample.merge(s);
|
127
130
|
REQUIRE(sample.get_c() == static_cast<double>(i));
|
128
131
|
REQUIRE(sample.get_num_retained_items() == i);
|