datasketches 0.4.2 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/NOTICE +1 -1
- data/README.md +1 -1
- data/ext/datasketches/vo_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +2 -0
- data/vendor/datasketches-cpp/LICENSE +35 -7
- data/vendor/datasketches-cpp/NOTICE +3 -3
- data/vendor/datasketches-cpp/README.md +2 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -3
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +18 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
- data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
- data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
- data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
- data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
- data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +304 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +632 -0
- data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
- data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +456 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -38
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +27 -9
@@ -0,0 +1,406 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <catch2/catch.hpp>
|
21
|
+
|
22
|
+
#include "bloom_filter.hpp"
|
23
|
+
|
24
|
+
#ifdef TEST_BINARY_INPUT_PATH
|
25
|
+
static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
|
26
|
+
#else
|
27
|
+
static std::string testBinaryInputPath = "test/";
|
28
|
+
#endif
|
29
|
+
|
30
|
+
namespace datasketches {
|
31
|
+
|
32
|
+
TEST_CASE("bloom_filter: invalid constructor args", "[bloom_filter]") {
|
33
|
+
REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(0, 4), std::invalid_argument);
|
34
|
+
REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(1L << 60, 4), std::invalid_argument);
|
35
|
+
REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(65535, 0), std::invalid_argument);
|
36
|
+
}
|
37
|
+
|
38
|
+
TEST_CASE("bloom_filter: standard constructors", "[bloom_filter]") {
|
39
|
+
uint64_t num_items = 4000;
|
40
|
+
double fpp = 0.01;
|
41
|
+
|
42
|
+
uint64_t num_bits = bloom_filter::builder::suggest_num_filter_bits(num_items, fpp);
|
43
|
+
uint16_t num_hashes = bloom_filter::builder::suggest_num_hashes(num_items, num_bits);
|
44
|
+
uint64_t seed = 89023;
|
45
|
+
|
46
|
+
auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes, seed);
|
47
|
+
uint64_t adjusted_num_bits = (num_bits + 63) & ~0x3F; // round up to the nearest multiple of 64
|
48
|
+
REQUIRE(bf.get_capacity() == adjusted_num_bits);
|
49
|
+
REQUIRE(bf.get_num_hashes() == num_hashes);
|
50
|
+
REQUIRE(bf.get_seed() == seed);
|
51
|
+
REQUIRE(bf.is_empty());
|
52
|
+
|
53
|
+
// should match above
|
54
|
+
bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
|
55
|
+
REQUIRE(bf.get_capacity() == adjusted_num_bits);
|
56
|
+
REQUIRE(bf.get_num_hashes() == num_hashes);
|
57
|
+
REQUIRE(bf.get_seed() == seed);
|
58
|
+
REQUIRE(bf.is_empty());
|
59
|
+
|
60
|
+
// same for initializing memory in-place
|
61
|
+
size_t serialized_size_bytes = bloom_filter::get_serialized_size_bytes(num_bits);
|
62
|
+
uint8_t* bytes = new uint8_t[serialized_size_bytes];
|
63
|
+
|
64
|
+
bf = bloom_filter::builder::initialize_by_size(bytes, serialized_size_bytes, num_bits, num_hashes, seed);
|
65
|
+
REQUIRE(bf.get_capacity() == adjusted_num_bits);
|
66
|
+
REQUIRE(bf.get_num_hashes() == num_hashes);
|
67
|
+
REQUIRE(bf.get_seed() == seed);
|
68
|
+
REQUIRE(bf.is_empty());
|
69
|
+
|
70
|
+
bf = bloom_filter::builder::initialize_by_accuracy(bytes, serialized_size_bytes, num_items, fpp, seed);
|
71
|
+
REQUIRE(bf.get_capacity() == adjusted_num_bits);
|
72
|
+
REQUIRE(bf.get_num_hashes() == num_hashes);
|
73
|
+
REQUIRE(bf.get_seed() == seed);
|
74
|
+
REQUIRE(bf.is_empty());
|
75
|
+
|
76
|
+
delete [] bytes;
|
77
|
+
}
|
78
|
+
|
79
|
+
TEST_CASE("bloom_filter: basic operations", "[bloom_filter]") {
|
80
|
+
uint64_t num_items = 5000;
|
81
|
+
double fpp = 0.01;
|
82
|
+
uint64_t seed = 4897301548054ULL;
|
83
|
+
|
84
|
+
auto bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
|
85
|
+
REQUIRE(bf.is_empty());
|
86
|
+
REQUIRE(bf.get_bits_used() == 0);
|
87
|
+
|
88
|
+
for (uint64_t i = 0; i < num_items; ++i) {
|
89
|
+
bf.query_and_update(i);
|
90
|
+
}
|
91
|
+
|
92
|
+
REQUIRE(!bf.is_empty());
|
93
|
+
// filter is about 50% full at target capacity
|
94
|
+
// since seed is fixed we expect an exact value every time
|
95
|
+
// but leaving the approximate test in since that's more the "expectation"
|
96
|
+
REQUIRE(bf.get_bits_used() == 24793); // exact value is not important but should be consistent
|
97
|
+
REQUIRE(bf.get_bits_used() == Approx(0.5 * bf.get_capacity()).epsilon(0.05)); // just over 3.3% in practice
|
98
|
+
|
99
|
+
uint32_t num_found = 0;
|
100
|
+
for (uint64_t i = num_items; i < bf.get_capacity(); ++i) {
|
101
|
+
if (bf.query(i)) {
|
102
|
+
++num_found;
|
103
|
+
}
|
104
|
+
}
|
105
|
+
// fpp is average with significant variance -- even at 12% it would fail occasionally
|
106
|
+
REQUIRE(num_found == 423);
|
107
|
+
//REQUIRE(num_found == Approx((bf.get_capacity() - num_items) * fpp).epsilon(0.12));
|
108
|
+
auto bytes = bf.serialize();
|
109
|
+
|
110
|
+
// initialize in memory and run the same tests
|
111
|
+
// also checking against the results from the first part
|
112
|
+
uint8_t* bf_memory = new uint8_t[bytes.size()];
|
113
|
+
auto bf2 = bloom_filter::builder::initialize_by_accuracy(bf_memory, bytes.size(), num_items, fpp, bf.get_seed());
|
114
|
+
REQUIRE(bf2.is_empty());
|
115
|
+
REQUIRE(bf2.get_bits_used() == 0);
|
116
|
+
|
117
|
+
for (uint64_t i = 0; i < num_items; ++i) {
|
118
|
+
bf2.query_and_update(i);
|
119
|
+
}
|
120
|
+
|
121
|
+
REQUIRE(!bf2.is_empty());
|
122
|
+
REQUIRE(bf2.get_bits_used() == bf.get_bits_used()); // should exactly match above
|
123
|
+
|
124
|
+
uint32_t num_found2 = 0;
|
125
|
+
for (uint64_t i = num_items; i < bf2.get_capacity(); ++i) {
|
126
|
+
if (bf2.query(i)) {
|
127
|
+
++num_found2;
|
128
|
+
}
|
129
|
+
}
|
130
|
+
REQUIRE(num_found == num_found2); // should exactly match above
|
131
|
+
auto bytes2 = bf2.serialize();
|
132
|
+
|
133
|
+
REQUIRE(bytes.size() == bytes2.size());
|
134
|
+
for (size_t i = 0; i < bytes.size(); ++i) {
|
135
|
+
REQUIRE(bytes[i] == bytes2[i]);
|
136
|
+
}
|
137
|
+
|
138
|
+
// check that raw memory also matches serialized sketch
|
139
|
+
const uint8_t* bf_bytes = bf2.get_wrapped_memory();
|
140
|
+
REQUIRE(bf_bytes == bf_memory);
|
141
|
+
for (size_t i = 0; i < bytes.size(); ++i) {
|
142
|
+
REQUIRE(bf_bytes[i] == bytes[i]);
|
143
|
+
}
|
144
|
+
|
145
|
+
// ensure the filters reset properly
|
146
|
+
bf.reset();
|
147
|
+
REQUIRE(bf.is_empty());
|
148
|
+
REQUIRE(bf.get_bits_used() == 0);
|
149
|
+
|
150
|
+
bf2.reset();
|
151
|
+
REQUIRE(bf2.is_empty());
|
152
|
+
REQUIRE(bf2.get_bits_used() == 0);
|
153
|
+
|
154
|
+
delete [] bf_memory;
|
155
|
+
}
|
156
|
+
|
157
|
+
TEST_CASE("bloom_filter: inversion", "[bloom_filter]") {
|
158
|
+
uint64_t num_bits = 8192;
|
159
|
+
uint16_t num_hashes = 3;
|
160
|
+
|
161
|
+
auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
|
162
|
+
|
163
|
+
uint64_t n = 500;
|
164
|
+
for (uint64_t i = 0; i < n; ++i) {
|
165
|
+
bf.update(i);
|
166
|
+
}
|
167
|
+
uint64_t num_bits_set = bf.get_bits_used();
|
168
|
+
bf.invert();
|
169
|
+
REQUIRE(bf.get_bits_used() == num_bits - num_bits_set);
|
170
|
+
|
171
|
+
// original items should be mostly not-present
|
172
|
+
uint32_t num_found = 0;
|
173
|
+
for (uint64_t i = 0; i < n; ++i) {
|
174
|
+
if (bf.query(i)) {
|
175
|
+
++num_found;
|
176
|
+
}
|
177
|
+
}
|
178
|
+
REQUIRE(num_found < n / 10);
|
179
|
+
|
180
|
+
// many other items should be "present"
|
181
|
+
num_found = 0;
|
182
|
+
for (uint64_t i = n; i < num_bits; ++i) {
|
183
|
+
if (bf.query(i)) {
|
184
|
+
++num_found;
|
185
|
+
}
|
186
|
+
}
|
187
|
+
REQUIRE(num_found > n);
|
188
|
+
}
|
189
|
+
|
190
|
+
TEST_CASE("bloom_filter: incompatible set operations", "[bloom_filter]") {
|
191
|
+
uint64_t num_bits = 32768;
|
192
|
+
uint16_t num_hashes = 4;
|
193
|
+
|
194
|
+
auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
|
195
|
+
|
196
|
+
// mismatched num bits
|
197
|
+
auto bf2 = bloom_filter::builder::create_by_size(2 * num_bits, num_hashes);
|
198
|
+
REQUIRE_THROWS_AS(bf1.union_with(bf2), std::invalid_argument);
|
199
|
+
|
200
|
+
// mismatched num hashes
|
201
|
+
auto bf3 = bloom_filter::builder::create_by_size(num_bits, 2 * num_hashes);
|
202
|
+
REQUIRE_THROWS_AS(bf1.intersect(bf2), std::invalid_argument);
|
203
|
+
|
204
|
+
// mismatched seed
|
205
|
+
auto bf4 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed() + 1);
|
206
|
+
REQUIRE_THROWS_AS(bf1.union_with(bf4), std::invalid_argument);
|
207
|
+
}
|
208
|
+
|
209
|
+
TEST_CASE("bloom_filter: basic union", "[bloom_filter]") {
|
210
|
+
const uint64_t num_bits = 12288;
|
211
|
+
const uint16_t num_hashes = 4;
|
212
|
+
|
213
|
+
auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
|
214
|
+
auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed());
|
215
|
+
|
216
|
+
const uint64_t n = 1000;
|
217
|
+
const uint32_t max_item = 3 * n / 2 - 1;
|
218
|
+
for (uint64_t i = 0; i < n; ++i) {
|
219
|
+
bf1.query_and_update(i);
|
220
|
+
bf2.update(n / 2 + i);
|
221
|
+
}
|
222
|
+
|
223
|
+
bf1.union_with(bf2);
|
224
|
+
for (uint64_t i = 0; i < max_item; ++i) {
|
225
|
+
REQUIRE(bf1.query(i));
|
226
|
+
}
|
227
|
+
|
228
|
+
uint32_t num_found = 0;
|
229
|
+
for (uint64_t i = max_item; i < num_bits; ++i) {
|
230
|
+
if (bf1.query(i)) {
|
231
|
+
++num_found;
|
232
|
+
}
|
233
|
+
}
|
234
|
+
REQUIRE(num_found < num_bits / 10); // not being super strict
|
235
|
+
}
|
236
|
+
|
237
|
+
TEST_CASE("bloom_filter: basic intersection", "[bloom_filter]") {
|
238
|
+
const uint64_t num_bits = 8192;
|
239
|
+
const uint16_t num_hahes = 5;
|
240
|
+
|
241
|
+
auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hahes);
|
242
|
+
auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hahes, bf1.get_seed());
|
243
|
+
|
244
|
+
const uint64_t n = 1024;
|
245
|
+
const uint32_t max_item = 3 * n / 2 - 1;
|
246
|
+
for (uint64_t i = 0; i < n; ++i) {
|
247
|
+
bf1.update(i);
|
248
|
+
bf2.update(n / 2 + i);
|
249
|
+
}
|
250
|
+
|
251
|
+
bf1.intersect(bf2);
|
252
|
+
// overlap bit should all be set
|
253
|
+
for (uint64_t i = n / 2; i < n; ++i) {
|
254
|
+
REQUIRE(bf1.query(i));
|
255
|
+
}
|
256
|
+
|
257
|
+
uint32_t num_found = 0;
|
258
|
+
for (uint64_t i = 0; i < n / 2; ++i) {
|
259
|
+
if (bf1.query(i)) {
|
260
|
+
++num_found;
|
261
|
+
}
|
262
|
+
}
|
263
|
+
for (uint64_t i = max_item; i < num_bits; ++i) {
|
264
|
+
if (bf1.query(i)) {
|
265
|
+
++num_found;
|
266
|
+
}
|
267
|
+
}
|
268
|
+
|
269
|
+
REQUIRE(num_found < num_bits / 10); // not being super strict
|
270
|
+
}
|
271
|
+
|
272
|
+
TEST_CASE("bloom_filter: empty serialization", "[bloom_filter]") {
|
273
|
+
const uint64_t num_bits = 32769;
|
274
|
+
const uint16_t num_hashes = 7;
|
275
|
+
|
276
|
+
auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
|
277
|
+
auto bytes = bf.serialize();
|
278
|
+
REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
|
279
|
+
|
280
|
+
auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
|
281
|
+
REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
|
282
|
+
REQUIRE(bf.get_seed() == bf_bytes.get_seed());
|
283
|
+
REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
|
284
|
+
REQUIRE(bf_bytes.is_empty());
|
285
|
+
|
286
|
+
std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
|
287
|
+
bf.serialize(ss);
|
288
|
+
auto bf_stream = bloom_filter::deserialize(ss);
|
289
|
+
REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
|
290
|
+
REQUIRE(bf.get_seed() == bf_stream.get_seed());
|
291
|
+
REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
|
292
|
+
REQUIRE(bf_stream.is_empty());
|
293
|
+
|
294
|
+
// read-only wrap should work
|
295
|
+
auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
|
296
|
+
REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
|
297
|
+
REQUIRE(bf.get_seed() == bf_wrap.get_seed());
|
298
|
+
REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
|
299
|
+
REQUIRE(bf_wrap.is_empty());
|
300
|
+
|
301
|
+
// writable wrap should not
|
302
|
+
REQUIRE_THROWS_AS(bloom_filter::writable_wrap(bytes.data(), bytes.size()), std::invalid_argument);
|
303
|
+
}
|
304
|
+
|
305
|
+
TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") {
|
306
|
+
const uint64_t num_bits = 32768;
|
307
|
+
const uint16_t num_hashes = 5;
|
308
|
+
|
309
|
+
auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
|
310
|
+
const uint64_t n = 1000;
|
311
|
+
for (uint64_t i = 0; i < n; ++i) {
|
312
|
+
bf.update(0.5 + i); // testing floats
|
313
|
+
}
|
314
|
+
|
315
|
+
// test more items without updating, assuming some false positives
|
316
|
+
// so we can check that we get the same number of false positives
|
317
|
+
// with the same query items
|
318
|
+
uint64_t fp_count = 0;
|
319
|
+
for (uint64_t i = n; i < num_bits; ++i) {
|
320
|
+
fp_count += bf.query(0.5 + i) ? 1 : 0;
|
321
|
+
}
|
322
|
+
|
323
|
+
auto bytes = bf.serialize();
|
324
|
+
REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
|
325
|
+
|
326
|
+
auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
|
327
|
+
REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
|
328
|
+
REQUIRE(bf.get_seed() == bf_bytes.get_seed());
|
329
|
+
REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
|
330
|
+
REQUIRE(!bf_bytes.is_empty());
|
331
|
+
REQUIRE(bf.is_memory_owned());
|
332
|
+
uint64_t fp_count_bytes = 0;
|
333
|
+
for (uint64_t i = 0; i < num_bits; ++i) {
|
334
|
+
bool val = bf_bytes.query(0.5 + i);
|
335
|
+
if (i < n)
|
336
|
+
REQUIRE(val);
|
337
|
+
else if (val)
|
338
|
+
++fp_count_bytes;
|
339
|
+
}
|
340
|
+
REQUIRE(fp_count_bytes == fp_count);
|
341
|
+
|
342
|
+
std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
|
343
|
+
bf.serialize(ss);
|
344
|
+
auto bf_stream = bloom_filter::deserialize(ss);
|
345
|
+
REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
|
346
|
+
REQUIRE(bf.get_seed() == bf_stream.get_seed());
|
347
|
+
REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
|
348
|
+
REQUIRE(!bf_stream.is_empty());
|
349
|
+
REQUIRE(bf_stream.is_memory_owned());
|
350
|
+
uint64_t fp_count_stream = 0;
|
351
|
+
for (uint64_t i = 0; i < num_bits; ++i) {
|
352
|
+
bool val = bf_stream.query(0.5 + i);
|
353
|
+
if (i < n)
|
354
|
+
REQUIRE(val);
|
355
|
+
else if (val)
|
356
|
+
++fp_count_stream;
|
357
|
+
}
|
358
|
+
REQUIRE(fp_count_stream == fp_count);
|
359
|
+
|
360
|
+
// read-only wrap
|
361
|
+
auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
|
362
|
+
REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
|
363
|
+
REQUIRE(bf.get_seed() == bf_wrap.get_seed());
|
364
|
+
REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
|
365
|
+
REQUIRE(!bf_wrap.is_empty());
|
366
|
+
REQUIRE(!bf_wrap.is_memory_owned());
|
367
|
+
uint64_t fp_count_wrap = 0;
|
368
|
+
for (uint64_t i = 0; i < num_bits; ++i) {
|
369
|
+
bool val = bf_wrap.query(0.5 + i);
|
370
|
+
if (i < n)
|
371
|
+
REQUIRE(val);
|
372
|
+
else if (val)
|
373
|
+
++fp_count_wrap;
|
374
|
+
}
|
375
|
+
REQUIRE(fp_count_wrap == fp_count);
|
376
|
+
REQUIRE_THROWS_AS(bf_wrap.update(-1.0), std::logic_error);
|
377
|
+
REQUIRE_THROWS_AS(bf_wrap.query_and_update(-2.0), std::logic_error);
|
378
|
+
REQUIRE_THROWS_AS(bf_wrap.reset(), std::logic_error);
|
379
|
+
|
380
|
+
// writable wrap
|
381
|
+
auto bf_writable = bloom_filter::writable_wrap(bytes.data(), bytes.size());
|
382
|
+
REQUIRE(bf.get_capacity() == bf_writable.get_capacity());
|
383
|
+
REQUIRE(bf.get_seed() == bf_writable.get_seed());
|
384
|
+
REQUIRE(bf.get_num_hashes() == bf_writable.get_num_hashes());
|
385
|
+
REQUIRE(!bf_writable.is_empty());
|
386
|
+
REQUIRE(!bf_writable.is_memory_owned());
|
387
|
+
uint64_t fp_count_writable = 0;
|
388
|
+
for (uint64_t i = 0; i < num_bits; ++i) {
|
389
|
+
bool val = bf_writable.query(0.5 + i);
|
390
|
+
if (i < n)
|
391
|
+
REQUIRE(val);
|
392
|
+
else if (val)
|
393
|
+
++fp_count_writable;
|
394
|
+
}
|
395
|
+
REQUIRE(fp_count_writable == fp_count);
|
396
|
+
|
397
|
+
REQUIRE(!bf_writable.query(-1.0));
|
398
|
+
bf_writable.update(-1.0);
|
399
|
+
REQUIRE(bf_writable.query(-1.0));
|
400
|
+
|
401
|
+
// not good memory management to do this, but because we wrapped the same bytes as both
|
402
|
+
// read-only adn writable, that update should ahve changed the read-only version, too
|
403
|
+
REQUIRE(bf_wrap.query(-1.0));
|
404
|
+
}
|
405
|
+
|
406
|
+
} // namespace datasketches
|
@@ -31,11 +31,14 @@ using alloc = test_allocator<test_type>;
|
|
31
31
|
|
32
32
|
TEST_CASE("kll sketch custom type", "[kll_sketch]") {
|
33
33
|
|
34
|
-
// setup section
|
35
34
|
test_allocator_total_bytes = 0;
|
35
|
+
test_allocator_net_allocations = 0;
|
36
36
|
|
37
37
|
SECTION("compact level zero") {
|
38
38
|
kll_test_type_sketch sketch(8, test_type_less(), 0);
|
39
|
+
REQUIRE(test_allocator_total_bytes != 0);
|
40
|
+
REQUIRE(test_allocator_net_allocations != 0);
|
41
|
+
|
39
42
|
REQUIRE_THROWS_AS(sketch.get_quantile(0), std::runtime_error);
|
40
43
|
REQUIRE_THROWS_AS(sketch.get_min_item(), std::runtime_error);
|
41
44
|
REQUIRE_THROWS_AS(sketch.get_max_item(), std::runtime_error);
|
@@ -146,10 +149,8 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
|
|
146
149
|
REQUIRE(sketch2.get_n() == 11);
|
147
150
|
}
|
148
151
|
|
149
|
-
|
150
|
-
|
151
|
-
REQUIRE(test_allocator_total_bytes == 0);
|
152
|
-
}
|
152
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
153
|
+
REQUIRE(test_allocator_net_allocations == 0);
|
153
154
|
}
|
154
155
|
|
155
156
|
} /* namespace datasketches */
|
@@ -37,14 +37,14 @@ class ebpps_sample {
|
|
37
37
|
public:
|
38
38
|
explicit ebpps_sample(uint32_t k, const A& allocator = A());
|
39
39
|
|
40
|
-
// constructor used to create a sample to merge one itme
|
41
|
-
template<typename TT>
|
42
|
-
ebpps_sample(TT&& item, double theta, const A& allocator = A());
|
43
|
-
|
44
40
|
// for deserialization
|
45
41
|
class items_deleter;
|
46
42
|
ebpps_sample(std::vector<T, A>&& data, optional<T>&& partial_item, double c, const A& allocator = A());
|
47
43
|
|
44
|
+
// used instead of having a single-item constructor for update/merge calls
|
45
|
+
template<typename TT>
|
46
|
+
void replace_content(TT&& item, double theta);
|
47
|
+
|
48
48
|
void reset();
|
49
49
|
void downsample(double theta);
|
50
50
|
|
@@ -41,22 +41,6 @@ ebpps_sample<T,A>::ebpps_sample(uint32_t reserved_size, const A& allocator) :
|
|
41
41
|
data_.reserve(reserved_size);
|
42
42
|
}
|
43
43
|
|
44
|
-
template<typename T, typename A>
|
45
|
-
template<typename TT>
|
46
|
-
ebpps_sample<T,A>::ebpps_sample(TT&& item, double theta, const A& allocator) :
|
47
|
-
allocator_(allocator),
|
48
|
-
c_(theta),
|
49
|
-
partial_item_(),
|
50
|
-
data_(allocator)
|
51
|
-
{
|
52
|
-
if (theta == 1.0) {
|
53
|
-
data_.reserve(1);
|
54
|
-
data_.emplace_back(std::forward<TT>(item));
|
55
|
-
} else {
|
56
|
-
partial_item_.emplace(std::forward<TT>(item));
|
57
|
-
}
|
58
|
-
}
|
59
|
-
|
60
44
|
template<typename T, typename A>
|
61
45
|
ebpps_sample<T,A>::ebpps_sample(std::vector<T, A>&& data, optional<T>&& partial_item, double c, const A& allocator) :
|
62
46
|
allocator_(allocator),
|
@@ -65,6 +49,19 @@ ebpps_sample<T,A>::ebpps_sample(std::vector<T, A>&& data, optional<T>&& partial_
|
|
65
49
|
data_(data, allocator)
|
66
50
|
{}
|
67
51
|
|
52
|
+
template<typename T, typename A>
|
53
|
+
template<typename TT>
|
54
|
+
void ebpps_sample<T,A>::replace_content(TT&& item, double theta) {
|
55
|
+
c_ = theta;
|
56
|
+
data_.clear();
|
57
|
+
partial_item_.reset();
|
58
|
+
if (theta == 1.0) {
|
59
|
+
data_.emplace_back(std::forward<TT>(item));
|
60
|
+
} else {
|
61
|
+
partial_item_.emplace(std::forward<TT>(item));
|
62
|
+
}
|
63
|
+
}
|
64
|
+
|
68
65
|
template<typename T, typename A>
|
69
66
|
auto ebpps_sample<T,A>::get_sample() const -> result_type {
|
70
67
|
double unused;
|
@@ -43,7 +43,7 @@ namespace ebpps_constants {
|
|
43
43
|
* From: "Exact PPS Sampling with Bounded Sample Size",
|
44
44
|
* B. Hentschel, P. J. Haas, Y. Tian. Information Processing Letters, 2023.
|
45
45
|
*
|
46
|
-
* This sketch samples data from a stream of items
|
46
|
+
* This sketch samples data from a stream of items proportional to the weight of each item.
|
47
47
|
* The sample guarantees the presence of an item in the result is proportional to that item's
|
48
48
|
* portion of the total weight seen by the sketch, and returns a sample no larger than size k.
|
49
49
|
*
|
@@ -256,6 +256,8 @@ class ebpps_sketch {
|
|
256
256
|
|
257
257
|
ebpps_sample<T,A> sample_; // Object holding the current state of the sample
|
258
258
|
|
259
|
+
ebpps_sample<T,A> tmp_; // Temporary sample of size 1 used in updates
|
260
|
+
|
259
261
|
// handles merge after ensuring other.cumulative_wt_ <= this->cumulative_wt_
|
260
262
|
// so we can send items in individually
|
261
263
|
template<typename O>
|
@@ -40,7 +40,8 @@ ebpps_sketch<T, A>::ebpps_sketch(uint32_t k, const A& allocator) :
|
|
40
40
|
cumulative_wt_(0.0),
|
41
41
|
wt_max_(0.0),
|
42
42
|
rho_(1.0),
|
43
|
-
sample_(check_k(k), allocator)
|
43
|
+
sample_(check_k(k), allocator),
|
44
|
+
tmp_(1, allocator)
|
44
45
|
{}
|
45
46
|
|
46
47
|
template<typename T, typename A>
|
@@ -53,7 +54,8 @@ ebpps_sketch<T,A>::ebpps_sketch(uint32_t k, uint64_t n, double cumulative_wt,
|
|
53
54
|
cumulative_wt_(cumulative_wt),
|
54
55
|
wt_max_(wt_max),
|
55
56
|
rho_(rho),
|
56
|
-
sample_(sample)
|
57
|
+
sample_(sample),
|
58
|
+
tmp_(1, allocator)
|
57
59
|
{}
|
58
60
|
|
59
61
|
template<typename T, typename A>
|
@@ -148,9 +150,8 @@ void ebpps_sketch<T, A>::internal_update(FwdItem&& item, double weight) {
|
|
148
150
|
if (cumulative_wt_ > 0.0)
|
149
151
|
sample_.downsample(new_rho / rho_);
|
150
152
|
|
151
|
-
|
152
|
-
|
153
|
-
sample_.merge(tmp);
|
153
|
+
tmp_.replace_content(conditional_forward<FwdItem>(item), new_rho * weight);
|
154
|
+
sample_.merge(tmp_);
|
154
155
|
|
155
156
|
cumulative_wt_ = new_cum_wt;
|
156
157
|
wt_max_ = new_wt_max;
|
@@ -240,9 +241,8 @@ void ebpps_sketch<T, A>::internal_merge(O&& sk) {
|
|
240
241
|
if (cumulative_wt_ > 0.0)
|
241
242
|
sample_.downsample(new_rho / rho_);
|
242
243
|
|
243
|
-
|
244
|
-
|
245
|
-
sample_.merge(tmp);
|
244
|
+
tmp_.replace_content(conditional_forward<O>(items[i]), new_rho * avg_wt);
|
245
|
+
sample_.merge(tmp_);
|
246
246
|
|
247
247
|
cumulative_wt_ = new_cum_wt;
|
248
248
|
rho_ = new_rho;
|
@@ -259,9 +259,8 @@ void ebpps_sketch<T, A>::internal_merge(O&& sk) {
|
|
259
259
|
if (cumulative_wt_ > 0.0)
|
260
260
|
sample_.downsample(new_rho / rho_);
|
261
261
|
|
262
|
-
|
263
|
-
|
264
|
-
sample_.merge(tmp);
|
262
|
+
tmp_.replace_content(conditional_forward<O>(other_sample.get_partial_item()), new_rho * other_c_frac * avg_wt);
|
263
|
+
sample_.merge(tmp_);
|
265
264
|
|
266
265
|
cumulative_wt_ = new_cum_wt;
|
267
266
|
rho_ = new_rho;
|
@@ -42,14 +42,15 @@ TEST_CASE("ebpps sample: basic initialization", "[ebpps_sketch]") {
|
|
42
42
|
|
43
43
|
TEST_CASE("ebpps sample: pre-initialized", "[ebpps_sketch]") {
|
44
44
|
double theta = 1.0;
|
45
|
-
ebpps_sample<int> sample
|
45
|
+
ebpps_sample<int> sample(1);
|
46
|
+
sample.replace_content(-1, theta);
|
46
47
|
REQUIRE(sample.get_c() == theta);
|
47
48
|
REQUIRE(sample.get_num_retained_items() == 1);
|
48
49
|
REQUIRE(sample.get_sample().size() == 1);
|
49
50
|
REQUIRE(sample.has_partial_item() == false);
|
50
51
|
|
51
52
|
theta = 1e-300;
|
52
|
-
sample
|
53
|
+
sample.replace_content(-1, theta);
|
53
54
|
REQUIRE(sample.get_c() == theta);
|
54
55
|
REQUIRE(sample.get_num_retained_items() == 1);
|
55
56
|
REQUIRE(sample.get_sample().size() == 0); // assuming the random number is > 1e-300
|
@@ -57,7 +58,8 @@ TEST_CASE("ebpps sample: pre-initialized", "[ebpps_sketch]") {
|
|
57
58
|
}
|
58
59
|
|
59
60
|
TEST_CASE("ebpps sample: downsampling", "[ebpps_sketch]") {
|
60
|
-
ebpps_sample<char> sample
|
61
|
+
ebpps_sample<char> sample(1);
|
62
|
+
sample.replace_content('a', 1.0);
|
61
63
|
|
62
64
|
sample.downsample(2.0); // no-op
|
63
65
|
REQUIRE(sample.get_c() == 1.0);
|
@@ -121,8 +123,9 @@ TEST_CASE("ebpps sample: merge unit samples", "[ebpps_sketch]") {
|
|
121
123
|
uint32_t k = 8;
|
122
124
|
ebpps_sample<int> sample = ebpps_sample<int>(k);
|
123
125
|
|
126
|
+
ebpps_sample<int> s(1);
|
124
127
|
for (uint32_t i = 1; i <= k; ++i) {
|
125
|
-
|
128
|
+
s.replace_content(i, 1.0);
|
126
129
|
sample.merge(s);
|
127
130
|
REQUIRE(sample.get_c() == static_cast<double>(i));
|
128
131
|
REQUIRE(sample.get_num_retained_items() == i);
|