datasketches 0.4.3 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/datasketches/vo_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/LICENSE +35 -7
- data/vendor/datasketches-cpp/NOTICE +2 -2
- data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -1
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
- data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
- data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
- data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +51 -1
- data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +38 -1
- data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +12 -3
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -39
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +18 -10
@@ -0,0 +1,406 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <catch2/catch.hpp>
|
21
|
+
|
22
|
+
#include "bloom_filter.hpp"
|
23
|
+
|
24
|
+
#ifdef TEST_BINARY_INPUT_PATH
|
25
|
+
static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
|
26
|
+
#else
|
27
|
+
static std::string testBinaryInputPath = "test/";
|
28
|
+
#endif
|
29
|
+
|
30
|
+
namespace datasketches {
|
31
|
+
|
32
|
+
TEST_CASE("bloom_filter: invalid constructor args", "[bloom_filter]") {
|
33
|
+
REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(0, 4), std::invalid_argument);
|
34
|
+
REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(1L << 60, 4), std::invalid_argument);
|
35
|
+
REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(65535, 0), std::invalid_argument);
|
36
|
+
}
|
37
|
+
|
38
|
+
TEST_CASE("bloom_filter: standard constructors", "[bloom_filter]") {
|
39
|
+
uint64_t num_items = 4000;
|
40
|
+
double fpp = 0.01;
|
41
|
+
|
42
|
+
uint64_t num_bits = bloom_filter::builder::suggest_num_filter_bits(num_items, fpp);
|
43
|
+
uint16_t num_hashes = bloom_filter::builder::suggest_num_hashes(num_items, num_bits);
|
44
|
+
uint64_t seed = 89023;
|
45
|
+
|
46
|
+
auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes, seed);
|
47
|
+
uint64_t adjusted_num_bits = (num_bits + 63) & ~0x3F; // round up to the nearest multiple of 64
|
48
|
+
REQUIRE(bf.get_capacity() == adjusted_num_bits);
|
49
|
+
REQUIRE(bf.get_num_hashes() == num_hashes);
|
50
|
+
REQUIRE(bf.get_seed() == seed);
|
51
|
+
REQUIRE(bf.is_empty());
|
52
|
+
|
53
|
+
// should match above
|
54
|
+
bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
|
55
|
+
REQUIRE(bf.get_capacity() == adjusted_num_bits);
|
56
|
+
REQUIRE(bf.get_num_hashes() == num_hashes);
|
57
|
+
REQUIRE(bf.get_seed() == seed);
|
58
|
+
REQUIRE(bf.is_empty());
|
59
|
+
|
60
|
+
// same for initializing memory in-place
|
61
|
+
size_t serialized_size_bytes = bloom_filter::get_serialized_size_bytes(num_bits);
|
62
|
+
uint8_t* bytes = new uint8_t[serialized_size_bytes];
|
63
|
+
|
64
|
+
bf = bloom_filter::builder::initialize_by_size(bytes, serialized_size_bytes, num_bits, num_hashes, seed);
|
65
|
+
REQUIRE(bf.get_capacity() == adjusted_num_bits);
|
66
|
+
REQUIRE(bf.get_num_hashes() == num_hashes);
|
67
|
+
REQUIRE(bf.get_seed() == seed);
|
68
|
+
REQUIRE(bf.is_empty());
|
69
|
+
|
70
|
+
bf = bloom_filter::builder::initialize_by_accuracy(bytes, serialized_size_bytes, num_items, fpp, seed);
|
71
|
+
REQUIRE(bf.get_capacity() == adjusted_num_bits);
|
72
|
+
REQUIRE(bf.get_num_hashes() == num_hashes);
|
73
|
+
REQUIRE(bf.get_seed() == seed);
|
74
|
+
REQUIRE(bf.is_empty());
|
75
|
+
|
76
|
+
delete [] bytes;
|
77
|
+
}
|
78
|
+
|
79
|
+
TEST_CASE("bloom_filter: basic operations", "[bloom_filter]") {
|
80
|
+
uint64_t num_items = 5000;
|
81
|
+
double fpp = 0.01;
|
82
|
+
uint64_t seed = 4897301548054ULL;
|
83
|
+
|
84
|
+
auto bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
|
85
|
+
REQUIRE(bf.is_empty());
|
86
|
+
REQUIRE(bf.get_bits_used() == 0);
|
87
|
+
|
88
|
+
for (uint64_t i = 0; i < num_items; ++i) {
|
89
|
+
bf.query_and_update(i);
|
90
|
+
}
|
91
|
+
|
92
|
+
REQUIRE(!bf.is_empty());
|
93
|
+
// filter is about 50% full at target capacity
|
94
|
+
// since seed is fixed we expect an exact value every time
|
95
|
+
// but leaving the approximate test in since that's more the "expectation"
|
96
|
+
REQUIRE(bf.get_bits_used() == 24793); // exact value is not important but should be consistent
|
97
|
+
REQUIRE(bf.get_bits_used() == Approx(0.5 * bf.get_capacity()).epsilon(0.05)); // just over 3.3% in practice
|
98
|
+
|
99
|
+
uint32_t num_found = 0;
|
100
|
+
for (uint64_t i = num_items; i < bf.get_capacity(); ++i) {
|
101
|
+
if (bf.query(i)) {
|
102
|
+
++num_found;
|
103
|
+
}
|
104
|
+
}
|
105
|
+
// fpp is average with significant variance -- even at 12% it would fail occasionally
|
106
|
+
REQUIRE(num_found == 423);
|
107
|
+
//REQUIRE(num_found == Approx((bf.get_capacity() - num_items) * fpp).epsilon(0.12));
|
108
|
+
auto bytes = bf.serialize();
|
109
|
+
|
110
|
+
// initialize in memory and run the same tests
|
111
|
+
// also checking against the results from the first part
|
112
|
+
uint8_t* bf_memory = new uint8_t[bytes.size()];
|
113
|
+
auto bf2 = bloom_filter::builder::initialize_by_accuracy(bf_memory, bytes.size(), num_items, fpp, bf.get_seed());
|
114
|
+
REQUIRE(bf2.is_empty());
|
115
|
+
REQUIRE(bf2.get_bits_used() == 0);
|
116
|
+
|
117
|
+
for (uint64_t i = 0; i < num_items; ++i) {
|
118
|
+
bf2.query_and_update(i);
|
119
|
+
}
|
120
|
+
|
121
|
+
REQUIRE(!bf2.is_empty());
|
122
|
+
REQUIRE(bf2.get_bits_used() == bf.get_bits_used()); // should exactly match above
|
123
|
+
|
124
|
+
uint32_t num_found2 = 0;
|
125
|
+
for (uint64_t i = num_items; i < bf2.get_capacity(); ++i) {
|
126
|
+
if (bf2.query(i)) {
|
127
|
+
++num_found2;
|
128
|
+
}
|
129
|
+
}
|
130
|
+
REQUIRE(num_found == num_found2); // should exactly match above
|
131
|
+
auto bytes2 = bf2.serialize();
|
132
|
+
|
133
|
+
REQUIRE(bytes.size() == bytes2.size());
|
134
|
+
for (size_t i = 0; i < bytes.size(); ++i) {
|
135
|
+
REQUIRE(bytes[i] == bytes2[i]);
|
136
|
+
}
|
137
|
+
|
138
|
+
// check that raw memory also matches serialized sketch
|
139
|
+
const uint8_t* bf_bytes = bf2.get_wrapped_memory();
|
140
|
+
REQUIRE(bf_bytes == bf_memory);
|
141
|
+
for (size_t i = 0; i < bytes.size(); ++i) {
|
142
|
+
REQUIRE(bf_bytes[i] == bytes[i]);
|
143
|
+
}
|
144
|
+
|
145
|
+
// ensure the filters reset properly
|
146
|
+
bf.reset();
|
147
|
+
REQUIRE(bf.is_empty());
|
148
|
+
REQUIRE(bf.get_bits_used() == 0);
|
149
|
+
|
150
|
+
bf2.reset();
|
151
|
+
REQUIRE(bf2.is_empty());
|
152
|
+
REQUIRE(bf2.get_bits_used() == 0);
|
153
|
+
|
154
|
+
delete [] bf_memory;
|
155
|
+
}
|
156
|
+
|
157
|
+
TEST_CASE("bloom_filter: inversion", "[bloom_filter]") {
|
158
|
+
uint64_t num_bits = 8192;
|
159
|
+
uint16_t num_hashes = 3;
|
160
|
+
|
161
|
+
auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
|
162
|
+
|
163
|
+
uint64_t n = 500;
|
164
|
+
for (uint64_t i = 0; i < n; ++i) {
|
165
|
+
bf.update(i);
|
166
|
+
}
|
167
|
+
uint64_t num_bits_set = bf.get_bits_used();
|
168
|
+
bf.invert();
|
169
|
+
REQUIRE(bf.get_bits_used() == num_bits - num_bits_set);
|
170
|
+
|
171
|
+
// original items should be mostly not-present
|
172
|
+
uint32_t num_found = 0;
|
173
|
+
for (uint64_t i = 0; i < n; ++i) {
|
174
|
+
if (bf.query(i)) {
|
175
|
+
++num_found;
|
176
|
+
}
|
177
|
+
}
|
178
|
+
REQUIRE(num_found < n / 10);
|
179
|
+
|
180
|
+
// many other items should be "present"
|
181
|
+
num_found = 0;
|
182
|
+
for (uint64_t i = n; i < num_bits; ++i) {
|
183
|
+
if (bf.query(i)) {
|
184
|
+
++num_found;
|
185
|
+
}
|
186
|
+
}
|
187
|
+
REQUIRE(num_found > n);
|
188
|
+
}
|
189
|
+
|
190
|
+
TEST_CASE("bloom_filter: incompatible set operations", "[bloom_filter]") {
|
191
|
+
uint64_t num_bits = 32768;
|
192
|
+
uint16_t num_hashes = 4;
|
193
|
+
|
194
|
+
auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
|
195
|
+
|
196
|
+
// mismatched num bits
|
197
|
+
auto bf2 = bloom_filter::builder::create_by_size(2 * num_bits, num_hashes);
|
198
|
+
REQUIRE_THROWS_AS(bf1.union_with(bf2), std::invalid_argument);
|
199
|
+
|
200
|
+
// mismatched num hashes
|
201
|
+
auto bf3 = bloom_filter::builder::create_by_size(num_bits, 2 * num_hashes);
|
202
|
+
REQUIRE_THROWS_AS(bf1.intersect(bf2), std::invalid_argument);
|
203
|
+
|
204
|
+
// mismatched seed
|
205
|
+
auto bf4 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed() + 1);
|
206
|
+
REQUIRE_THROWS_AS(bf1.union_with(bf4), std::invalid_argument);
|
207
|
+
}
|
208
|
+
|
209
|
+
TEST_CASE("bloom_filter: basic union", "[bloom_filter]") {
|
210
|
+
const uint64_t num_bits = 12288;
|
211
|
+
const uint16_t num_hashes = 4;
|
212
|
+
|
213
|
+
auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
|
214
|
+
auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed());
|
215
|
+
|
216
|
+
const uint64_t n = 1000;
|
217
|
+
const uint32_t max_item = 3 * n / 2 - 1;
|
218
|
+
for (uint64_t i = 0; i < n; ++i) {
|
219
|
+
bf1.query_and_update(i);
|
220
|
+
bf2.update(n / 2 + i);
|
221
|
+
}
|
222
|
+
|
223
|
+
bf1.union_with(bf2);
|
224
|
+
for (uint64_t i = 0; i < max_item; ++i) {
|
225
|
+
REQUIRE(bf1.query(i));
|
226
|
+
}
|
227
|
+
|
228
|
+
uint32_t num_found = 0;
|
229
|
+
for (uint64_t i = max_item; i < num_bits; ++i) {
|
230
|
+
if (bf1.query(i)) {
|
231
|
+
++num_found;
|
232
|
+
}
|
233
|
+
}
|
234
|
+
REQUIRE(num_found < num_bits / 10); // not being super strict
|
235
|
+
}
|
236
|
+
|
237
|
+
TEST_CASE("bloom_filter: basic intersection", "[bloom_filter]") {
|
238
|
+
const uint64_t num_bits = 8192;
|
239
|
+
const uint16_t num_hahes = 5;
|
240
|
+
|
241
|
+
auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hahes);
|
242
|
+
auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hahes, bf1.get_seed());
|
243
|
+
|
244
|
+
const uint64_t n = 1024;
|
245
|
+
const uint32_t max_item = 3 * n / 2 - 1;
|
246
|
+
for (uint64_t i = 0; i < n; ++i) {
|
247
|
+
bf1.update(i);
|
248
|
+
bf2.update(n / 2 + i);
|
249
|
+
}
|
250
|
+
|
251
|
+
bf1.intersect(bf2);
|
252
|
+
// overlap bit should all be set
|
253
|
+
for (uint64_t i = n / 2; i < n; ++i) {
|
254
|
+
REQUIRE(bf1.query(i));
|
255
|
+
}
|
256
|
+
|
257
|
+
uint32_t num_found = 0;
|
258
|
+
for (uint64_t i = 0; i < n / 2; ++i) {
|
259
|
+
if (bf1.query(i)) {
|
260
|
+
++num_found;
|
261
|
+
}
|
262
|
+
}
|
263
|
+
for (uint64_t i = max_item; i < num_bits; ++i) {
|
264
|
+
if (bf1.query(i)) {
|
265
|
+
++num_found;
|
266
|
+
}
|
267
|
+
}
|
268
|
+
|
269
|
+
REQUIRE(num_found < num_bits / 10); // not being super strict
|
270
|
+
}
|
271
|
+
|
272
|
+
TEST_CASE("bloom_filter: empty serialization", "[bloom_filter]") {
|
273
|
+
const uint64_t num_bits = 32769;
|
274
|
+
const uint16_t num_hashes = 7;
|
275
|
+
|
276
|
+
auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
|
277
|
+
auto bytes = bf.serialize();
|
278
|
+
REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
|
279
|
+
|
280
|
+
auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
|
281
|
+
REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
|
282
|
+
REQUIRE(bf.get_seed() == bf_bytes.get_seed());
|
283
|
+
REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
|
284
|
+
REQUIRE(bf_bytes.is_empty());
|
285
|
+
|
286
|
+
std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
|
287
|
+
bf.serialize(ss);
|
288
|
+
auto bf_stream = bloom_filter::deserialize(ss);
|
289
|
+
REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
|
290
|
+
REQUIRE(bf.get_seed() == bf_stream.get_seed());
|
291
|
+
REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
|
292
|
+
REQUIRE(bf_stream.is_empty());
|
293
|
+
|
294
|
+
// read-only wrap should work
|
295
|
+
auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
|
296
|
+
REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
|
297
|
+
REQUIRE(bf.get_seed() == bf_wrap.get_seed());
|
298
|
+
REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
|
299
|
+
REQUIRE(bf_wrap.is_empty());
|
300
|
+
|
301
|
+
// writable wrap should not
|
302
|
+
REQUIRE_THROWS_AS(bloom_filter::writable_wrap(bytes.data(), bytes.size()), std::invalid_argument);
|
303
|
+
}
|
304
|
+
|
305
|
+
TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") {
|
306
|
+
const uint64_t num_bits = 32768;
|
307
|
+
const uint16_t num_hashes = 5;
|
308
|
+
|
309
|
+
auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
|
310
|
+
const uint64_t n = 1000;
|
311
|
+
for (uint64_t i = 0; i < n; ++i) {
|
312
|
+
bf.update(0.5 + i); // testing floats
|
313
|
+
}
|
314
|
+
|
315
|
+
// test more items without updating, assuming some false positives
|
316
|
+
// so we can check that we get the same number of false positives
|
317
|
+
// with the same query items
|
318
|
+
uint64_t fp_count = 0;
|
319
|
+
for (uint64_t i = n; i < num_bits; ++i) {
|
320
|
+
fp_count += bf.query(0.5 + i) ? 1 : 0;
|
321
|
+
}
|
322
|
+
|
323
|
+
auto bytes = bf.serialize();
|
324
|
+
REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
|
325
|
+
|
326
|
+
auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
|
327
|
+
REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
|
328
|
+
REQUIRE(bf.get_seed() == bf_bytes.get_seed());
|
329
|
+
REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
|
330
|
+
REQUIRE(!bf_bytes.is_empty());
|
331
|
+
REQUIRE(bf.is_memory_owned());
|
332
|
+
uint64_t fp_count_bytes = 0;
|
333
|
+
for (uint64_t i = 0; i < num_bits; ++i) {
|
334
|
+
bool val = bf_bytes.query(0.5 + i);
|
335
|
+
if (i < n)
|
336
|
+
REQUIRE(val);
|
337
|
+
else if (val)
|
338
|
+
++fp_count_bytes;
|
339
|
+
}
|
340
|
+
REQUIRE(fp_count_bytes == fp_count);
|
341
|
+
|
342
|
+
std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
|
343
|
+
bf.serialize(ss);
|
344
|
+
auto bf_stream = bloom_filter::deserialize(ss);
|
345
|
+
REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
|
346
|
+
REQUIRE(bf.get_seed() == bf_stream.get_seed());
|
347
|
+
REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
|
348
|
+
REQUIRE(!bf_stream.is_empty());
|
349
|
+
REQUIRE(bf_stream.is_memory_owned());
|
350
|
+
uint64_t fp_count_stream = 0;
|
351
|
+
for (uint64_t i = 0; i < num_bits; ++i) {
|
352
|
+
bool val = bf_stream.query(0.5 + i);
|
353
|
+
if (i < n)
|
354
|
+
REQUIRE(val);
|
355
|
+
else if (val)
|
356
|
+
++fp_count_stream;
|
357
|
+
}
|
358
|
+
REQUIRE(fp_count_stream == fp_count);
|
359
|
+
|
360
|
+
// read-only wrap
|
361
|
+
auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
|
362
|
+
REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
|
363
|
+
REQUIRE(bf.get_seed() == bf_wrap.get_seed());
|
364
|
+
REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
|
365
|
+
REQUIRE(!bf_wrap.is_empty());
|
366
|
+
REQUIRE(!bf_wrap.is_memory_owned());
|
367
|
+
uint64_t fp_count_wrap = 0;
|
368
|
+
for (uint64_t i = 0; i < num_bits; ++i) {
|
369
|
+
bool val = bf_wrap.query(0.5 + i);
|
370
|
+
if (i < n)
|
371
|
+
REQUIRE(val);
|
372
|
+
else if (val)
|
373
|
+
++fp_count_wrap;
|
374
|
+
}
|
375
|
+
REQUIRE(fp_count_wrap == fp_count);
|
376
|
+
REQUIRE_THROWS_AS(bf_wrap.update(-1.0), std::logic_error);
|
377
|
+
REQUIRE_THROWS_AS(bf_wrap.query_and_update(-2.0), std::logic_error);
|
378
|
+
REQUIRE_THROWS_AS(bf_wrap.reset(), std::logic_error);
|
379
|
+
|
380
|
+
// writable wrap
|
381
|
+
auto bf_writable = bloom_filter::writable_wrap(bytes.data(), bytes.size());
|
382
|
+
REQUIRE(bf.get_capacity() == bf_writable.get_capacity());
|
383
|
+
REQUIRE(bf.get_seed() == bf_writable.get_seed());
|
384
|
+
REQUIRE(bf.get_num_hashes() == bf_writable.get_num_hashes());
|
385
|
+
REQUIRE(!bf_writable.is_empty());
|
386
|
+
REQUIRE(!bf_writable.is_memory_owned());
|
387
|
+
uint64_t fp_count_writable = 0;
|
388
|
+
for (uint64_t i = 0; i < num_bits; ++i) {
|
389
|
+
bool val = bf_writable.query(0.5 + i);
|
390
|
+
if (i < n)
|
391
|
+
REQUIRE(val);
|
392
|
+
else if (val)
|
393
|
+
++fp_count_writable;
|
394
|
+
}
|
395
|
+
REQUIRE(fp_count_writable == fp_count);
|
396
|
+
|
397
|
+
REQUIRE(!bf_writable.query(-1.0));
|
398
|
+
bf_writable.update(-1.0);
|
399
|
+
REQUIRE(bf_writable.query(-1.0));
|
400
|
+
|
401
|
+
// not good memory management to do this, but because we wrapped the same bytes as both
|
402
|
+
// read-only adn writable, that update should ahve changed the read-only version, too
|
403
|
+
REQUIRE(bf_wrap.query(-1.0));
|
404
|
+
}
|
405
|
+
|
406
|
+
} // namespace datasketches
|
@@ -89,6 +89,7 @@ public:
|
|
89
89
|
using vector_t = std::vector<T, Allocator>;
|
90
90
|
using vector_centroid = std::vector<centroid, typename std::allocator_traits<Allocator>::template rebind_alloc<centroid>>;
|
91
91
|
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
|
92
|
+
using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
|
92
93
|
|
93
94
|
struct centroid_cmp {
|
94
95
|
centroid_cmp() {}
|
@@ -115,7 +116,7 @@ public:
|
|
115
116
|
* Merge the given t-Digest into this one
|
116
117
|
* @param other t-Digest to merge
|
117
118
|
*/
|
118
|
-
void merge(tdigest& other);
|
119
|
+
void merge(const tdigest& other);
|
119
120
|
|
120
121
|
/**
|
121
122
|
* Process buffered values and merge centroids if needed
|
@@ -142,8 +143,17 @@ public:
|
|
142
143
|
*/
|
143
144
|
uint64_t get_total_weight() const;
|
144
145
|
|
146
|
+
/**
|
147
|
+
* Returns an instance of the allocator for this t-Digest.
|
148
|
+
* @return allocator
|
149
|
+
*/
|
150
|
+
Allocator get_allocator() const;
|
151
|
+
|
145
152
|
/**
|
146
153
|
* Compute approximate normalized rank of the given value.
|
154
|
+
*
|
155
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
156
|
+
*
|
147
157
|
* @param value to be ranked
|
148
158
|
* @return normalized rank (from 0 to 1 inclusive)
|
149
159
|
*/
|
@@ -151,11 +161,49 @@ public:
|
|
151
161
|
|
152
162
|
/**
|
153
163
|
* Compute approximate quantile value corresponding to the given normalized rank
|
164
|
+
*
|
165
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
166
|
+
*
|
154
167
|
* @param rank normalized rank (from 0 to 1 inclusive)
|
155
168
|
* @return quantile value corresponding to the given rank
|
156
169
|
*/
|
157
170
|
T get_quantile(double rank) const;
|
158
171
|
|
172
|
+
/**
|
173
|
+
* Returns an approximation to the Probability Mass Function (PMF) of the input stream
|
174
|
+
* given a set of split points.
|
175
|
+
*
|
176
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
177
|
+
*
|
178
|
+
* @param split_points an array of <i>m</i> unique, monotonically increasing values
|
179
|
+
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
|
180
|
+
*
|
181
|
+
* @param size the number of split points in the array
|
182
|
+
*
|
183
|
+
* @return an array of m+1 doubles each of which is an approximation
|
184
|
+
* to the fraction of the input stream values (the mass) that fall into one of those intervals.
|
185
|
+
*/
|
186
|
+
vector_double get_PMF(const T* split_points, uint32_t size) const;
|
187
|
+
|
188
|
+
/**
|
189
|
+
* Returns an approximation to the Cumulative Distribution Function (CDF), which is the
|
190
|
+
* cumulative analog of the PMF, of the input stream given a set of split points.
|
191
|
+
*
|
192
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
193
|
+
*
|
194
|
+
* @param split_points an array of <i>m</i> unique, monotonically increasing values
|
195
|
+
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
|
196
|
+
*
|
197
|
+
* @param size the number of split points in the array
|
198
|
+
*
|
199
|
+
* @return an array of m+1 doubles, which are a consecutive approximation to the CDF
|
200
|
+
* of the input stream given the split_points. The value at array position j of the returned
|
201
|
+
* CDF array is the sum of the returned values in positions 0 through j of the returned PMF
|
202
|
+
* array. This can be viewed as array of ranks of the given split points plus one more value
|
203
|
+
* that is always 1.
|
204
|
+
*/
|
205
|
+
vector_double get_CDF(const T* split_points, uint32_t size) const;
|
206
|
+
|
159
207
|
/**
|
160
208
|
* @return parameter k (compression) that was used to configure this t-Digest
|
161
209
|
*/
|
@@ -245,6 +293,8 @@ private:
|
|
245
293
|
// for compatibility with format of the reference implementation
|
246
294
|
static tdigest deserialize_compat(std::istream& is, const Allocator& allocator = Allocator());
|
247
295
|
static tdigest deserialize_compat(const void* bytes, size_t size, const Allocator& allocator = Allocator());
|
296
|
+
|
297
|
+
static inline void check_split_points(const T* values, uint32_t size);
|
248
298
|
};
|
249
299
|
|
250
300
|
} /* namespace datasketches */
|
@@ -20,6 +20,7 @@
|
|
20
20
|
#ifndef _TDIGEST_IMPL_HPP_
|
21
21
|
#define _TDIGEST_IMPL_HPP_
|
22
22
|
|
23
|
+
#include <algorithm>
|
23
24
|
#include <cmath>
|
24
25
|
#include <sstream>
|
25
26
|
|
@@ -43,7 +44,7 @@ void tdigest<T, A>::update(T value) {
|
|
43
44
|
}
|
44
45
|
|
45
46
|
template<typename T, typename A>
|
46
|
-
void tdigest<T, A>::merge(tdigest& other) {
|
47
|
+
void tdigest<T, A>::merge(const tdigest& other) {
|
47
48
|
if (other.is_empty()) return;
|
48
49
|
vector_centroid tmp(buffer_.get_allocator());
|
49
50
|
tmp.reserve(buffer_.size() + centroids_.size() + other.buffer_.size() + other.centroids_.size());
|
@@ -84,6 +85,11 @@ uint64_t tdigest<T, A>::get_total_weight() const {
|
|
84
85
|
return centroids_weight_ + buffer_.size();
|
85
86
|
}
|
86
87
|
|
88
|
+
template<typename T, typename A>
|
89
|
+
A tdigest<T, A>::get_allocator() const {
|
90
|
+
return buffer_.get_allocator();
|
91
|
+
}
|
92
|
+
|
87
93
|
template<typename T, typename A>
|
88
94
|
double tdigest<T, A>::get_rank(T value) const {
|
89
95
|
if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
|
@@ -190,6 +196,25 @@ T tdigest<T, A>::get_quantile(double rank) const {
|
|
190
196
|
return weighted_average(centroids_.back().get_weight(), w1, max_, w2);
|
191
197
|
}
|
192
198
|
|
199
|
+
template<typename T, typename A>
|
200
|
+
auto tdigest<T, A>::get_PMF(const T* split_points, uint32_t size) const -> vector_double {
|
201
|
+
auto buckets = get_CDF(split_points, size);
|
202
|
+
for (uint32_t i = size; i > 0; --i) {
|
203
|
+
buckets[i] -= buckets[i - 1];
|
204
|
+
}
|
205
|
+
return buckets;
|
206
|
+
}
|
207
|
+
|
208
|
+
template<typename T, typename A>
|
209
|
+
auto tdigest<T, A>::get_CDF(const T* split_points, uint32_t size) const -> vector_double {
|
210
|
+
check_split_points(split_points, size);
|
211
|
+
vector_double ranks(get_allocator());
|
212
|
+
ranks.reserve(size + 1);
|
213
|
+
for (uint32_t i = 0; i < size; ++i) ranks.push_back(get_rank(split_points[i]));
|
214
|
+
ranks.push_back(1);
|
215
|
+
return ranks;
|
216
|
+
}
|
217
|
+
|
193
218
|
template<typename T, typename A>
|
194
219
|
uint16_t tdigest<T, A>::get_k() const {
|
195
220
|
return k_;
|
@@ -590,6 +615,18 @@ buffer_(std::move(buffer))
|
|
590
615
|
buffer_.reserve(centroids_capacity_ * BUFFER_MULTIPLIER);
|
591
616
|
}
|
592
617
|
|
618
|
+
template<typename T, typename A>
|
619
|
+
void tdigest<T, A>::check_split_points(const T* values, uint32_t size) {
|
620
|
+
for (uint32_t i = 0; i < size ; i++) {
|
621
|
+
if (std::isnan(values[i])) {
|
622
|
+
throw std::invalid_argument("Values must not be NaN");
|
623
|
+
}
|
624
|
+
if ((i < (size - 1)) && !(values[i] < values[i + 1])) {
|
625
|
+
throw std::invalid_argument("Values must be unique and monotonically increasing");
|
626
|
+
}
|
627
|
+
}
|
628
|
+
}
|
629
|
+
|
593
630
|
} /* namespace datasketches */
|
594
631
|
|
595
632
|
#endif // _TDIGEST_IMPL_HPP_
|
@@ -35,6 +35,9 @@ TEST_CASE("empty", "[tdigest]") {
|
|
35
35
|
REQUIRE_THROWS_AS(td.get_max_value(), std::runtime_error);
|
36
36
|
REQUIRE_THROWS_AS(td.get_rank(0), std::runtime_error);
|
37
37
|
REQUIRE_THROWS_AS(td.get_quantile(0.5), std::runtime_error);
|
38
|
+
const double split_points[1] {0};
|
39
|
+
REQUIRE_THROWS_AS(td.get_PMF(split_points, 1), std::runtime_error);
|
40
|
+
REQUIRE_THROWS_AS(td.get_CDF(split_points, 1), std::runtime_error);
|
38
41
|
}
|
39
42
|
|
40
43
|
TEST_CASE("one value", "[tdigest]") {
|
@@ -56,9 +59,6 @@ TEST_CASE("many values", "[tdigest]") {
|
|
56
59
|
const size_t n = 10000;
|
57
60
|
tdigest_double td;
|
58
61
|
for (size_t i = 0; i < n; ++i) td.update(i);
|
59
|
-
// std::cout << td.to_string(true);
|
60
|
-
// td.compress();
|
61
|
-
// std::cout << td.to_string(true);
|
62
62
|
REQUIRE_FALSE(td.is_empty());
|
63
63
|
REQUIRE(td.get_total_weight() == n);
|
64
64
|
REQUIRE(td.get_min_value() == 0);
|
@@ -73,6 +73,15 @@ TEST_CASE("many values", "[tdigest]") {
|
|
73
73
|
REQUIRE(td.get_quantile(0.9) == Approx(n * 0.9).epsilon(0.01));
|
74
74
|
REQUIRE(td.get_quantile(0.95) == Approx(n * 0.95).epsilon(0.01));
|
75
75
|
REQUIRE(td.get_quantile(1) == n - 1);
|
76
|
+
const double split_points[1] {n / 2};
|
77
|
+
const auto pmf = td.get_PMF(split_points, 1);
|
78
|
+
REQUIRE(pmf.size() == 2);
|
79
|
+
REQUIRE(pmf[0] == Approx(0.5).margin(0.0001));
|
80
|
+
REQUIRE(pmf[1] == Approx(0.5).margin(0.0001));
|
81
|
+
const auto cdf = td.get_CDF(split_points, 1);
|
82
|
+
REQUIRE(cdf.size() == 2);
|
83
|
+
REQUIRE(cdf[0] == Approx(0.5).margin(0.0001));
|
84
|
+
REQUIRE(cdf[1] == 1);
|
76
85
|
}
|
77
86
|
|
78
87
|
TEST_CASE("rank - two values", "[tdigest]") {
|
@@ -329,7 +329,7 @@ static inline void pack_bits_13(const uint64_t* values, uint8_t* ptr) {
|
|
329
329
|
|
330
330
|
*ptr++ = static_cast<uint8_t>(values[3] >> 4);
|
331
331
|
|
332
|
-
*ptr = static_cast<uint8_t>(values[3]
|
332
|
+
*ptr = static_cast<uint8_t>(values[3] << 4);
|
333
333
|
*ptr++ |= static_cast<uint8_t>(values[4] >> 9);
|
334
334
|
|
335
335
|
*ptr++ = static_cast<uint8_t>(values[4] >> 1);
|
@@ -4227,7 +4227,7 @@ static inline void unpack_bits_33(uint64_t* values, const uint8_t* ptr) {
|
|
4227
4227
|
values[6] |= *ptr >> 1;
|
4228
4228
|
|
4229
4229
|
values[7] = static_cast<uint64_t>(*ptr++ & 1) << 32;
|
4230
|
-
values[7] |= *ptr++ << 24;
|
4230
|
+
values[7] |= static_cast<uint64_t>(*ptr++) << 24;
|
4231
4231
|
values[7] |= *ptr++ << 16;
|
4232
4232
|
values[7] |= *ptr++ << 8;
|
4233
4233
|
values[7] |= *ptr;
|
@@ -4296,7 +4296,7 @@ static inline void unpack_bits_35(uint64_t* values, const uint8_t* ptr) {
|
|
4296
4296
|
values[1] |= *ptr++ << 6;
|
4297
4297
|
values[1] |= *ptr >> 2;
|
4298
4298
|
|
4299
|
-
values[2] = static_cast<uint64_t>(*ptr++ &
|
4299
|
+
values[2] = static_cast<uint64_t>(*ptr++ & 3) << 33;
|
4300
4300
|
values[2] |= static_cast<uint64_t>(*ptr++) << 25;
|
4301
4301
|
values[2] |= *ptr++ << 17;
|
4302
4302
|
values[2] |= *ptr++ << 9;
|
@@ -6201,7 +6201,7 @@ static inline void pack_bits_block8(const uint64_t* values, uint8_t* ptr, uint8_
|
|
6201
6201
|
case 61: pack_bits_61(values, ptr); break;
|
6202
6202
|
case 62: pack_bits_62(values, ptr); break;
|
6203
6203
|
case 63: pack_bits_63(values, ptr); break;
|
6204
|
-
default: throw std::logic_error("wrong number of bits " + std::to_string(bits));
|
6204
|
+
default: throw std::logic_error("wrong number of bits in pack_bits_block8: " + std::to_string(bits));
|
6205
6205
|
}
|
6206
6206
|
}
|
6207
6207
|
|
@@ -6270,7 +6270,7 @@ static inline void unpack_bits_block8(uint64_t* values, const uint8_t* ptr, uint
|
|
6270
6270
|
case 61: unpack_bits_61(values, ptr); break;
|
6271
6271
|
case 62: unpack_bits_62(values, ptr); break;
|
6272
6272
|
case 63: unpack_bits_63(values, ptr); break;
|
6273
|
-
default: throw std::logic_error("wrong number of bits " + std::to_string(bits));
|
6273
|
+
default: throw std::logic_error("wrong number of bits in unpack_bits_block8: " + std::to_string(bits));
|
6274
6274
|
}
|
6275
6275
|
}
|
6276
6276
|
|