datasketches 0.4.3 → 0.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/datasketches/vo_wrapper.cpp +1 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/LICENSE +35 -7
- data/vendor/datasketches-cpp/NOTICE +2 -2
- data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -1
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
- data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
- data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
- data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
- data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
- data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
- data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
- data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +51 -1
- data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +38 -1
- data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +12 -3
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
- data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -39
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +17 -9
@@ -0,0 +1,406 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <catch2/catch.hpp>
|
21
|
+
|
22
|
+
#include "bloom_filter.hpp"
|
23
|
+
|
24
|
+
#ifdef TEST_BINARY_INPUT_PATH
|
25
|
+
static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
|
26
|
+
#else
|
27
|
+
static std::string testBinaryInputPath = "test/";
|
28
|
+
#endif
|
29
|
+
|
30
|
+
namespace datasketches {
|
31
|
+
|
32
|
+
TEST_CASE("bloom_filter: invalid constructor args", "[bloom_filter]") {
|
33
|
+
REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(0, 4), std::invalid_argument);
|
34
|
+
REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(1L << 60, 4), std::invalid_argument);
|
35
|
+
REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(65535, 0), std::invalid_argument);
|
36
|
+
}
|
37
|
+
|
38
|
+
TEST_CASE("bloom_filter: standard constructors", "[bloom_filter]") {
|
39
|
+
uint64_t num_items = 4000;
|
40
|
+
double fpp = 0.01;
|
41
|
+
|
42
|
+
uint64_t num_bits = bloom_filter::builder::suggest_num_filter_bits(num_items, fpp);
|
43
|
+
uint16_t num_hashes = bloom_filter::builder::suggest_num_hashes(num_items, num_bits);
|
44
|
+
uint64_t seed = 89023;
|
45
|
+
|
46
|
+
auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes, seed);
|
47
|
+
uint64_t adjusted_num_bits = (num_bits + 63) & ~0x3F; // round up to the nearest multiple of 64
|
48
|
+
REQUIRE(bf.get_capacity() == adjusted_num_bits);
|
49
|
+
REQUIRE(bf.get_num_hashes() == num_hashes);
|
50
|
+
REQUIRE(bf.get_seed() == seed);
|
51
|
+
REQUIRE(bf.is_empty());
|
52
|
+
|
53
|
+
// should match above
|
54
|
+
bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
|
55
|
+
REQUIRE(bf.get_capacity() == adjusted_num_bits);
|
56
|
+
REQUIRE(bf.get_num_hashes() == num_hashes);
|
57
|
+
REQUIRE(bf.get_seed() == seed);
|
58
|
+
REQUIRE(bf.is_empty());
|
59
|
+
|
60
|
+
// same for initializing memory in-place
|
61
|
+
size_t serialized_size_bytes = bloom_filter::get_serialized_size_bytes(num_bits);
|
62
|
+
uint8_t* bytes = new uint8_t[serialized_size_bytes];
|
63
|
+
|
64
|
+
bf = bloom_filter::builder::initialize_by_size(bytes, serialized_size_bytes, num_bits, num_hashes, seed);
|
65
|
+
REQUIRE(bf.get_capacity() == adjusted_num_bits);
|
66
|
+
REQUIRE(bf.get_num_hashes() == num_hashes);
|
67
|
+
REQUIRE(bf.get_seed() == seed);
|
68
|
+
REQUIRE(bf.is_empty());
|
69
|
+
|
70
|
+
bf = bloom_filter::builder::initialize_by_accuracy(bytes, serialized_size_bytes, num_items, fpp, seed);
|
71
|
+
REQUIRE(bf.get_capacity() == adjusted_num_bits);
|
72
|
+
REQUIRE(bf.get_num_hashes() == num_hashes);
|
73
|
+
REQUIRE(bf.get_seed() == seed);
|
74
|
+
REQUIRE(bf.is_empty());
|
75
|
+
|
76
|
+
delete [] bytes;
|
77
|
+
}
|
78
|
+
|
79
|
+
TEST_CASE("bloom_filter: basic operations", "[bloom_filter]") {
|
80
|
+
uint64_t num_items = 5000;
|
81
|
+
double fpp = 0.01;
|
82
|
+
uint64_t seed = 4897301548054ULL;
|
83
|
+
|
84
|
+
auto bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
|
85
|
+
REQUIRE(bf.is_empty());
|
86
|
+
REQUIRE(bf.get_bits_used() == 0);
|
87
|
+
|
88
|
+
for (uint64_t i = 0; i < num_items; ++i) {
|
89
|
+
bf.query_and_update(i);
|
90
|
+
}
|
91
|
+
|
92
|
+
REQUIRE(!bf.is_empty());
|
93
|
+
// filter is about 50% full at target capacity
|
94
|
+
// since seed is fixed we expect an exact value every time
|
95
|
+
// but leaving the approximate test in since that's more the "expectation"
|
96
|
+
REQUIRE(bf.get_bits_used() == 24793); // exact value is not important but should be consistent
|
97
|
+
REQUIRE(bf.get_bits_used() == Approx(0.5 * bf.get_capacity()).epsilon(0.05)); // just over 3.3% in practice
|
98
|
+
|
99
|
+
uint32_t num_found = 0;
|
100
|
+
for (uint64_t i = num_items; i < bf.get_capacity(); ++i) {
|
101
|
+
if (bf.query(i)) {
|
102
|
+
++num_found;
|
103
|
+
}
|
104
|
+
}
|
105
|
+
// fpp is average with significant variance -- even at 12% it would fail occasionally
|
106
|
+
REQUIRE(num_found == 423);
|
107
|
+
//REQUIRE(num_found == Approx((bf.get_capacity() - num_items) * fpp).epsilon(0.12));
|
108
|
+
auto bytes = bf.serialize();
|
109
|
+
|
110
|
+
// initialize in memory and run the same tests
|
111
|
+
// also checking against the results from the first part
|
112
|
+
uint8_t* bf_memory = new uint8_t[bytes.size()];
|
113
|
+
auto bf2 = bloom_filter::builder::initialize_by_accuracy(bf_memory, bytes.size(), num_items, fpp, bf.get_seed());
|
114
|
+
REQUIRE(bf2.is_empty());
|
115
|
+
REQUIRE(bf2.get_bits_used() == 0);
|
116
|
+
|
117
|
+
for (uint64_t i = 0; i < num_items; ++i) {
|
118
|
+
bf2.query_and_update(i);
|
119
|
+
}
|
120
|
+
|
121
|
+
REQUIRE(!bf2.is_empty());
|
122
|
+
REQUIRE(bf2.get_bits_used() == bf.get_bits_used()); // should exactly match above
|
123
|
+
|
124
|
+
uint32_t num_found2 = 0;
|
125
|
+
for (uint64_t i = num_items; i < bf2.get_capacity(); ++i) {
|
126
|
+
if (bf2.query(i)) {
|
127
|
+
++num_found2;
|
128
|
+
}
|
129
|
+
}
|
130
|
+
REQUIRE(num_found == num_found2); // should exactly match above
|
131
|
+
auto bytes2 = bf2.serialize();
|
132
|
+
|
133
|
+
REQUIRE(bytes.size() == bytes2.size());
|
134
|
+
for (size_t i = 0; i < bytes.size(); ++i) {
|
135
|
+
REQUIRE(bytes[i] == bytes2[i]);
|
136
|
+
}
|
137
|
+
|
138
|
+
// check that raw memory also matches serialized sketch
|
139
|
+
const uint8_t* bf_bytes = bf2.get_wrapped_memory();
|
140
|
+
REQUIRE(bf_bytes == bf_memory);
|
141
|
+
for (size_t i = 0; i < bytes.size(); ++i) {
|
142
|
+
REQUIRE(bf_bytes[i] == bytes[i]);
|
143
|
+
}
|
144
|
+
|
145
|
+
// ensure the filters reset properly
|
146
|
+
bf.reset();
|
147
|
+
REQUIRE(bf.is_empty());
|
148
|
+
REQUIRE(bf.get_bits_used() == 0);
|
149
|
+
|
150
|
+
bf2.reset();
|
151
|
+
REQUIRE(bf2.is_empty());
|
152
|
+
REQUIRE(bf2.get_bits_used() == 0);
|
153
|
+
|
154
|
+
delete [] bf_memory;
|
155
|
+
}
|
156
|
+
|
157
|
+
TEST_CASE("bloom_filter: inversion", "[bloom_filter]") {
|
158
|
+
uint64_t num_bits = 8192;
|
159
|
+
uint16_t num_hashes = 3;
|
160
|
+
|
161
|
+
auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
|
162
|
+
|
163
|
+
uint64_t n = 500;
|
164
|
+
for (uint64_t i = 0; i < n; ++i) {
|
165
|
+
bf.update(i);
|
166
|
+
}
|
167
|
+
uint64_t num_bits_set = bf.get_bits_used();
|
168
|
+
bf.invert();
|
169
|
+
REQUIRE(bf.get_bits_used() == num_bits - num_bits_set);
|
170
|
+
|
171
|
+
// original items should be mostly not-present
|
172
|
+
uint32_t num_found = 0;
|
173
|
+
for (uint64_t i = 0; i < n; ++i) {
|
174
|
+
if (bf.query(i)) {
|
175
|
+
++num_found;
|
176
|
+
}
|
177
|
+
}
|
178
|
+
REQUIRE(num_found < n / 10);
|
179
|
+
|
180
|
+
// many other items should be "present"
|
181
|
+
num_found = 0;
|
182
|
+
for (uint64_t i = n; i < num_bits; ++i) {
|
183
|
+
if (bf.query(i)) {
|
184
|
+
++num_found;
|
185
|
+
}
|
186
|
+
}
|
187
|
+
REQUIRE(num_found > n);
|
188
|
+
}
|
189
|
+
|
190
|
+
TEST_CASE("bloom_filter: incompatible set operations", "[bloom_filter]") {
|
191
|
+
uint64_t num_bits = 32768;
|
192
|
+
uint16_t num_hashes = 4;
|
193
|
+
|
194
|
+
auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
|
195
|
+
|
196
|
+
// mismatched num bits
|
197
|
+
auto bf2 = bloom_filter::builder::create_by_size(2 * num_bits, num_hashes);
|
198
|
+
REQUIRE_THROWS_AS(bf1.union_with(bf2), std::invalid_argument);
|
199
|
+
|
200
|
+
// mismatched num hashes
|
201
|
+
auto bf3 = bloom_filter::builder::create_by_size(num_bits, 2 * num_hashes);
|
202
|
+
REQUIRE_THROWS_AS(bf1.intersect(bf2), std::invalid_argument);
|
203
|
+
|
204
|
+
// mismatched seed
|
205
|
+
auto bf4 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed() + 1);
|
206
|
+
REQUIRE_THROWS_AS(bf1.union_with(bf4), std::invalid_argument);
|
207
|
+
}
|
208
|
+
|
209
|
+
TEST_CASE("bloom_filter: basic union", "[bloom_filter]") {
|
210
|
+
const uint64_t num_bits = 12288;
|
211
|
+
const uint16_t num_hashes = 4;
|
212
|
+
|
213
|
+
auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
|
214
|
+
auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed());
|
215
|
+
|
216
|
+
const uint64_t n = 1000;
|
217
|
+
const uint32_t max_item = 3 * n / 2 - 1;
|
218
|
+
for (uint64_t i = 0; i < n; ++i) {
|
219
|
+
bf1.query_and_update(i);
|
220
|
+
bf2.update(n / 2 + i);
|
221
|
+
}
|
222
|
+
|
223
|
+
bf1.union_with(bf2);
|
224
|
+
for (uint64_t i = 0; i < max_item; ++i) {
|
225
|
+
REQUIRE(bf1.query(i));
|
226
|
+
}
|
227
|
+
|
228
|
+
uint32_t num_found = 0;
|
229
|
+
for (uint64_t i = max_item; i < num_bits; ++i) {
|
230
|
+
if (bf1.query(i)) {
|
231
|
+
++num_found;
|
232
|
+
}
|
233
|
+
}
|
234
|
+
REQUIRE(num_found < num_bits / 10); // not being super strict
|
235
|
+
}
|
236
|
+
|
237
|
+
TEST_CASE("bloom_filter: basic intersection", "[bloom_filter]") {
|
238
|
+
const uint64_t num_bits = 8192;
|
239
|
+
const uint16_t num_hahes = 5;
|
240
|
+
|
241
|
+
auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hahes);
|
242
|
+
auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hahes, bf1.get_seed());
|
243
|
+
|
244
|
+
const uint64_t n = 1024;
|
245
|
+
const uint32_t max_item = 3 * n / 2 - 1;
|
246
|
+
for (uint64_t i = 0; i < n; ++i) {
|
247
|
+
bf1.update(i);
|
248
|
+
bf2.update(n / 2 + i);
|
249
|
+
}
|
250
|
+
|
251
|
+
bf1.intersect(bf2);
|
252
|
+
// overlap bit should all be set
|
253
|
+
for (uint64_t i = n / 2; i < n; ++i) {
|
254
|
+
REQUIRE(bf1.query(i));
|
255
|
+
}
|
256
|
+
|
257
|
+
uint32_t num_found = 0;
|
258
|
+
for (uint64_t i = 0; i < n / 2; ++i) {
|
259
|
+
if (bf1.query(i)) {
|
260
|
+
++num_found;
|
261
|
+
}
|
262
|
+
}
|
263
|
+
for (uint64_t i = max_item; i < num_bits; ++i) {
|
264
|
+
if (bf1.query(i)) {
|
265
|
+
++num_found;
|
266
|
+
}
|
267
|
+
}
|
268
|
+
|
269
|
+
REQUIRE(num_found < num_bits / 10); // not being super strict
|
270
|
+
}
|
271
|
+
|
272
|
+
TEST_CASE("bloom_filter: empty serialization", "[bloom_filter]") {
|
273
|
+
const uint64_t num_bits = 32769;
|
274
|
+
const uint16_t num_hashes = 7;
|
275
|
+
|
276
|
+
auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
|
277
|
+
auto bytes = bf.serialize();
|
278
|
+
REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
|
279
|
+
|
280
|
+
auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
|
281
|
+
REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
|
282
|
+
REQUIRE(bf.get_seed() == bf_bytes.get_seed());
|
283
|
+
REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
|
284
|
+
REQUIRE(bf_bytes.is_empty());
|
285
|
+
|
286
|
+
std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
|
287
|
+
bf.serialize(ss);
|
288
|
+
auto bf_stream = bloom_filter::deserialize(ss);
|
289
|
+
REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
|
290
|
+
REQUIRE(bf.get_seed() == bf_stream.get_seed());
|
291
|
+
REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
|
292
|
+
REQUIRE(bf_stream.is_empty());
|
293
|
+
|
294
|
+
// read-only wrap should work
|
295
|
+
auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
|
296
|
+
REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
|
297
|
+
REQUIRE(bf.get_seed() == bf_wrap.get_seed());
|
298
|
+
REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
|
299
|
+
REQUIRE(bf_wrap.is_empty());
|
300
|
+
|
301
|
+
// writable wrap should not
|
302
|
+
REQUIRE_THROWS_AS(bloom_filter::writable_wrap(bytes.data(), bytes.size()), std::invalid_argument);
|
303
|
+
}
|
304
|
+
|
305
|
+
TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") {
|
306
|
+
const uint64_t num_bits = 32768;
|
307
|
+
const uint16_t num_hashes = 5;
|
308
|
+
|
309
|
+
auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
|
310
|
+
const uint64_t n = 1000;
|
311
|
+
for (uint64_t i = 0; i < n; ++i) {
|
312
|
+
bf.update(0.5 + i); // testing floats
|
313
|
+
}
|
314
|
+
|
315
|
+
// test more items without updating, assuming some false positives
|
316
|
+
// so we can check that we get the same number of false positives
|
317
|
+
// with the same query items
|
318
|
+
uint64_t fp_count = 0;
|
319
|
+
for (uint64_t i = n; i < num_bits; ++i) {
|
320
|
+
fp_count += bf.query(0.5 + i) ? 1 : 0;
|
321
|
+
}
|
322
|
+
|
323
|
+
auto bytes = bf.serialize();
|
324
|
+
REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
|
325
|
+
|
326
|
+
auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
|
327
|
+
REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
|
328
|
+
REQUIRE(bf.get_seed() == bf_bytes.get_seed());
|
329
|
+
REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
|
330
|
+
REQUIRE(!bf_bytes.is_empty());
|
331
|
+
REQUIRE(bf.is_memory_owned());
|
332
|
+
uint64_t fp_count_bytes = 0;
|
333
|
+
for (uint64_t i = 0; i < num_bits; ++i) {
|
334
|
+
bool val = bf_bytes.query(0.5 + i);
|
335
|
+
if (i < n)
|
336
|
+
REQUIRE(val);
|
337
|
+
else if (val)
|
338
|
+
++fp_count_bytes;
|
339
|
+
}
|
340
|
+
REQUIRE(fp_count_bytes == fp_count);
|
341
|
+
|
342
|
+
std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
|
343
|
+
bf.serialize(ss);
|
344
|
+
auto bf_stream = bloom_filter::deserialize(ss);
|
345
|
+
REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
|
346
|
+
REQUIRE(bf.get_seed() == bf_stream.get_seed());
|
347
|
+
REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
|
348
|
+
REQUIRE(!bf_stream.is_empty());
|
349
|
+
REQUIRE(bf_stream.is_memory_owned());
|
350
|
+
uint64_t fp_count_stream = 0;
|
351
|
+
for (uint64_t i = 0; i < num_bits; ++i) {
|
352
|
+
bool val = bf_stream.query(0.5 + i);
|
353
|
+
if (i < n)
|
354
|
+
REQUIRE(val);
|
355
|
+
else if (val)
|
356
|
+
++fp_count_stream;
|
357
|
+
}
|
358
|
+
REQUIRE(fp_count_stream == fp_count);
|
359
|
+
|
360
|
+
// read-only wrap
|
361
|
+
auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
|
362
|
+
REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
|
363
|
+
REQUIRE(bf.get_seed() == bf_wrap.get_seed());
|
364
|
+
REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
|
365
|
+
REQUIRE(!bf_wrap.is_empty());
|
366
|
+
REQUIRE(!bf_wrap.is_memory_owned());
|
367
|
+
uint64_t fp_count_wrap = 0;
|
368
|
+
for (uint64_t i = 0; i < num_bits; ++i) {
|
369
|
+
bool val = bf_wrap.query(0.5 + i);
|
370
|
+
if (i < n)
|
371
|
+
REQUIRE(val);
|
372
|
+
else if (val)
|
373
|
+
++fp_count_wrap;
|
374
|
+
}
|
375
|
+
REQUIRE(fp_count_wrap == fp_count);
|
376
|
+
REQUIRE_THROWS_AS(bf_wrap.update(-1.0), std::logic_error);
|
377
|
+
REQUIRE_THROWS_AS(bf_wrap.query_and_update(-2.0), std::logic_error);
|
378
|
+
REQUIRE_THROWS_AS(bf_wrap.reset(), std::logic_error);
|
379
|
+
|
380
|
+
// writable wrap
|
381
|
+
auto bf_writable = bloom_filter::writable_wrap(bytes.data(), bytes.size());
|
382
|
+
REQUIRE(bf.get_capacity() == bf_writable.get_capacity());
|
383
|
+
REQUIRE(bf.get_seed() == bf_writable.get_seed());
|
384
|
+
REQUIRE(bf.get_num_hashes() == bf_writable.get_num_hashes());
|
385
|
+
REQUIRE(!bf_writable.is_empty());
|
386
|
+
REQUIRE(!bf_writable.is_memory_owned());
|
387
|
+
uint64_t fp_count_writable = 0;
|
388
|
+
for (uint64_t i = 0; i < num_bits; ++i) {
|
389
|
+
bool val = bf_writable.query(0.5 + i);
|
390
|
+
if (i < n)
|
391
|
+
REQUIRE(val);
|
392
|
+
else if (val)
|
393
|
+
++fp_count_writable;
|
394
|
+
}
|
395
|
+
REQUIRE(fp_count_writable == fp_count);
|
396
|
+
|
397
|
+
REQUIRE(!bf_writable.query(-1.0));
|
398
|
+
bf_writable.update(-1.0);
|
399
|
+
REQUIRE(bf_writable.query(-1.0));
|
400
|
+
|
401
|
+
// not good memory management to do this, but because we wrapped the same bytes as both
|
402
|
+
// read-only adn writable, that update should ahve changed the read-only version, too
|
403
|
+
REQUIRE(bf_wrap.query(-1.0));
|
404
|
+
}
|
405
|
+
|
406
|
+
} // namespace datasketches
|
@@ -89,6 +89,7 @@ public:
|
|
89
89
|
using vector_t = std::vector<T, Allocator>;
|
90
90
|
using vector_centroid = std::vector<centroid, typename std::allocator_traits<Allocator>::template rebind_alloc<centroid>>;
|
91
91
|
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
|
92
|
+
using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
|
92
93
|
|
93
94
|
struct centroid_cmp {
|
94
95
|
centroid_cmp() {}
|
@@ -115,7 +116,7 @@ public:
|
|
115
116
|
* Merge the given t-Digest into this one
|
116
117
|
* @param other t-Digest to merge
|
117
118
|
*/
|
118
|
-
void merge(tdigest& other);
|
119
|
+
void merge(const tdigest& other);
|
119
120
|
|
120
121
|
/**
|
121
122
|
* Process buffered values and merge centroids if needed
|
@@ -142,8 +143,17 @@ public:
|
|
142
143
|
*/
|
143
144
|
uint64_t get_total_weight() const;
|
144
145
|
|
146
|
+
/**
|
147
|
+
* Returns an instance of the allocator for this t-Digest.
|
148
|
+
* @return allocator
|
149
|
+
*/
|
150
|
+
Allocator get_allocator() const;
|
151
|
+
|
145
152
|
/**
|
146
153
|
* Compute approximate normalized rank of the given value.
|
154
|
+
*
|
155
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
156
|
+
*
|
147
157
|
* @param value to be ranked
|
148
158
|
* @return normalized rank (from 0 to 1 inclusive)
|
149
159
|
*/
|
@@ -151,11 +161,49 @@ public:
|
|
151
161
|
|
152
162
|
/**
|
153
163
|
* Compute approximate quantile value corresponding to the given normalized rank
|
164
|
+
*
|
165
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
166
|
+
*
|
154
167
|
* @param rank normalized rank (from 0 to 1 inclusive)
|
155
168
|
* @return quantile value corresponding to the given rank
|
156
169
|
*/
|
157
170
|
T get_quantile(double rank) const;
|
158
171
|
|
172
|
+
/**
|
173
|
+
* Returns an approximation to the Probability Mass Function (PMF) of the input stream
|
174
|
+
* given a set of split points.
|
175
|
+
*
|
176
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
177
|
+
*
|
178
|
+
* @param split_points an array of <i>m</i> unique, monotonically increasing values
|
179
|
+
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
|
180
|
+
*
|
181
|
+
* @param size the number of split points in the array
|
182
|
+
*
|
183
|
+
* @return an array of m+1 doubles each of which is an approximation
|
184
|
+
* to the fraction of the input stream values (the mass) that fall into one of those intervals.
|
185
|
+
*/
|
186
|
+
vector_double get_PMF(const T* split_points, uint32_t size) const;
|
187
|
+
|
188
|
+
/**
|
189
|
+
* Returns an approximation to the Cumulative Distribution Function (CDF), which is the
|
190
|
+
* cumulative analog of the PMF, of the input stream given a set of split points.
|
191
|
+
*
|
192
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
193
|
+
*
|
194
|
+
* @param split_points an array of <i>m</i> unique, monotonically increasing values
|
195
|
+
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
|
196
|
+
*
|
197
|
+
* @param size the number of split points in the array
|
198
|
+
*
|
199
|
+
* @return an array of m+1 doubles, which are a consecutive approximation to the CDF
|
200
|
+
* of the input stream given the split_points. The value at array position j of the returned
|
201
|
+
* CDF array is the sum of the returned values in positions 0 through j of the returned PMF
|
202
|
+
* array. This can be viewed as array of ranks of the given split points plus one more value
|
203
|
+
* that is always 1.
|
204
|
+
*/
|
205
|
+
vector_double get_CDF(const T* split_points, uint32_t size) const;
|
206
|
+
|
159
207
|
/**
|
160
208
|
* @return parameter k (compression) that was used to configure this t-Digest
|
161
209
|
*/
|
@@ -245,6 +293,8 @@ private:
|
|
245
293
|
// for compatibility with format of the reference implementation
|
246
294
|
static tdigest deserialize_compat(std::istream& is, const Allocator& allocator = Allocator());
|
247
295
|
static tdigest deserialize_compat(const void* bytes, size_t size, const Allocator& allocator = Allocator());
|
296
|
+
|
297
|
+
static inline void check_split_points(const T* values, uint32_t size);
|
248
298
|
};
|
249
299
|
|
250
300
|
} /* namespace datasketches */
|
@@ -20,6 +20,7 @@
|
|
20
20
|
#ifndef _TDIGEST_IMPL_HPP_
|
21
21
|
#define _TDIGEST_IMPL_HPP_
|
22
22
|
|
23
|
+
#include <algorithm>
|
23
24
|
#include <cmath>
|
24
25
|
#include <sstream>
|
25
26
|
|
@@ -43,7 +44,7 @@ void tdigest<T, A>::update(T value) {
|
|
43
44
|
}
|
44
45
|
|
45
46
|
template<typename T, typename A>
|
46
|
-
void tdigest<T, A>::merge(tdigest& other) {
|
47
|
+
void tdigest<T, A>::merge(const tdigest& other) {
|
47
48
|
if (other.is_empty()) return;
|
48
49
|
vector_centroid tmp(buffer_.get_allocator());
|
49
50
|
tmp.reserve(buffer_.size() + centroids_.size() + other.buffer_.size() + other.centroids_.size());
|
@@ -84,6 +85,11 @@ uint64_t tdigest<T, A>::get_total_weight() const {
|
|
84
85
|
return centroids_weight_ + buffer_.size();
|
85
86
|
}
|
86
87
|
|
88
|
+
template<typename T, typename A>
|
89
|
+
A tdigest<T, A>::get_allocator() const {
|
90
|
+
return buffer_.get_allocator();
|
91
|
+
}
|
92
|
+
|
87
93
|
template<typename T, typename A>
|
88
94
|
double tdigest<T, A>::get_rank(T value) const {
|
89
95
|
if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
|
@@ -190,6 +196,25 @@ T tdigest<T, A>::get_quantile(double rank) const {
|
|
190
196
|
return weighted_average(centroids_.back().get_weight(), w1, max_, w2);
|
191
197
|
}
|
192
198
|
|
199
|
+
template<typename T, typename A>
|
200
|
+
auto tdigest<T, A>::get_PMF(const T* split_points, uint32_t size) const -> vector_double {
|
201
|
+
auto buckets = get_CDF(split_points, size);
|
202
|
+
for (uint32_t i = size; i > 0; --i) {
|
203
|
+
buckets[i] -= buckets[i - 1];
|
204
|
+
}
|
205
|
+
return buckets;
|
206
|
+
}
|
207
|
+
|
208
|
+
template<typename T, typename A>
|
209
|
+
auto tdigest<T, A>::get_CDF(const T* split_points, uint32_t size) const -> vector_double {
|
210
|
+
check_split_points(split_points, size);
|
211
|
+
vector_double ranks(get_allocator());
|
212
|
+
ranks.reserve(size + 1);
|
213
|
+
for (uint32_t i = 0; i < size; ++i) ranks.push_back(get_rank(split_points[i]));
|
214
|
+
ranks.push_back(1);
|
215
|
+
return ranks;
|
216
|
+
}
|
217
|
+
|
193
218
|
template<typename T, typename A>
|
194
219
|
uint16_t tdigest<T, A>::get_k() const {
|
195
220
|
return k_;
|
@@ -590,6 +615,18 @@ buffer_(std::move(buffer))
|
|
590
615
|
buffer_.reserve(centroids_capacity_ * BUFFER_MULTIPLIER);
|
591
616
|
}
|
592
617
|
|
618
|
+
template<typename T, typename A>
|
619
|
+
void tdigest<T, A>::check_split_points(const T* values, uint32_t size) {
|
620
|
+
for (uint32_t i = 0; i < size ; i++) {
|
621
|
+
if (std::isnan(values[i])) {
|
622
|
+
throw std::invalid_argument("Values must not be NaN");
|
623
|
+
}
|
624
|
+
if ((i < (size - 1)) && !(values[i] < values[i + 1])) {
|
625
|
+
throw std::invalid_argument("Values must be unique and monotonically increasing");
|
626
|
+
}
|
627
|
+
}
|
628
|
+
}
|
629
|
+
|
593
630
|
} /* namespace datasketches */
|
594
631
|
|
595
632
|
#endif // _TDIGEST_IMPL_HPP_
|
@@ -35,6 +35,9 @@ TEST_CASE("empty", "[tdigest]") {
|
|
35
35
|
REQUIRE_THROWS_AS(td.get_max_value(), std::runtime_error);
|
36
36
|
REQUIRE_THROWS_AS(td.get_rank(0), std::runtime_error);
|
37
37
|
REQUIRE_THROWS_AS(td.get_quantile(0.5), std::runtime_error);
|
38
|
+
const double split_points[1] {0};
|
39
|
+
REQUIRE_THROWS_AS(td.get_PMF(split_points, 1), std::runtime_error);
|
40
|
+
REQUIRE_THROWS_AS(td.get_CDF(split_points, 1), std::runtime_error);
|
38
41
|
}
|
39
42
|
|
40
43
|
TEST_CASE("one value", "[tdigest]") {
|
@@ -56,9 +59,6 @@ TEST_CASE("many values", "[tdigest]") {
|
|
56
59
|
const size_t n = 10000;
|
57
60
|
tdigest_double td;
|
58
61
|
for (size_t i = 0; i < n; ++i) td.update(i);
|
59
|
-
// std::cout << td.to_string(true);
|
60
|
-
// td.compress();
|
61
|
-
// std::cout << td.to_string(true);
|
62
62
|
REQUIRE_FALSE(td.is_empty());
|
63
63
|
REQUIRE(td.get_total_weight() == n);
|
64
64
|
REQUIRE(td.get_min_value() == 0);
|
@@ -73,6 +73,15 @@ TEST_CASE("many values", "[tdigest]") {
|
|
73
73
|
REQUIRE(td.get_quantile(0.9) == Approx(n * 0.9).epsilon(0.01));
|
74
74
|
REQUIRE(td.get_quantile(0.95) == Approx(n * 0.95).epsilon(0.01));
|
75
75
|
REQUIRE(td.get_quantile(1) == n - 1);
|
76
|
+
const double split_points[1] {n / 2};
|
77
|
+
const auto pmf = td.get_PMF(split_points, 1);
|
78
|
+
REQUIRE(pmf.size() == 2);
|
79
|
+
REQUIRE(pmf[0] == Approx(0.5).margin(0.0001));
|
80
|
+
REQUIRE(pmf[1] == Approx(0.5).margin(0.0001));
|
81
|
+
const auto cdf = td.get_CDF(split_points, 1);
|
82
|
+
REQUIRE(cdf.size() == 2);
|
83
|
+
REQUIRE(cdf[0] == Approx(0.5).margin(0.0001));
|
84
|
+
REQUIRE(cdf[1] == 1);
|
76
85
|
}
|
77
86
|
|
78
87
|
TEST_CASE("rank - two values", "[tdigest]") {
|
@@ -329,7 +329,7 @@ static inline void pack_bits_13(const uint64_t* values, uint8_t* ptr) {
|
|
329
329
|
|
330
330
|
*ptr++ = static_cast<uint8_t>(values[3] >> 4);
|
331
331
|
|
332
|
-
*ptr = static_cast<uint8_t>(values[3]
|
332
|
+
*ptr = static_cast<uint8_t>(values[3] << 4);
|
333
333
|
*ptr++ |= static_cast<uint8_t>(values[4] >> 9);
|
334
334
|
|
335
335
|
*ptr++ = static_cast<uint8_t>(values[4] >> 1);
|
@@ -4227,7 +4227,7 @@ static inline void unpack_bits_33(uint64_t* values, const uint8_t* ptr) {
|
|
4227
4227
|
values[6] |= *ptr >> 1;
|
4228
4228
|
|
4229
4229
|
values[7] = static_cast<uint64_t>(*ptr++ & 1) << 32;
|
4230
|
-
values[7] |= *ptr++ << 24;
|
4230
|
+
values[7] |= static_cast<uint64_t>(*ptr++) << 24;
|
4231
4231
|
values[7] |= *ptr++ << 16;
|
4232
4232
|
values[7] |= *ptr++ << 8;
|
4233
4233
|
values[7] |= *ptr;
|
@@ -4296,7 +4296,7 @@ static inline void unpack_bits_35(uint64_t* values, const uint8_t* ptr) {
|
|
4296
4296
|
values[1] |= *ptr++ << 6;
|
4297
4297
|
values[1] |= *ptr >> 2;
|
4298
4298
|
|
4299
|
-
values[2] = static_cast<uint64_t>(*ptr++ &
|
4299
|
+
values[2] = static_cast<uint64_t>(*ptr++ & 3) << 33;
|
4300
4300
|
values[2] |= static_cast<uint64_t>(*ptr++) << 25;
|
4301
4301
|
values[2] |= *ptr++ << 17;
|
4302
4302
|
values[2] |= *ptr++ << 9;
|
@@ -6201,7 +6201,7 @@ static inline void pack_bits_block8(const uint64_t* values, uint8_t* ptr, uint8_
|
|
6201
6201
|
case 61: pack_bits_61(values, ptr); break;
|
6202
6202
|
case 62: pack_bits_62(values, ptr); break;
|
6203
6203
|
case 63: pack_bits_63(values, ptr); break;
|
6204
|
-
default: throw std::logic_error("wrong number of bits " + std::to_string(bits));
|
6204
|
+
default: throw std::logic_error("wrong number of bits in pack_bits_block8: " + std::to_string(bits));
|
6205
6205
|
}
|
6206
6206
|
}
|
6207
6207
|
|
@@ -6270,7 +6270,7 @@ static inline void unpack_bits_block8(uint64_t* values, const uint8_t* ptr, uint
|
|
6270
6270
|
case 61: unpack_bits_61(values, ptr); break;
|
6271
6271
|
case 62: unpack_bits_62(values, ptr); break;
|
6272
6272
|
case 63: unpack_bits_63(values, ptr); break;
|
6273
|
-
default: throw std::logic_error("wrong number of bits " + std::to_string(bits));
|
6273
|
+
default: throw std::logic_error("wrong number of bits in unpack_bits_block8: " + std::to_string(bits));
|
6274
6274
|
}
|
6275
6275
|
}
|
6276
6276
|
|