datasketches 0.4.2 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/NOTICE +1 -1
  4. data/README.md +1 -1
  5. data/ext/datasketches/vo_wrapper.cpp +1 -1
  6. data/lib/datasketches/version.rb +1 -1
  7. data/vendor/datasketches-cpp/CMakeLists.txt +2 -0
  8. data/vendor/datasketches-cpp/LICENSE +35 -7
  9. data/vendor/datasketches-cpp/NOTICE +3 -3
  10. data/vendor/datasketches-cpp/README.md +2 -3
  11. data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -3
  12. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
  13. data/vendor/datasketches-cpp/common/include/common_defs.hpp +18 -0
  14. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
  15. data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
  16. data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
  17. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
  18. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
  19. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
  20. data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
  21. data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
  22. data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
  23. data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
  24. data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
  25. data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
  26. data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
  27. data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
  28. data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
  29. data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
  30. data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
  31. data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
  32. data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
  33. data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
  34. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
  35. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
  36. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
  37. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
  38. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
  39. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
  40. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
  41. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
  42. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
  43. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
  44. data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
  45. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +304 -0
  46. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +632 -0
  47. data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
  48. data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
  49. data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
  50. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
  51. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
  52. data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
  53. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +456 -0
  54. data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
  55. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
  56. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  57. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
  58. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
  59. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -38
  60. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
  61. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
  62. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
  63. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
  64. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
  65. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
  66. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
  67. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  68. metadata +27 -9
@@ -0,0 +1,406 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+
22
+ #include "bloom_filter.hpp"
23
+
24
+ #ifdef TEST_BINARY_INPUT_PATH
25
+ static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
26
+ #else
27
+ static std::string testBinaryInputPath = "test/";
28
+ #endif
29
+
30
+ namespace datasketches {
31
+
32
+ TEST_CASE("bloom_filter: invalid constructor args", "[bloom_filter]") {
33
+ REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(0, 4), std::invalid_argument);
34
+ REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(1L << 60, 4), std::invalid_argument);
35
+ REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(65535, 0), std::invalid_argument);
36
+ }
37
+
38
+ TEST_CASE("bloom_filter: standard constructors", "[bloom_filter]") {
39
+ uint64_t num_items = 4000;
40
+ double fpp = 0.01;
41
+
42
+ uint64_t num_bits = bloom_filter::builder::suggest_num_filter_bits(num_items, fpp);
43
+ uint16_t num_hashes = bloom_filter::builder::suggest_num_hashes(num_items, num_bits);
44
+ uint64_t seed = 89023;
45
+
46
+ auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes, seed);
47
+ uint64_t adjusted_num_bits = (num_bits + 63) & ~0x3F; // round up to the nearest multiple of 64
48
+ REQUIRE(bf.get_capacity() == adjusted_num_bits);
49
+ REQUIRE(bf.get_num_hashes() == num_hashes);
50
+ REQUIRE(bf.get_seed() == seed);
51
+ REQUIRE(bf.is_empty());
52
+
53
+ // should match above
54
+ bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
55
+ REQUIRE(bf.get_capacity() == adjusted_num_bits);
56
+ REQUIRE(bf.get_num_hashes() == num_hashes);
57
+ REQUIRE(bf.get_seed() == seed);
58
+ REQUIRE(bf.is_empty());
59
+
60
+ // same for initializing memory in-place
61
+ size_t serialized_size_bytes = bloom_filter::get_serialized_size_bytes(num_bits);
62
+ uint8_t* bytes = new uint8_t[serialized_size_bytes];
63
+
64
+ bf = bloom_filter::builder::initialize_by_size(bytes, serialized_size_bytes, num_bits, num_hashes, seed);
65
+ REQUIRE(bf.get_capacity() == adjusted_num_bits);
66
+ REQUIRE(bf.get_num_hashes() == num_hashes);
67
+ REQUIRE(bf.get_seed() == seed);
68
+ REQUIRE(bf.is_empty());
69
+
70
+ bf = bloom_filter::builder::initialize_by_accuracy(bytes, serialized_size_bytes, num_items, fpp, seed);
71
+ REQUIRE(bf.get_capacity() == adjusted_num_bits);
72
+ REQUIRE(bf.get_num_hashes() == num_hashes);
73
+ REQUIRE(bf.get_seed() == seed);
74
+ REQUIRE(bf.is_empty());
75
+
76
+ delete [] bytes;
77
+ }
78
+
79
+ TEST_CASE("bloom_filter: basic operations", "[bloom_filter]") {
80
+ uint64_t num_items = 5000;
81
+ double fpp = 0.01;
82
+ uint64_t seed = 4897301548054ULL;
83
+
84
+ auto bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
85
+ REQUIRE(bf.is_empty());
86
+ REQUIRE(bf.get_bits_used() == 0);
87
+
88
+ for (uint64_t i = 0; i < num_items; ++i) {
89
+ bf.query_and_update(i);
90
+ }
91
+
92
+ REQUIRE(!bf.is_empty());
93
+ // filter is about 50% full at target capacity
94
+ // since seed is fixed we expect an exact value every time
95
+ // but leaving the approximate test in since that's more the "expectation"
96
+ REQUIRE(bf.get_bits_used() == 24793); // exact value is not important but should be consistent
97
+ REQUIRE(bf.get_bits_used() == Approx(0.5 * bf.get_capacity()).epsilon(0.05)); // just over 3.3% in practice
98
+
99
+ uint32_t num_found = 0;
100
+ for (uint64_t i = num_items; i < bf.get_capacity(); ++i) {
101
+ if (bf.query(i)) {
102
+ ++num_found;
103
+ }
104
+ }
105
+ // fpp is average with significant variance -- even at 12% it would fail occasionally
106
+ REQUIRE(num_found == 423);
107
+ //REQUIRE(num_found == Approx((bf.get_capacity() - num_items) * fpp).epsilon(0.12));
108
+ auto bytes = bf.serialize();
109
+
110
+ // initialize in memory and run the same tests
111
+ // also checking against the results from the first part
112
+ uint8_t* bf_memory = new uint8_t[bytes.size()];
113
+ auto bf2 = bloom_filter::builder::initialize_by_accuracy(bf_memory, bytes.size(), num_items, fpp, bf.get_seed());
114
+ REQUIRE(bf2.is_empty());
115
+ REQUIRE(bf2.get_bits_used() == 0);
116
+
117
+ for (uint64_t i = 0; i < num_items; ++i) {
118
+ bf2.query_and_update(i);
119
+ }
120
+
121
+ REQUIRE(!bf2.is_empty());
122
+ REQUIRE(bf2.get_bits_used() == bf.get_bits_used()); // should exactly match above
123
+
124
+ uint32_t num_found2 = 0;
125
+ for (uint64_t i = num_items; i < bf2.get_capacity(); ++i) {
126
+ if (bf2.query(i)) {
127
+ ++num_found2;
128
+ }
129
+ }
130
+ REQUIRE(num_found == num_found2); // should exactly match above
131
+ auto bytes2 = bf2.serialize();
132
+
133
+ REQUIRE(bytes.size() == bytes2.size());
134
+ for (size_t i = 0; i < bytes.size(); ++i) {
135
+ REQUIRE(bytes[i] == bytes2[i]);
136
+ }
137
+
138
+ // check that raw memory also matches serialized sketch
139
+ const uint8_t* bf_bytes = bf2.get_wrapped_memory();
140
+ REQUIRE(bf_bytes == bf_memory);
141
+ for (size_t i = 0; i < bytes.size(); ++i) {
142
+ REQUIRE(bf_bytes[i] == bytes[i]);
143
+ }
144
+
145
+ // ensure the filters reset properly
146
+ bf.reset();
147
+ REQUIRE(bf.is_empty());
148
+ REQUIRE(bf.get_bits_used() == 0);
149
+
150
+ bf2.reset();
151
+ REQUIRE(bf2.is_empty());
152
+ REQUIRE(bf2.get_bits_used() == 0);
153
+
154
+ delete [] bf_memory;
155
+ }
156
+
157
+ TEST_CASE("bloom_filter: inversion", "[bloom_filter]") {
158
+ uint64_t num_bits = 8192;
159
+ uint16_t num_hashes = 3;
160
+
161
+ auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
162
+
163
+ uint64_t n = 500;
164
+ for (uint64_t i = 0; i < n; ++i) {
165
+ bf.update(i);
166
+ }
167
+ uint64_t num_bits_set = bf.get_bits_used();
168
+ bf.invert();
169
+ REQUIRE(bf.get_bits_used() == num_bits - num_bits_set);
170
+
171
+ // original items should be mostly not-present
172
+ uint32_t num_found = 0;
173
+ for (uint64_t i = 0; i < n; ++i) {
174
+ if (bf.query(i)) {
175
+ ++num_found;
176
+ }
177
+ }
178
+ REQUIRE(num_found < n / 10);
179
+
180
+ // many other items should be "present"
181
+ num_found = 0;
182
+ for (uint64_t i = n; i < num_bits; ++i) {
183
+ if (bf.query(i)) {
184
+ ++num_found;
185
+ }
186
+ }
187
+ REQUIRE(num_found > n);
188
+ }
189
+
190
+ TEST_CASE("bloom_filter: incompatible set operations", "[bloom_filter]") {
191
+ uint64_t num_bits = 32768;
192
+ uint16_t num_hashes = 4;
193
+
194
+ auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
195
+
196
+ // mismatched num bits
197
+ auto bf2 = bloom_filter::builder::create_by_size(2 * num_bits, num_hashes);
198
+ REQUIRE_THROWS_AS(bf1.union_with(bf2), std::invalid_argument);
199
+
200
+ // mismatched num hashes
201
+ auto bf3 = bloom_filter::builder::create_by_size(num_bits, 2 * num_hashes);
202
+ REQUIRE_THROWS_AS(bf1.intersect(bf2), std::invalid_argument);
203
+
204
+ // mismatched seed
205
+ auto bf4 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed() + 1);
206
+ REQUIRE_THROWS_AS(bf1.union_with(bf4), std::invalid_argument);
207
+ }
208
+
209
+ TEST_CASE("bloom_filter: basic union", "[bloom_filter]") {
210
+ const uint64_t num_bits = 12288;
211
+ const uint16_t num_hashes = 4;
212
+
213
+ auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
214
+ auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed());
215
+
216
+ const uint64_t n = 1000;
217
+ const uint32_t max_item = 3 * n / 2 - 1;
218
+ for (uint64_t i = 0; i < n; ++i) {
219
+ bf1.query_and_update(i);
220
+ bf2.update(n / 2 + i);
221
+ }
222
+
223
+ bf1.union_with(bf2);
224
+ for (uint64_t i = 0; i < max_item; ++i) {
225
+ REQUIRE(bf1.query(i));
226
+ }
227
+
228
+ uint32_t num_found = 0;
229
+ for (uint64_t i = max_item; i < num_bits; ++i) {
230
+ if (bf1.query(i)) {
231
+ ++num_found;
232
+ }
233
+ }
234
+ REQUIRE(num_found < num_bits / 10); // not being super strict
235
+ }
236
+
237
+ TEST_CASE("bloom_filter: basic intersection", "[bloom_filter]") {
238
+ const uint64_t num_bits = 8192;
239
+ const uint16_t num_hahes = 5;
240
+
241
+ auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hahes);
242
+ auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hahes, bf1.get_seed());
243
+
244
+ const uint64_t n = 1024;
245
+ const uint32_t max_item = 3 * n / 2 - 1;
246
+ for (uint64_t i = 0; i < n; ++i) {
247
+ bf1.update(i);
248
+ bf2.update(n / 2 + i);
249
+ }
250
+
251
+ bf1.intersect(bf2);
252
+ // overlap bit should all be set
253
+ for (uint64_t i = n / 2; i < n; ++i) {
254
+ REQUIRE(bf1.query(i));
255
+ }
256
+
257
+ uint32_t num_found = 0;
258
+ for (uint64_t i = 0; i < n / 2; ++i) {
259
+ if (bf1.query(i)) {
260
+ ++num_found;
261
+ }
262
+ }
263
+ for (uint64_t i = max_item; i < num_bits; ++i) {
264
+ if (bf1.query(i)) {
265
+ ++num_found;
266
+ }
267
+ }
268
+
269
+ REQUIRE(num_found < num_bits / 10); // not being super strict
270
+ }
271
+
272
+ TEST_CASE("bloom_filter: empty serialization", "[bloom_filter]") {
273
+ const uint64_t num_bits = 32769;
274
+ const uint16_t num_hashes = 7;
275
+
276
+ auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
277
+ auto bytes = bf.serialize();
278
+ REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
279
+
280
+ auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
281
+ REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
282
+ REQUIRE(bf.get_seed() == bf_bytes.get_seed());
283
+ REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
284
+ REQUIRE(bf_bytes.is_empty());
285
+
286
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
287
+ bf.serialize(ss);
288
+ auto bf_stream = bloom_filter::deserialize(ss);
289
+ REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
290
+ REQUIRE(bf.get_seed() == bf_stream.get_seed());
291
+ REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
292
+ REQUIRE(bf_stream.is_empty());
293
+
294
+ // read-only wrap should work
295
+ auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
296
+ REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
297
+ REQUIRE(bf.get_seed() == bf_wrap.get_seed());
298
+ REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
299
+ REQUIRE(bf_wrap.is_empty());
300
+
301
+ // writable wrap should not
302
+ REQUIRE_THROWS_AS(bloom_filter::writable_wrap(bytes.data(), bytes.size()), std::invalid_argument);
303
+ }
304
+
305
+ TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") {
306
+ const uint64_t num_bits = 32768;
307
+ const uint16_t num_hashes = 5;
308
+
309
+ auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
310
+ const uint64_t n = 1000;
311
+ for (uint64_t i = 0; i < n; ++i) {
312
+ bf.update(0.5 + i); // testing floats
313
+ }
314
+
315
+ // test more items without updating, assuming some false positives
316
+ // so we can check that we get the same number of false positives
317
+ // with the same query items
318
+ uint64_t fp_count = 0;
319
+ for (uint64_t i = n; i < num_bits; ++i) {
320
+ fp_count += bf.query(0.5 + i) ? 1 : 0;
321
+ }
322
+
323
+ auto bytes = bf.serialize();
324
+ REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
325
+
326
+ auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
327
+ REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
328
+ REQUIRE(bf.get_seed() == bf_bytes.get_seed());
329
+ REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
330
+ REQUIRE(!bf_bytes.is_empty());
331
+ REQUIRE(bf.is_memory_owned());
332
+ uint64_t fp_count_bytes = 0;
333
+ for (uint64_t i = 0; i < num_bits; ++i) {
334
+ bool val = bf_bytes.query(0.5 + i);
335
+ if (i < n)
336
+ REQUIRE(val);
337
+ else if (val)
338
+ ++fp_count_bytes;
339
+ }
340
+ REQUIRE(fp_count_bytes == fp_count);
341
+
342
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
343
+ bf.serialize(ss);
344
+ auto bf_stream = bloom_filter::deserialize(ss);
345
+ REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
346
+ REQUIRE(bf.get_seed() == bf_stream.get_seed());
347
+ REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
348
+ REQUIRE(!bf_stream.is_empty());
349
+ REQUIRE(bf_stream.is_memory_owned());
350
+ uint64_t fp_count_stream = 0;
351
+ for (uint64_t i = 0; i < num_bits; ++i) {
352
+ bool val = bf_stream.query(0.5 + i);
353
+ if (i < n)
354
+ REQUIRE(val);
355
+ else if (val)
356
+ ++fp_count_stream;
357
+ }
358
+ REQUIRE(fp_count_stream == fp_count);
359
+
360
+ // read-only wrap
361
+ auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
362
+ REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
363
+ REQUIRE(bf.get_seed() == bf_wrap.get_seed());
364
+ REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
365
+ REQUIRE(!bf_wrap.is_empty());
366
+ REQUIRE(!bf_wrap.is_memory_owned());
367
+ uint64_t fp_count_wrap = 0;
368
+ for (uint64_t i = 0; i < num_bits; ++i) {
369
+ bool val = bf_wrap.query(0.5 + i);
370
+ if (i < n)
371
+ REQUIRE(val);
372
+ else if (val)
373
+ ++fp_count_wrap;
374
+ }
375
+ REQUIRE(fp_count_wrap == fp_count);
376
+ REQUIRE_THROWS_AS(bf_wrap.update(-1.0), std::logic_error);
377
+ REQUIRE_THROWS_AS(bf_wrap.query_and_update(-2.0), std::logic_error);
378
+ REQUIRE_THROWS_AS(bf_wrap.reset(), std::logic_error);
379
+
380
+ // writable wrap
381
+ auto bf_writable = bloom_filter::writable_wrap(bytes.data(), bytes.size());
382
+ REQUIRE(bf.get_capacity() == bf_writable.get_capacity());
383
+ REQUIRE(bf.get_seed() == bf_writable.get_seed());
384
+ REQUIRE(bf.get_num_hashes() == bf_writable.get_num_hashes());
385
+ REQUIRE(!bf_writable.is_empty());
386
+ REQUIRE(!bf_writable.is_memory_owned());
387
+ uint64_t fp_count_writable = 0;
388
+ for (uint64_t i = 0; i < num_bits; ++i) {
389
+ bool val = bf_writable.query(0.5 + i);
390
+ if (i < n)
391
+ REQUIRE(val);
392
+ else if (val)
393
+ ++fp_count_writable;
394
+ }
395
+ REQUIRE(fp_count_writable == fp_count);
396
+
397
+ REQUIRE(!bf_writable.query(-1.0));
398
+ bf_writable.update(-1.0);
399
+ REQUIRE(bf_writable.query(-1.0));
400
+
401
+ // not good memory management to do this, but because we wrapped the same bytes as both
402
+ // read-only adn writable, that update should ahve changed the read-only version, too
403
+ REQUIRE(bf_wrap.query(-1.0));
404
+ }
405
+
406
+ } // namespace datasketches
@@ -30,7 +30,6 @@ target_include_directories(hll
30
30
  )
31
31
 
32
32
  target_link_libraries(hll INTERFACE common)
33
- target_compile_features(hll INTERFACE cxx_std_11)
34
33
 
35
34
  install(TARGETS hll
36
35
  EXPORT ${PROJECT_NAME}
@@ -30,7 +30,6 @@ target_include_directories(kll
30
30
  )
31
31
 
32
32
  target_link_libraries(kll INTERFACE common)
33
- target_compile_features(kll INTERFACE cxx_std_11)
34
33
 
35
34
  install(TARGETS kll
36
35
  EXPORT ${PROJECT_NAME}
@@ -31,11 +31,14 @@ using alloc = test_allocator<test_type>;
31
31
 
32
32
  TEST_CASE("kll sketch custom type", "[kll_sketch]") {
33
33
 
34
- // setup section
35
34
  test_allocator_total_bytes = 0;
35
+ test_allocator_net_allocations = 0;
36
36
 
37
37
  SECTION("compact level zero") {
38
38
  kll_test_type_sketch sketch(8, test_type_less(), 0);
39
+ REQUIRE(test_allocator_total_bytes != 0);
40
+ REQUIRE(test_allocator_net_allocations != 0);
41
+
39
42
  REQUIRE_THROWS_AS(sketch.get_quantile(0), std::runtime_error);
40
43
  REQUIRE_THROWS_AS(sketch.get_min_item(), std::runtime_error);
41
44
  REQUIRE_THROWS_AS(sketch.get_max_item(), std::runtime_error);
@@ -146,10 +149,8 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
146
149
  REQUIRE(sketch2.get_n() == 11);
147
150
  }
148
151
 
149
- // cleanup
150
- if (test_allocator_total_bytes != 0) {
151
- REQUIRE(test_allocator_total_bytes == 0);
152
- }
152
+ REQUIRE(test_allocator_total_bytes == 0);
153
+ REQUIRE(test_allocator_net_allocations == 0);
153
154
  }
154
155
 
155
156
  } /* namespace datasketches */
@@ -30,7 +30,6 @@ target_include_directories(quantiles
30
30
  )
31
31
 
32
32
  target_link_libraries(quantiles INTERFACE common)
33
- target_compile_features(quantiles INTERFACE cxx_std_11)
34
33
 
35
34
  install(TARGETS quantiles
36
35
  EXPORT ${PROJECT_NAME}
@@ -30,7 +30,6 @@ target_include_directories(req
30
30
  )
31
31
 
32
32
  target_link_libraries(req INTERFACE common)
33
- target_compile_features(req INTERFACE cxx_std_11)
34
33
 
35
34
  install(TARGETS req
36
35
  EXPORT ${PROJECT_NAME}
@@ -30,7 +30,6 @@ target_include_directories(sampling
30
30
  )
31
31
 
32
32
  target_link_libraries(sampling INTERFACE common)
33
- target_compile_features(sampling INTERFACE cxx_std_11)
34
33
 
35
34
  install(TARGETS sampling
36
35
  EXPORT ${PROJECT_NAME}
@@ -37,14 +37,14 @@ class ebpps_sample {
37
37
  public:
38
38
  explicit ebpps_sample(uint32_t k, const A& allocator = A());
39
39
 
40
- // constructor used to create a sample to merge one itme
41
- template<typename TT>
42
- ebpps_sample(TT&& item, double theta, const A& allocator = A());
43
-
44
40
  // for deserialization
45
41
  class items_deleter;
46
42
  ebpps_sample(std::vector<T, A>&& data, optional<T>&& partial_item, double c, const A& allocator = A());
47
43
 
44
+ // used instead of having a single-item constructor for update/merge calls
45
+ template<typename TT>
46
+ void replace_content(TT&& item, double theta);
47
+
48
48
  void reset();
49
49
  void downsample(double theta);
50
50
 
@@ -41,22 +41,6 @@ ebpps_sample<T,A>::ebpps_sample(uint32_t reserved_size, const A& allocator) :
41
41
  data_.reserve(reserved_size);
42
42
  }
43
43
 
44
- template<typename T, typename A>
45
- template<typename TT>
46
- ebpps_sample<T,A>::ebpps_sample(TT&& item, double theta, const A& allocator) :
47
- allocator_(allocator),
48
- c_(theta),
49
- partial_item_(),
50
- data_(allocator)
51
- {
52
- if (theta == 1.0) {
53
- data_.reserve(1);
54
- data_.emplace_back(std::forward<TT>(item));
55
- } else {
56
- partial_item_.emplace(std::forward<TT>(item));
57
- }
58
- }
59
-
60
44
  template<typename T, typename A>
61
45
  ebpps_sample<T,A>::ebpps_sample(std::vector<T, A>&& data, optional<T>&& partial_item, double c, const A& allocator) :
62
46
  allocator_(allocator),
@@ -65,6 +49,19 @@ ebpps_sample<T,A>::ebpps_sample(std::vector<T, A>&& data, optional<T>&& partial_
65
49
  data_(data, allocator)
66
50
  {}
67
51
 
52
+ template<typename T, typename A>
53
+ template<typename TT>
54
+ void ebpps_sample<T,A>::replace_content(TT&& item, double theta) {
55
+ c_ = theta;
56
+ data_.clear();
57
+ partial_item_.reset();
58
+ if (theta == 1.0) {
59
+ data_.emplace_back(std::forward<TT>(item));
60
+ } else {
61
+ partial_item_.emplace(std::forward<TT>(item));
62
+ }
63
+ }
64
+
68
65
  template<typename T, typename A>
69
66
  auto ebpps_sample<T,A>::get_sample() const -> result_type {
70
67
  double unused;
@@ -43,7 +43,7 @@ namespace ebpps_constants {
43
43
  * From: "Exact PPS Sampling with Bounded Sample Size",
44
44
  * B. Hentschel, P. J. Haas, Y. Tian. Information Processing Letters, 2023.
45
45
  *
46
- * This sketch samples data from a stream of items propotional to the weight of each item.
46
+ * This sketch samples data from a stream of items proportional to the weight of each item.
47
47
  * The sample guarantees the presence of an item in the result is proportional to that item's
48
48
  * portion of the total weight seen by the sketch, and returns a sample no larger than size k.
49
49
  *
@@ -256,6 +256,8 @@ class ebpps_sketch {
256
256
 
257
257
  ebpps_sample<T,A> sample_; // Object holding the current state of the sample
258
258
 
259
+ ebpps_sample<T,A> tmp_; // Temporary sample of size 1 used in updates
260
+
259
261
  // handles merge after ensuring other.cumulative_wt_ <= this->cumulative_wt_
260
262
  // so we can send items in individually
261
263
  template<typename O>
@@ -40,7 +40,8 @@ ebpps_sketch<T, A>::ebpps_sketch(uint32_t k, const A& allocator) :
40
40
  cumulative_wt_(0.0),
41
41
  wt_max_(0.0),
42
42
  rho_(1.0),
43
- sample_(check_k(k), allocator)
43
+ sample_(check_k(k), allocator),
44
+ tmp_(1, allocator)
44
45
  {}
45
46
 
46
47
  template<typename T, typename A>
@@ -53,7 +54,8 @@ ebpps_sketch<T,A>::ebpps_sketch(uint32_t k, uint64_t n, double cumulative_wt,
53
54
  cumulative_wt_(cumulative_wt),
54
55
  wt_max_(wt_max),
55
56
  rho_(rho),
56
- sample_(sample)
57
+ sample_(sample),
58
+ tmp_(1, allocator)
57
59
  {}
58
60
 
59
61
  template<typename T, typename A>
@@ -148,9 +150,8 @@ void ebpps_sketch<T, A>::internal_update(FwdItem&& item, double weight) {
148
150
  if (cumulative_wt_ > 0.0)
149
151
  sample_.downsample(new_rho / rho_);
150
152
 
151
- ebpps_sample<T,A> tmp(conditional_forward<FwdItem>(item), new_rho * weight, allocator_);
152
-
153
- sample_.merge(tmp);
153
+ tmp_.replace_content(conditional_forward<FwdItem>(item), new_rho * weight);
154
+ sample_.merge(tmp_);
154
155
 
155
156
  cumulative_wt_ = new_cum_wt;
156
157
  wt_max_ = new_wt_max;
@@ -240,9 +241,8 @@ void ebpps_sketch<T, A>::internal_merge(O&& sk) {
240
241
  if (cumulative_wt_ > 0.0)
241
242
  sample_.downsample(new_rho / rho_);
242
243
 
243
- ebpps_sample<T,A> tmp(conditional_forward<O>(items[i]), new_rho * avg_wt, allocator_);
244
-
245
- sample_.merge(tmp);
244
+ tmp_.replace_content(conditional_forward<O>(items[i]), new_rho * avg_wt);
245
+ sample_.merge(tmp_);
246
246
 
247
247
  cumulative_wt_ = new_cum_wt;
248
248
  rho_ = new_rho;
@@ -259,9 +259,8 @@ void ebpps_sketch<T, A>::internal_merge(O&& sk) {
259
259
  if (cumulative_wt_ > 0.0)
260
260
  sample_.downsample(new_rho / rho_);
261
261
 
262
- ebpps_sample<T,A> tmp(conditional_forward<O>(other_sample.get_partial_item()), new_rho * other_c_frac * avg_wt, allocator_);
263
-
264
- sample_.merge(tmp);
262
+ tmp_.replace_content(conditional_forward<O>(other_sample.get_partial_item()), new_rho * other_c_frac * avg_wt);
263
+ sample_.merge(tmp_);
265
264
 
266
265
  cumulative_wt_ = new_cum_wt;
267
266
  rho_ = new_rho;
@@ -42,14 +42,15 @@ TEST_CASE("ebpps sample: basic initialization", "[ebpps_sketch]") {
42
42
 
43
43
  TEST_CASE("ebpps sample: pre-initialized", "[ebpps_sketch]") {
44
44
  double theta = 1.0;
45
- ebpps_sample<int> sample = ebpps_sample<int>(-1, theta);
45
+ ebpps_sample<int> sample(1);
46
+ sample.replace_content(-1, theta);
46
47
  REQUIRE(sample.get_c() == theta);
47
48
  REQUIRE(sample.get_num_retained_items() == 1);
48
49
  REQUIRE(sample.get_sample().size() == 1);
49
50
  REQUIRE(sample.has_partial_item() == false);
50
51
 
51
52
  theta = 1e-300;
52
- sample = ebpps_sample<int>(-1, theta);
53
+ sample.replace_content(-1, theta);
53
54
  REQUIRE(sample.get_c() == theta);
54
55
  REQUIRE(sample.get_num_retained_items() == 1);
55
56
  REQUIRE(sample.get_sample().size() == 0); // assuming the random number is > 1e-300
@@ -57,7 +58,8 @@ TEST_CASE("ebpps sample: pre-initialized", "[ebpps_sketch]") {
57
58
  }
58
59
 
59
60
  TEST_CASE("ebpps sample: downsampling", "[ebpps_sketch]") {
60
- ebpps_sample<char> sample = ebpps_sample<char>('a', 1.0);
61
+ ebpps_sample<char> sample(1);
62
+ sample.replace_content('a', 1.0);
61
63
 
62
64
  sample.downsample(2.0); // no-op
63
65
  REQUIRE(sample.get_c() == 1.0);
@@ -121,8 +123,9 @@ TEST_CASE("ebpps sample: merge unit samples", "[ebpps_sketch]") {
121
123
  uint32_t k = 8;
122
124
  ebpps_sample<int> sample = ebpps_sample<int>(k);
123
125
 
126
+ ebpps_sample<int> s(1);
124
127
  for (uint32_t i = 1; i <= k; ++i) {
125
- ebpps_sample<int> s = ebpps_sample<int>(i, 1.0);
128
+ s.replace_content(i, 1.0);
126
129
  sample.merge(s);
127
130
  REQUIRE(sample.get_c() == static_cast<double>(i));
128
131
  REQUIRE(sample.get_num_retained_items() == i);