datasketches 0.4.2 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/NOTICE +1 -1
  4. data/README.md +1 -1
  5. data/ext/datasketches/vo_wrapper.cpp +1 -1
  6. data/lib/datasketches/version.rb +1 -1
  7. data/vendor/datasketches-cpp/CMakeLists.txt +2 -0
  8. data/vendor/datasketches-cpp/LICENSE +35 -7
  9. data/vendor/datasketches-cpp/NOTICE +3 -3
  10. data/vendor/datasketches-cpp/README.md +2 -3
  11. data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -3
  12. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +5 -6
  13. data/vendor/datasketches-cpp/common/include/common_defs.hpp +18 -0
  14. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
  15. data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
  16. data/vendor/datasketches-cpp/count/CMakeLists.txt +0 -1
  17. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +0 -1
  18. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -1
  19. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +10 -0
  20. data/vendor/datasketches-cpp/density/CMakeLists.txt +0 -1
  21. data/vendor/datasketches-cpp/fi/CMakeLists.txt +0 -1
  22. data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
  23. data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
  24. data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
  25. data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
  26. data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
  27. data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
  28. data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
  29. data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
  30. data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
  31. data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
  32. data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
  33. data/vendor/datasketches-cpp/hll/CMakeLists.txt +0 -1
  34. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -1
  35. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +6 -5
  36. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +0 -1
  37. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -1
  38. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +0 -1
  39. data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +4 -4
  40. data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +13 -16
  41. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +3 -1
  42. data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +10 -11
  43. data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +7 -4
  44. data/vendor/datasketches-cpp/tdigest/CMakeLists.txt +41 -0
  45. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +304 -0
  46. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +632 -0
  47. data/vendor/datasketches-cpp/tdigest/test/CMakeLists.txt +56 -0
  48. data/vendor/datasketches-cpp/tdigest/test/tdigest_custom_allocator_test.cpp +43 -0
  49. data/vendor/datasketches-cpp/tdigest/test/tdigest_deserialize_from_java_test.cpp +54 -0
  50. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_double.sk +0 -0
  51. data/vendor/datasketches-cpp/tdigest/test/tdigest_ref_k100_n10000_float.sk +0 -0
  52. data/vendor/datasketches-cpp/tdigest/test/tdigest_serialize_for_java.cpp +67 -0
  53. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +456 -0
  54. data/vendor/datasketches-cpp/theta/CMakeLists.txt +0 -1
  55. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
  56. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
  57. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +18 -1
  58. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +45 -21
  59. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -38
  60. data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +17 -0
  61. data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +1 -1
  62. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +73 -2
  63. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +0 -1
  64. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +28 -1
  65. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -0
  66. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +61 -0
  67. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  68. metadata +27 -9
@@ -0,0 +1,406 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+
22
+ #include "bloom_filter.hpp"
23
+
24
+ #ifdef TEST_BINARY_INPUT_PATH
25
+ static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
26
+ #else
27
+ static std::string testBinaryInputPath = "test/";
28
+ #endif
29
+
30
+ namespace datasketches {
31
+
32
+ TEST_CASE("bloom_filter: invalid constructor args", "[bloom_filter]") {
33
+ REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(0, 4), std::invalid_argument);
34
+ REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(1L << 60, 4), std::invalid_argument);
35
+ REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(65535, 0), std::invalid_argument);
36
+ }
37
+
38
+ TEST_CASE("bloom_filter: standard constructors", "[bloom_filter]") {
39
+ uint64_t num_items = 4000;
40
+ double fpp = 0.01;
41
+
42
+ uint64_t num_bits = bloom_filter::builder::suggest_num_filter_bits(num_items, fpp);
43
+ uint16_t num_hashes = bloom_filter::builder::suggest_num_hashes(num_items, num_bits);
44
+ uint64_t seed = 89023;
45
+
46
+ auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes, seed);
47
+ uint64_t adjusted_num_bits = (num_bits + 63) & ~0x3F; // round up to the nearest multiple of 64
48
+ REQUIRE(bf.get_capacity() == adjusted_num_bits);
49
+ REQUIRE(bf.get_num_hashes() == num_hashes);
50
+ REQUIRE(bf.get_seed() == seed);
51
+ REQUIRE(bf.is_empty());
52
+
53
+ // should match above
54
+ bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
55
+ REQUIRE(bf.get_capacity() == adjusted_num_bits);
56
+ REQUIRE(bf.get_num_hashes() == num_hashes);
57
+ REQUIRE(bf.get_seed() == seed);
58
+ REQUIRE(bf.is_empty());
59
+
60
+ // same for initializing memory in-place
61
+ size_t serialized_size_bytes = bloom_filter::get_serialized_size_bytes(num_bits);
62
+ uint8_t* bytes = new uint8_t[serialized_size_bytes];
63
+
64
+ bf = bloom_filter::builder::initialize_by_size(bytes, serialized_size_bytes, num_bits, num_hashes, seed);
65
+ REQUIRE(bf.get_capacity() == adjusted_num_bits);
66
+ REQUIRE(bf.get_num_hashes() == num_hashes);
67
+ REQUIRE(bf.get_seed() == seed);
68
+ REQUIRE(bf.is_empty());
69
+
70
+ bf = bloom_filter::builder::initialize_by_accuracy(bytes, serialized_size_bytes, num_items, fpp, seed);
71
+ REQUIRE(bf.get_capacity() == adjusted_num_bits);
72
+ REQUIRE(bf.get_num_hashes() == num_hashes);
73
+ REQUIRE(bf.get_seed() == seed);
74
+ REQUIRE(bf.is_empty());
75
+
76
+ delete [] bytes;
77
+ }
78
+
79
+ TEST_CASE("bloom_filter: basic operations", "[bloom_filter]") {
80
+ uint64_t num_items = 5000;
81
+ double fpp = 0.01;
82
+ uint64_t seed = 4897301548054ULL;
83
+
84
+ auto bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
85
+ REQUIRE(bf.is_empty());
86
+ REQUIRE(bf.get_bits_used() == 0);
87
+
88
+ for (uint64_t i = 0; i < num_items; ++i) {
89
+ bf.query_and_update(i);
90
+ }
91
+
92
+ REQUIRE(!bf.is_empty());
93
+ // filter is about 50% full at target capacity
94
+ // since seed is fixed we expect an exact value every time
95
+ // but leaving the approximate test in since that's more the "expectation"
96
+ REQUIRE(bf.get_bits_used() == 24793); // exact value is not important but should be consistent
97
+ REQUIRE(bf.get_bits_used() == Approx(0.5 * bf.get_capacity()).epsilon(0.05)); // just over 3.3% in practice
98
+
99
+ uint32_t num_found = 0;
100
+ for (uint64_t i = num_items; i < bf.get_capacity(); ++i) {
101
+ if (bf.query(i)) {
102
+ ++num_found;
103
+ }
104
+ }
105
+ // fpp is average with significant variance -- even at 12% it would fail occasionally
106
+ REQUIRE(num_found == 423);
107
+ //REQUIRE(num_found == Approx((bf.get_capacity() - num_items) * fpp).epsilon(0.12));
108
+ auto bytes = bf.serialize();
109
+
110
+ // initialize in memory and run the same tests
111
+ // also checking against the results from the first part
112
+ uint8_t* bf_memory = new uint8_t[bytes.size()];
113
+ auto bf2 = bloom_filter::builder::initialize_by_accuracy(bf_memory, bytes.size(), num_items, fpp, bf.get_seed());
114
+ REQUIRE(bf2.is_empty());
115
+ REQUIRE(bf2.get_bits_used() == 0);
116
+
117
+ for (uint64_t i = 0; i < num_items; ++i) {
118
+ bf2.query_and_update(i);
119
+ }
120
+
121
+ REQUIRE(!bf2.is_empty());
122
+ REQUIRE(bf2.get_bits_used() == bf.get_bits_used()); // should exactly match above
123
+
124
+ uint32_t num_found2 = 0;
125
+ for (uint64_t i = num_items; i < bf2.get_capacity(); ++i) {
126
+ if (bf2.query(i)) {
127
+ ++num_found2;
128
+ }
129
+ }
130
+ REQUIRE(num_found == num_found2); // should exactly match above
131
+ auto bytes2 = bf2.serialize();
132
+
133
+ REQUIRE(bytes.size() == bytes2.size());
134
+ for (size_t i = 0; i < bytes.size(); ++i) {
135
+ REQUIRE(bytes[i] == bytes2[i]);
136
+ }
137
+
138
+ // check that raw memory also matches serialized sketch
139
+ const uint8_t* bf_bytes = bf2.get_wrapped_memory();
140
+ REQUIRE(bf_bytes == bf_memory);
141
+ for (size_t i = 0; i < bytes.size(); ++i) {
142
+ REQUIRE(bf_bytes[i] == bytes[i]);
143
+ }
144
+
145
+ // ensure the filters reset properly
146
+ bf.reset();
147
+ REQUIRE(bf.is_empty());
148
+ REQUIRE(bf.get_bits_used() == 0);
149
+
150
+ bf2.reset();
151
+ REQUIRE(bf2.is_empty());
152
+ REQUIRE(bf2.get_bits_used() == 0);
153
+
154
+ delete [] bf_memory;
155
+ }
156
+
157
+ TEST_CASE("bloom_filter: inversion", "[bloom_filter]") {
158
+ uint64_t num_bits = 8192;
159
+ uint16_t num_hashes = 3;
160
+
161
+ auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
162
+
163
+ uint64_t n = 500;
164
+ for (uint64_t i = 0; i < n; ++i) {
165
+ bf.update(i);
166
+ }
167
+ uint64_t num_bits_set = bf.get_bits_used();
168
+ bf.invert();
169
+ REQUIRE(bf.get_bits_used() == num_bits - num_bits_set);
170
+
171
+ // original items should be mostly not-present
172
+ uint32_t num_found = 0;
173
+ for (uint64_t i = 0; i < n; ++i) {
174
+ if (bf.query(i)) {
175
+ ++num_found;
176
+ }
177
+ }
178
+ REQUIRE(num_found < n / 10);
179
+
180
+ // many other items should be "present"
181
+ num_found = 0;
182
+ for (uint64_t i = n; i < num_bits; ++i) {
183
+ if (bf.query(i)) {
184
+ ++num_found;
185
+ }
186
+ }
187
+ REQUIRE(num_found > n);
188
+ }
189
+
190
+ TEST_CASE("bloom_filter: incompatible set operations", "[bloom_filter]") {
191
+ uint64_t num_bits = 32768;
192
+ uint16_t num_hashes = 4;
193
+
194
+ auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
195
+
196
+ // mismatched num bits
197
+ auto bf2 = bloom_filter::builder::create_by_size(2 * num_bits, num_hashes);
198
+ REQUIRE_THROWS_AS(bf1.union_with(bf2), std::invalid_argument);
199
+
200
+ // mismatched num hashes
201
+ auto bf3 = bloom_filter::builder::create_by_size(num_bits, 2 * num_hashes);
202
+ REQUIRE_THROWS_AS(bf1.intersect(bf2), std::invalid_argument);
203
+
204
+ // mismatched seed
205
+ auto bf4 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed() + 1);
206
+ REQUIRE_THROWS_AS(bf1.union_with(bf4), std::invalid_argument);
207
+ }
208
+
209
+ TEST_CASE("bloom_filter: basic union", "[bloom_filter]") {
210
+ const uint64_t num_bits = 12288;
211
+ const uint16_t num_hashes = 4;
212
+
213
+ auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
214
+ auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed());
215
+
216
+ const uint64_t n = 1000;
217
+ const uint32_t max_item = 3 * n / 2 - 1;
218
+ for (uint64_t i = 0; i < n; ++i) {
219
+ bf1.query_and_update(i);
220
+ bf2.update(n / 2 + i);
221
+ }
222
+
223
+ bf1.union_with(bf2);
224
+ for (uint64_t i = 0; i < max_item; ++i) {
225
+ REQUIRE(bf1.query(i));
226
+ }
227
+
228
+ uint32_t num_found = 0;
229
+ for (uint64_t i = max_item; i < num_bits; ++i) {
230
+ if (bf1.query(i)) {
231
+ ++num_found;
232
+ }
233
+ }
234
+ REQUIRE(num_found < num_bits / 10); // not being super strict
235
+ }
236
+
237
+ TEST_CASE("bloom_filter: basic intersection", "[bloom_filter]") {
238
+ const uint64_t num_bits = 8192;
239
+ const uint16_t num_hahes = 5;
240
+
241
+ auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hahes);
242
+ auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hahes, bf1.get_seed());
243
+
244
+ const uint64_t n = 1024;
245
+ const uint32_t max_item = 3 * n / 2 - 1;
246
+ for (uint64_t i = 0; i < n; ++i) {
247
+ bf1.update(i);
248
+ bf2.update(n / 2 + i);
249
+ }
250
+
251
+ bf1.intersect(bf2);
252
+ // overlap bit should all be set
253
+ for (uint64_t i = n / 2; i < n; ++i) {
254
+ REQUIRE(bf1.query(i));
255
+ }
256
+
257
+ uint32_t num_found = 0;
258
+ for (uint64_t i = 0; i < n / 2; ++i) {
259
+ if (bf1.query(i)) {
260
+ ++num_found;
261
+ }
262
+ }
263
+ for (uint64_t i = max_item; i < num_bits; ++i) {
264
+ if (bf1.query(i)) {
265
+ ++num_found;
266
+ }
267
+ }
268
+
269
+ REQUIRE(num_found < num_bits / 10); // not being super strict
270
+ }
271
+
272
+ TEST_CASE("bloom_filter: empty serialization", "[bloom_filter]") {
273
+ const uint64_t num_bits = 32769;
274
+ const uint16_t num_hashes = 7;
275
+
276
+ auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
277
+ auto bytes = bf.serialize();
278
+ REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
279
+
280
+ auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
281
+ REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
282
+ REQUIRE(bf.get_seed() == bf_bytes.get_seed());
283
+ REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
284
+ REQUIRE(bf_bytes.is_empty());
285
+
286
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
287
+ bf.serialize(ss);
288
+ auto bf_stream = bloom_filter::deserialize(ss);
289
+ REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
290
+ REQUIRE(bf.get_seed() == bf_stream.get_seed());
291
+ REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
292
+ REQUIRE(bf_stream.is_empty());
293
+
294
+ // read-only wrap should work
295
+ auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
296
+ REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
297
+ REQUIRE(bf.get_seed() == bf_wrap.get_seed());
298
+ REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
299
+ REQUIRE(bf_wrap.is_empty());
300
+
301
+ // writable wrap should not
302
+ REQUIRE_THROWS_AS(bloom_filter::writable_wrap(bytes.data(), bytes.size()), std::invalid_argument);
303
+ }
304
+
305
+ TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") {
306
+ const uint64_t num_bits = 32768;
307
+ const uint16_t num_hashes = 5;
308
+
309
+ auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
310
+ const uint64_t n = 1000;
311
+ for (uint64_t i = 0; i < n; ++i) {
312
+ bf.update(0.5 + i); // testing floats
313
+ }
314
+
315
+ // test more items without updating, assuming some false positives
316
+ // so we can check that we get the same number of false positives
317
+ // with the same query items
318
+ uint64_t fp_count = 0;
319
+ for (uint64_t i = n; i < num_bits; ++i) {
320
+ fp_count += bf.query(0.5 + i) ? 1 : 0;
321
+ }
322
+
323
+ auto bytes = bf.serialize();
324
+ REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
325
+
326
+ auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
327
+ REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
328
+ REQUIRE(bf.get_seed() == bf_bytes.get_seed());
329
+ REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
330
+ REQUIRE(!bf_bytes.is_empty());
331
+ REQUIRE(bf.is_memory_owned());
332
+ uint64_t fp_count_bytes = 0;
333
+ for (uint64_t i = 0; i < num_bits; ++i) {
334
+ bool val = bf_bytes.query(0.5 + i);
335
+ if (i < n)
336
+ REQUIRE(val);
337
+ else if (val)
338
+ ++fp_count_bytes;
339
+ }
340
+ REQUIRE(fp_count_bytes == fp_count);
341
+
342
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
343
+ bf.serialize(ss);
344
+ auto bf_stream = bloom_filter::deserialize(ss);
345
+ REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
346
+ REQUIRE(bf.get_seed() == bf_stream.get_seed());
347
+ REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
348
+ REQUIRE(!bf_stream.is_empty());
349
+ REQUIRE(bf_stream.is_memory_owned());
350
+ uint64_t fp_count_stream = 0;
351
+ for (uint64_t i = 0; i < num_bits; ++i) {
352
+ bool val = bf_stream.query(0.5 + i);
353
+ if (i < n)
354
+ REQUIRE(val);
355
+ else if (val)
356
+ ++fp_count_stream;
357
+ }
358
+ REQUIRE(fp_count_stream == fp_count);
359
+
360
+ // read-only wrap
361
+ auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
362
+ REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
363
+ REQUIRE(bf.get_seed() == bf_wrap.get_seed());
364
+ REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
365
+ REQUIRE(!bf_wrap.is_empty());
366
+ REQUIRE(!bf_wrap.is_memory_owned());
367
+ uint64_t fp_count_wrap = 0;
368
+ for (uint64_t i = 0; i < num_bits; ++i) {
369
+ bool val = bf_wrap.query(0.5 + i);
370
+ if (i < n)
371
+ REQUIRE(val);
372
+ else if (val)
373
+ ++fp_count_wrap;
374
+ }
375
+ REQUIRE(fp_count_wrap == fp_count);
376
+ REQUIRE_THROWS_AS(bf_wrap.update(-1.0), std::logic_error);
377
+ REQUIRE_THROWS_AS(bf_wrap.query_and_update(-2.0), std::logic_error);
378
+ REQUIRE_THROWS_AS(bf_wrap.reset(), std::logic_error);
379
+
380
+ // writable wrap
381
+ auto bf_writable = bloom_filter::writable_wrap(bytes.data(), bytes.size());
382
+ REQUIRE(bf.get_capacity() == bf_writable.get_capacity());
383
+ REQUIRE(bf.get_seed() == bf_writable.get_seed());
384
+ REQUIRE(bf.get_num_hashes() == bf_writable.get_num_hashes());
385
+ REQUIRE(!bf_writable.is_empty());
386
+ REQUIRE(!bf_writable.is_memory_owned());
387
+ uint64_t fp_count_writable = 0;
388
+ for (uint64_t i = 0; i < num_bits; ++i) {
389
+ bool val = bf_writable.query(0.5 + i);
390
+ if (i < n)
391
+ REQUIRE(val);
392
+ else if (val)
393
+ ++fp_count_writable;
394
+ }
395
+ REQUIRE(fp_count_writable == fp_count);
396
+
397
+ REQUIRE(!bf_writable.query(-1.0));
398
+ bf_writable.update(-1.0);
399
+ REQUIRE(bf_writable.query(-1.0));
400
+
401
+ // not good memory management to do this, but because we wrapped the same bytes as both
402
+ // read-only adn writable, that update should ahve changed the read-only version, too
403
+ REQUIRE(bf_wrap.query(-1.0));
404
+ }
405
+
406
+ } // namespace datasketches
@@ -30,7 +30,6 @@ target_include_directories(hll
30
30
  )
31
31
 
32
32
  target_link_libraries(hll INTERFACE common)
33
- target_compile_features(hll INTERFACE cxx_std_11)
34
33
 
35
34
  install(TARGETS hll
36
35
  EXPORT ${PROJECT_NAME}
@@ -30,7 +30,6 @@ target_include_directories(kll
30
30
  )
31
31
 
32
32
  target_link_libraries(kll INTERFACE common)
33
- target_compile_features(kll INTERFACE cxx_std_11)
34
33
 
35
34
  install(TARGETS kll
36
35
  EXPORT ${PROJECT_NAME}
@@ -31,11 +31,14 @@ using alloc = test_allocator<test_type>;
31
31
 
32
32
  TEST_CASE("kll sketch custom type", "[kll_sketch]") {
33
33
 
34
- // setup section
35
34
  test_allocator_total_bytes = 0;
35
+ test_allocator_net_allocations = 0;
36
36
 
37
37
  SECTION("compact level zero") {
38
38
  kll_test_type_sketch sketch(8, test_type_less(), 0);
39
+ REQUIRE(test_allocator_total_bytes != 0);
40
+ REQUIRE(test_allocator_net_allocations != 0);
41
+
39
42
  REQUIRE_THROWS_AS(sketch.get_quantile(0), std::runtime_error);
40
43
  REQUIRE_THROWS_AS(sketch.get_min_item(), std::runtime_error);
41
44
  REQUIRE_THROWS_AS(sketch.get_max_item(), std::runtime_error);
@@ -146,10 +149,8 @@ TEST_CASE("kll sketch custom type", "[kll_sketch]") {
146
149
  REQUIRE(sketch2.get_n() == 11);
147
150
  }
148
151
 
149
- // cleanup
150
- if (test_allocator_total_bytes != 0) {
151
- REQUIRE(test_allocator_total_bytes == 0);
152
- }
152
+ REQUIRE(test_allocator_total_bytes == 0);
153
+ REQUIRE(test_allocator_net_allocations == 0);
153
154
  }
154
155
 
155
156
  } /* namespace datasketches */
@@ -30,7 +30,6 @@ target_include_directories(quantiles
30
30
  )
31
31
 
32
32
  target_link_libraries(quantiles INTERFACE common)
33
- target_compile_features(quantiles INTERFACE cxx_std_11)
34
33
 
35
34
  install(TARGETS quantiles
36
35
  EXPORT ${PROJECT_NAME}
@@ -30,7 +30,6 @@ target_include_directories(req
30
30
  )
31
31
 
32
32
  target_link_libraries(req INTERFACE common)
33
- target_compile_features(req INTERFACE cxx_std_11)
34
33
 
35
34
  install(TARGETS req
36
35
  EXPORT ${PROJECT_NAME}
@@ -30,7 +30,6 @@ target_include_directories(sampling
30
30
  )
31
31
 
32
32
  target_link_libraries(sampling INTERFACE common)
33
- target_compile_features(sampling INTERFACE cxx_std_11)
34
33
 
35
34
  install(TARGETS sampling
36
35
  EXPORT ${PROJECT_NAME}
@@ -37,14 +37,14 @@ class ebpps_sample {
37
37
  public:
38
38
  explicit ebpps_sample(uint32_t k, const A& allocator = A());
39
39
 
40
- // constructor used to create a sample to merge one itme
41
- template<typename TT>
42
- ebpps_sample(TT&& item, double theta, const A& allocator = A());
43
-
44
40
  // for deserialization
45
41
  class items_deleter;
46
42
  ebpps_sample(std::vector<T, A>&& data, optional<T>&& partial_item, double c, const A& allocator = A());
47
43
 
44
+ // used instead of having a single-item constructor for update/merge calls
45
+ template<typename TT>
46
+ void replace_content(TT&& item, double theta);
47
+
48
48
  void reset();
49
49
  void downsample(double theta);
50
50
 
@@ -41,22 +41,6 @@ ebpps_sample<T,A>::ebpps_sample(uint32_t reserved_size, const A& allocator) :
41
41
  data_.reserve(reserved_size);
42
42
  }
43
43
 
44
- template<typename T, typename A>
45
- template<typename TT>
46
- ebpps_sample<T,A>::ebpps_sample(TT&& item, double theta, const A& allocator) :
47
- allocator_(allocator),
48
- c_(theta),
49
- partial_item_(),
50
- data_(allocator)
51
- {
52
- if (theta == 1.0) {
53
- data_.reserve(1);
54
- data_.emplace_back(std::forward<TT>(item));
55
- } else {
56
- partial_item_.emplace(std::forward<TT>(item));
57
- }
58
- }
59
-
60
44
  template<typename T, typename A>
61
45
  ebpps_sample<T,A>::ebpps_sample(std::vector<T, A>&& data, optional<T>&& partial_item, double c, const A& allocator) :
62
46
  allocator_(allocator),
@@ -65,6 +49,19 @@ ebpps_sample<T,A>::ebpps_sample(std::vector<T, A>&& data, optional<T>&& partial_
65
49
  data_(data, allocator)
66
50
  {}
67
51
 
52
+ template<typename T, typename A>
53
+ template<typename TT>
54
+ void ebpps_sample<T,A>::replace_content(TT&& item, double theta) {
55
+ c_ = theta;
56
+ data_.clear();
57
+ partial_item_.reset();
58
+ if (theta == 1.0) {
59
+ data_.emplace_back(std::forward<TT>(item));
60
+ } else {
61
+ partial_item_.emplace(std::forward<TT>(item));
62
+ }
63
+ }
64
+
68
65
  template<typename T, typename A>
69
66
  auto ebpps_sample<T,A>::get_sample() const -> result_type {
70
67
  double unused;
@@ -43,7 +43,7 @@ namespace ebpps_constants {
43
43
  * From: "Exact PPS Sampling with Bounded Sample Size",
44
44
  * B. Hentschel, P. J. Haas, Y. Tian. Information Processing Letters, 2023.
45
45
  *
46
- * This sketch samples data from a stream of items propotional to the weight of each item.
46
+ * This sketch samples data from a stream of items proportional to the weight of each item.
47
47
  * The sample guarantees the presence of an item in the result is proportional to that item's
48
48
  * portion of the total weight seen by the sketch, and returns a sample no larger than size k.
49
49
  *
@@ -256,6 +256,8 @@ class ebpps_sketch {
256
256
 
257
257
  ebpps_sample<T,A> sample_; // Object holding the current state of the sample
258
258
 
259
+ ebpps_sample<T,A> tmp_; // Temporary sample of size 1 used in updates
260
+
259
261
  // handles merge after ensuring other.cumulative_wt_ <= this->cumulative_wt_
260
262
  // so we can send items in individually
261
263
  template<typename O>
@@ -40,7 +40,8 @@ ebpps_sketch<T, A>::ebpps_sketch(uint32_t k, const A& allocator) :
40
40
  cumulative_wt_(0.0),
41
41
  wt_max_(0.0),
42
42
  rho_(1.0),
43
- sample_(check_k(k), allocator)
43
+ sample_(check_k(k), allocator),
44
+ tmp_(1, allocator)
44
45
  {}
45
46
 
46
47
  template<typename T, typename A>
@@ -53,7 +54,8 @@ ebpps_sketch<T,A>::ebpps_sketch(uint32_t k, uint64_t n, double cumulative_wt,
53
54
  cumulative_wt_(cumulative_wt),
54
55
  wt_max_(wt_max),
55
56
  rho_(rho),
56
- sample_(sample)
57
+ sample_(sample),
58
+ tmp_(1, allocator)
57
59
  {}
58
60
 
59
61
  template<typename T, typename A>
@@ -148,9 +150,8 @@ void ebpps_sketch<T, A>::internal_update(FwdItem&& item, double weight) {
148
150
  if (cumulative_wt_ > 0.0)
149
151
  sample_.downsample(new_rho / rho_);
150
152
 
151
- ebpps_sample<T,A> tmp(conditional_forward<FwdItem>(item), new_rho * weight, allocator_);
152
-
153
- sample_.merge(tmp);
153
+ tmp_.replace_content(conditional_forward<FwdItem>(item), new_rho * weight);
154
+ sample_.merge(tmp_);
154
155
 
155
156
  cumulative_wt_ = new_cum_wt;
156
157
  wt_max_ = new_wt_max;
@@ -240,9 +241,8 @@ void ebpps_sketch<T, A>::internal_merge(O&& sk) {
240
241
  if (cumulative_wt_ > 0.0)
241
242
  sample_.downsample(new_rho / rho_);
242
243
 
243
- ebpps_sample<T,A> tmp(conditional_forward<O>(items[i]), new_rho * avg_wt, allocator_);
244
-
245
- sample_.merge(tmp);
244
+ tmp_.replace_content(conditional_forward<O>(items[i]), new_rho * avg_wt);
245
+ sample_.merge(tmp_);
246
246
 
247
247
  cumulative_wt_ = new_cum_wt;
248
248
  rho_ = new_rho;
@@ -259,9 +259,8 @@ void ebpps_sketch<T, A>::internal_merge(O&& sk) {
259
259
  if (cumulative_wt_ > 0.0)
260
260
  sample_.downsample(new_rho / rho_);
261
261
 
262
- ebpps_sample<T,A> tmp(conditional_forward<O>(other_sample.get_partial_item()), new_rho * other_c_frac * avg_wt, allocator_);
263
-
264
- sample_.merge(tmp);
262
+ tmp_.replace_content(conditional_forward<O>(other_sample.get_partial_item()), new_rho * other_c_frac * avg_wt);
263
+ sample_.merge(tmp_);
265
264
 
266
265
  cumulative_wt_ = new_cum_wt;
267
266
  rho_ = new_rho;
@@ -42,14 +42,15 @@ TEST_CASE("ebpps sample: basic initialization", "[ebpps_sketch]") {
42
42
 
43
43
  TEST_CASE("ebpps sample: pre-initialized", "[ebpps_sketch]") {
44
44
  double theta = 1.0;
45
- ebpps_sample<int> sample = ebpps_sample<int>(-1, theta);
45
+ ebpps_sample<int> sample(1);
46
+ sample.replace_content(-1, theta);
46
47
  REQUIRE(sample.get_c() == theta);
47
48
  REQUIRE(sample.get_num_retained_items() == 1);
48
49
  REQUIRE(sample.get_sample().size() == 1);
49
50
  REQUIRE(sample.has_partial_item() == false);
50
51
 
51
52
  theta = 1e-300;
52
- sample = ebpps_sample<int>(-1, theta);
53
+ sample.replace_content(-1, theta);
53
54
  REQUIRE(sample.get_c() == theta);
54
55
  REQUIRE(sample.get_num_retained_items() == 1);
55
56
  REQUIRE(sample.get_sample().size() == 0); // assuming the random number is > 1e-300
@@ -57,7 +58,8 @@ TEST_CASE("ebpps sample: pre-initialized", "[ebpps_sketch]") {
57
58
  }
58
59
 
59
60
  TEST_CASE("ebpps sample: downsampling", "[ebpps_sketch]") {
60
- ebpps_sample<char> sample = ebpps_sample<char>('a', 1.0);
61
+ ebpps_sample<char> sample(1);
62
+ sample.replace_content('a', 1.0);
61
63
 
62
64
  sample.downsample(2.0); // no-op
63
65
  REQUIRE(sample.get_c() == 1.0);
@@ -121,8 +123,9 @@ TEST_CASE("ebpps sample: merge unit samples", "[ebpps_sketch]") {
121
123
  uint32_t k = 8;
122
124
  ebpps_sample<int> sample = ebpps_sample<int>(k);
123
125
 
126
+ ebpps_sample<int> s(1);
124
127
  for (uint32_t i = 1; i <= k; ++i) {
125
- ebpps_sample<int> s = ebpps_sample<int>(i, 1.0);
128
+ s.replace_content(i, 1.0);
126
129
  sample.merge(s);
127
130
  REQUIRE(sample.get_c() == static_cast<double>(i));
128
131
  REQUIRE(sample.get_num_retained_items() == i);