datasketches 0.4.3 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +8 -0
  3. data/ext/datasketches/vo_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  6. data/vendor/datasketches-cpp/LICENSE +35 -7
  7. data/vendor/datasketches-cpp/NOTICE +2 -2
  8. data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -1
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +1 -0
  10. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
  11. data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
  12. data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
  13. data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
  14. data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
  15. data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
  16. data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
  17. data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
  18. data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
  19. data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
  20. data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
  21. data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
  22. data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
  23. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +51 -1
  24. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +38 -1
  25. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +12 -3
  26. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
  27. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -39
  28. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  29. metadata +18 -10
@@ -0,0 +1,406 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+
22
+ #include "bloom_filter.hpp"
23
+
24
+ #ifdef TEST_BINARY_INPUT_PATH
25
+ static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
26
+ #else
27
+ static std::string testBinaryInputPath = "test/";
28
+ #endif
29
+
30
+ namespace datasketches {
31
+
32
+ TEST_CASE("bloom_filter: invalid constructor args", "[bloom_filter]") {
33
+ REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(0, 4), std::invalid_argument);
34
+ REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(1L << 60, 4), std::invalid_argument);
35
+ REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(65535, 0), std::invalid_argument);
36
+ }
37
+
38
+ TEST_CASE("bloom_filter: standard constructors", "[bloom_filter]") {
39
+ uint64_t num_items = 4000;
40
+ double fpp = 0.01;
41
+
42
+ uint64_t num_bits = bloom_filter::builder::suggest_num_filter_bits(num_items, fpp);
43
+ uint16_t num_hashes = bloom_filter::builder::suggest_num_hashes(num_items, num_bits);
44
+ uint64_t seed = 89023;
45
+
46
+ auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes, seed);
47
+ uint64_t adjusted_num_bits = (num_bits + 63) & ~0x3F; // round up to the nearest multiple of 64
48
+ REQUIRE(bf.get_capacity() == adjusted_num_bits);
49
+ REQUIRE(bf.get_num_hashes() == num_hashes);
50
+ REQUIRE(bf.get_seed() == seed);
51
+ REQUIRE(bf.is_empty());
52
+
53
+ // should match above
54
+ bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
55
+ REQUIRE(bf.get_capacity() == adjusted_num_bits);
56
+ REQUIRE(bf.get_num_hashes() == num_hashes);
57
+ REQUIRE(bf.get_seed() == seed);
58
+ REQUIRE(bf.is_empty());
59
+
60
+ // same for initializing memory in-place
61
+ size_t serialized_size_bytes = bloom_filter::get_serialized_size_bytes(num_bits);
62
+ uint8_t* bytes = new uint8_t[serialized_size_bytes];
63
+
64
+ bf = bloom_filter::builder::initialize_by_size(bytes, serialized_size_bytes, num_bits, num_hashes, seed);
65
+ REQUIRE(bf.get_capacity() == adjusted_num_bits);
66
+ REQUIRE(bf.get_num_hashes() == num_hashes);
67
+ REQUIRE(bf.get_seed() == seed);
68
+ REQUIRE(bf.is_empty());
69
+
70
+ bf = bloom_filter::builder::initialize_by_accuracy(bytes, serialized_size_bytes, num_items, fpp, seed);
71
+ REQUIRE(bf.get_capacity() == adjusted_num_bits);
72
+ REQUIRE(bf.get_num_hashes() == num_hashes);
73
+ REQUIRE(bf.get_seed() == seed);
74
+ REQUIRE(bf.is_empty());
75
+
76
+ delete [] bytes;
77
+ }
78
+
79
+ TEST_CASE("bloom_filter: basic operations", "[bloom_filter]") {
80
+ uint64_t num_items = 5000;
81
+ double fpp = 0.01;
82
+ uint64_t seed = 4897301548054ULL;
83
+
84
+ auto bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
85
+ REQUIRE(bf.is_empty());
86
+ REQUIRE(bf.get_bits_used() == 0);
87
+
88
+ for (uint64_t i = 0; i < num_items; ++i) {
89
+ bf.query_and_update(i);
90
+ }
91
+
92
+ REQUIRE(!bf.is_empty());
93
+ // filter is about 50% full at target capacity
94
+ // since seed is fixed we expect an exact value every time
95
+ // but leaving the approximate test in since that's more the "expectation"
96
+ REQUIRE(bf.get_bits_used() == 24793); // exact value is not important but should be consistent
97
+ REQUIRE(bf.get_bits_used() == Approx(0.5 * bf.get_capacity()).epsilon(0.05)); // just over 3.3% in practice
98
+
99
+ uint32_t num_found = 0;
100
+ for (uint64_t i = num_items; i < bf.get_capacity(); ++i) {
101
+ if (bf.query(i)) {
102
+ ++num_found;
103
+ }
104
+ }
105
+ // fpp is average with significant variance -- even at 12% it would fail occasionally
106
+ REQUIRE(num_found == 423);
107
+ //REQUIRE(num_found == Approx((bf.get_capacity() - num_items) * fpp).epsilon(0.12));
108
+ auto bytes = bf.serialize();
109
+
110
+ // initialize in memory and run the same tests
111
+ // also checking against the results from the first part
112
+ uint8_t* bf_memory = new uint8_t[bytes.size()];
113
+ auto bf2 = bloom_filter::builder::initialize_by_accuracy(bf_memory, bytes.size(), num_items, fpp, bf.get_seed());
114
+ REQUIRE(bf2.is_empty());
115
+ REQUIRE(bf2.get_bits_used() == 0);
116
+
117
+ for (uint64_t i = 0; i < num_items; ++i) {
118
+ bf2.query_and_update(i);
119
+ }
120
+
121
+ REQUIRE(!bf2.is_empty());
122
+ REQUIRE(bf2.get_bits_used() == bf.get_bits_used()); // should exactly match above
123
+
124
+ uint32_t num_found2 = 0;
125
+ for (uint64_t i = num_items; i < bf2.get_capacity(); ++i) {
126
+ if (bf2.query(i)) {
127
+ ++num_found2;
128
+ }
129
+ }
130
+ REQUIRE(num_found == num_found2); // should exactly match above
131
+ auto bytes2 = bf2.serialize();
132
+
133
+ REQUIRE(bytes.size() == bytes2.size());
134
+ for (size_t i = 0; i < bytes.size(); ++i) {
135
+ REQUIRE(bytes[i] == bytes2[i]);
136
+ }
137
+
138
+ // check that raw memory also matches serialized sketch
139
+ const uint8_t* bf_bytes = bf2.get_wrapped_memory();
140
+ REQUIRE(bf_bytes == bf_memory);
141
+ for (size_t i = 0; i < bytes.size(); ++i) {
142
+ REQUIRE(bf_bytes[i] == bytes[i]);
143
+ }
144
+
145
+ // ensure the filters reset properly
146
+ bf.reset();
147
+ REQUIRE(bf.is_empty());
148
+ REQUIRE(bf.get_bits_used() == 0);
149
+
150
+ bf2.reset();
151
+ REQUIRE(bf2.is_empty());
152
+ REQUIRE(bf2.get_bits_used() == 0);
153
+
154
+ delete [] bf_memory;
155
+ }
156
+
157
+ TEST_CASE("bloom_filter: inversion", "[bloom_filter]") {
158
+ uint64_t num_bits = 8192;
159
+ uint16_t num_hashes = 3;
160
+
161
+ auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
162
+
163
+ uint64_t n = 500;
164
+ for (uint64_t i = 0; i < n; ++i) {
165
+ bf.update(i);
166
+ }
167
+ uint64_t num_bits_set = bf.get_bits_used();
168
+ bf.invert();
169
+ REQUIRE(bf.get_bits_used() == num_bits - num_bits_set);
170
+
171
+ // original items should be mostly not-present
172
+ uint32_t num_found = 0;
173
+ for (uint64_t i = 0; i < n; ++i) {
174
+ if (bf.query(i)) {
175
+ ++num_found;
176
+ }
177
+ }
178
+ REQUIRE(num_found < n / 10);
179
+
180
+ // many other items should be "present"
181
+ num_found = 0;
182
+ for (uint64_t i = n; i < num_bits; ++i) {
183
+ if (bf.query(i)) {
184
+ ++num_found;
185
+ }
186
+ }
187
+ REQUIRE(num_found > n);
188
+ }
189
+
190
+ TEST_CASE("bloom_filter: incompatible set operations", "[bloom_filter]") {
191
+ uint64_t num_bits = 32768;
192
+ uint16_t num_hashes = 4;
193
+
194
+ auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
195
+
196
+ // mismatched num bits
197
+ auto bf2 = bloom_filter::builder::create_by_size(2 * num_bits, num_hashes);
198
+ REQUIRE_THROWS_AS(bf1.union_with(bf2), std::invalid_argument);
199
+
200
+ // mismatched num hashes
201
+ auto bf3 = bloom_filter::builder::create_by_size(num_bits, 2 * num_hashes);
202
+ REQUIRE_THROWS_AS(bf1.intersect(bf2), std::invalid_argument);
203
+
204
+ // mismatched seed
205
+ auto bf4 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed() + 1);
206
+ REQUIRE_THROWS_AS(bf1.union_with(bf4), std::invalid_argument);
207
+ }
208
+
209
+ TEST_CASE("bloom_filter: basic union", "[bloom_filter]") {
210
+ const uint64_t num_bits = 12288;
211
+ const uint16_t num_hashes = 4;
212
+
213
+ auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
214
+ auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed());
215
+
216
+ const uint64_t n = 1000;
217
+ const uint32_t max_item = 3 * n / 2 - 1;
218
+ for (uint64_t i = 0; i < n; ++i) {
219
+ bf1.query_and_update(i);
220
+ bf2.update(n / 2 + i);
221
+ }
222
+
223
+ bf1.union_with(bf2);
224
+ for (uint64_t i = 0; i < max_item; ++i) {
225
+ REQUIRE(bf1.query(i));
226
+ }
227
+
228
+ uint32_t num_found = 0;
229
+ for (uint64_t i = max_item; i < num_bits; ++i) {
230
+ if (bf1.query(i)) {
231
+ ++num_found;
232
+ }
233
+ }
234
+ REQUIRE(num_found < num_bits / 10); // not being super strict
235
+ }
236
+
237
+ TEST_CASE("bloom_filter: basic intersection", "[bloom_filter]") {
238
+ const uint64_t num_bits = 8192;
239
+ const uint16_t num_hahes = 5;
240
+
241
+ auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hahes);
242
+ auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hahes, bf1.get_seed());
243
+
244
+ const uint64_t n = 1024;
245
+ const uint32_t max_item = 3 * n / 2 - 1;
246
+ for (uint64_t i = 0; i < n; ++i) {
247
+ bf1.update(i);
248
+ bf2.update(n / 2 + i);
249
+ }
250
+
251
+ bf1.intersect(bf2);
252
+ // overlap bit should all be set
253
+ for (uint64_t i = n / 2; i < n; ++i) {
254
+ REQUIRE(bf1.query(i));
255
+ }
256
+
257
+ uint32_t num_found = 0;
258
+ for (uint64_t i = 0; i < n / 2; ++i) {
259
+ if (bf1.query(i)) {
260
+ ++num_found;
261
+ }
262
+ }
263
+ for (uint64_t i = max_item; i < num_bits; ++i) {
264
+ if (bf1.query(i)) {
265
+ ++num_found;
266
+ }
267
+ }
268
+
269
+ REQUIRE(num_found < num_bits / 10); // not being super strict
270
+ }
271
+
272
+ TEST_CASE("bloom_filter: empty serialization", "[bloom_filter]") {
273
+ const uint64_t num_bits = 32769;
274
+ const uint16_t num_hashes = 7;
275
+
276
+ auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
277
+ auto bytes = bf.serialize();
278
+ REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
279
+
280
+ auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
281
+ REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
282
+ REQUIRE(bf.get_seed() == bf_bytes.get_seed());
283
+ REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
284
+ REQUIRE(bf_bytes.is_empty());
285
+
286
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
287
+ bf.serialize(ss);
288
+ auto bf_stream = bloom_filter::deserialize(ss);
289
+ REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
290
+ REQUIRE(bf.get_seed() == bf_stream.get_seed());
291
+ REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
292
+ REQUIRE(bf_stream.is_empty());
293
+
294
+ // read-only wrap should work
295
+ auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
296
+ REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
297
+ REQUIRE(bf.get_seed() == bf_wrap.get_seed());
298
+ REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
299
+ REQUIRE(bf_wrap.is_empty());
300
+
301
+ // writable wrap should not
302
+ REQUIRE_THROWS_AS(bloom_filter::writable_wrap(bytes.data(), bytes.size()), std::invalid_argument);
303
+ }
304
+
305
+ TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") {
306
+ const uint64_t num_bits = 32768;
307
+ const uint16_t num_hashes = 5;
308
+
309
+ auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
310
+ const uint64_t n = 1000;
311
+ for (uint64_t i = 0; i < n; ++i) {
312
+ bf.update(0.5 + i); // testing floats
313
+ }
314
+
315
+ // test more items without updating, assuming some false positives
316
+ // so we can check that we get the same number of false positives
317
+ // with the same query items
318
+ uint64_t fp_count = 0;
319
+ for (uint64_t i = n; i < num_bits; ++i) {
320
+ fp_count += bf.query(0.5 + i) ? 1 : 0;
321
+ }
322
+
323
+ auto bytes = bf.serialize();
324
+ REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
325
+
326
+ auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
327
+ REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
328
+ REQUIRE(bf.get_seed() == bf_bytes.get_seed());
329
+ REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
330
+ REQUIRE(!bf_bytes.is_empty());
331
+ REQUIRE(bf.is_memory_owned());
332
+ uint64_t fp_count_bytes = 0;
333
+ for (uint64_t i = 0; i < num_bits; ++i) {
334
+ bool val = bf_bytes.query(0.5 + i);
335
+ if (i < n)
336
+ REQUIRE(val);
337
+ else if (val)
338
+ ++fp_count_bytes;
339
+ }
340
+ REQUIRE(fp_count_bytes == fp_count);
341
+
342
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
343
+ bf.serialize(ss);
344
+ auto bf_stream = bloom_filter::deserialize(ss);
345
+ REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
346
+ REQUIRE(bf.get_seed() == bf_stream.get_seed());
347
+ REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
348
+ REQUIRE(!bf_stream.is_empty());
349
+ REQUIRE(bf_stream.is_memory_owned());
350
+ uint64_t fp_count_stream = 0;
351
+ for (uint64_t i = 0; i < num_bits; ++i) {
352
+ bool val = bf_stream.query(0.5 + i);
353
+ if (i < n)
354
+ REQUIRE(val);
355
+ else if (val)
356
+ ++fp_count_stream;
357
+ }
358
+ REQUIRE(fp_count_stream == fp_count);
359
+
360
+ // read-only wrap
361
+ auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
362
+ REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
363
+ REQUIRE(bf.get_seed() == bf_wrap.get_seed());
364
+ REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
365
+ REQUIRE(!bf_wrap.is_empty());
366
+ REQUIRE(!bf_wrap.is_memory_owned());
367
+ uint64_t fp_count_wrap = 0;
368
+ for (uint64_t i = 0; i < num_bits; ++i) {
369
+ bool val = bf_wrap.query(0.5 + i);
370
+ if (i < n)
371
+ REQUIRE(val);
372
+ else if (val)
373
+ ++fp_count_wrap;
374
+ }
375
+ REQUIRE(fp_count_wrap == fp_count);
376
+ REQUIRE_THROWS_AS(bf_wrap.update(-1.0), std::logic_error);
377
+ REQUIRE_THROWS_AS(bf_wrap.query_and_update(-2.0), std::logic_error);
378
+ REQUIRE_THROWS_AS(bf_wrap.reset(), std::logic_error);
379
+
380
+ // writable wrap
381
+ auto bf_writable = bloom_filter::writable_wrap(bytes.data(), bytes.size());
382
+ REQUIRE(bf.get_capacity() == bf_writable.get_capacity());
383
+ REQUIRE(bf.get_seed() == bf_writable.get_seed());
384
+ REQUIRE(bf.get_num_hashes() == bf_writable.get_num_hashes());
385
+ REQUIRE(!bf_writable.is_empty());
386
+ REQUIRE(!bf_writable.is_memory_owned());
387
+ uint64_t fp_count_writable = 0;
388
+ for (uint64_t i = 0; i < num_bits; ++i) {
389
+ bool val = bf_writable.query(0.5 + i);
390
+ if (i < n)
391
+ REQUIRE(val);
392
+ else if (val)
393
+ ++fp_count_writable;
394
+ }
395
+ REQUIRE(fp_count_writable == fp_count);
396
+
397
+ REQUIRE(!bf_writable.query(-1.0));
398
+ bf_writable.update(-1.0);
399
+ REQUIRE(bf_writable.query(-1.0));
400
+
401
+ // not good memory management to do this, but because we wrapped the same bytes as both
402
+ // read-only adn writable, that update should ahve changed the read-only version, too
403
+ REQUIRE(bf_wrap.query(-1.0));
404
+ }
405
+
406
+ } // namespace datasketches
@@ -89,6 +89,7 @@ public:
89
89
  using vector_t = std::vector<T, Allocator>;
90
90
  using vector_centroid = std::vector<centroid, typename std::allocator_traits<Allocator>::template rebind_alloc<centroid>>;
91
91
  using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
92
+ using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
92
93
 
93
94
  struct centroid_cmp {
94
95
  centroid_cmp() {}
@@ -115,7 +116,7 @@ public:
115
116
  * Merge the given t-Digest into this one
116
117
  * @param other t-Digest to merge
117
118
  */
118
- void merge(tdigest& other);
119
+ void merge(const tdigest& other);
119
120
 
120
121
  /**
121
122
  * Process buffered values and merge centroids if needed
@@ -142,8 +143,17 @@ public:
142
143
  */
143
144
  uint64_t get_total_weight() const;
144
145
 
146
+ /**
147
+ * Returns an instance of the allocator for this t-Digest.
148
+ * @return allocator
149
+ */
150
+ Allocator get_allocator() const;
151
+
145
152
  /**
146
153
  * Compute approximate normalized rank of the given value.
154
+ *
155
+ * <p>If the sketch is empty this throws std::runtime_error.
156
+ *
147
157
  * @param value to be ranked
148
158
  * @return normalized rank (from 0 to 1 inclusive)
149
159
  */
@@ -151,11 +161,49 @@ public:
151
161
 
152
162
  /**
153
163
  * Compute approximate quantile value corresponding to the given normalized rank
164
+ *
165
+ * <p>If the sketch is empty this throws std::runtime_error.
166
+ *
154
167
  * @param rank normalized rank (from 0 to 1 inclusive)
155
168
  * @return quantile value corresponding to the given rank
156
169
  */
157
170
  T get_quantile(double rank) const;
158
171
 
172
+ /**
173
+ * Returns an approximation to the Probability Mass Function (PMF) of the input stream
174
+ * given a set of split points.
175
+ *
176
+ * <p>If the sketch is empty this throws std::runtime_error.
177
+ *
178
+ * @param split_points an array of <i>m</i> unique, monotonically increasing values
179
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
180
+ *
181
+ * @param size the number of split points in the array
182
+ *
183
+ * @return an array of m+1 doubles each of which is an approximation
184
+ * to the fraction of the input stream values (the mass) that fall into one of those intervals.
185
+ */
186
+ vector_double get_PMF(const T* split_points, uint32_t size) const;
187
+
188
+ /**
189
+ * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
190
+ * cumulative analog of the PMF, of the input stream given a set of split points.
191
+ *
192
+ * <p>If the sketch is empty this throws std::runtime_error.
193
+ *
194
+ * @param split_points an array of <i>m</i> unique, monotonically increasing values
195
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
196
+ *
197
+ * @param size the number of split points in the array
198
+ *
199
+ * @return an array of m+1 doubles, which are a consecutive approximation to the CDF
200
+ * of the input stream given the split_points. The value at array position j of the returned
201
+ * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
202
+ * array. This can be viewed as array of ranks of the given split points plus one more value
203
+ * that is always 1.
204
+ */
205
+ vector_double get_CDF(const T* split_points, uint32_t size) const;
206
+
159
207
  /**
160
208
  * @return parameter k (compression) that was used to configure this t-Digest
161
209
  */
@@ -245,6 +293,8 @@ private:
245
293
  // for compatibility with format of the reference implementation
246
294
  static tdigest deserialize_compat(std::istream& is, const Allocator& allocator = Allocator());
247
295
  static tdigest deserialize_compat(const void* bytes, size_t size, const Allocator& allocator = Allocator());
296
+
297
+ static inline void check_split_points(const T* values, uint32_t size);
248
298
  };
249
299
 
250
300
  } /* namespace datasketches */
@@ -20,6 +20,7 @@
20
20
  #ifndef _TDIGEST_IMPL_HPP_
21
21
  #define _TDIGEST_IMPL_HPP_
22
22
 
23
+ #include <algorithm>
23
24
  #include <cmath>
24
25
  #include <sstream>
25
26
 
@@ -43,7 +44,7 @@ void tdigest<T, A>::update(T value) {
43
44
  }
44
45
 
45
46
  template<typename T, typename A>
46
- void tdigest<T, A>::merge(tdigest& other) {
47
+ void tdigest<T, A>::merge(const tdigest& other) {
47
48
  if (other.is_empty()) return;
48
49
  vector_centroid tmp(buffer_.get_allocator());
49
50
  tmp.reserve(buffer_.size() + centroids_.size() + other.buffer_.size() + other.centroids_.size());
@@ -84,6 +85,11 @@ uint64_t tdigest<T, A>::get_total_weight() const {
84
85
  return centroids_weight_ + buffer_.size();
85
86
  }
86
87
 
88
+ template<typename T, typename A>
89
+ A tdigest<T, A>::get_allocator() const {
90
+ return buffer_.get_allocator();
91
+ }
92
+
87
93
  template<typename T, typename A>
88
94
  double tdigest<T, A>::get_rank(T value) const {
89
95
  if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
@@ -190,6 +196,25 @@ T tdigest<T, A>::get_quantile(double rank) const {
190
196
  return weighted_average(centroids_.back().get_weight(), w1, max_, w2);
191
197
  }
192
198
 
199
+ template<typename T, typename A>
200
+ auto tdigest<T, A>::get_PMF(const T* split_points, uint32_t size) const -> vector_double {
201
+ auto buckets = get_CDF(split_points, size);
202
+ for (uint32_t i = size; i > 0; --i) {
203
+ buckets[i] -= buckets[i - 1];
204
+ }
205
+ return buckets;
206
+ }
207
+
208
+ template<typename T, typename A>
209
+ auto tdigest<T, A>::get_CDF(const T* split_points, uint32_t size) const -> vector_double {
210
+ check_split_points(split_points, size);
211
+ vector_double ranks(get_allocator());
212
+ ranks.reserve(size + 1);
213
+ for (uint32_t i = 0; i < size; ++i) ranks.push_back(get_rank(split_points[i]));
214
+ ranks.push_back(1);
215
+ return ranks;
216
+ }
217
+
193
218
  template<typename T, typename A>
194
219
  uint16_t tdigest<T, A>::get_k() const {
195
220
  return k_;
@@ -590,6 +615,18 @@ buffer_(std::move(buffer))
590
615
  buffer_.reserve(centroids_capacity_ * BUFFER_MULTIPLIER);
591
616
  }
592
617
 
618
+ template<typename T, typename A>
619
+ void tdigest<T, A>::check_split_points(const T* values, uint32_t size) {
620
+ for (uint32_t i = 0; i < size ; i++) {
621
+ if (std::isnan(values[i])) {
622
+ throw std::invalid_argument("Values must not be NaN");
623
+ }
624
+ if ((i < (size - 1)) && !(values[i] < values[i + 1])) {
625
+ throw std::invalid_argument("Values must be unique and monotonically increasing");
626
+ }
627
+ }
628
+ }
629
+
593
630
  } /* namespace datasketches */
594
631
 
595
632
  #endif // _TDIGEST_IMPL_HPP_
@@ -35,6 +35,9 @@ TEST_CASE("empty", "[tdigest]") {
35
35
  REQUIRE_THROWS_AS(td.get_max_value(), std::runtime_error);
36
36
  REQUIRE_THROWS_AS(td.get_rank(0), std::runtime_error);
37
37
  REQUIRE_THROWS_AS(td.get_quantile(0.5), std::runtime_error);
38
+ const double split_points[1] {0};
39
+ REQUIRE_THROWS_AS(td.get_PMF(split_points, 1), std::runtime_error);
40
+ REQUIRE_THROWS_AS(td.get_CDF(split_points, 1), std::runtime_error);
38
41
  }
39
42
 
40
43
  TEST_CASE("one value", "[tdigest]") {
@@ -56,9 +59,6 @@ TEST_CASE("many values", "[tdigest]") {
56
59
  const size_t n = 10000;
57
60
  tdigest_double td;
58
61
  for (size_t i = 0; i < n; ++i) td.update(i);
59
- // std::cout << td.to_string(true);
60
- // td.compress();
61
- // std::cout << td.to_string(true);
62
62
  REQUIRE_FALSE(td.is_empty());
63
63
  REQUIRE(td.get_total_weight() == n);
64
64
  REQUIRE(td.get_min_value() == 0);
@@ -73,6 +73,15 @@ TEST_CASE("many values", "[tdigest]") {
73
73
  REQUIRE(td.get_quantile(0.9) == Approx(n * 0.9).epsilon(0.01));
74
74
  REQUIRE(td.get_quantile(0.95) == Approx(n * 0.95).epsilon(0.01));
75
75
  REQUIRE(td.get_quantile(1) == n - 1);
76
+ const double split_points[1] {n / 2};
77
+ const auto pmf = td.get_PMF(split_points, 1);
78
+ REQUIRE(pmf.size() == 2);
79
+ REQUIRE(pmf[0] == Approx(0.5).margin(0.0001));
80
+ REQUIRE(pmf[1] == Approx(0.5).margin(0.0001));
81
+ const auto cdf = td.get_CDF(split_points, 1);
82
+ REQUIRE(cdf.size() == 2);
83
+ REQUIRE(cdf[0] == Approx(0.5).margin(0.0001));
84
+ REQUIRE(cdf[1] == 1);
76
85
  }
77
86
 
78
87
  TEST_CASE("rank - two values", "[tdigest]") {
@@ -329,7 +329,7 @@ static inline void pack_bits_13(const uint64_t* values, uint8_t* ptr) {
329
329
 
330
330
  *ptr++ = static_cast<uint8_t>(values[3] >> 4);
331
331
 
332
- *ptr = static_cast<uint8_t>(values[3] >> 4);
332
+ *ptr = static_cast<uint8_t>(values[3] << 4);
333
333
  *ptr++ |= static_cast<uint8_t>(values[4] >> 9);
334
334
 
335
335
  *ptr++ = static_cast<uint8_t>(values[4] >> 1);
@@ -4227,7 +4227,7 @@ static inline void unpack_bits_33(uint64_t* values, const uint8_t* ptr) {
4227
4227
  values[6] |= *ptr >> 1;
4228
4228
 
4229
4229
  values[7] = static_cast<uint64_t>(*ptr++ & 1) << 32;
4230
- values[7] |= *ptr++ << 24;
4230
+ values[7] |= static_cast<uint64_t>(*ptr++) << 24;
4231
4231
  values[7] |= *ptr++ << 16;
4232
4232
  values[7] |= *ptr++ << 8;
4233
4233
  values[7] |= *ptr;
@@ -4296,7 +4296,7 @@ static inline void unpack_bits_35(uint64_t* values, const uint8_t* ptr) {
4296
4296
  values[1] |= *ptr++ << 6;
4297
4297
  values[1] |= *ptr >> 2;
4298
4298
 
4299
- values[2] = static_cast<uint64_t>(*ptr++ & 2) << 33;
4299
+ values[2] = static_cast<uint64_t>(*ptr++ & 3) << 33;
4300
4300
  values[2] |= static_cast<uint64_t>(*ptr++) << 25;
4301
4301
  values[2] |= *ptr++ << 17;
4302
4302
  values[2] |= *ptr++ << 9;
@@ -6201,7 +6201,7 @@ static inline void pack_bits_block8(const uint64_t* values, uint8_t* ptr, uint8_
6201
6201
  case 61: pack_bits_61(values, ptr); break;
6202
6202
  case 62: pack_bits_62(values, ptr); break;
6203
6203
  case 63: pack_bits_63(values, ptr); break;
6204
- default: throw std::logic_error("wrong number of bits " + std::to_string(bits));
6204
+ default: throw std::logic_error("wrong number of bits in pack_bits_block8: " + std::to_string(bits));
6205
6205
  }
6206
6206
  }
6207
6207
 
@@ -6270,7 +6270,7 @@ static inline void unpack_bits_block8(uint64_t* values, const uint8_t* ptr, uint
6270
6270
  case 61: unpack_bits_61(values, ptr); break;
6271
6271
  case 62: unpack_bits_62(values, ptr); break;
6272
6272
  case 63: unpack_bits_63(values, ptr); break;
6273
- default: throw std::logic_error("wrong number of bits " + std::to_string(bits));
6273
+ default: throw std::logic_error("wrong number of bits in unpack_bits_block8: " + std::to_string(bits));
6274
6274
  }
6275
6275
  }
6276
6276