datasketches 0.4.3 → 0.4.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (29) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/ext/datasketches/vo_wrapper.cpp +1 -1
  4. data/lib/datasketches/version.rb +1 -1
  5. data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
  6. data/vendor/datasketches-cpp/LICENSE +35 -7
  7. data/vendor/datasketches-cpp/NOTICE +2 -2
  8. data/vendor/datasketches-cpp/common/CMakeLists.txt +2 -1
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +1 -0
  10. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +5 -7
  11. data/vendor/datasketches-cpp/common/include/xxhash64.h +202 -0
  12. data/vendor/datasketches-cpp/filters/CMakeLists.txt +43 -0
  13. data/vendor/datasketches-cpp/filters/include/bit_array_ops.hpp +180 -0
  14. data/vendor/datasketches-cpp/filters/include/bloom_filter.hpp +753 -0
  15. data/vendor/datasketches-cpp/filters/include/bloom_filter_builder_impl.hpp +132 -0
  16. data/vendor/datasketches-cpp/filters/include/bloom_filter_impl.hpp +908 -0
  17. data/vendor/datasketches-cpp/filters/test/CMakeLists.txt +60 -0
  18. data/vendor/datasketches-cpp/filters/test/bit_array_ops_test.cpp +107 -0
  19. data/vendor/datasketches-cpp/filters/test/bloom_filter_allocation_test.cpp +75 -0
  20. data/vendor/datasketches-cpp/filters/test/bloom_filter_deserialize_from_java_test.cpp +51 -0
  21. data/vendor/datasketches-cpp/filters/test/bloom_filter_serialize_for_java.cpp +45 -0
  22. data/vendor/datasketches-cpp/filters/test/bloom_filter_test.cpp +406 -0
  23. data/vendor/datasketches-cpp/tdigest/include/tdigest.hpp +51 -1
  24. data/vendor/datasketches-cpp/tdigest/include/tdigest_impl.hpp +38 -1
  25. data/vendor/datasketches-cpp/tdigest/test/tdigest_test.cpp +12 -3
  26. data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +5 -5
  27. data/vendor/datasketches-cpp/theta/test/bit_packing_test.cpp +41 -39
  28. data/vendor/datasketches-cpp/version.cfg.in +1 -1
  29. metadata +17 -9
@@ -0,0 +1,406 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <catch2/catch.hpp>
21
+
22
+ #include "bloom_filter.hpp"
23
+
24
+ #ifdef TEST_BINARY_INPUT_PATH
25
+ static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
26
+ #else
27
+ static std::string testBinaryInputPath = "test/";
28
+ #endif
29
+
30
+ namespace datasketches {
31
+
32
+ TEST_CASE("bloom_filter: invalid constructor args", "[bloom_filter]") {
33
+ REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(0, 4), std::invalid_argument);
34
+ REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(1L << 60, 4), std::invalid_argument);
35
+ REQUIRE_THROWS_AS(bloom_filter::builder::create_by_size(65535, 0), std::invalid_argument);
36
+ }
37
+
38
+ TEST_CASE("bloom_filter: standard constructors", "[bloom_filter]") {
39
+ uint64_t num_items = 4000;
40
+ double fpp = 0.01;
41
+
42
+ uint64_t num_bits = bloom_filter::builder::suggest_num_filter_bits(num_items, fpp);
43
+ uint16_t num_hashes = bloom_filter::builder::suggest_num_hashes(num_items, num_bits);
44
+ uint64_t seed = 89023;
45
+
46
+ auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes, seed);
47
+ uint64_t adjusted_num_bits = (num_bits + 63) & ~0x3F; // round up to the nearest multiple of 64
48
+ REQUIRE(bf.get_capacity() == adjusted_num_bits);
49
+ REQUIRE(bf.get_num_hashes() == num_hashes);
50
+ REQUIRE(bf.get_seed() == seed);
51
+ REQUIRE(bf.is_empty());
52
+
53
+ // should match above
54
+ bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
55
+ REQUIRE(bf.get_capacity() == adjusted_num_bits);
56
+ REQUIRE(bf.get_num_hashes() == num_hashes);
57
+ REQUIRE(bf.get_seed() == seed);
58
+ REQUIRE(bf.is_empty());
59
+
60
+ // same for initializing memory in-place
61
+ size_t serialized_size_bytes = bloom_filter::get_serialized_size_bytes(num_bits);
62
+ uint8_t* bytes = new uint8_t[serialized_size_bytes];
63
+
64
+ bf = bloom_filter::builder::initialize_by_size(bytes, serialized_size_bytes, num_bits, num_hashes, seed);
65
+ REQUIRE(bf.get_capacity() == adjusted_num_bits);
66
+ REQUIRE(bf.get_num_hashes() == num_hashes);
67
+ REQUIRE(bf.get_seed() == seed);
68
+ REQUIRE(bf.is_empty());
69
+
70
+ bf = bloom_filter::builder::initialize_by_accuracy(bytes, serialized_size_bytes, num_items, fpp, seed);
71
+ REQUIRE(bf.get_capacity() == adjusted_num_bits);
72
+ REQUIRE(bf.get_num_hashes() == num_hashes);
73
+ REQUIRE(bf.get_seed() == seed);
74
+ REQUIRE(bf.is_empty());
75
+
76
+ delete [] bytes;
77
+ }
78
+
79
+ TEST_CASE("bloom_filter: basic operations", "[bloom_filter]") {
80
+ uint64_t num_items = 5000;
81
+ double fpp = 0.01;
82
+ uint64_t seed = 4897301548054ULL;
83
+
84
+ auto bf = bloom_filter::builder::create_by_accuracy(num_items, fpp, seed);
85
+ REQUIRE(bf.is_empty());
86
+ REQUIRE(bf.get_bits_used() == 0);
87
+
88
+ for (uint64_t i = 0; i < num_items; ++i) {
89
+ bf.query_and_update(i);
90
+ }
91
+
92
+ REQUIRE(!bf.is_empty());
93
+ // filter is about 50% full at target capacity
94
+ // since seed is fixed we expect an exact value every time
95
+ // but leaving the approximate test in since that's more the "expectation"
96
+ REQUIRE(bf.get_bits_used() == 24793); // exact value is not important but should be consistent
97
+ REQUIRE(bf.get_bits_used() == Approx(0.5 * bf.get_capacity()).epsilon(0.05)); // just over 3.3% in practice
98
+
99
+ uint32_t num_found = 0;
100
+ for (uint64_t i = num_items; i < bf.get_capacity(); ++i) {
101
+ if (bf.query(i)) {
102
+ ++num_found;
103
+ }
104
+ }
105
+ // fpp is average with significant variance -- even at 12% it would fail occasionally
106
+ REQUIRE(num_found == 423);
107
+ //REQUIRE(num_found == Approx((bf.get_capacity() - num_items) * fpp).epsilon(0.12));
108
+ auto bytes = bf.serialize();
109
+
110
+ // initialize in memory and run the same tests
111
+ // also checking against the results from the first part
112
+ uint8_t* bf_memory = new uint8_t[bytes.size()];
113
+ auto bf2 = bloom_filter::builder::initialize_by_accuracy(bf_memory, bytes.size(), num_items, fpp, bf.get_seed());
114
+ REQUIRE(bf2.is_empty());
115
+ REQUIRE(bf2.get_bits_used() == 0);
116
+
117
+ for (uint64_t i = 0; i < num_items; ++i) {
118
+ bf2.query_and_update(i);
119
+ }
120
+
121
+ REQUIRE(!bf2.is_empty());
122
+ REQUIRE(bf2.get_bits_used() == bf.get_bits_used()); // should exactly match above
123
+
124
+ uint32_t num_found2 = 0;
125
+ for (uint64_t i = num_items; i < bf2.get_capacity(); ++i) {
126
+ if (bf2.query(i)) {
127
+ ++num_found2;
128
+ }
129
+ }
130
+ REQUIRE(num_found == num_found2); // should exactly match above
131
+ auto bytes2 = bf2.serialize();
132
+
133
+ REQUIRE(bytes.size() == bytes2.size());
134
+ for (size_t i = 0; i < bytes.size(); ++i) {
135
+ REQUIRE(bytes[i] == bytes2[i]);
136
+ }
137
+
138
+ // check that raw memory also matches serialized sketch
139
+ const uint8_t* bf_bytes = bf2.get_wrapped_memory();
140
+ REQUIRE(bf_bytes == bf_memory);
141
+ for (size_t i = 0; i < bytes.size(); ++i) {
142
+ REQUIRE(bf_bytes[i] == bytes[i]);
143
+ }
144
+
145
+ // ensure the filters reset properly
146
+ bf.reset();
147
+ REQUIRE(bf.is_empty());
148
+ REQUIRE(bf.get_bits_used() == 0);
149
+
150
+ bf2.reset();
151
+ REQUIRE(bf2.is_empty());
152
+ REQUIRE(bf2.get_bits_used() == 0);
153
+
154
+ delete [] bf_memory;
155
+ }
156
+
157
+ TEST_CASE("bloom_filter: inversion", "[bloom_filter]") {
158
+ uint64_t num_bits = 8192;
159
+ uint16_t num_hashes = 3;
160
+
161
+ auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
162
+
163
+ uint64_t n = 500;
164
+ for (uint64_t i = 0; i < n; ++i) {
165
+ bf.update(i);
166
+ }
167
+ uint64_t num_bits_set = bf.get_bits_used();
168
+ bf.invert();
169
+ REQUIRE(bf.get_bits_used() == num_bits - num_bits_set);
170
+
171
+ // original items should be mostly not-present
172
+ uint32_t num_found = 0;
173
+ for (uint64_t i = 0; i < n; ++i) {
174
+ if (bf.query(i)) {
175
+ ++num_found;
176
+ }
177
+ }
178
+ REQUIRE(num_found < n / 10);
179
+
180
+ // many other items should be "present"
181
+ num_found = 0;
182
+ for (uint64_t i = n; i < num_bits; ++i) {
183
+ if (bf.query(i)) {
184
+ ++num_found;
185
+ }
186
+ }
187
+ REQUIRE(num_found > n);
188
+ }
189
+
190
+ TEST_CASE("bloom_filter: incompatible set operations", "[bloom_filter]") {
191
+ uint64_t num_bits = 32768;
192
+ uint16_t num_hashes = 4;
193
+
194
+ auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
195
+
196
+ // mismatched num bits
197
+ auto bf2 = bloom_filter::builder::create_by_size(2 * num_bits, num_hashes);
198
+ REQUIRE_THROWS_AS(bf1.union_with(bf2), std::invalid_argument);
199
+
200
+ // mismatched num hashes
201
+ auto bf3 = bloom_filter::builder::create_by_size(num_bits, 2 * num_hashes);
202
+ REQUIRE_THROWS_AS(bf1.intersect(bf2), std::invalid_argument);
203
+
204
+ // mismatched seed
205
+ auto bf4 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed() + 1);
206
+ REQUIRE_THROWS_AS(bf1.union_with(bf4), std::invalid_argument);
207
+ }
208
+
209
+ TEST_CASE("bloom_filter: basic union", "[bloom_filter]") {
210
+ const uint64_t num_bits = 12288;
211
+ const uint16_t num_hashes = 4;
212
+
213
+ auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hashes);
214
+ auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hashes, bf1.get_seed());
215
+
216
+ const uint64_t n = 1000;
217
+ const uint32_t max_item = 3 * n / 2 - 1;
218
+ for (uint64_t i = 0; i < n; ++i) {
219
+ bf1.query_and_update(i);
220
+ bf2.update(n / 2 + i);
221
+ }
222
+
223
+ bf1.union_with(bf2);
224
+ for (uint64_t i = 0; i < max_item; ++i) {
225
+ REQUIRE(bf1.query(i));
226
+ }
227
+
228
+ uint32_t num_found = 0;
229
+ for (uint64_t i = max_item; i < num_bits; ++i) {
230
+ if (bf1.query(i)) {
231
+ ++num_found;
232
+ }
233
+ }
234
+ REQUIRE(num_found < num_bits / 10); // not being super strict
235
+ }
236
+
237
+ TEST_CASE("bloom_filter: basic intersection", "[bloom_filter]") {
238
+ const uint64_t num_bits = 8192;
239
+ const uint16_t num_hahes = 5;
240
+
241
+ auto bf1 = bloom_filter::builder::create_by_size(num_bits, num_hahes);
242
+ auto bf2 = bloom_filter::builder::create_by_size(num_bits, num_hahes, bf1.get_seed());
243
+
244
+ const uint64_t n = 1024;
245
+ const uint32_t max_item = 3 * n / 2 - 1;
246
+ for (uint64_t i = 0; i < n; ++i) {
247
+ bf1.update(i);
248
+ bf2.update(n / 2 + i);
249
+ }
250
+
251
+ bf1.intersect(bf2);
252
+ // overlap bit should all be set
253
+ for (uint64_t i = n / 2; i < n; ++i) {
254
+ REQUIRE(bf1.query(i));
255
+ }
256
+
257
+ uint32_t num_found = 0;
258
+ for (uint64_t i = 0; i < n / 2; ++i) {
259
+ if (bf1.query(i)) {
260
+ ++num_found;
261
+ }
262
+ }
263
+ for (uint64_t i = max_item; i < num_bits; ++i) {
264
+ if (bf1.query(i)) {
265
+ ++num_found;
266
+ }
267
+ }
268
+
269
+ REQUIRE(num_found < num_bits / 10); // not being super strict
270
+ }
271
+
272
+ TEST_CASE("bloom_filter: empty serialization", "[bloom_filter]") {
273
+ const uint64_t num_bits = 32769;
274
+ const uint16_t num_hashes = 7;
275
+
276
+ auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
277
+ auto bytes = bf.serialize();
278
+ REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
279
+
280
+ auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
281
+ REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
282
+ REQUIRE(bf.get_seed() == bf_bytes.get_seed());
283
+ REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
284
+ REQUIRE(bf_bytes.is_empty());
285
+
286
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
287
+ bf.serialize(ss);
288
+ auto bf_stream = bloom_filter::deserialize(ss);
289
+ REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
290
+ REQUIRE(bf.get_seed() == bf_stream.get_seed());
291
+ REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
292
+ REQUIRE(bf_stream.is_empty());
293
+
294
+ // read-only wrap should work
295
+ auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
296
+ REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
297
+ REQUIRE(bf.get_seed() == bf_wrap.get_seed());
298
+ REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
299
+ REQUIRE(bf_wrap.is_empty());
300
+
301
+ // writable wrap should not
302
+ REQUIRE_THROWS_AS(bloom_filter::writable_wrap(bytes.data(), bytes.size()), std::invalid_argument);
303
+ }
304
+
305
+ TEST_CASE("bloom_filter: non-empty serialization", "[bloom_filter]") {
306
+ const uint64_t num_bits = 32768;
307
+ const uint16_t num_hashes = 5;
308
+
309
+ auto bf = bloom_filter::builder::create_by_size(num_bits, num_hashes);
310
+ const uint64_t n = 1000;
311
+ for (uint64_t i = 0; i < n; ++i) {
312
+ bf.update(0.5 + i); // testing floats
313
+ }
314
+
315
+ // test more items without updating, assuming some false positives
316
+ // so we can check that we get the same number of false positives
317
+ // with the same query items
318
+ uint64_t fp_count = 0;
319
+ for (uint64_t i = n; i < num_bits; ++i) {
320
+ fp_count += bf.query(0.5 + i) ? 1 : 0;
321
+ }
322
+
323
+ auto bytes = bf.serialize();
324
+ REQUIRE(bytes.size() == bf.get_serialized_size_bytes());
325
+
326
+ auto bf_bytes = bloom_filter::deserialize(bytes.data(), bytes.size());
327
+ REQUIRE(bf.get_capacity() == bf_bytes.get_capacity());
328
+ REQUIRE(bf.get_seed() == bf_bytes.get_seed());
329
+ REQUIRE(bf.get_num_hashes() == bf_bytes.get_num_hashes());
330
+ REQUIRE(!bf_bytes.is_empty());
331
+ REQUIRE(bf.is_memory_owned());
332
+ uint64_t fp_count_bytes = 0;
333
+ for (uint64_t i = 0; i < num_bits; ++i) {
334
+ bool val = bf_bytes.query(0.5 + i);
335
+ if (i < n)
336
+ REQUIRE(val);
337
+ else if (val)
338
+ ++fp_count_bytes;
339
+ }
340
+ REQUIRE(fp_count_bytes == fp_count);
341
+
342
+ std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
343
+ bf.serialize(ss);
344
+ auto bf_stream = bloom_filter::deserialize(ss);
345
+ REQUIRE(bf.get_capacity() == bf_stream.get_capacity());
346
+ REQUIRE(bf.get_seed() == bf_stream.get_seed());
347
+ REQUIRE(bf.get_num_hashes() == bf_stream.get_num_hashes());
348
+ REQUIRE(!bf_stream.is_empty());
349
+ REQUIRE(bf_stream.is_memory_owned());
350
+ uint64_t fp_count_stream = 0;
351
+ for (uint64_t i = 0; i < num_bits; ++i) {
352
+ bool val = bf_stream.query(0.5 + i);
353
+ if (i < n)
354
+ REQUIRE(val);
355
+ else if (val)
356
+ ++fp_count_stream;
357
+ }
358
+ REQUIRE(fp_count_stream == fp_count);
359
+
360
+ // read-only wrap
361
+ auto bf_wrap = bloom_filter::wrap(bytes.data(), bytes.size());
362
+ REQUIRE(bf.get_capacity() == bf_wrap.get_capacity());
363
+ REQUIRE(bf.get_seed() == bf_wrap.get_seed());
364
+ REQUIRE(bf.get_num_hashes() == bf_wrap.get_num_hashes());
365
+ REQUIRE(!bf_wrap.is_empty());
366
+ REQUIRE(!bf_wrap.is_memory_owned());
367
+ uint64_t fp_count_wrap = 0;
368
+ for (uint64_t i = 0; i < num_bits; ++i) {
369
+ bool val = bf_wrap.query(0.5 + i);
370
+ if (i < n)
371
+ REQUIRE(val);
372
+ else if (val)
373
+ ++fp_count_wrap;
374
+ }
375
+ REQUIRE(fp_count_wrap == fp_count);
376
+ REQUIRE_THROWS_AS(bf_wrap.update(-1.0), std::logic_error);
377
+ REQUIRE_THROWS_AS(bf_wrap.query_and_update(-2.0), std::logic_error);
378
+ REQUIRE_THROWS_AS(bf_wrap.reset(), std::logic_error);
379
+
380
+ // writable wrap
381
+ auto bf_writable = bloom_filter::writable_wrap(bytes.data(), bytes.size());
382
+ REQUIRE(bf.get_capacity() == bf_writable.get_capacity());
383
+ REQUIRE(bf.get_seed() == bf_writable.get_seed());
384
+ REQUIRE(bf.get_num_hashes() == bf_writable.get_num_hashes());
385
+ REQUIRE(!bf_writable.is_empty());
386
+ REQUIRE(!bf_writable.is_memory_owned());
387
+ uint64_t fp_count_writable = 0;
388
+ for (uint64_t i = 0; i < num_bits; ++i) {
389
+ bool val = bf_writable.query(0.5 + i);
390
+ if (i < n)
391
+ REQUIRE(val);
392
+ else if (val)
393
+ ++fp_count_writable;
394
+ }
395
+ REQUIRE(fp_count_writable == fp_count);
396
+
397
+ REQUIRE(!bf_writable.query(-1.0));
398
+ bf_writable.update(-1.0);
399
+ REQUIRE(bf_writable.query(-1.0));
400
+
401
+ // not good memory management to do this, but because we wrapped the same bytes as both
402
+ // read-only adn writable, that update should ahve changed the read-only version, too
403
+ REQUIRE(bf_wrap.query(-1.0));
404
+ }
405
+
406
+ } // namespace datasketches
@@ -89,6 +89,7 @@ public:
89
89
  using vector_t = std::vector<T, Allocator>;
90
90
  using vector_centroid = std::vector<centroid, typename std::allocator_traits<Allocator>::template rebind_alloc<centroid>>;
91
91
  using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
92
+ using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
92
93
 
93
94
  struct centroid_cmp {
94
95
  centroid_cmp() {}
@@ -115,7 +116,7 @@ public:
115
116
  * Merge the given t-Digest into this one
116
117
  * @param other t-Digest to merge
117
118
  */
118
- void merge(tdigest& other);
119
+ void merge(const tdigest& other);
119
120
 
120
121
  /**
121
122
  * Process buffered values and merge centroids if needed
@@ -142,8 +143,17 @@ public:
142
143
  */
143
144
  uint64_t get_total_weight() const;
144
145
 
146
+ /**
147
+ * Returns an instance of the allocator for this t-Digest.
148
+ * @return allocator
149
+ */
150
+ Allocator get_allocator() const;
151
+
145
152
  /**
146
153
  * Compute approximate normalized rank of the given value.
154
+ *
155
+ * <p>If the sketch is empty this throws std::runtime_error.
156
+ *
147
157
  * @param value to be ranked
148
158
  * @return normalized rank (from 0 to 1 inclusive)
149
159
  */
@@ -151,11 +161,49 @@ public:
151
161
 
152
162
  /**
153
163
  * Compute approximate quantile value corresponding to the given normalized rank
164
+ *
165
+ * <p>If the sketch is empty this throws std::runtime_error.
166
+ *
154
167
  * @param rank normalized rank (from 0 to 1 inclusive)
155
168
  * @return quantile value corresponding to the given rank
156
169
  */
157
170
  T get_quantile(double rank) const;
158
171
 
172
+ /**
173
+ * Returns an approximation to the Probability Mass Function (PMF) of the input stream
174
+ * given a set of split points.
175
+ *
176
+ * <p>If the sketch is empty this throws std::runtime_error.
177
+ *
178
+ * @param split_points an array of <i>m</i> unique, monotonically increasing values
179
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
180
+ *
181
+ * @param size the number of split points in the array
182
+ *
183
+ * @return an array of m+1 doubles each of which is an approximation
184
+ * to the fraction of the input stream values (the mass) that fall into one of those intervals.
185
+ */
186
+ vector_double get_PMF(const T* split_points, uint32_t size) const;
187
+
188
+ /**
189
+ * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
190
+ * cumulative analog of the PMF, of the input stream given a set of split points.
191
+ *
192
+ * <p>If the sketch is empty this throws std::runtime_error.
193
+ *
194
+ * @param split_points an array of <i>m</i> unique, monotonically increasing values
195
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
196
+ *
197
+ * @param size the number of split points in the array
198
+ *
199
+ * @return an array of m+1 doubles, which are a consecutive approximation to the CDF
200
+ * of the input stream given the split_points. The value at array position j of the returned
201
+ * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
202
+ * array. This can be viewed as array of ranks of the given split points plus one more value
203
+ * that is always 1.
204
+ */
205
+ vector_double get_CDF(const T* split_points, uint32_t size) const;
206
+
159
207
  /**
160
208
  * @return parameter k (compression) that was used to configure this t-Digest
161
209
  */
@@ -245,6 +293,8 @@ private:
245
293
  // for compatibility with format of the reference implementation
246
294
  static tdigest deserialize_compat(std::istream& is, const Allocator& allocator = Allocator());
247
295
  static tdigest deserialize_compat(const void* bytes, size_t size, const Allocator& allocator = Allocator());
296
+
297
+ static inline void check_split_points(const T* values, uint32_t size);
248
298
  };
249
299
 
250
300
  } /* namespace datasketches */
@@ -20,6 +20,7 @@
20
20
  #ifndef _TDIGEST_IMPL_HPP_
21
21
  #define _TDIGEST_IMPL_HPP_
22
22
 
23
+ #include <algorithm>
23
24
  #include <cmath>
24
25
  #include <sstream>
25
26
 
@@ -43,7 +44,7 @@ void tdigest<T, A>::update(T value) {
43
44
  }
44
45
 
45
46
  template<typename T, typename A>
46
- void tdigest<T, A>::merge(tdigest& other) {
47
+ void tdigest<T, A>::merge(const tdigest& other) {
47
48
  if (other.is_empty()) return;
48
49
  vector_centroid tmp(buffer_.get_allocator());
49
50
  tmp.reserve(buffer_.size() + centroids_.size() + other.buffer_.size() + other.centroids_.size());
@@ -84,6 +85,11 @@ uint64_t tdigest<T, A>::get_total_weight() const {
84
85
  return centroids_weight_ + buffer_.size();
85
86
  }
86
87
 
88
+ template<typename T, typename A>
89
+ A tdigest<T, A>::get_allocator() const {
90
+ return buffer_.get_allocator();
91
+ }
92
+
87
93
  template<typename T, typename A>
88
94
  double tdigest<T, A>::get_rank(T value) const {
89
95
  if (is_empty()) throw std::runtime_error("operation is undefined for an empty sketch");
@@ -190,6 +196,25 @@ T tdigest<T, A>::get_quantile(double rank) const {
190
196
  return weighted_average(centroids_.back().get_weight(), w1, max_, w2);
191
197
  }
192
198
 
199
+ template<typename T, typename A>
200
+ auto tdigest<T, A>::get_PMF(const T* split_points, uint32_t size) const -> vector_double {
201
+ auto buckets = get_CDF(split_points, size);
202
+ for (uint32_t i = size; i > 0; --i) {
203
+ buckets[i] -= buckets[i - 1];
204
+ }
205
+ return buckets;
206
+ }
207
+
208
+ template<typename T, typename A>
209
+ auto tdigest<T, A>::get_CDF(const T* split_points, uint32_t size) const -> vector_double {
210
+ check_split_points(split_points, size);
211
+ vector_double ranks(get_allocator());
212
+ ranks.reserve(size + 1);
213
+ for (uint32_t i = 0; i < size; ++i) ranks.push_back(get_rank(split_points[i]));
214
+ ranks.push_back(1);
215
+ return ranks;
216
+ }
217
+
193
218
  template<typename T, typename A>
194
219
  uint16_t tdigest<T, A>::get_k() const {
195
220
  return k_;
@@ -590,6 +615,18 @@ buffer_(std::move(buffer))
590
615
  buffer_.reserve(centroids_capacity_ * BUFFER_MULTIPLIER);
591
616
  }
592
617
 
618
+ template<typename T, typename A>
619
+ void tdigest<T, A>::check_split_points(const T* values, uint32_t size) {
620
+ for (uint32_t i = 0; i < size ; i++) {
621
+ if (std::isnan(values[i])) {
622
+ throw std::invalid_argument("Values must not be NaN");
623
+ }
624
+ if ((i < (size - 1)) && !(values[i] < values[i + 1])) {
625
+ throw std::invalid_argument("Values must be unique and monotonically increasing");
626
+ }
627
+ }
628
+ }
629
+
593
630
  } /* namespace datasketches */
594
631
 
595
632
  #endif // _TDIGEST_IMPL_HPP_
@@ -35,6 +35,9 @@ TEST_CASE("empty", "[tdigest]") {
35
35
  REQUIRE_THROWS_AS(td.get_max_value(), std::runtime_error);
36
36
  REQUIRE_THROWS_AS(td.get_rank(0), std::runtime_error);
37
37
  REQUIRE_THROWS_AS(td.get_quantile(0.5), std::runtime_error);
38
+ const double split_points[1] {0};
39
+ REQUIRE_THROWS_AS(td.get_PMF(split_points, 1), std::runtime_error);
40
+ REQUIRE_THROWS_AS(td.get_CDF(split_points, 1), std::runtime_error);
38
41
  }
39
42
 
40
43
  TEST_CASE("one value", "[tdigest]") {
@@ -56,9 +59,6 @@ TEST_CASE("many values", "[tdigest]") {
56
59
  const size_t n = 10000;
57
60
  tdigest_double td;
58
61
  for (size_t i = 0; i < n; ++i) td.update(i);
59
- // std::cout << td.to_string(true);
60
- // td.compress();
61
- // std::cout << td.to_string(true);
62
62
  REQUIRE_FALSE(td.is_empty());
63
63
  REQUIRE(td.get_total_weight() == n);
64
64
  REQUIRE(td.get_min_value() == 0);
@@ -73,6 +73,15 @@ TEST_CASE("many values", "[tdigest]") {
73
73
  REQUIRE(td.get_quantile(0.9) == Approx(n * 0.9).epsilon(0.01));
74
74
  REQUIRE(td.get_quantile(0.95) == Approx(n * 0.95).epsilon(0.01));
75
75
  REQUIRE(td.get_quantile(1) == n - 1);
76
+ const double split_points[1] {n / 2};
77
+ const auto pmf = td.get_PMF(split_points, 1);
78
+ REQUIRE(pmf.size() == 2);
79
+ REQUIRE(pmf[0] == Approx(0.5).margin(0.0001));
80
+ REQUIRE(pmf[1] == Approx(0.5).margin(0.0001));
81
+ const auto cdf = td.get_CDF(split_points, 1);
82
+ REQUIRE(cdf.size() == 2);
83
+ REQUIRE(cdf[0] == Approx(0.5).margin(0.0001));
84
+ REQUIRE(cdf[1] == 1);
76
85
  }
77
86
 
78
87
  TEST_CASE("rank - two values", "[tdigest]") {
@@ -329,7 +329,7 @@ static inline void pack_bits_13(const uint64_t* values, uint8_t* ptr) {
329
329
 
330
330
  *ptr++ = static_cast<uint8_t>(values[3] >> 4);
331
331
 
332
- *ptr = static_cast<uint8_t>(values[3] >> 4);
332
+ *ptr = static_cast<uint8_t>(values[3] << 4);
333
333
  *ptr++ |= static_cast<uint8_t>(values[4] >> 9);
334
334
 
335
335
  *ptr++ = static_cast<uint8_t>(values[4] >> 1);
@@ -4227,7 +4227,7 @@ static inline void unpack_bits_33(uint64_t* values, const uint8_t* ptr) {
4227
4227
  values[6] |= *ptr >> 1;
4228
4228
 
4229
4229
  values[7] = static_cast<uint64_t>(*ptr++ & 1) << 32;
4230
- values[7] |= *ptr++ << 24;
4230
+ values[7] |= static_cast<uint64_t>(*ptr++) << 24;
4231
4231
  values[7] |= *ptr++ << 16;
4232
4232
  values[7] |= *ptr++ << 8;
4233
4233
  values[7] |= *ptr;
@@ -4296,7 +4296,7 @@ static inline void unpack_bits_35(uint64_t* values, const uint8_t* ptr) {
4296
4296
  values[1] |= *ptr++ << 6;
4297
4297
  values[1] |= *ptr >> 2;
4298
4298
 
4299
- values[2] = static_cast<uint64_t>(*ptr++ & 2) << 33;
4299
+ values[2] = static_cast<uint64_t>(*ptr++ & 3) << 33;
4300
4300
  values[2] |= static_cast<uint64_t>(*ptr++) << 25;
4301
4301
  values[2] |= *ptr++ << 17;
4302
4302
  values[2] |= *ptr++ << 9;
@@ -6201,7 +6201,7 @@ static inline void pack_bits_block8(const uint64_t* values, uint8_t* ptr, uint8_
6201
6201
  case 61: pack_bits_61(values, ptr); break;
6202
6202
  case 62: pack_bits_62(values, ptr); break;
6203
6203
  case 63: pack_bits_63(values, ptr); break;
6204
- default: throw std::logic_error("wrong number of bits " + std::to_string(bits));
6204
+ default: throw std::logic_error("wrong number of bits in pack_bits_block8: " + std::to_string(bits));
6205
6205
  }
6206
6206
  }
6207
6207
 
@@ -6270,7 +6270,7 @@ static inline void unpack_bits_block8(uint64_t* values, const uint8_t* ptr, uint
6270
6270
  case 61: unpack_bits_61(values, ptr); break;
6271
6271
  case 62: unpack_bits_62(values, ptr); break;
6272
6272
  case 63: unpack_bits_63(values, ptr); break;
6273
- default: throw std::logic_error("wrong number of bits " + std::to_string(bits));
6273
+ default: throw std::logic_error("wrong number of bits in unpack_bits_block8: " + std::to_string(bits));
6274
6274
  }
6275
6275
  }
6276
6276