datasketches 0.2.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (106) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +1 -1
  4. data/ext/datasketches/kll_wrapper.cpp +5 -1
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +4 -3
  7. data/vendor/datasketches-cpp/common/CMakeLists.txt +4 -0
  8. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
  10. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
  11. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
  12. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
  13. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
  14. data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
  15. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
  16. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
  17. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +2 -0
  18. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
  19. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
  20. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
  21. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +25 -9
  22. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
  23. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
  24. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
  25. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
  26. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
  28. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
  29. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
  30. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
  31. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
  32. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
  33. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
  34. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
  35. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
  36. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
  37. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -4
  38. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
  39. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
  40. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +96 -42
  41. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +105 -127
  42. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +94 -25
  43. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  44. data/vendor/datasketches-cpp/pyproject.toml +1 -1
  45. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  46. data/vendor/datasketches-cpp/python/README.md +7 -0
  47. data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
  48. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
  49. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
  50. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
  51. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
  52. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
  53. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
  54. data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
  55. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
  56. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
  57. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
  58. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
  59. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
  60. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
  61. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
  62. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
  63. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
  64. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
  65. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
  66. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
  67. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
  68. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
  69. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
  70. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
  71. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -2
  72. data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
  73. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
  74. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
  75. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +62 -59
  76. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
  77. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
  78. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +31 -26
  79. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
  80. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +25 -9
  81. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
  82. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
  83. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
  84. data/vendor/datasketches-cpp/setup.py +1 -1
  85. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  86. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +8 -6
  87. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +1 -0
  88. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -0
  89. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +7 -45
  90. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -0
  91. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +1 -0
  92. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
  93. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
  94. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -0
  95. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +29 -1
  96. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +2 -0
  97. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +16 -0
  98. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +1 -0
  99. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
  100. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
  101. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -0
  102. metadata +25 -9
  103. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
  104. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
  105. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
  106. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
@@ -23,7 +23,9 @@
23
23
  #include <iostream>
24
24
  #include <iomanip>
25
25
  #include <sstream>
26
+ #include <stdexcept>
26
27
 
28
+ #include "conditional_forward.hpp"
27
29
  #include "memory_operations.hpp"
28
30
  #include "kll_helper.hpp"
29
31
 
@@ -146,19 +148,12 @@ kll_sketch<T, C, S, A>::~kll_sketch() {
146
148
  }
147
149
 
148
150
  template<typename T, typename C, typename S, typename A>
149
- void kll_sketch<T, C, S, A>::update(const T& value) {
151
+ template<typename FwdT>
152
+ void kll_sketch<T, C, S, A>::update(FwdT&& value) {
150
153
  if (!check_update_value(value)) { return; }
151
154
  update_min_max(value);
152
155
  const uint32_t index = internal_update();
153
- new (&items_[index]) T(value);
154
- }
155
-
156
- template<typename T, typename C, typename S, typename A>
157
- void kll_sketch<T, C, S, A>::update(T&& value) {
158
- if (!check_update_value(value)) { return; }
159
- update_min_max(value);
160
- const uint32_t index = internal_update();
161
- new (&items_[index]) T(std::move(value));
156
+ new (&items_[index]) T(std::forward<FwdT>(value));
162
157
  }
163
158
 
164
159
  template<typename T, typename C, typename S, typename A>
@@ -181,22 +176,23 @@ uint32_t kll_sketch<T, C, S, A>::internal_update() {
181
176
  }
182
177
 
183
178
  template<typename T, typename C, typename S, typename A>
184
- void kll_sketch<T, C, S, A>::merge(const kll_sketch& other) {
179
+ template<typename FwdSk>
180
+ void kll_sketch<T, C, S, A>::merge(FwdSk&& other) {
185
181
  if (other.is_empty()) return;
186
182
  if (m_ != other.m_) {
187
183
  throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
188
184
  }
189
185
  if (is_empty()) {
190
- min_value_ = new (allocator_.allocate(1)) T(*other.min_value_);
191
- max_value_ = new (allocator_.allocate(1)) T(*other.max_value_);
186
+ min_value_ = new (allocator_.allocate(1)) T(conditional_forward<FwdSk>(*other.min_value_));
187
+ max_value_ = new (allocator_.allocate(1)) T(conditional_forward<FwdSk>(*other.max_value_));
192
188
  } else {
193
- if (C()(*other.min_value_, *min_value_)) *min_value_ = *other.min_value_;
194
- if (C()(*max_value_, *other.max_value_)) *max_value_ = *other.max_value_;
189
+ if (C()(*other.min_value_, *min_value_)) *min_value_ = conditional_forward<FwdSk>(*other.min_value_);
190
+ if (C()(*max_value_, *other.max_value_)) *max_value_ = conditional_forward<FwdSk>(*other.max_value_);
195
191
  }
196
192
  const uint64_t final_n = n_ + other.n_;
197
193
  for (uint32_t i = other.levels_[0]; i < other.levels_[1]; i++) {
198
194
  const uint32_t index = internal_update();
199
- new (&items_[index]) T(other.items_[i]);
195
+ new (&items_[index]) T(conditional_forward<FwdSk>(other.items_[i]));
200
196
  }
201
197
  if (other.num_levels_ >= 2) merge_higher_levels(other, final_n);
202
198
  n_ = final_n;
@@ -204,30 +200,6 @@ void kll_sketch<T, C, S, A>::merge(const kll_sketch& other) {
204
200
  assert_correct_total_weight();
205
201
  }
206
202
 
207
- template<typename T, typename C, typename S, typename A>
208
- void kll_sketch<T, C, S, A>::merge(kll_sketch&& other) {
209
- if (other.is_empty()) return;
210
- if (m_ != other.m_) {
211
- throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
212
- }
213
- if (is_empty()) {
214
- min_value_ = new (allocator_.allocate(1)) T(std::move(*other.min_value_));
215
- max_value_ = new (allocator_.allocate(1)) T(std::move(*other.max_value_));
216
- } else {
217
- if (C()(*other.min_value_, *min_value_)) *min_value_ = std::move(*other.min_value_);
218
- if (C()(*max_value_, *other.max_value_)) *max_value_ = std::move(*other.max_value_);
219
- }
220
- const uint64_t final_n = n_ + other.n_;
221
- for (uint32_t i = other.levels_[0]; i < other.levels_[1]; i++) {
222
- const uint32_t index = internal_update();
223
- new (&items_[index]) T(std::move(other.items_[i]));
224
- }
225
- if (other.num_levels_ >= 2) merge_higher_levels(std::forward<kll_sketch>(other), final_n);
226
- n_ = final_n;
227
- if (other.is_estimation_mode()) min_k_ = std::min(min_k_, other.min_k_);
228
- assert_correct_total_weight();
229
- }
230
-
231
203
  template<typename T, typename C, typename S, typename A>
232
204
  bool kll_sketch<T, C, S, A>::is_empty() const {
233
205
  return n_ == 0;
@@ -266,43 +238,49 @@ T kll_sketch<T, C, S, A>::get_max_value() const {
266
238
  }
267
239
 
268
240
  template<typename T, typename C, typename S, typename A>
269
- T kll_sketch<T, C, S, A>::get_quantile(double fraction) const {
241
+ C kll_sketch<T, C, S, A>::get_comparator() const {
242
+ return C();
243
+ }
244
+
245
+ template<typename T, typename C, typename S, typename A>
246
+ template<bool inclusive>
247
+ auto kll_sketch<T, C, S, A>::get_quantile(double rank) const -> quantile_return_type {
270
248
  if (is_empty()) return get_invalid_value();
271
- if (fraction == 0.0) return *min_value_;
272
- if (fraction == 1.0) return *max_value_;
273
- if ((fraction < 0.0) || (fraction > 1.0)) {
249
+ if (rank == 0.0) return *min_value_;
250
+ if (rank == 1.0) return *max_value_;
251
+ if ((rank < 0.0) || (rank > 1.0)) {
274
252
  throw std::invalid_argument("Fraction cannot be less than zero or greater than 1.0");
275
253
  }
276
- // has side effect of sorting level zero if needed
277
- auto quantile_calculator(const_cast<kll_sketch*>(this)->get_quantile_calculator());
278
- return quantile_calculator->get_quantile(fraction);
254
+ // may have a side effect of sorting level zero if needed
255
+ return get_sorted_view<inclusive>(true).get_quantile(rank);
279
256
  }
280
257
 
281
258
  template<typename T, typename C, typename S, typename A>
282
- std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(const double* fractions, uint32_t size) const {
259
+ template<bool inclusive>
260
+ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(const double* ranks, uint32_t size) const {
283
261
  std::vector<T, A> quantiles(allocator_);
284
262
  if (is_empty()) return quantiles;
285
- std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator;
286
263
  quantiles.reserve(size);
264
+
265
+ // may have a side effect of sorting level zero if needed
266
+ auto view = get_sorted_view<inclusive>(true);
267
+
287
268
  for (uint32_t i = 0; i < size; i++) {
288
- const double fraction = fractions[i];
289
- if ((fraction < 0.0) || (fraction > 1.0)) {
269
+ const double rank = ranks[i];
270
+ if ((rank < 0.0) || (rank > 1.0)) {
290
271
  throw std::invalid_argument("Fraction cannot be less than zero or greater than 1.0");
291
272
  }
292
- if (fraction == 0.0) quantiles.push_back(*min_value_);
293
- else if (fraction == 1.0) quantiles.push_back(*max_value_);
273
+ else if (rank == 0.0) quantiles.push_back(*min_value_);
274
+ else if (rank == 1.0) quantiles.push_back(*max_value_);
294
275
  else {
295
- if (!quantile_calculator) {
296
- // has side effect of sorting level zero if needed
297
- quantile_calculator = const_cast<kll_sketch*>(this)->get_quantile_calculator();
298
- }
299
- quantiles.push_back(quantile_calculator->get_quantile(fraction));
276
+ quantiles.push_back(view.get_quantile(rank));
300
277
  }
301
278
  }
302
279
  return quantiles;
303
280
  }
304
281
 
305
282
  template<typename T, typename C, typename S, typename A>
283
+ template<bool inclusive>
306
284
  std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(uint32_t num) const {
307
285
  if (is_empty()) return std::vector<T, A>(allocator_);
308
286
  if (num == 0) {
@@ -316,10 +294,11 @@ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(uint32_t num) const {
316
294
  if (num > 1) {
317
295
  fractions[num - 1] = 1.0;
318
296
  }
319
- return get_quantiles(fractions.data(), num);
297
+ return get_quantiles<inclusive>(fractions.data(), num);
320
298
  }
321
299
 
322
300
  template<typename T, typename C, typename S, typename A>
301
+ template<bool inclusive>
323
302
  double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
324
303
  if (is_empty()) return std::numeric_limits<double>::quiet_NaN();
325
304
  uint8_t level = 0;
@@ -329,7 +308,7 @@ double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
329
308
  const auto from_index(levels_[level]);
330
309
  const auto to_index(levels_[level + 1]); // exclusive
331
310
  for (uint32_t i = from_index; i < to_index; i++) {
332
- if (C()(items_[i], value)) {
311
+ if (inclusive ? !C()(value, items_[i]) : C()(items_[i], value)) {
333
312
  total += weight;
334
313
  } else if ((level > 0) || is_level_zero_sorted_) {
335
314
  break; // levels above 0 are sorted, no point comparing further
@@ -342,13 +321,15 @@ double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
342
321
  }
343
322
 
344
323
  template<typename T, typename C, typename S, typename A>
324
+ template<bool inclusive>
345
325
  vector_d<A> kll_sketch<T, C, S, A>::get_PMF(const T* split_points, uint32_t size) const {
346
- return get_PMF_or_CDF(split_points, size, false);
326
+ return get_PMF_or_CDF<inclusive>(split_points, size, false);
347
327
  }
348
328
 
349
329
  template<typename T, typename C, typename S, typename A>
330
+ template<bool inclusive>
350
331
  vector_d<A> kll_sketch<T, C, S, A>::get_CDF(const T* split_points, uint32_t size) const {
351
- return get_PMF_or_CDF(split_points, size, true);
332
+ return get_PMF_or_CDF<inclusive>(split_points, size, true);
352
333
  }
353
334
 
354
335
  template<typename T, typename C, typename S, typename A>
@@ -358,8 +339,8 @@ double kll_sketch<T, C, S, A>::get_normalized_rank_error(bool pmf) const {
358
339
 
359
340
  // implementation for fixed-size arithmetic types (integral and floating point)
360
341
  template<typename T, typename C, typename S, typename A>
361
- template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
362
- size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
342
+ template<typename TT, typename SerDe, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
343
+ size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes(const SerDe&) const {
363
344
  if (is_empty()) { return EMPTY_SIZE_BYTES; }
364
345
  if (num_levels_ == 1 && get_num_retained() == 1) {
365
346
  return DATA_START_SINGLE_ITEM + sizeof(TT);
@@ -370,17 +351,17 @@ size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
370
351
 
371
352
  // implementation for all other types
372
353
  template<typename T, typename C, typename S, typename A>
373
- template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
374
- size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
354
+ template<typename TT, typename SerDe, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
355
+ size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes(const SerDe& sd) const {
375
356
  if (is_empty()) { return EMPTY_SIZE_BYTES; }
376
357
  if (num_levels_ == 1 && get_num_retained() == 1) {
377
- return DATA_START_SINGLE_ITEM + S().size_of_item(items_[levels_[0]]);
358
+ return DATA_START_SINGLE_ITEM + sd.size_of_item(items_[levels_[0]]);
378
359
  }
379
360
  // the last integer in the levels_ array is not serialized because it can be derived
380
361
  size_t size = DATA_START + num_levels_ * sizeof(uint32_t);
381
- size += S().size_of_item(*min_value_);
382
- size += S().size_of_item(*max_value_);
383
- for (auto it: *this) size += S().size_of_item(it.first);
362
+ size += sd.size_of_item(*min_value_);
363
+ size += sd.size_of_item(*max_value_);
364
+ for (auto it: *this) size += sd.size_of_item(it.first);
384
365
  return size;
385
366
  }
386
367
 
@@ -405,7 +386,8 @@ size_t kll_sketch<T, C, S, A>::get_max_serialized_size_bytes(uint16_t k, uint64_
405
386
  }
406
387
 
407
388
  template<typename T, typename C, typename S, typename A>
408
- void kll_sketch<T, C, S, A>::serialize(std::ostream& os) const {
389
+ template<typename SerDe>
390
+ void kll_sketch<T, C, S, A>::serialize(std::ostream& os, const SerDe& sd) const {
409
391
  const bool is_single_item = n_ == 1;
410
392
  const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
411
393
  write(os, preamble_ints);
@@ -430,16 +412,17 @@ void kll_sketch<T, C, S, A>::serialize(std::ostream& os) const {
430
412
  write(os, num_levels_);
431
413
  write(os, unused);
432
414
  write(os, levels_.data(), sizeof(levels_[0]) * num_levels_);
433
- S().serialize(os, min_value_, 1);
434
- S().serialize(os, max_value_, 1);
415
+ sd.serialize(os, min_value_, 1);
416
+ sd.serialize(os, max_value_, 1);
435
417
  }
436
- S().serialize(os, &items_[levels_[0]], get_num_retained());
418
+ sd.serialize(os, &items_[levels_[0]], get_num_retained());
437
419
  }
438
420
 
439
421
  template<typename T, typename C, typename S, typename A>
440
- vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const {
422
+ template<typename SerDe>
423
+ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes, const SerDe& sd) const {
441
424
  const bool is_single_item = n_ == 1;
442
- const size_t size = header_size_bytes + get_serialized_size_bytes();
425
+ const size_t size = header_size_bytes + get_serialized_size_bytes(sd);
443
426
  vector_u8<A> bytes(size, 0, allocator_);
444
427
  uint8_t* ptr = bytes.data() + header_size_bytes;
445
428
  const uint8_t* end_ptr = ptr + size;
@@ -465,11 +448,11 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
465
448
  ptr += copy_to_mem(num_levels_, ptr);
466
449
  ptr += sizeof(uint8_t); // unused
467
450
  ptr += copy_to_mem(levels_.data(), ptr, sizeof(levels_[0]) * num_levels_);
468
- ptr += S().serialize(ptr, end_ptr - ptr, min_value_, 1);
469
- ptr += S().serialize(ptr, end_ptr - ptr, max_value_, 1);
451
+ ptr += sd.serialize(ptr, end_ptr - ptr, min_value_, 1);
452
+ ptr += sd.serialize(ptr, end_ptr - ptr, max_value_, 1);
470
453
  }
471
454
  const size_t bytes_remaining = end_ptr - ptr;
472
- ptr += S().serialize(ptr, bytes_remaining, &items_[levels_[0]], get_num_retained());
455
+ ptr += sd.serialize(ptr, bytes_remaining, &items_[levels_[0]], get_num_retained());
473
456
  }
474
457
  const size_t delta = ptr - bytes.data();
475
458
  if (delta != size) throw std::logic_error("serialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size));
@@ -478,6 +461,12 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
478
461
 
479
462
  template<typename T, typename C, typename S, typename A>
480
463
  kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, const A& allocator) {
464
+ return deserialize(is, S(), allocator);
465
+ }
466
+
467
+ template<typename T, typename C, typename S, typename A>
468
+ template<typename SerDe>
469
+ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
481
470
  const auto preamble_ints = read<uint8_t>(is);
482
471
  const auto serial_version = read<uint8_t>(is);
483
472
  const auto family_id = read<uint8_t>(is);
@@ -525,17 +514,17 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, con
525
514
  std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
526
515
  std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
527
516
  if (!is_single_item) {
528
- S().deserialize(is, min_value_buffer.get(), 1);
517
+ sd.deserialize(is, min_value_buffer.get(), 1);
529
518
  // serde call did not throw, repackage with destrtuctor
530
519
  min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
531
- S().deserialize(is, max_value_buffer.get(), 1);
520
+ sd.deserialize(is, max_value_buffer.get(), 1);
532
521
  // serde call did not throw, repackage with destrtuctor
533
522
  max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
534
523
  }
535
524
  auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); };
536
525
  std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(alloc.allocate(capacity), items_buffer_deleter);
537
526
  const auto num_items = levels[num_levels] - levels[0];
538
- S().deserialize(is, &items_buffer.get()[levels[0]], num_items);
527
+ sd.deserialize(is, &items_buffer.get()[levels[0]], num_items);
539
528
  // serde call did not throw, repackage with destrtuctors
540
529
  std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity, allocator));
541
530
  const bool is_level_zero_sorted = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
@@ -555,6 +544,12 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, con
555
544
 
556
545
  template<typename T, typename C, typename S, typename A>
557
546
  kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, size_t size, const A& allocator) {
547
+ return deserialize(bytes, size, S(), allocator);
548
+ }
549
+
550
+ template<typename T, typename C, typename S, typename A>
551
+ template<typename SerDe>
552
+ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {
558
553
  ensure_minimum_memory(size, 8);
559
554
  const char* ptr = static_cast<const char*>(bytes);
560
555
  uint8_t preamble_ints;
@@ -611,17 +606,17 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, si
611
606
  std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
612
607
  std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
613
608
  if (!is_single_item) {
614
- ptr += S().deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
609
+ ptr += sd.deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
615
610
  // serde call did not throw, repackage with destrtuctor
616
611
  min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
617
- ptr += S().deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
612
+ ptr += sd.deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
618
613
  // serde call did not throw, repackage with destrtuctor
619
614
  max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
620
615
  }
621
616
  auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); };
622
617
  std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(alloc.allocate(capacity), items_buffer_deleter);
623
618
  const auto num_items = levels[num_levels] - levels[0];
624
- ptr += S().deserialize(ptr, end_ptr - ptr, &items_buffer.get()[levels[0]], num_items);
619
+ ptr += sd.deserialize(ptr, end_ptr - ptr, &items_buffer.get()[levels[0]], num_items);
625
620
  // serde call did not throw, repackage with destrtuctors
626
621
  std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity, allocator));
627
622
  const size_t delta = ptr - static_cast<const char*>(bytes);
@@ -786,18 +781,23 @@ void kll_sketch<T, C, S, A>::sort_level_zero() {
786
781
  }
787
782
 
788
783
  template<typename T, typename C, typename S, typename A>
789
- std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> kll_sketch<T, C, S, A>::get_quantile_calculator() {
790
- sort_level_zero();
791
- using AllocCalc = typename std::allocator_traits<A>::template rebind_alloc<kll_quantile_calculator<T, C, A>>;
792
- AllocCalc alloc(allocator_);
793
- std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator(
794
- new (alloc.allocate(1)) kll_quantile_calculator<T, C, A>(*this),
795
- [&alloc](kll_quantile_calculator<T, C, A>* ptr){ ptr->~kll_quantile_calculator<T, C, A>(); alloc.deallocate(ptr, 1); }
796
- );
797
- return quantile_calculator;
784
+ template<bool inclusive>
785
+ quantile_sketch_sorted_view<T, C, A> kll_sketch<T, C, S, A>::get_sorted_view(bool cumulative) const {
786
+ const_cast<kll_sketch*>(this)->sort_level_zero(); // allow this side effect
787
+ quantile_sketch_sorted_view<T, C, A> view(get_num_retained(), allocator_);
788
+ uint8_t level = 0;
789
+ while (level < num_levels_) {
790
+ const auto from = items_ + levels_[level];
791
+ const auto to = items_ + levels_[level + 1]; // exclusive
792
+ view.add(from, to, 1 << level);
793
+ ++level;
794
+ }
795
+ if (cumulative) view.template convert_to_cummulative<inclusive>();
796
+ return view;
798
797
  }
799
798
 
800
799
  template<typename T, typename C, typename S, typename A>
800
+ template<bool inclusive>
801
801
  vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32_t size, bool is_CDF) const {
802
802
  if (is_empty()) return vector_d<A>(allocator_);
803
803
  kll_helper::validate_values<T, C>(split_points, size);
@@ -808,9 +808,9 @@ vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32
808
808
  const auto from_index = levels_[level];
809
809
  const auto to_index = levels_[level + 1]; // exclusive
810
810
  if ((level == 0) && !is_level_zero_sorted_) {
811
- increment_buckets_unsorted_level(from_index, to_index, weight, split_points, size, buckets.data());
811
+ increment_buckets_unsorted_level<inclusive>(from_index, to_index, weight, split_points, size, buckets.data());
812
812
  } else {
813
- increment_buckets_sorted_level(from_index, to_index, weight, split_points, size, buckets.data());
813
+ increment_buckets_sorted_level<inclusive>(from_index, to_index, weight, split_points, size, buckets.data());
814
814
  }
815
815
  level++;
816
816
  weight *= 2;
@@ -831,13 +831,14 @@ vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32
831
831
  }
832
832
 
833
833
  template<typename T, typename C, typename S, typename A>
834
+ template<bool inclusive>
834
835
  void kll_sketch<T, C, S, A>::increment_buckets_unsorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
835
836
  const T* split_points, uint32_t size, double* buckets) const
836
837
  {
837
838
  for (uint32_t i = from_index; i < to_index; i++) {
838
839
  uint32_t j;
839
840
  for (j = 0; j < size; j++) {
840
- if (C()(items_[i], split_points[j])) {
841
+ if (inclusive ? !C()(split_points[j], items_[i]) : C()(items_[i], split_points[j])) {
841
842
  break;
842
843
  }
843
844
  }
@@ -846,13 +847,14 @@ void kll_sketch<T, C, S, A>::increment_buckets_unsorted_level(uint32_t from_inde
846
847
  }
847
848
 
848
849
  template<typename T, typename C, typename S, typename A>
850
+ template<bool inclusive>
849
851
  void kll_sketch<T, C, S, A>::increment_buckets_sorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
850
852
  const T* split_points, uint32_t size, double* buckets) const
851
853
  {
852
854
  uint32_t i = from_index;
853
855
  uint32_t j = 0;
854
856
  while ((i < to_index) && (j < size)) {
855
- if (C()(items_[i], split_points[j])) {
857
+ if (inclusive ? !C()(split_points[j], items_[i]) : C()(items_[i], split_points[j])) {
856
858
  buckets[j] += weight; // this sample goes into this bucket
857
859
  i++; // move on to next sample and see whether it also goes into this bucket
858
860
  } else {
@@ -910,34 +912,9 @@ void kll_sketch<T, C, S, A>::merge_higher_levels(O&& other, uint64_t final_n) {
910
912
  }
911
913
 
912
914
  // this leaves items_ uninitialized (all objects moved out and destroyed)
913
- // this version copies objects from the incoming sketch
914
- template<typename T, typename C, typename S, typename A>
915
- void kll_sketch<T, C, S, A>::populate_work_arrays(const kll_sketch& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels) {
916
- worklevels[0] = 0;
917
-
918
- // the level zero data from "other" was already inserted into "this"
919
- kll_helper::move_construct<T>(items_, levels_[0], levels_[1], workbuf, 0, true);
920
- worklevels[1] = safe_level_size(0);
921
-
922
- for (uint8_t lvl = 1; lvl < provisional_num_levels; lvl++) {
923
- const uint32_t self_pop = safe_level_size(lvl);
924
- const uint32_t other_pop = other.safe_level_size(lvl);
925
- worklevels[lvl + 1] = worklevels[lvl] + self_pop + other_pop;
926
-
927
- if ((self_pop > 0) && (other_pop == 0)) {
928
- kll_helper::move_construct<T>(items_, levels_[lvl], levels_[lvl] + self_pop, workbuf, worklevels[lvl], true);
929
- } else if ((self_pop == 0) && (other_pop > 0)) {
930
- kll_helper::copy_construct<T>(other.items_, other.levels_[lvl], other.levels_[lvl] + other_pop, workbuf, worklevels[lvl]);
931
- } else if ((self_pop > 0) && (other_pop > 0)) {
932
- kll_helper::merge_sorted_arrays<T, C>(items_, levels_[lvl], self_pop, other.items_, other.levels_[lvl], other_pop, workbuf, worklevels[lvl]);
933
- }
934
- }
935
- }
936
-
937
- // this leaves items_ uninitialized (all objects moved out and destroyed)
938
- // this version moves objects from the incoming sketch
939
915
  template<typename T, typename C, typename S, typename A>
940
- void kll_sketch<T, C, S, A>::populate_work_arrays(kll_sketch&& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels) {
916
+ template<typename FwdSk>
917
+ void kll_sketch<T, C, S, A>::populate_work_arrays(FwdSk&& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels) {
941
918
  worklevels[0] = 0;
942
919
 
943
920
  // the level zero data from "other" was already inserted into "this"
@@ -952,7 +929,9 @@ void kll_sketch<T, C, S, A>::populate_work_arrays(kll_sketch&& other, T* workbuf
952
929
  if ((self_pop > 0) && (other_pop == 0)) {
953
930
  kll_helper::move_construct<T>(items_, levels_[lvl], levels_[lvl] + self_pop, workbuf, worklevels[lvl], true);
954
931
  } else if ((self_pop == 0) && (other_pop > 0)) {
955
- kll_helper::move_construct<T>(other.items_, other.levels_[lvl], other.levels_[lvl] + other_pop, workbuf, worklevels[lvl], false);
932
+ for (auto i = other.levels_[lvl], j = worklevels[lvl]; i < other.levels_[lvl] + other_pop; ++i, ++j) {
933
+ new (&workbuf[j]) T(conditional_forward<FwdSk>(other.items_[i]));
934
+ }
956
935
  } else if ((self_pop > 0) && (other_pop > 0)) {
957
936
  kll_helper::merge_sorted_arrays<T, C>(items_, levels_[lvl], self_pop, other.items_, other.levels_[lvl], other_pop, workbuf, worklevels[lvl]);
958
937
  }
@@ -1039,7 +1018,6 @@ string<A> kll_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items)
1039
1018
  os << " Sorted : " << (is_level_zero_sorted_ ? "true" : "false") << std::endl;
1040
1019
  os << " Capacity items : " << items_size_ << std::endl;
1041
1020
  os << " Retained items : " << get_num_retained() << std::endl;
1042
- os << " Storage bytes : " << get_serialized_size_bytes() << std::endl;
1043
1021
  if (!is_empty()) {
1044
1022
  os << " Min value : " << *min_value_ << std::endl;
1045
1023
  os << " Max value : " << *max_value_ << std::endl;