datasketches 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +1 -1
  4. data/ext/datasketches/kll_wrapper.cpp +5 -1
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +4 -3
  7. data/vendor/datasketches-cpp/common/CMakeLists.txt +4 -0
  8. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
  10. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
  11. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
  12. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
  13. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
  14. data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
  15. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
  16. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
  17. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +2 -0
  18. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
  19. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
  20. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
  21. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +25 -9
  22. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
  23. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
  24. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
  25. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
  26. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
  28. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
  29. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
  30. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
  31. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
  32. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
  33. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
  34. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
  35. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
  36. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
  37. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -4
  38. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
  39. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
  40. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +96 -42
  41. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +105 -127
  42. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +94 -25
  43. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  44. data/vendor/datasketches-cpp/pyproject.toml +1 -1
  45. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  46. data/vendor/datasketches-cpp/python/README.md +7 -0
  47. data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
  48. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
  49. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
  50. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
  51. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
  52. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
  53. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
  54. data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
  55. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
  56. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
  57. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
  58. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
  59. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
  60. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
  61. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
  62. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
  63. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
  64. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
  65. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
  66. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
  67. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
  68. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
  69. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
  70. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
  71. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -2
  72. data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
  73. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
  74. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
  75. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +62 -59
  76. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
  77. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
  78. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +31 -26
  79. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
  80. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +25 -9
  81. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
  82. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
  83. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
  84. data/vendor/datasketches-cpp/setup.py +1 -1
  85. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  86. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +8 -6
  87. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +1 -0
  88. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -0
  89. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +7 -45
  90. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -0
  91. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +1 -0
  92. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
  93. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
  94. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -0
  95. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +29 -1
  96. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +2 -0
  97. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +16 -0
  98. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +1 -0
  99. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
  100. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
  101. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -0
  102. metadata +25 -9
  103. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
  104. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
  105. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
  106. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
@@ -23,7 +23,9 @@
23
23
  #include <iostream>
24
24
  #include <iomanip>
25
25
  #include <sstream>
26
+ #include <stdexcept>
26
27
 
28
+ #include "conditional_forward.hpp"
27
29
  #include "memory_operations.hpp"
28
30
  #include "kll_helper.hpp"
29
31
 
@@ -146,19 +148,12 @@ kll_sketch<T, C, S, A>::~kll_sketch() {
146
148
  }
147
149
 
148
150
  template<typename T, typename C, typename S, typename A>
149
- void kll_sketch<T, C, S, A>::update(const T& value) {
151
+ template<typename FwdT>
152
+ void kll_sketch<T, C, S, A>::update(FwdT&& value) {
150
153
  if (!check_update_value(value)) { return; }
151
154
  update_min_max(value);
152
155
  const uint32_t index = internal_update();
153
- new (&items_[index]) T(value);
154
- }
155
-
156
- template<typename T, typename C, typename S, typename A>
157
- void kll_sketch<T, C, S, A>::update(T&& value) {
158
- if (!check_update_value(value)) { return; }
159
- update_min_max(value);
160
- const uint32_t index = internal_update();
161
- new (&items_[index]) T(std::move(value));
156
+ new (&items_[index]) T(std::forward<FwdT>(value));
162
157
  }
163
158
 
164
159
  template<typename T, typename C, typename S, typename A>
@@ -181,22 +176,23 @@ uint32_t kll_sketch<T, C, S, A>::internal_update() {
181
176
  }
182
177
 
183
178
  template<typename T, typename C, typename S, typename A>
184
- void kll_sketch<T, C, S, A>::merge(const kll_sketch& other) {
179
+ template<typename FwdSk>
180
+ void kll_sketch<T, C, S, A>::merge(FwdSk&& other) {
185
181
  if (other.is_empty()) return;
186
182
  if (m_ != other.m_) {
187
183
  throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
188
184
  }
189
185
  if (is_empty()) {
190
- min_value_ = new (allocator_.allocate(1)) T(*other.min_value_);
191
- max_value_ = new (allocator_.allocate(1)) T(*other.max_value_);
186
+ min_value_ = new (allocator_.allocate(1)) T(conditional_forward<FwdSk>(*other.min_value_));
187
+ max_value_ = new (allocator_.allocate(1)) T(conditional_forward<FwdSk>(*other.max_value_));
192
188
  } else {
193
- if (C()(*other.min_value_, *min_value_)) *min_value_ = *other.min_value_;
194
- if (C()(*max_value_, *other.max_value_)) *max_value_ = *other.max_value_;
189
+ if (C()(*other.min_value_, *min_value_)) *min_value_ = conditional_forward<FwdSk>(*other.min_value_);
190
+ if (C()(*max_value_, *other.max_value_)) *max_value_ = conditional_forward<FwdSk>(*other.max_value_);
195
191
  }
196
192
  const uint64_t final_n = n_ + other.n_;
197
193
  for (uint32_t i = other.levels_[0]; i < other.levels_[1]; i++) {
198
194
  const uint32_t index = internal_update();
199
- new (&items_[index]) T(other.items_[i]);
195
+ new (&items_[index]) T(conditional_forward<FwdSk>(other.items_[i]));
200
196
  }
201
197
  if (other.num_levels_ >= 2) merge_higher_levels(other, final_n);
202
198
  n_ = final_n;
@@ -204,30 +200,6 @@ void kll_sketch<T, C, S, A>::merge(const kll_sketch& other) {
204
200
  assert_correct_total_weight();
205
201
  }
206
202
 
207
- template<typename T, typename C, typename S, typename A>
208
- void kll_sketch<T, C, S, A>::merge(kll_sketch&& other) {
209
- if (other.is_empty()) return;
210
- if (m_ != other.m_) {
211
- throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
212
- }
213
- if (is_empty()) {
214
- min_value_ = new (allocator_.allocate(1)) T(std::move(*other.min_value_));
215
- max_value_ = new (allocator_.allocate(1)) T(std::move(*other.max_value_));
216
- } else {
217
- if (C()(*other.min_value_, *min_value_)) *min_value_ = std::move(*other.min_value_);
218
- if (C()(*max_value_, *other.max_value_)) *max_value_ = std::move(*other.max_value_);
219
- }
220
- const uint64_t final_n = n_ + other.n_;
221
- for (uint32_t i = other.levels_[0]; i < other.levels_[1]; i++) {
222
- const uint32_t index = internal_update();
223
- new (&items_[index]) T(std::move(other.items_[i]));
224
- }
225
- if (other.num_levels_ >= 2) merge_higher_levels(std::forward<kll_sketch>(other), final_n);
226
- n_ = final_n;
227
- if (other.is_estimation_mode()) min_k_ = std::min(min_k_, other.min_k_);
228
- assert_correct_total_weight();
229
- }
230
-
231
203
  template<typename T, typename C, typename S, typename A>
232
204
  bool kll_sketch<T, C, S, A>::is_empty() const {
233
205
  return n_ == 0;
@@ -266,43 +238,49 @@ T kll_sketch<T, C, S, A>::get_max_value() const {
266
238
  }
267
239
 
268
240
  template<typename T, typename C, typename S, typename A>
269
- T kll_sketch<T, C, S, A>::get_quantile(double fraction) const {
241
+ C kll_sketch<T, C, S, A>::get_comparator() const {
242
+ return C();
243
+ }
244
+
245
+ template<typename T, typename C, typename S, typename A>
246
+ template<bool inclusive>
247
+ auto kll_sketch<T, C, S, A>::get_quantile(double rank) const -> quantile_return_type {
270
248
  if (is_empty()) return get_invalid_value();
271
- if (fraction == 0.0) return *min_value_;
272
- if (fraction == 1.0) return *max_value_;
273
- if ((fraction < 0.0) || (fraction > 1.0)) {
249
+ if (rank == 0.0) return *min_value_;
250
+ if (rank == 1.0) return *max_value_;
251
+ if ((rank < 0.0) || (rank > 1.0)) {
274
252
  throw std::invalid_argument("Fraction cannot be less than zero or greater than 1.0");
275
253
  }
276
- // has side effect of sorting level zero if needed
277
- auto quantile_calculator(const_cast<kll_sketch*>(this)->get_quantile_calculator());
278
- return quantile_calculator->get_quantile(fraction);
254
+ // may have a side effect of sorting level zero if needed
255
+ return get_sorted_view<inclusive>(true).get_quantile(rank);
279
256
  }
280
257
 
281
258
  template<typename T, typename C, typename S, typename A>
282
- std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(const double* fractions, uint32_t size) const {
259
+ template<bool inclusive>
260
+ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(const double* ranks, uint32_t size) const {
283
261
  std::vector<T, A> quantiles(allocator_);
284
262
  if (is_empty()) return quantiles;
285
- std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator;
286
263
  quantiles.reserve(size);
264
+
265
+ // may have a side effect of sorting level zero if needed
266
+ auto view = get_sorted_view<inclusive>(true);
267
+
287
268
  for (uint32_t i = 0; i < size; i++) {
288
- const double fraction = fractions[i];
289
- if ((fraction < 0.0) || (fraction > 1.0)) {
269
+ const double rank = ranks[i];
270
+ if ((rank < 0.0) || (rank > 1.0)) {
290
271
  throw std::invalid_argument("Fraction cannot be less than zero or greater than 1.0");
291
272
  }
292
- if (fraction == 0.0) quantiles.push_back(*min_value_);
293
- else if (fraction == 1.0) quantiles.push_back(*max_value_);
273
+ else if (rank == 0.0) quantiles.push_back(*min_value_);
274
+ else if (rank == 1.0) quantiles.push_back(*max_value_);
294
275
  else {
295
- if (!quantile_calculator) {
296
- // has side effect of sorting level zero if needed
297
- quantile_calculator = const_cast<kll_sketch*>(this)->get_quantile_calculator();
298
- }
299
- quantiles.push_back(quantile_calculator->get_quantile(fraction));
276
+ quantiles.push_back(view.get_quantile(rank));
300
277
  }
301
278
  }
302
279
  return quantiles;
303
280
  }
304
281
 
305
282
  template<typename T, typename C, typename S, typename A>
283
+ template<bool inclusive>
306
284
  std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(uint32_t num) const {
307
285
  if (is_empty()) return std::vector<T, A>(allocator_);
308
286
  if (num == 0) {
@@ -316,10 +294,11 @@ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(uint32_t num) const {
316
294
  if (num > 1) {
317
295
  fractions[num - 1] = 1.0;
318
296
  }
319
- return get_quantiles(fractions.data(), num);
297
+ return get_quantiles<inclusive>(fractions.data(), num);
320
298
  }
321
299
 
322
300
  template<typename T, typename C, typename S, typename A>
301
+ template<bool inclusive>
323
302
  double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
324
303
  if (is_empty()) return std::numeric_limits<double>::quiet_NaN();
325
304
  uint8_t level = 0;
@@ -329,7 +308,7 @@ double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
329
308
  const auto from_index(levels_[level]);
330
309
  const auto to_index(levels_[level + 1]); // exclusive
331
310
  for (uint32_t i = from_index; i < to_index; i++) {
332
- if (C()(items_[i], value)) {
311
+ if (inclusive ? !C()(value, items_[i]) : C()(items_[i], value)) {
333
312
  total += weight;
334
313
  } else if ((level > 0) || is_level_zero_sorted_) {
335
314
  break; // levels above 0 are sorted, no point comparing further
@@ -342,13 +321,15 @@ double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
342
321
  }
343
322
 
344
323
  template<typename T, typename C, typename S, typename A>
324
+ template<bool inclusive>
345
325
  vector_d<A> kll_sketch<T, C, S, A>::get_PMF(const T* split_points, uint32_t size) const {
346
- return get_PMF_or_CDF(split_points, size, false);
326
+ return get_PMF_or_CDF<inclusive>(split_points, size, false);
347
327
  }
348
328
 
349
329
  template<typename T, typename C, typename S, typename A>
330
+ template<bool inclusive>
350
331
  vector_d<A> kll_sketch<T, C, S, A>::get_CDF(const T* split_points, uint32_t size) const {
351
- return get_PMF_or_CDF(split_points, size, true);
332
+ return get_PMF_or_CDF<inclusive>(split_points, size, true);
352
333
  }
353
334
 
354
335
  template<typename T, typename C, typename S, typename A>
@@ -358,8 +339,8 @@ double kll_sketch<T, C, S, A>::get_normalized_rank_error(bool pmf) const {
358
339
 
359
340
  // implementation for fixed-size arithmetic types (integral and floating point)
360
341
  template<typename T, typename C, typename S, typename A>
361
- template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
362
- size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
342
+ template<typename TT, typename SerDe, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
343
+ size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes(const SerDe&) const {
363
344
  if (is_empty()) { return EMPTY_SIZE_BYTES; }
364
345
  if (num_levels_ == 1 && get_num_retained() == 1) {
365
346
  return DATA_START_SINGLE_ITEM + sizeof(TT);
@@ -370,17 +351,17 @@ size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
370
351
 
371
352
  // implementation for all other types
372
353
  template<typename T, typename C, typename S, typename A>
373
- template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
374
- size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
354
+ template<typename TT, typename SerDe, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
355
+ size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes(const SerDe& sd) const {
375
356
  if (is_empty()) { return EMPTY_SIZE_BYTES; }
376
357
  if (num_levels_ == 1 && get_num_retained() == 1) {
377
- return DATA_START_SINGLE_ITEM + S().size_of_item(items_[levels_[0]]);
358
+ return DATA_START_SINGLE_ITEM + sd.size_of_item(items_[levels_[0]]);
378
359
  }
379
360
  // the last integer in the levels_ array is not serialized because it can be derived
380
361
  size_t size = DATA_START + num_levels_ * sizeof(uint32_t);
381
- size += S().size_of_item(*min_value_);
382
- size += S().size_of_item(*max_value_);
383
- for (auto it: *this) size += S().size_of_item(it.first);
362
+ size += sd.size_of_item(*min_value_);
363
+ size += sd.size_of_item(*max_value_);
364
+ for (auto it: *this) size += sd.size_of_item(it.first);
384
365
  return size;
385
366
  }
386
367
 
@@ -405,7 +386,8 @@ size_t kll_sketch<T, C, S, A>::get_max_serialized_size_bytes(uint16_t k, uint64_
405
386
  }
406
387
 
407
388
  template<typename T, typename C, typename S, typename A>
408
- void kll_sketch<T, C, S, A>::serialize(std::ostream& os) const {
389
+ template<typename SerDe>
390
+ void kll_sketch<T, C, S, A>::serialize(std::ostream& os, const SerDe& sd) const {
409
391
  const bool is_single_item = n_ == 1;
410
392
  const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
411
393
  write(os, preamble_ints);
@@ -430,16 +412,17 @@ void kll_sketch<T, C, S, A>::serialize(std::ostream& os) const {
430
412
  write(os, num_levels_);
431
413
  write(os, unused);
432
414
  write(os, levels_.data(), sizeof(levels_[0]) * num_levels_);
433
- S().serialize(os, min_value_, 1);
434
- S().serialize(os, max_value_, 1);
415
+ sd.serialize(os, min_value_, 1);
416
+ sd.serialize(os, max_value_, 1);
435
417
  }
436
- S().serialize(os, &items_[levels_[0]], get_num_retained());
418
+ sd.serialize(os, &items_[levels_[0]], get_num_retained());
437
419
  }
438
420
 
439
421
  template<typename T, typename C, typename S, typename A>
440
- vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const {
422
+ template<typename SerDe>
423
+ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes, const SerDe& sd) const {
441
424
  const bool is_single_item = n_ == 1;
442
- const size_t size = header_size_bytes + get_serialized_size_bytes();
425
+ const size_t size = header_size_bytes + get_serialized_size_bytes(sd);
443
426
  vector_u8<A> bytes(size, 0, allocator_);
444
427
  uint8_t* ptr = bytes.data() + header_size_bytes;
445
428
  const uint8_t* end_ptr = ptr + size;
@@ -465,11 +448,11 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
465
448
  ptr += copy_to_mem(num_levels_, ptr);
466
449
  ptr += sizeof(uint8_t); // unused
467
450
  ptr += copy_to_mem(levels_.data(), ptr, sizeof(levels_[0]) * num_levels_);
468
- ptr += S().serialize(ptr, end_ptr - ptr, min_value_, 1);
469
- ptr += S().serialize(ptr, end_ptr - ptr, max_value_, 1);
451
+ ptr += sd.serialize(ptr, end_ptr - ptr, min_value_, 1);
452
+ ptr += sd.serialize(ptr, end_ptr - ptr, max_value_, 1);
470
453
  }
471
454
  const size_t bytes_remaining = end_ptr - ptr;
472
- ptr += S().serialize(ptr, bytes_remaining, &items_[levels_[0]], get_num_retained());
455
+ ptr += sd.serialize(ptr, bytes_remaining, &items_[levels_[0]], get_num_retained());
473
456
  }
474
457
  const size_t delta = ptr - bytes.data();
475
458
  if (delta != size) throw std::logic_error("serialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size));
@@ -478,6 +461,12 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
478
461
 
479
462
  template<typename T, typename C, typename S, typename A>
480
463
  kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, const A& allocator) {
464
+ return deserialize(is, S(), allocator);
465
+ }
466
+
467
+ template<typename T, typename C, typename S, typename A>
468
+ template<typename SerDe>
469
+ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
481
470
  const auto preamble_ints = read<uint8_t>(is);
482
471
  const auto serial_version = read<uint8_t>(is);
483
472
  const auto family_id = read<uint8_t>(is);
@@ -525,17 +514,17 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, con
525
514
  std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
526
515
  std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
527
516
  if (!is_single_item) {
528
- S().deserialize(is, min_value_buffer.get(), 1);
517
+ sd.deserialize(is, min_value_buffer.get(), 1);
529
518
  // serde call did not throw, repackage with destrtuctor
530
519
  min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
531
- S().deserialize(is, max_value_buffer.get(), 1);
520
+ sd.deserialize(is, max_value_buffer.get(), 1);
532
521
  // serde call did not throw, repackage with destrtuctor
533
522
  max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
534
523
  }
535
524
  auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); };
536
525
  std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(alloc.allocate(capacity), items_buffer_deleter);
537
526
  const auto num_items = levels[num_levels] - levels[0];
538
- S().deserialize(is, &items_buffer.get()[levels[0]], num_items);
527
+ sd.deserialize(is, &items_buffer.get()[levels[0]], num_items);
539
528
  // serde call did not throw, repackage with destrtuctors
540
529
  std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity, allocator));
541
530
  const bool is_level_zero_sorted = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
@@ -555,6 +544,12 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, con
555
544
 
556
545
  template<typename T, typename C, typename S, typename A>
557
546
  kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, size_t size, const A& allocator) {
547
+ return deserialize(bytes, size, S(), allocator);
548
+ }
549
+
550
+ template<typename T, typename C, typename S, typename A>
551
+ template<typename SerDe>
552
+ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {
558
553
  ensure_minimum_memory(size, 8);
559
554
  const char* ptr = static_cast<const char*>(bytes);
560
555
  uint8_t preamble_ints;
@@ -611,17 +606,17 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, si
611
606
  std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
612
607
  std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
613
608
  if (!is_single_item) {
614
- ptr += S().deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
609
+ ptr += sd.deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
615
610
  // serde call did not throw, repackage with destrtuctor
616
611
  min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
617
- ptr += S().deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
612
+ ptr += sd.deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
618
613
  // serde call did not throw, repackage with destrtuctor
619
614
  max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
620
615
  }
621
616
  auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); };
622
617
  std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(alloc.allocate(capacity), items_buffer_deleter);
623
618
  const auto num_items = levels[num_levels] - levels[0];
624
- ptr += S().deserialize(ptr, end_ptr - ptr, &items_buffer.get()[levels[0]], num_items);
619
+ ptr += sd.deserialize(ptr, end_ptr - ptr, &items_buffer.get()[levels[0]], num_items);
625
620
  // serde call did not throw, repackage with destrtuctors
626
621
  std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity, allocator));
627
622
  const size_t delta = ptr - static_cast<const char*>(bytes);
@@ -786,18 +781,23 @@ void kll_sketch<T, C, S, A>::sort_level_zero() {
786
781
  }
787
782
 
788
783
  template<typename T, typename C, typename S, typename A>
789
- std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> kll_sketch<T, C, S, A>::get_quantile_calculator() {
790
- sort_level_zero();
791
- using AllocCalc = typename std::allocator_traits<A>::template rebind_alloc<kll_quantile_calculator<T, C, A>>;
792
- AllocCalc alloc(allocator_);
793
- std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator(
794
- new (alloc.allocate(1)) kll_quantile_calculator<T, C, A>(*this),
795
- [&alloc](kll_quantile_calculator<T, C, A>* ptr){ ptr->~kll_quantile_calculator<T, C, A>(); alloc.deallocate(ptr, 1); }
796
- );
797
- return quantile_calculator;
784
+ template<bool inclusive>
785
+ quantile_sketch_sorted_view<T, C, A> kll_sketch<T, C, S, A>::get_sorted_view(bool cumulative) const {
786
+ const_cast<kll_sketch*>(this)->sort_level_zero(); // allow this side effect
787
+ quantile_sketch_sorted_view<T, C, A> view(get_num_retained(), allocator_);
788
+ uint8_t level = 0;
789
+ while (level < num_levels_) {
790
+ const auto from = items_ + levels_[level];
791
+ const auto to = items_ + levels_[level + 1]; // exclusive
792
+ view.add(from, to, 1 << level);
793
+ ++level;
794
+ }
795
+ if (cumulative) view.template convert_to_cummulative<inclusive>();
796
+ return view;
798
797
  }
799
798
 
800
799
  template<typename T, typename C, typename S, typename A>
800
+ template<bool inclusive>
801
801
  vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32_t size, bool is_CDF) const {
802
802
  if (is_empty()) return vector_d<A>(allocator_);
803
803
  kll_helper::validate_values<T, C>(split_points, size);
@@ -808,9 +808,9 @@ vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32
808
808
  const auto from_index = levels_[level];
809
809
  const auto to_index = levels_[level + 1]; // exclusive
810
810
  if ((level == 0) && !is_level_zero_sorted_) {
811
- increment_buckets_unsorted_level(from_index, to_index, weight, split_points, size, buckets.data());
811
+ increment_buckets_unsorted_level<inclusive>(from_index, to_index, weight, split_points, size, buckets.data());
812
812
  } else {
813
- increment_buckets_sorted_level(from_index, to_index, weight, split_points, size, buckets.data());
813
+ increment_buckets_sorted_level<inclusive>(from_index, to_index, weight, split_points, size, buckets.data());
814
814
  }
815
815
  level++;
816
816
  weight *= 2;
@@ -831,13 +831,14 @@ vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32
831
831
  }
832
832
 
833
833
  template<typename T, typename C, typename S, typename A>
834
+ template<bool inclusive>
834
835
  void kll_sketch<T, C, S, A>::increment_buckets_unsorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
835
836
  const T* split_points, uint32_t size, double* buckets) const
836
837
  {
837
838
  for (uint32_t i = from_index; i < to_index; i++) {
838
839
  uint32_t j;
839
840
  for (j = 0; j < size; j++) {
840
- if (C()(items_[i], split_points[j])) {
841
+ if (inclusive ? !C()(split_points[j], items_[i]) : C()(items_[i], split_points[j])) {
841
842
  break;
842
843
  }
843
844
  }
@@ -846,13 +847,14 @@ void kll_sketch<T, C, S, A>::increment_buckets_unsorted_level(uint32_t from_inde
846
847
  }
847
848
 
848
849
  template<typename T, typename C, typename S, typename A>
850
+ template<bool inclusive>
849
851
  void kll_sketch<T, C, S, A>::increment_buckets_sorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
850
852
  const T* split_points, uint32_t size, double* buckets) const
851
853
  {
852
854
  uint32_t i = from_index;
853
855
  uint32_t j = 0;
854
856
  while ((i < to_index) && (j < size)) {
855
- if (C()(items_[i], split_points[j])) {
857
+ if (inclusive ? !C()(split_points[j], items_[i]) : C()(items_[i], split_points[j])) {
856
858
  buckets[j] += weight; // this sample goes into this bucket
857
859
  i++; // move on to next sample and see whether it also goes into this bucket
858
860
  } else {
@@ -910,34 +912,9 @@ void kll_sketch<T, C, S, A>::merge_higher_levels(O&& other, uint64_t final_n) {
910
912
  }
911
913
 
912
914
  // this leaves items_ uninitialized (all objects moved out and destroyed)
913
- // this version copies objects from the incoming sketch
914
- template<typename T, typename C, typename S, typename A>
915
- void kll_sketch<T, C, S, A>::populate_work_arrays(const kll_sketch& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels) {
916
- worklevels[0] = 0;
917
-
918
- // the level zero data from "other" was already inserted into "this"
919
- kll_helper::move_construct<T>(items_, levels_[0], levels_[1], workbuf, 0, true);
920
- worklevels[1] = safe_level_size(0);
921
-
922
- for (uint8_t lvl = 1; lvl < provisional_num_levels; lvl++) {
923
- const uint32_t self_pop = safe_level_size(lvl);
924
- const uint32_t other_pop = other.safe_level_size(lvl);
925
- worklevels[lvl + 1] = worklevels[lvl] + self_pop + other_pop;
926
-
927
- if ((self_pop > 0) && (other_pop == 0)) {
928
- kll_helper::move_construct<T>(items_, levels_[lvl], levels_[lvl] + self_pop, workbuf, worklevels[lvl], true);
929
- } else if ((self_pop == 0) && (other_pop > 0)) {
930
- kll_helper::copy_construct<T>(other.items_, other.levels_[lvl], other.levels_[lvl] + other_pop, workbuf, worklevels[lvl]);
931
- } else if ((self_pop > 0) && (other_pop > 0)) {
932
- kll_helper::merge_sorted_arrays<T, C>(items_, levels_[lvl], self_pop, other.items_, other.levels_[lvl], other_pop, workbuf, worklevels[lvl]);
933
- }
934
- }
935
- }
936
-
937
- // this leaves items_ uninitialized (all objects moved out and destroyed)
938
- // this version moves objects from the incoming sketch
939
915
  template<typename T, typename C, typename S, typename A>
940
- void kll_sketch<T, C, S, A>::populate_work_arrays(kll_sketch&& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels) {
916
+ template<typename FwdSk>
917
+ void kll_sketch<T, C, S, A>::populate_work_arrays(FwdSk&& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels) {
941
918
  worklevels[0] = 0;
942
919
 
943
920
  // the level zero data from "other" was already inserted into "this"
@@ -952,7 +929,9 @@ void kll_sketch<T, C, S, A>::populate_work_arrays(kll_sketch&& other, T* workbuf
952
929
  if ((self_pop > 0) && (other_pop == 0)) {
953
930
  kll_helper::move_construct<T>(items_, levels_[lvl], levels_[lvl] + self_pop, workbuf, worklevels[lvl], true);
954
931
  } else if ((self_pop == 0) && (other_pop > 0)) {
955
- kll_helper::move_construct<T>(other.items_, other.levels_[lvl], other.levels_[lvl] + other_pop, workbuf, worklevels[lvl], false);
932
+ for (auto i = other.levels_[lvl], j = worklevels[lvl]; i < other.levels_[lvl] + other_pop; ++i, ++j) {
933
+ new (&workbuf[j]) T(conditional_forward<FwdSk>(other.items_[i]));
934
+ }
956
935
  } else if ((self_pop > 0) && (other_pop > 0)) {
957
936
  kll_helper::merge_sorted_arrays<T, C>(items_, levels_[lvl], self_pop, other.items_, other.levels_[lvl], other_pop, workbuf, worklevels[lvl]);
958
937
  }
@@ -1039,7 +1018,6 @@ string<A> kll_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items)
1039
1018
  os << " Sorted : " << (is_level_zero_sorted_ ? "true" : "false") << std::endl;
1040
1019
  os << " Capacity items : " << items_size_ << std::endl;
1041
1020
  os << " Retained items : " << get_num_retained() << std::endl;
1042
- os << " Storage bytes : " << get_serialized_size_bytes() << std::endl;
1043
1021
  if (!is_empty()) {
1044
1022
  os << " Min value : " << *min_value_ << std::endl;
1045
1023
  os << " Max value : " << *max_value_ << std::endl;