datasketches 0.2.3 → 0.2.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (143) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +12 -0
  3. data/README.md +8 -8
  4. data/ext/datasketches/kll_wrapper.cpp +7 -3
  5. data/ext/datasketches/theta_wrapper.cpp +20 -4
  6. data/lib/datasketches/version.rb +1 -1
  7. data/vendor/datasketches-cpp/CMakeLists.txt +25 -5
  8. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  9. data/vendor/datasketches-cpp/NOTICE +6 -5
  10. data/vendor/datasketches-cpp/README.md +76 -9
  11. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  12. data/vendor/datasketches-cpp/common/CMakeLists.txt +18 -13
  13. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
  14. data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
  15. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
  16. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
  17. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
  18. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
  19. data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
  20. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  21. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +3 -1
  22. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
  23. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
  24. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +10 -6
  25. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  26. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
  27. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
  28. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  29. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
  30. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +29 -11
  31. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
  32. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
  33. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  34. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
  35. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
  36. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
  37. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
  38. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
  39. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
  40. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
  41. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
  42. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
  43. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
  44. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
  45. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
  46. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
  47. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
  48. data/vendor/datasketches-cpp/kll/CMakeLists.txt +5 -19
  49. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
  50. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +5 -2
  51. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +108 -41
  52. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +150 -132
  53. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +165 -31
  54. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  55. data/vendor/datasketches-cpp/pyproject.toml +1 -1
  56. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  57. data/vendor/datasketches-cpp/python/README.md +13 -9
  58. data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
  59. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
  60. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
  61. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
  62. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
  63. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
  64. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
  65. data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
  66. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
  67. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
  68. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +656 -0
  69. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1373 -0
  70. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
  71. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
  72. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
  73. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
  74. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
  75. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
  76. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
  77. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
  78. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
  79. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
  80. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
  81. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +975 -0
  82. data/vendor/datasketches-cpp/req/CMakeLists.txt +6 -21
  83. data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
  84. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +6 -0
  85. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +30 -2
  86. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +73 -23
  87. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +95 -63
  88. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +74 -3
  89. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +44 -33
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +33 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
  96. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
  97. data/vendor/datasketches-cpp/setup.py +1 -1
  98. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  99. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +92 -23
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +7 -6
  103. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +3 -2
  104. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
  105. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +150 -93
  106. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
  107. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  108. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -2
  109. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  110. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -4
  111. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +34 -9
  112. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  113. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
  114. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  115. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  116. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  117. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  118. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  119. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
  120. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +446 -0
  121. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +429 -1
  122. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -11
  123. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  124. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  125. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
  126. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  127. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  128. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +29 -9
  129. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +34 -14
  130. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  131. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  132. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
  133. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
  134. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
  135. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
  136. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +8 -0
  137. metadata +33 -12
  138. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
  139. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
  140. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
  141. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
  142. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -64,8 +64,8 @@ compactors_(other.compactors_),
64
64
  min_value_(nullptr),
65
65
  max_value_(nullptr)
66
66
  {
67
- if (other.min_value_ != nullptr) min_value_ = new (A().allocate(1)) T(*other.min_value_);
68
- if (other.max_value_ != nullptr) max_value_ = new (A().allocate(1)) T(*other.max_value_);
67
+ if (other.min_value_ != nullptr) min_value_ = new (allocator_.allocate(1)) T(*other.min_value_);
68
+ if (other.max_value_ != nullptr) max_value_ = new (allocator_.allocate(1)) T(*other.max_value_);
69
69
  }
70
70
 
71
71
  template<typename T, typename C, typename S, typename A>
@@ -113,6 +113,33 @@ req_sketch<T, C, S, A>& req_sketch<T, C, S, A>::operator=(req_sketch&& other) {
113
113
  return *this;
114
114
  }
115
115
 
116
+ template<typename T, typename C, typename S, typename A>
117
+ template<typename TT, typename CC, typename SS, typename AA>
118
+ req_sketch<T, C, S, A>::req_sketch(const req_sketch<TT, CC, SS, AA>& other, const A& allocator):
119
+ allocator_(allocator),
120
+ k_(other.k_),
121
+ hra_(other.hra_),
122
+ max_nom_size_(other.max_nom_size_),
123
+ num_retained_(other.num_retained_),
124
+ n_(other.n_),
125
+ compactors_(allocator),
126
+ min_value_(nullptr),
127
+ max_value_(nullptr)
128
+ {
129
+ static_assert(
130
+ std::is_constructible<T, TT>::value,
131
+ "Type converting constructor requires new type to be constructible from existing type"
132
+ );
133
+ compactors_.reserve(other.compactors_.size());
134
+ for (const auto& compactor: other.compactors_) {
135
+ compactors_.push_back(req_compactor<T, C, A>(compactor, allocator_));
136
+ }
137
+ if (!other.is_empty()) {
138
+ min_value_ = new (allocator_.allocate(1)) T(other.get_min_value());
139
+ max_value_ = new (allocator_.allocate(1)) T(other.get_max_value());
140
+ }
141
+ }
142
+
116
143
  template<typename T, typename C, typename S, typename A>
117
144
  uint16_t req_sketch<T, C, S, A>::get_k() const {
118
145
  return k_;
@@ -196,6 +223,11 @@ const T& req_sketch<T, C, S, A>::get_max_value() const {
196
223
  return *max_value_;
197
224
  }
198
225
 
226
+ template<typename T, typename C, typename S, typename A>
227
+ C req_sketch<T, C, S, A>::get_comparator() const {
228
+ return C();
229
+ }
230
+
199
231
  template<typename T, typename C, typename S, typename A>
200
232
  template<bool inclusive>
201
233
  double req_sketch<T, C, S, A>::get_rank(const T& item) const {
@@ -210,6 +242,7 @@ template<typename T, typename C, typename S, typename A>
210
242
  template<bool inclusive>
211
243
  auto req_sketch<T, C, S, A>::get_PMF(const T* split_points, uint32_t size) const -> vector_double {
212
244
  auto buckets = get_CDF<inclusive>(split_points, size);
245
+ if (is_empty()) return buckets;
213
246
  for (uint32_t i = size; i > 0; --i) {
214
247
  buckets[i] -= buckets[i - 1];
215
248
  }
@@ -230,14 +263,15 @@ auto req_sketch<T, C, S, A>::get_CDF(const T* split_points, uint32_t size) const
230
263
 
231
264
  template<typename T, typename C, typename S, typename A>
232
265
  template<bool inclusive>
233
- const T& req_sketch<T, C, S, A>::get_quantile(double rank) const {
266
+ auto req_sketch<T, C, S, A>::get_quantile(double rank) const -> quantile_return_type {
234
267
  if (is_empty()) return get_invalid_value();
235
268
  if (rank == 0.0) return *min_value_;
236
269
  if (rank == 1.0) return *max_value_;
237
270
  if ((rank < 0.0) || (rank > 1.0)) {
238
271
  throw std::invalid_argument("Rank cannot be less than zero or greater than 1.0");
239
272
  }
240
- return *(get_quantile_calculator<inclusive>()->get_quantile(rank));
273
+ // possible side-effect of sorting level zero
274
+ return get_sorted_view<inclusive>(true).get_quantile(rank);
241
275
  }
242
276
 
243
277
  template<typename T, typename C, typename S, typename A>
@@ -245,8 +279,11 @@ template<bool inclusive>
245
279
  std::vector<T, A> req_sketch<T, C, S, A>::get_quantiles(const double* ranks, uint32_t size) const {
246
280
  std::vector<T, A> quantiles(allocator_);
247
281
  if (is_empty()) return quantiles;
248
- QuantileCalculatorPtr quantile_calculator(nullptr, calculator_deleter(allocator_));
249
282
  quantiles.reserve(size);
283
+
284
+ // possible side-effect of sorting level zero
285
+ auto view = get_sorted_view<inclusive>(true);
286
+
250
287
  for (uint32_t i = 0; i < size; ++i) {
251
288
  const double rank = ranks[i];
252
289
  if ((rank < 0.0) || (rank > 1.0)) {
@@ -255,47 +292,26 @@ std::vector<T, A> req_sketch<T, C, S, A>::get_quantiles(const double* ranks, uin
255
292
  if (rank == 0.0) quantiles.push_back(*min_value_);
256
293
  else if (rank == 1.0) quantiles.push_back(*max_value_);
257
294
  else {
258
- if (!quantile_calculator) {
259
- // has side effect of sorting level zero if needed
260
- quantile_calculator = const_cast<req_sketch*>(this)->get_quantile_calculator<inclusive>();
261
- }
262
- quantiles.push_back(*(quantile_calculator->get_quantile(rank)));
295
+ quantiles.push_back(view.get_quantile(rank));
263
296
  }
264
297
  }
265
298
  return quantiles;
266
299
  }
267
300
 
268
- template<typename T, typename C, typename S, typename A>
269
- class req_sketch<T, C, S, A>::calculator_deleter {
270
- public:
271
- calculator_deleter(const AllocCalc& allocator): allocator_(allocator) {}
272
- void operator() (QuantileCalculator* ptr) {
273
- if (ptr != nullptr) {
274
- ptr->~QuantileCalculator();
275
- allocator_.deallocate(ptr, 1);
276
- }
277
- }
278
- private:
279
- AllocCalc allocator_;
280
- };
281
-
282
301
  template<typename T, typename C, typename S, typename A>
283
302
  template<bool inclusive>
284
- auto req_sketch<T, C, S, A>::get_quantile_calculator() const -> QuantileCalculatorPtr {
303
+ quantile_sketch_sorted_view<T, C, A> req_sketch<T, C, S, A>::get_sorted_view(bool cumulative) const {
285
304
  if (!compactors_[0].is_sorted()) {
286
305
  const_cast<Compactor&>(compactors_[0]).sort(); // allow this side effect
287
306
  }
288
- AllocCalc ac(allocator_);
289
- QuantileCalculatorPtr quantile_calculator(
290
- new (ac.allocate(1)) req_quantile_calculator<T, C, A>(n_, ac),
291
- calculator_deleter(ac)
292
- );
307
+ quantile_sketch_sorted_view<T, C, A> view(get_num_retained(), allocator_);
293
308
 
294
309
  for (auto& compactor: compactors_) {
295
- quantile_calculator->add(compactor.begin(), compactor.end(), compactor.get_lg_weight());
310
+ view.add(compactor.begin(), compactor.end(), 1 << compactor.get_lg_weight());
296
311
  }
297
- quantile_calculator->template convert_to_cummulative<inclusive>();
298
- return quantile_calculator;
312
+
313
+ if (cumulative) view.template convert_to_cummulative<inclusive>();
314
+ return view;
299
315
  }
300
316
 
301
317
  template<typename T, typename C, typename S, typename A>
@@ -348,8 +364,8 @@ double req_sketch<T, C, S, A>::relative_rse_factor() {
348
364
 
349
365
  // implementation for fixed-size arithmetic types (integral and floating point)
350
366
  template<typename T, typename C, typename S, typename A>
351
- template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
352
- size_t req_sketch<T, C, S, A>::get_serialized_size_bytes() const {
367
+ template<typename TT, typename SerDe, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
368
+ size_t req_sketch<T, C, S, A>::get_serialized_size_bytes(const SerDe& sd) const {
353
369
  size_t size = PREAMBLE_SIZE_BYTES;
354
370
  if (is_empty()) return size;
355
371
  if (is_estimation_mode()) {
@@ -358,32 +374,33 @@ size_t req_sketch<T, C, S, A>::get_serialized_size_bytes() const {
358
374
  if (n_ == 1) {
359
375
  size += sizeof(TT);
360
376
  } else {
361
- for (const auto& compactor: compactors_) size += compactor.get_serialized_size_bytes(S());
377
+ for (const auto& compactor: compactors_) size += compactor.get_serialized_size_bytes(sd);
362
378
  }
363
379
  return size;
364
380
  }
365
381
 
366
382
  // implementation for all other types
367
383
  template<typename T, typename C, typename S, typename A>
368
- template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
369
- size_t req_sketch<T, C, S, A>::get_serialized_size_bytes() const {
384
+ template<typename TT, typename SerDe, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
385
+ size_t req_sketch<T, C, S, A>::get_serialized_size_bytes(const SerDe& sd) const {
370
386
  size_t size = PREAMBLE_SIZE_BYTES;
371
387
  if (is_empty()) return size;
372
388
  if (is_estimation_mode()) {
373
389
  size += sizeof(n_);
374
- size += S().size_of_item(*min_value_);
375
- size += S().size_of_item(*max_value_);
390
+ size += sd.size_of_item(*min_value_);
391
+ size += sd.size_of_item(*max_value_);
376
392
  }
377
393
  if (n_ == 1) {
378
- size += S().size_of_item(*compactors_[0].begin());
394
+ size += sd.size_of_item(*compactors_[0].begin());
379
395
  } else {
380
- for (const auto& compactor: compactors_) size += compactor.get_serialized_size_bytes(S());
396
+ for (const auto& compactor: compactors_) size += compactor.get_serialized_size_bytes(sd);
381
397
  }
382
398
  return size;
383
399
  }
384
400
 
385
401
  template<typename T, typename C, typename S, typename A>
386
- void req_sketch<T, C, S, A>::serialize(std::ostream& os) const {
402
+ template<typename SerDe>
403
+ void req_sketch<T, C, S, A>::serialize(std::ostream& os, const SerDe& sd) const {
387
404
  const uint8_t preamble_ints = is_estimation_mode() ? 4 : 2;
388
405
  write(os, preamble_ints);
389
406
  const uint8_t serial_version = SERIAL_VERSION;
@@ -406,19 +423,20 @@ void req_sketch<T, C, S, A>::serialize(std::ostream& os) const {
406
423
  if (is_empty()) return;
407
424
  if (is_estimation_mode()) {
408
425
  write(os, n_);
409
- S().serialize(os, min_value_, 1);
410
- S().serialize(os, max_value_, 1);
426
+ sd.serialize(os, min_value_, 1);
427
+ sd.serialize(os, max_value_, 1);
411
428
  }
412
429
  if (raw_items) {
413
- S().serialize(os, compactors_[0].begin(), num_raw_items);
430
+ sd.serialize(os, compactors_[0].begin(), num_raw_items);
414
431
  } else {
415
- for (const auto& compactor: compactors_) compactor.serialize(os, S());
432
+ for (const auto& compactor: compactors_) compactor.serialize(os, sd);
416
433
  }
417
434
  }
418
435
 
419
436
  template<typename T, typename C, typename S, typename A>
420
- auto req_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
421
- const size_t size = header_size_bytes + get_serialized_size_bytes();
437
+ template<typename SerDe>
438
+ auto req_sketch<T, C, S, A>::serialize(unsigned header_size_bytes, const SerDe& sd) const -> vector_bytes {
439
+ const size_t size = header_size_bytes + get_serialized_size_bytes(sd);
422
440
  vector_bytes bytes(size, 0, allocator_);
423
441
  uint8_t* ptr = bytes.data() + header_size_bytes;
424
442
  const uint8_t* end_ptr = ptr + size;
@@ -445,13 +463,13 @@ auto req_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const -> vect
445
463
  if (!is_empty()) {
446
464
  if (is_estimation_mode()) {
447
465
  ptr += copy_to_mem(n_, ptr);
448
- ptr += S().serialize(ptr, end_ptr - ptr, min_value_, 1);
449
- ptr += S().serialize(ptr, end_ptr - ptr, max_value_, 1);
466
+ ptr += sd.serialize(ptr, end_ptr - ptr, min_value_, 1);
467
+ ptr += sd.serialize(ptr, end_ptr - ptr, max_value_, 1);
450
468
  }
451
469
  if (raw_items) {
452
- ptr += S().serialize(ptr, end_ptr - ptr, compactors_[0].begin(), num_raw_items);
470
+ ptr += sd.serialize(ptr, end_ptr - ptr, compactors_[0].begin(), num_raw_items);
453
471
  } else {
454
- for (const auto& compactor: compactors_) ptr += compactor.serialize(ptr, end_ptr - ptr, S());
472
+ for (const auto& compactor: compactors_) ptr += compactor.serialize(ptr, end_ptr - ptr, sd);
455
473
  }
456
474
  }
457
475
  return bytes;
@@ -459,6 +477,12 @@ auto req_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const -> vect
459
477
 
460
478
  template<typename T, typename C, typename S, typename A>
461
479
  req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(std::istream& is, const A& allocator) {
480
+ return deserialize(is, S(), allocator);
481
+ }
482
+
483
+ template<typename T, typename C, typename S, typename A>
484
+ template<typename SerDe>
485
+ req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
462
486
  const auto preamble_ints = read<uint8_t>(is);
463
487
  const auto serial_version = read<uint8_t>(is);
464
488
  const auto family_id = read<uint8_t>(is);
@@ -490,19 +514,19 @@ req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(std::istream& is, con
490
514
  uint64_t n = 1;
491
515
  if (num_levels > 1) {
492
516
  n = read<uint64_t>(is);
493
- S().deserialize(is, min_value_buffer.get(), 1);
517
+ sd.deserialize(is, min_value_buffer.get(), 1);
494
518
  // serde call did not throw, repackage with destrtuctor
495
519
  min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
496
- S().deserialize(is, max_value_buffer.get(), 1);
520
+ sd.deserialize(is, max_value_buffer.get(), 1);
497
521
  // serde call did not throw, repackage with destrtuctor
498
522
  max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
499
523
  }
500
524
 
501
525
  if (raw_items) {
502
- compactors.push_back(Compactor::deserialize(is, S(), allocator, is_level_0_sorted, k, num_raw_items, hra));
526
+ compactors.push_back(Compactor::deserialize(is, sd, allocator, is_level_0_sorted, k, num_raw_items, hra));
503
527
  } else {
504
528
  for (size_t i = 0; i < num_levels; ++i) {
505
- compactors.push_back(Compactor::deserialize(is, S(), allocator, i == 0 ? is_level_0_sorted : true, hra));
529
+ compactors.push_back(Compactor::deserialize(is, sd, allocator, i == 0 ? is_level_0_sorted : true, hra));
506
530
  }
507
531
  }
508
532
  if (num_levels == 1) {
@@ -529,6 +553,12 @@ req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(std::istream& is, con
529
553
 
530
554
  template<typename T, typename C, typename S, typename A>
531
555
  req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(const void* bytes, size_t size, const A& allocator) {
556
+ return deserialize(bytes, size, S(), allocator);
557
+ }
558
+
559
+ template<typename T, typename C, typename S, typename A>
560
+ template<typename SerDe>
561
+ req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {
532
562
  ensure_minimum_memory(size, 8);
533
563
  const char* ptr = static_cast<const char*>(bytes);
534
564
  const char* end_ptr = static_cast<const char*>(bytes) + size;
@@ -571,21 +601,21 @@ req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(const void* bytes, si
571
601
  if (num_levels > 1) {
572
602
  ensure_minimum_memory(end_ptr - ptr, sizeof(n));
573
603
  ptr += copy_from_mem(ptr, n);
574
- ptr += S().deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
604
+ ptr += sd.deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
575
605
  // serde call did not throw, repackage with destrtuctor
576
606
  min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
577
- ptr += S().deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
607
+ ptr += sd.deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
578
608
  // serde call did not throw, repackage with destrtuctor
579
609
  max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
580
610
  }
581
611
 
582
612
  if (raw_items) {
583
- auto pair = Compactor::deserialize(ptr, end_ptr - ptr, S(), allocator, is_level_0_sorted, k, num_raw_items, hra);
613
+ auto pair = Compactor::deserialize(ptr, end_ptr - ptr, sd, allocator, is_level_0_sorted, k, num_raw_items, hra);
584
614
  compactors.push_back(std::move(pair.first));
585
615
  ptr += pair.second;
586
616
  } else {
587
617
  for (size_t i = 0; i < num_levels; ++i) {
588
- auto pair = Compactor::deserialize(ptr, end_ptr - ptr, S(), allocator, i == 0 ? is_level_0_sorted : true, hra);
618
+ auto pair = Compactor::deserialize(ptr, end_ptr - ptr, sd, allocator, i == 0 ? is_level_0_sorted : true, hra);
589
619
  compactors.push_back(std::move(pair.first));
590
620
  ptr += pair.second;
591
621
  }
@@ -653,7 +683,9 @@ void req_sketch<T, C, S, A>::compress() {
653
683
 
654
684
  template<typename T, typename C, typename S, typename A>
655
685
  string<A> req_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items) const {
656
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
686
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
687
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
688
+ std::ostringstream os;
657
689
  os << "### REQ sketch summary:" << std::endl;
658
690
  os << " K : " << k_ << std::endl;
659
691
  os << " High Rank Acc : " << (hra_ ? "true" : "false") << std::endl;
@@ -693,7 +725,7 @@ string<A> req_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items)
693
725
  }
694
726
  os << "### End sketch data" << std::endl;
695
727
  }
696
- return os.str();
728
+ return string<A>(os.str().c_str(), allocator_);
697
729
  }
698
730
 
699
731
  template<typename T, typename C, typename S, typename A>
@@ -24,6 +24,7 @@
24
24
  #include <fstream>
25
25
  #include <sstream>
26
26
  #include <limits>
27
+ #include <stdexcept>
27
28
 
28
29
  namespace datasketches {
29
30
 
@@ -34,7 +35,7 @@ const std::string input_path = "test/";
34
35
  #endif
35
36
 
36
37
  TEST_CASE("req sketch: empty", "[req_sketch]") {
37
- std::cout << "sizeof(req_float_sketch)=" << sizeof(req_sketch<float>) << "\n";
38
+ //std::cout << "sizeof(req_float_sketch)=" << sizeof(req_sketch<float>) << "\n";
38
39
  req_sketch<float> sketch(12);
39
40
  REQUIRE(sketch.get_k() == 12);
40
41
  REQUIRE(sketch.is_HRA());
@@ -51,6 +52,10 @@ TEST_CASE("req sketch: empty", "[req_sketch]") {
51
52
  REQUIRE(std::isnan(sketch.get_quantile(1)));
52
53
  const double ranks[3] {0, 0.5, 1};
53
54
  REQUIRE(sketch.get_quantiles(ranks, 3).size() == 0);
55
+
56
+ const float split_points[1] {0};
57
+ REQUIRE(sketch.get_CDF(split_points, 1).empty());
58
+ REQUIRE(sketch.get_PMF(split_points, 1).empty());
54
59
  }
55
60
 
56
61
  TEST_CASE("req sketch: single value, lra", "[req_sketch]") {
@@ -240,7 +245,7 @@ TEST_CASE("req sketch: byte serialize-deserialize single item", "[req_sketch]")
240
245
  auto bytes = sketch.serialize();
241
246
  REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
242
247
  auto sketch2 = req_sketch<float>::deserialize(bytes.data(), bytes.size());
243
- std::cout << sketch2.to_string(true);
248
+ //std::cout << sketch2.to_string(true);
244
249
  REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
245
250
  REQUIRE(sketch2.is_empty() == sketch.is_empty());
246
251
  REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
@@ -277,7 +282,7 @@ TEST_CASE("req sketch: byte serialize-deserialize exact mode", "[req_sketch]") {
277
282
  auto bytes = sketch.serialize();
278
283
  REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
279
284
  auto sketch2 = req_sketch<float>::deserialize(bytes.data(), bytes.size());
280
- std::cout << sketch2.to_string(true);
285
+ //std::cout << sketch2.to_string(true);
281
286
  REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
282
287
  REQUIRE(sketch2.is_empty() == sketch.is_empty());
283
288
  REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
@@ -480,6 +485,72 @@ TEST_CASE("req sketch: merge incompatible HRA and LRA", "[req_sketch]") {
480
485
  REQUIRE_THROWS_AS(sketch1.merge(sketch2), std::invalid_argument);
481
486
  }
482
487
 
488
+ TEST_CASE("req sketch: type conversion - empty", "[req_sketch]") {
489
+ req_sketch<double> req_double(12);
490
+ req_sketch<float> req_float(req_double);
491
+ REQUIRE(req_float.is_empty());
492
+ REQUIRE(req_float.get_k() == req_double.get_k());
493
+ REQUIRE(req_float.get_n() == 0);
494
+ REQUIRE(req_float.get_num_retained() == 0);
495
+ }
496
+
497
+ TEST_CASE("req sketch: type conversion - several levels", "[req_sketch]") {
498
+ req_sketch<double> req_double(12);
499
+ for (int i = 0; i < 1000; ++i) req_double.update(static_cast<double>(i));
500
+ req_sketch<float> req_float(req_double);
501
+ REQUIRE(!req_float.is_empty());
502
+ REQUIRE(req_float.get_k() == req_double.get_k());
503
+ REQUIRE(req_float.get_n() == req_double.get_n());
504
+ REQUIRE(req_float.get_num_retained() == req_double.get_num_retained());
505
+
506
+ auto sv_float = req_float.get_sorted_view(false);
507
+ auto sv_double = req_double.get_sorted_view(false);
508
+ auto sv_float_it = sv_float.begin();
509
+ auto sv_double_it = sv_double.begin();
510
+ while (sv_float_it != sv_float.end()) {
511
+ REQUIRE(sv_double_it != sv_double.end());
512
+ auto float_pair = *sv_float_it;
513
+ auto double_pair = *sv_double_it;
514
+ REQUIRE(float_pair.first == Approx(double_pair.first).margin(0.01));
515
+ REQUIRE(float_pair.second == double_pair.second);
516
+ ++sv_float_it;
517
+ ++sv_double_it;
518
+ }
519
+ REQUIRE(sv_double_it == sv_double.end());
520
+ }
521
+
522
+ class A {
523
+ int val;
524
+ public:
525
+ A(int val): val(val) {}
526
+ int get_val() const { return val; }
527
+ };
528
+
529
+ struct less_A {
530
+ bool operator()(const A& a1, const A& a2) const { return a1.get_val() < a2.get_val(); }
531
+ };
532
+
533
+ class B {
534
+ int val;
535
+ public:
536
+ explicit B(const A& a): val(a.get_val()) {}
537
+ int get_val() const { return val; }
538
+ };
539
+
540
+ struct less_B {
541
+ bool operator()(const B& b1, const B& b2) const { return b1.get_val() < b2.get_val(); }
542
+ };
543
+
544
+ TEST_CASE("req sketch: type conversion - custom types") {
545
+ req_sketch<A, less_A> sa(4);
546
+ sa.update(1);
547
+ sa.update(2);
548
+ sa.update(3);
549
+
550
+ req_sketch<B, less_B> sb(sa);
551
+ REQUIRE(sb.get_n() == 3);
552
+ }
553
+
483
554
  //TEST_CASE("for manual comparison with Java") {
484
555
  // req_sketch<float> sketch(12, false);
485
556
  // for (size_t i = 0; i < 100000; ++i) sketch.update(i);
@@ -32,17 +32,13 @@ target_include_directories(sampling
32
32
  target_link_libraries(sampling INTERFACE common)
33
33
  target_compile_features(sampling INTERFACE cxx_std_11)
34
34
 
35
- set(sampling_HEADERS "include/var_opt_sketch.hpp;include/var_opt_sketch_impl.hpp")
36
-
37
35
  install(TARGETS sampling
38
36
  EXPORT ${PROJECT_NAME}
39
37
  )
40
38
 
41
- install(FILES ${sampling_HEADERS}
39
+ install(FILES
40
+ include/var_opt_sketch.hpp
41
+ include/var_opt_sketch_impl.hpp
42
+ include/var_opt_union.hpp
43
+ include/var_opt_union_impl.hpp
42
44
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
43
-
44
- target_sources(sampling
45
- INTERFACE
46
- ${CMAKE_CURRENT_SOURCE_DIR}/include/var_opt_sketch.hpp
47
- ${CMAKE_CURRENT_SOURCE_DIR}/include/var_opt_sketch_impl.hpp
48
- )
@@ -58,7 +58,11 @@ namespace var_opt_constants {
58
58
  const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
59
59
  }
60
60
 
61
- template <typename T, typename S = serde<T>, typename A = std::allocator<T>>
61
+ template<
62
+ typename T,
63
+ typename S = serde<T>, // deprecated, to be removed in the next major version
64
+ typename A = std::allocator<T>
65
+ >
62
66
  class var_opt_sketch {
63
67
 
64
68
  public:
@@ -135,18 +139,20 @@ class var_opt_sketch {
135
139
  /**
136
140
  * Computes size needed to serialize the current state of the sketch.
137
141
  * This version is for fixed-size arithmetic types (integral and floating point).
142
+ * @param instance of a SerDe
138
143
  * @return size in bytes needed to serialize this sketch
139
144
  */
140
- template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
141
- inline size_t get_serialized_size_bytes() const;
145
+ template<typename TT = T, typename SerDe = S, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
146
+ inline size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
142
147
 
143
148
  /**
144
149
  * Computes size needed to serialize the current state of the sketch.
145
150
  * This version is for all other types and can be expensive since every item needs to be looked at.
151
+ * @param instance of a SerDe
146
152
  * @return size in bytes needed to serialize this sketch
147
153
  */
148
- template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
149
- inline size_t get_serialized_size_bytes() const;
154
+ template<typename TT = T, typename SerDe = S, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
155
+ inline size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
150
156
 
151
157
  // This is a convenience alias for users
152
158
  // The type returned by the following serialize method
@@ -158,30 +164,61 @@ class var_opt_sketch {
158
164
  * It is a blank space of a given size.
159
165
  * This header is used in Datasketches PostgreSQL extension.
160
166
  * @param header_size_bytes space to reserve in front of the sketch
167
+ * @param instance of a SerDe
161
168
  */
162
- vector_bytes serialize(unsigned header_size_bytes = 0) const;
169
+ template<typename SerDe = S>
170
+ vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
163
171
 
164
172
  /**
165
173
  * This method serializes the sketch into a given stream in a binary form
166
174
  * @param os output stream
175
+ * @param instance of a SerDe
167
176
  */
168
- void serialize(std::ostream& os) const;
177
+ template<typename SerDe = S>
178
+ void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
169
179
 
170
180
  /**
171
181
  * This method deserializes a sketch from a given stream.
172
182
  * @param is input stream
183
+ * @param instance of an Allocator
173
184
  * @return an instance of a sketch
185
+ *
186
+ * Deprecated, to be removed in the next major version
174
187
  */
175
188
  static var_opt_sketch deserialize(std::istream& is, const A& allocator = A());
176
189
 
190
+ /**
191
+ * This method deserializes a sketch from a given stream.
192
+ * @param is input stream
193
+ * @param instance of a SerDe
194
+ * @param instance of an Allocator
195
+ * @return an instance of a sketch
196
+ */
197
+ template<typename SerDe = S>
198
+ static var_opt_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
199
+
177
200
  /**
178
201
  * This method deserializes a sketch from a given array of bytes.
179
202
  * @param bytes pointer to the array of bytes
180
203
  * @param size the size of the array
204
+ * @param instance of an Allocator
181
205
  * @return an instance of a sketch
206
+ *
207
+ * Deprecated, to be removed in the next major version
182
208
  */
183
209
  static var_opt_sketch deserialize(const void* bytes, size_t size, const A& allocator = A());
184
210
 
211
+ /**
212
+ * This method deserializes a sketch from a given array of bytes.
213
+ * @param bytes pointer to the array of bytes
214
+ * @param size the size of the array
215
+ * @param instance of a SerDe
216
+ * @param instance of an Allocator
217
+ * @return an instance of a sketch
218
+ */
219
+ template<typename SerDe = S>
220
+ static var_opt_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
221
+
185
222
  /**
186
223
  * Prints a summary of the sketch.
187
224
  * @return the summary as a string