datasketches 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +1 -1
  4. data/ext/datasketches/kll_wrapper.cpp +5 -1
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +4 -3
  7. data/vendor/datasketches-cpp/common/CMakeLists.txt +4 -0
  8. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
  10. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
  11. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
  12. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
  13. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
  14. data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
  15. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
  16. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
  17. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +2 -0
  18. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
  19. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
  20. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
  21. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +25 -9
  22. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
  23. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
  24. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
  25. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
  26. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
  28. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
  29. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
  30. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
  31. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
  32. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
  33. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
  34. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
  35. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
  36. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
  37. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -4
  38. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
  39. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
  40. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +96 -42
  41. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +105 -127
  42. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +94 -25
  43. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  44. data/vendor/datasketches-cpp/pyproject.toml +1 -1
  45. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  46. data/vendor/datasketches-cpp/python/README.md +7 -0
  47. data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
  48. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
  49. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
  50. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
  51. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
  52. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
  53. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
  54. data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
  55. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
  56. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
  57. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
  58. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
  59. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
  60. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
  61. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
  62. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
  63. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
  64. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
  65. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
  66. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
  67. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
  68. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
  69. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
  70. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
  71. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -2
  72. data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
  73. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
  74. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
  75. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +62 -59
  76. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
  77. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
  78. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +31 -26
  79. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
  80. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +25 -9
  81. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
  82. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
  83. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
  84. data/vendor/datasketches-cpp/setup.py +1 -1
  85. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  86. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +8 -6
  87. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +1 -0
  88. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -0
  89. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +7 -45
  90. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -0
  91. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +1 -0
  92. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
  93. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
  94. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -0
  95. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +29 -1
  96. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +2 -0
  97. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +16 -0
  98. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +1 -0
  99. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
  100. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
  101. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -0
  102. metadata +25 -9
  103. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
  104. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
  105. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
  106. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
@@ -42,6 +42,4 @@ install(FILES
42
42
  include/req_sketch_impl.hpp
43
43
  include/req_compactor.hpp
44
44
  include/req_compactor_impl.hpp
45
- include/req_quantile_calculator.hpp
46
- include/req_quantile_calculator_impl.hpp
47
45
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
@@ -21,17 +21,12 @@
21
21
  #define REQ_COMMON_HPP_
22
22
 
23
23
  #include <random>
24
- #include <chrono>
25
24
 
26
25
  #include "serde.hpp"
27
26
  #include "common_defs.hpp"
28
27
 
29
28
  namespace datasketches {
30
29
 
31
- // TODO: have a common random bit with KLL
32
- static std::independent_bits_engine<std::mt19937, 1, unsigned>
33
- req_random_bit(static_cast<unsigned>(std::chrono::system_clock::now().time_since_epoch().count()));
34
-
35
30
  namespace req_constants {
36
31
  static const uint16_t MIN_K = 4;
37
32
  static const uint8_t INIT_NUM_SECTIONS = 3;
@@ -26,6 +26,7 @@
26
26
 
27
27
  #include "count_zeros.hpp"
28
28
  #include "conditional_forward.hpp"
29
+ #include "common_defs.hpp"
29
30
 
30
31
  #include <iomanip>
31
32
 
@@ -245,7 +246,7 @@ std::pair<uint32_t, uint32_t> req_compactor<T, C, A>::compact(req_compactor& nex
245
246
  if (compaction_range.second - compaction_range.first < 2) throw std::logic_error("compaction range error");
246
247
 
247
248
  if ((state_ & 1) == 1) { coin_ = !coin_; } // for odd flip coin;
248
- else { coin_ = req_random_bit(); } // random coin flip
249
+ else { coin_ = random_bit(); } // random coin flip
249
250
 
250
251
  const auto num = (compaction_range.second - compaction_range.first) / 2;
251
252
  next.ensure_space(num);
@@ -451,7 +452,7 @@ req_compactor<T, C, A>::req_compactor(bool hra, uint8_t lg_weight, bool sorted,
451
452
  allocator_(allocator),
452
453
  lg_weight_(lg_weight),
453
454
  hra_(hra),
454
- coin_(req_random_bit()),
455
+ coin_(random_bit()),
455
456
  sorted_(sorted),
456
457
  section_size_raw_(section_size_raw),
457
458
  section_size_(nearest_even(section_size_raw)),
@@ -22,22 +22,25 @@
22
22
 
23
23
  #include "req_common.hpp"
24
24
  #include "req_compactor.hpp"
25
- #include "req_quantile_calculator.hpp"
25
+ #include "quantile_sketch_sorted_view.hpp"
26
+
27
+ #include <stdexcept>
26
28
 
27
29
  namespace datasketches {
28
30
 
29
31
  template<
30
32
  typename T,
31
- typename Comparator = std::less<T>,
32
- typename SerDe = serde<T>,
33
+ typename Comparator = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare)
34
+ typename S = serde<T>, // deprecated, to be removed in the next major version
33
35
  typename Allocator = std::allocator<T>
34
36
  >
35
37
  class req_sketch {
36
38
  public:
39
+ using value_type = T;
40
+ using comparator = Comparator;
37
41
  using Compactor = req_compactor<T, Comparator, Allocator>;
38
42
  using AllocCompactor = typename std::allocator_traits<Allocator>::template rebind_alloc<Compactor>;
39
- using AllocDouble = typename std::allocator_traits<Allocator>::template rebind_alloc<double>;
40
- using vector_double = std::vector<double, AllocDouble>;
43
+ using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
41
44
 
42
45
  /**
43
46
  * Constructor
@@ -113,6 +116,12 @@ public:
113
116
  */
114
117
  const T& get_max_value() const;
115
118
 
119
+ /**
120
+ * Returns an instance of the comparator for this sketch.
121
+ * @return comparator
122
+ */
123
+ Comparator get_comparator() const;
124
+
116
125
  /**
117
126
  * Returns an approximation to the normalized (fractional) rank of the given item from 0 to 1 inclusive.
118
127
  * With the template parameter inclusive=true the weight of the given item is included into the rank.
@@ -123,7 +132,6 @@ public:
123
132
  * @param item to be ranked
124
133
  * @return an approximate rank of the given item
125
134
  */
126
-
127
135
  template<bool inclusive = false>
128
136
  double get_rank(const T& item) const;
129
137
 
@@ -135,9 +143,10 @@ public:
135
143
  *
136
144
  * @param split_points an array of <i>m</i> unique, monotonically increasing values
137
145
  * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
138
- * The definition of an "interval" is inclusive of the left split point (or minimum value) and
139
- * exclusive of the right split point, with the exception that the last interval will include
140
- * the maximum value.
146
+ * If the template parameter inclusive=false, the definition of an "interval" is inclusive of the left split point and exclusive of the right
147
+ * split point, with the exception that the last interval will include the maximum value.
148
+ * If the template parameter inclusive=true, the definition of an "interval" is exclusive of the left split point and inclusive of the right
149
+ * split point.
141
150
  * It is not necessary to include either the min or max values in these split points.
142
151
  *
143
152
  * @return an array of m+1 doubles each of which is an approximation
@@ -178,8 +187,9 @@ public:
178
187
  * @param rank the given normalized rank
179
188
  * @return approximate quantile given the normalized rank
180
189
  */
190
+ using quantile_return_type = typename quantile_sketch_sorted_view<T, Comparator, Allocator>::quantile_return_type;
181
191
  template<bool inclusive = false>
182
- const T& get_quantile(double rank) const;
192
+ quantile_return_type get_quantile(double rank) const;
183
193
 
184
194
  /**
185
195
  * Returns an array of quantiles that correspond to the given array of normalized ranks.
@@ -221,24 +231,28 @@ public:
221
231
  /**
222
232
  * Computes size needed to serialize the current state of the sketch.
223
233
  * This version is for fixed-size arithmetic types (integral and floating point).
234
+ * @param instance of a SerDe
224
235
  * @return size in bytes needed to serialize this sketch
225
236
  */
226
- template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
227
- size_t get_serialized_size_bytes() const;
237
+ template<typename TT = T, typename SerDe = S, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
238
+ size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
228
239
 
229
240
  /**
230
241
  * Computes size needed to serialize the current state of the sketch.
231
242
  * This version is for all other types and can be expensive since every item needs to be looked at.
243
+ * @param instance of a SerDe
232
244
  * @return size in bytes needed to serialize this sketch
233
245
  */
234
- template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
235
- size_t get_serialized_size_bytes() const;
246
+ template<typename TT = T, typename SerDe = S, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
247
+ size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
236
248
 
237
249
  /**
238
250
  * This method serializes the sketch into a given stream in a binary form
239
251
  * @param os output stream
252
+ * @param instance of a SerDe
240
253
  */
241
- void serialize(std::ostream& os) const;
254
+ template<typename SerDe = S>
255
+ void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
242
256
 
243
257
  // This is a convenience alias for users
244
258
  // The type returned by the following serialize method
@@ -250,24 +264,53 @@ public:
250
264
  * It is a blank space of a given size.
251
265
  * This header is used in Datasketches PostgreSQL extension.
252
266
  * @param header_size_bytes space to reserve in front of the sketch
267
+ * @param instance of a SerDe
253
268
  */
254
- vector_bytes serialize(unsigned header_size_bytes = 0) const;
269
+ template<typename SerDe = S>
270
+ vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
255
271
 
256
272
  /**
257
273
  * This method deserializes a sketch from a given stream.
258
274
  * @param is input stream
275
+ * @param instance of an Allocator
259
276
  * @return an instance of a sketch
277
+ *
278
+ * Deprecated, to be removed in the next major version
260
279
  */
261
280
  static req_sketch deserialize(std::istream& is, const Allocator& allocator = Allocator());
262
281
 
282
+ /**
283
+ * This method deserializes a sketch from a given stream.
284
+ * @param is input stream
285
+ * @param instance of a SerDe
286
+ * @param instance of an Allocator
287
+ * @return an instance of a sketch
288
+ */
289
+ template<typename SerDe = S>
290
+ static req_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());
291
+
263
292
  /**
264
293
  * This method deserializes a sketch from a given array of bytes.
265
294
  * @param bytes pointer to the array of bytes
266
295
  * @param size the size of the array
296
+ * @param instance of an Allocator
267
297
  * @return an instance of a sketch
298
+ *
299
+ * Deprecated, to be removed in the next major version
268
300
  */
269
301
  static req_sketch deserialize(const void* bytes, size_t size, const Allocator& allocator = Allocator());
270
302
 
303
+ /**
304
+ * This method deserializes a sketch from a given array of bytes.
305
+ * @param bytes pointer to the array of bytes
306
+ * @param size the size of the array
307
+ * @param instance of a SerDe
308
+ * @param instance of an Allocator
309
+ * @return an instance of a sketch
310
+ */
311
+ template<typename SerDe = S>
312
+ static req_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());
313
+
271
314
  /**
272
315
  * Prints a summary of the sketch.
273
316
  * @param print_levels if true include information about levels
@@ -279,6 +322,9 @@ public:
279
322
  const_iterator begin() const;
280
323
  const_iterator end() const;
281
324
 
325
+ template<bool inclusive = false>
326
+ quantile_sketch_sorted_view<T, Comparator, Allocator> get_sorted_view(bool cumulative) const;
327
+
282
328
  private:
283
329
  Allocator allocator_;
284
330
  uint16_t k_;
@@ -310,13 +356,6 @@ private:
310
356
  static double get_rank_ub(uint16_t k, uint8_t num_levels, double rank, uint8_t num_std_dev, uint64_t n, bool hra);
311
357
  static bool is_exact_rank(uint16_t k, uint8_t num_levels, double rank, uint64_t n, bool hra);
312
358
 
313
- using QuantileCalculator = req_quantile_calculator<T, Comparator, Allocator>;
314
- using AllocCalc = typename std::allocator_traits<Allocator>::template rebind_alloc<QuantileCalculator>;
315
- class calculator_deleter;
316
- using QuantileCalculatorPtr = typename std::unique_ptr<QuantileCalculator, calculator_deleter>;
317
- template<bool inclusive>
318
- QuantileCalculatorPtr get_quantile_calculator() const;
319
-
320
359
  // for deserialization
321
360
  class item_deleter;
322
361
  req_sketch(uint16_t k, bool hra, uint64_t n, std::unique_ptr<T, item_deleter> min_value, std::unique_ptr<T, item_deleter> max_value, std::vector<Compactor, AllocCompactor>&& compactors);
@@ -196,6 +196,11 @@ const T& req_sketch<T, C, S, A>::get_max_value() const {
196
196
  return *max_value_;
197
197
  }
198
198
 
199
+ template<typename T, typename C, typename S, typename A>
200
+ C req_sketch<T, C, S, A>::get_comparator() const {
201
+ return C();
202
+ }
203
+
199
204
  template<typename T, typename C, typename S, typename A>
200
205
  template<bool inclusive>
201
206
  double req_sketch<T, C, S, A>::get_rank(const T& item) const {
@@ -210,6 +215,7 @@ template<typename T, typename C, typename S, typename A>
210
215
  template<bool inclusive>
211
216
  auto req_sketch<T, C, S, A>::get_PMF(const T* split_points, uint32_t size) const -> vector_double {
212
217
  auto buckets = get_CDF<inclusive>(split_points, size);
218
+ if (is_empty()) return buckets;
213
219
  for (uint32_t i = size; i > 0; --i) {
214
220
  buckets[i] -= buckets[i - 1];
215
221
  }
@@ -230,14 +236,15 @@ auto req_sketch<T, C, S, A>::get_CDF(const T* split_points, uint32_t size) const
230
236
 
231
237
  template<typename T, typename C, typename S, typename A>
232
238
  template<bool inclusive>
233
- const T& req_sketch<T, C, S, A>::get_quantile(double rank) const {
239
+ auto req_sketch<T, C, S, A>::get_quantile(double rank) const -> quantile_return_type {
234
240
  if (is_empty()) return get_invalid_value();
235
241
  if (rank == 0.0) return *min_value_;
236
242
  if (rank == 1.0) return *max_value_;
237
243
  if ((rank < 0.0) || (rank > 1.0)) {
238
244
  throw std::invalid_argument("Rank cannot be less than zero or greater than 1.0");
239
245
  }
240
- return *(get_quantile_calculator<inclusive>()->get_quantile(rank));
246
+ // possible side-effect of sorting level zero
247
+ return get_sorted_view<inclusive>(true).get_quantile(rank);
241
248
  }
242
249
 
243
250
  template<typename T, typename C, typename S, typename A>
@@ -245,8 +252,11 @@ template<bool inclusive>
245
252
  std::vector<T, A> req_sketch<T, C, S, A>::get_quantiles(const double* ranks, uint32_t size) const {
246
253
  std::vector<T, A> quantiles(allocator_);
247
254
  if (is_empty()) return quantiles;
248
- QuantileCalculatorPtr quantile_calculator(nullptr, calculator_deleter(allocator_));
249
255
  quantiles.reserve(size);
256
+
257
+ // possible side-effect of sorting level zero
258
+ auto view = get_sorted_view<inclusive>(true);
259
+
250
260
  for (uint32_t i = 0; i < size; ++i) {
251
261
  const double rank = ranks[i];
252
262
  if ((rank < 0.0) || (rank > 1.0)) {
@@ -255,47 +265,26 @@ std::vector<T, A> req_sketch<T, C, S, A>::get_quantiles(const double* ranks, uin
255
265
  if (rank == 0.0) quantiles.push_back(*min_value_);
256
266
  else if (rank == 1.0) quantiles.push_back(*max_value_);
257
267
  else {
258
- if (!quantile_calculator) {
259
- // has side effect of sorting level zero if needed
260
- quantile_calculator = const_cast<req_sketch*>(this)->get_quantile_calculator<inclusive>();
261
- }
262
- quantiles.push_back(*(quantile_calculator->get_quantile(rank)));
268
+ quantiles.push_back(view.get_quantile(rank));
263
269
  }
264
270
  }
265
271
  return quantiles;
266
272
  }
267
273
 
268
- template<typename T, typename C, typename S, typename A>
269
- class req_sketch<T, C, S, A>::calculator_deleter {
270
- public:
271
- calculator_deleter(const AllocCalc& allocator): allocator_(allocator) {}
272
- void operator() (QuantileCalculator* ptr) {
273
- if (ptr != nullptr) {
274
- ptr->~QuantileCalculator();
275
- allocator_.deallocate(ptr, 1);
276
- }
277
- }
278
- private:
279
- AllocCalc allocator_;
280
- };
281
-
282
274
  template<typename T, typename C, typename S, typename A>
283
275
  template<bool inclusive>
284
- auto req_sketch<T, C, S, A>::get_quantile_calculator() const -> QuantileCalculatorPtr {
276
+ quantile_sketch_sorted_view<T, C, A> req_sketch<T, C, S, A>::get_sorted_view(bool cumulative) const {
285
277
  if (!compactors_[0].is_sorted()) {
286
278
  const_cast<Compactor&>(compactors_[0]).sort(); // allow this side effect
287
279
  }
288
- AllocCalc ac(allocator_);
289
- QuantileCalculatorPtr quantile_calculator(
290
- new (ac.allocate(1)) req_quantile_calculator<T, C, A>(n_, ac),
291
- calculator_deleter(ac)
292
- );
280
+ quantile_sketch_sorted_view<T, C, A> view(get_num_retained(), allocator_);
293
281
 
294
282
  for (auto& compactor: compactors_) {
295
- quantile_calculator->add(compactor.begin(), compactor.end(), compactor.get_lg_weight());
283
+ view.add(compactor.begin(), compactor.end(), 1 << compactor.get_lg_weight());
296
284
  }
297
- quantile_calculator->template convert_to_cummulative<inclusive>();
298
- return quantile_calculator;
285
+
286
+ if (cumulative) view.template convert_to_cummulative<inclusive>();
287
+ return view;
299
288
  }
300
289
 
301
290
  template<typename T, typename C, typename S, typename A>
@@ -348,8 +337,8 @@ double req_sketch<T, C, S, A>::relative_rse_factor() {
348
337
 
349
338
  // implementation for fixed-size arithmetic types (integral and floating point)
350
339
  template<typename T, typename C, typename S, typename A>
351
- template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
352
- size_t req_sketch<T, C, S, A>::get_serialized_size_bytes() const {
340
+ template<typename TT, typename SerDe, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
341
+ size_t req_sketch<T, C, S, A>::get_serialized_size_bytes(const SerDe& sd) const {
353
342
  size_t size = PREAMBLE_SIZE_BYTES;
354
343
  if (is_empty()) return size;
355
344
  if (is_estimation_mode()) {
@@ -358,32 +347,33 @@ size_t req_sketch<T, C, S, A>::get_serialized_size_bytes() const {
358
347
  if (n_ == 1) {
359
348
  size += sizeof(TT);
360
349
  } else {
361
- for (const auto& compactor: compactors_) size += compactor.get_serialized_size_bytes(S());
350
+ for (const auto& compactor: compactors_) size += compactor.get_serialized_size_bytes(sd);
362
351
  }
363
352
  return size;
364
353
  }
365
354
 
366
355
  // implementation for all other types
367
356
  template<typename T, typename C, typename S, typename A>
368
- template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
369
- size_t req_sketch<T, C, S, A>::get_serialized_size_bytes() const {
357
+ template<typename TT, typename SerDe, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
358
+ size_t req_sketch<T, C, S, A>::get_serialized_size_bytes(const SerDe& sd) const {
370
359
  size_t size = PREAMBLE_SIZE_BYTES;
371
360
  if (is_empty()) return size;
372
361
  if (is_estimation_mode()) {
373
362
  size += sizeof(n_);
374
- size += S().size_of_item(*min_value_);
375
- size += S().size_of_item(*max_value_);
363
+ size += sd.size_of_item(*min_value_);
364
+ size += sd.size_of_item(*max_value_);
376
365
  }
377
366
  if (n_ == 1) {
378
- size += S().size_of_item(*compactors_[0].begin());
367
+ size += sd.size_of_item(*compactors_[0].begin());
379
368
  } else {
380
- for (const auto& compactor: compactors_) size += compactor.get_serialized_size_bytes(S());
369
+ for (const auto& compactor: compactors_) size += compactor.get_serialized_size_bytes(sd);
381
370
  }
382
371
  return size;
383
372
  }
384
373
 
385
374
  template<typename T, typename C, typename S, typename A>
386
- void req_sketch<T, C, S, A>::serialize(std::ostream& os) const {
375
+ template<typename SerDe>
376
+ void req_sketch<T, C, S, A>::serialize(std::ostream& os, const SerDe& sd) const {
387
377
  const uint8_t preamble_ints = is_estimation_mode() ? 4 : 2;
388
378
  write(os, preamble_ints);
389
379
  const uint8_t serial_version = SERIAL_VERSION;
@@ -406,19 +396,20 @@ void req_sketch<T, C, S, A>::serialize(std::ostream& os) const {
406
396
  if (is_empty()) return;
407
397
  if (is_estimation_mode()) {
408
398
  write(os, n_);
409
- S().serialize(os, min_value_, 1);
410
- S().serialize(os, max_value_, 1);
399
+ sd.serialize(os, min_value_, 1);
400
+ sd.serialize(os, max_value_, 1);
411
401
  }
412
402
  if (raw_items) {
413
- S().serialize(os, compactors_[0].begin(), num_raw_items);
403
+ sd.serialize(os, compactors_[0].begin(), num_raw_items);
414
404
  } else {
415
- for (const auto& compactor: compactors_) compactor.serialize(os, S());
405
+ for (const auto& compactor: compactors_) compactor.serialize(os, sd);
416
406
  }
417
407
  }
418
408
 
419
409
  template<typename T, typename C, typename S, typename A>
420
- auto req_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
421
- const size_t size = header_size_bytes + get_serialized_size_bytes();
410
+ template<typename SerDe>
411
+ auto req_sketch<T, C, S, A>::serialize(unsigned header_size_bytes, const SerDe& sd) const -> vector_bytes {
412
+ const size_t size = header_size_bytes + get_serialized_size_bytes(sd);
422
413
  vector_bytes bytes(size, 0, allocator_);
423
414
  uint8_t* ptr = bytes.data() + header_size_bytes;
424
415
  const uint8_t* end_ptr = ptr + size;
@@ -445,13 +436,13 @@ auto req_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const -> vect
445
436
  if (!is_empty()) {
446
437
  if (is_estimation_mode()) {
447
438
  ptr += copy_to_mem(n_, ptr);
448
- ptr += S().serialize(ptr, end_ptr - ptr, min_value_, 1);
449
- ptr += S().serialize(ptr, end_ptr - ptr, max_value_, 1);
439
+ ptr += sd.serialize(ptr, end_ptr - ptr, min_value_, 1);
440
+ ptr += sd.serialize(ptr, end_ptr - ptr, max_value_, 1);
450
441
  }
451
442
  if (raw_items) {
452
- ptr += S().serialize(ptr, end_ptr - ptr, compactors_[0].begin(), num_raw_items);
443
+ ptr += sd.serialize(ptr, end_ptr - ptr, compactors_[0].begin(), num_raw_items);
453
444
  } else {
454
- for (const auto& compactor: compactors_) ptr += compactor.serialize(ptr, end_ptr - ptr, S());
445
+ for (const auto& compactor: compactors_) ptr += compactor.serialize(ptr, end_ptr - ptr, sd);
455
446
  }
456
447
  }
457
448
  return bytes;
@@ -459,6 +450,12 @@ auto req_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const -> vect
459
450
 
460
451
  template<typename T, typename C, typename S, typename A>
461
452
  req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(std::istream& is, const A& allocator) {
453
+ return deserialize(is, S(), allocator);
454
+ }
455
+
456
+ template<typename T, typename C, typename S, typename A>
457
+ template<typename SerDe>
458
+ req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
462
459
  const auto preamble_ints = read<uint8_t>(is);
463
460
  const auto serial_version = read<uint8_t>(is);
464
461
  const auto family_id = read<uint8_t>(is);
@@ -490,19 +487,19 @@ req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(std::istream& is, con
490
487
  uint64_t n = 1;
491
488
  if (num_levels > 1) {
492
489
  n = read<uint64_t>(is);
493
- S().deserialize(is, min_value_buffer.get(), 1);
490
+ sd.deserialize(is, min_value_buffer.get(), 1);
494
491
  // serde call did not throw, repackage with destrtuctor
495
492
  min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
496
- S().deserialize(is, max_value_buffer.get(), 1);
493
+ sd.deserialize(is, max_value_buffer.get(), 1);
497
494
  // serde call did not throw, repackage with destrtuctor
498
495
  max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
499
496
  }
500
497
 
501
498
  if (raw_items) {
502
- compactors.push_back(Compactor::deserialize(is, S(), allocator, is_level_0_sorted, k, num_raw_items, hra));
499
+ compactors.push_back(Compactor::deserialize(is, sd, allocator, is_level_0_sorted, k, num_raw_items, hra));
503
500
  } else {
504
501
  for (size_t i = 0; i < num_levels; ++i) {
505
- compactors.push_back(Compactor::deserialize(is, S(), allocator, i == 0 ? is_level_0_sorted : true, hra));
502
+ compactors.push_back(Compactor::deserialize(is, sd, allocator, i == 0 ? is_level_0_sorted : true, hra));
506
503
  }
507
504
  }
508
505
  if (num_levels == 1) {
@@ -529,6 +526,12 @@ req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(std::istream& is, con
529
526
 
530
527
  template<typename T, typename C, typename S, typename A>
531
528
  req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(const void* bytes, size_t size, const A& allocator) {
529
+ return deserialize(bytes, size, S(), allocator);
530
+ }
531
+
532
+ template<typename T, typename C, typename S, typename A>
533
+ template<typename SerDe>
534
+ req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {
532
535
  ensure_minimum_memory(size, 8);
533
536
  const char* ptr = static_cast<const char*>(bytes);
534
537
  const char* end_ptr = static_cast<const char*>(bytes) + size;
@@ -571,21 +574,21 @@ req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(const void* bytes, si
571
574
  if (num_levels > 1) {
572
575
  ensure_minimum_memory(end_ptr - ptr, sizeof(n));
573
576
  ptr += copy_from_mem(ptr, n);
574
- ptr += S().deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
577
+ ptr += sd.deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
575
578
  // serde call did not throw, repackage with destrtuctor
576
579
  min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
577
- ptr += S().deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
580
+ ptr += sd.deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
578
581
  // serde call did not throw, repackage with destrtuctor
579
582
  max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
580
583
  }
581
584
 
582
585
  if (raw_items) {
583
- auto pair = Compactor::deserialize(ptr, end_ptr - ptr, S(), allocator, is_level_0_sorted, k, num_raw_items, hra);
586
+ auto pair = Compactor::deserialize(ptr, end_ptr - ptr, sd, allocator, is_level_0_sorted, k, num_raw_items, hra);
584
587
  compactors.push_back(std::move(pair.first));
585
588
  ptr += pair.second;
586
589
  } else {
587
590
  for (size_t i = 0; i < num_levels; ++i) {
588
- auto pair = Compactor::deserialize(ptr, end_ptr - ptr, S(), allocator, i == 0 ? is_level_0_sorted : true, hra);
591
+ auto pair = Compactor::deserialize(ptr, end_ptr - ptr, sd, allocator, i == 0 ? is_level_0_sorted : true, hra);
589
592
  compactors.push_back(std::move(pair.first));
590
593
  ptr += pair.second;
591
594
  }
@@ -24,6 +24,7 @@
24
24
  #include <fstream>
25
25
  #include <sstream>
26
26
  #include <limits>
27
+ #include <stdexcept>
27
28
 
28
29
  namespace datasketches {
29
30
 
@@ -51,6 +52,10 @@ TEST_CASE("req sketch: empty", "[req_sketch]") {
51
52
  REQUIRE(std::isnan(sketch.get_quantile(1)));
52
53
  const double ranks[3] {0, 0.5, 1};
53
54
  REQUIRE(sketch.get_quantiles(ranks, 3).size() == 0);
55
+
56
+ const float split_points[1] {0};
57
+ REQUIRE(sketch.get_CDF(split_points, 1).empty());
58
+ REQUIRE(sketch.get_PMF(split_points, 1).empty());
54
59
  }
55
60
 
56
61
  TEST_CASE("req sketch: single value, lra", "[req_sketch]") {
@@ -58,7 +58,11 @@ namespace var_opt_constants {
58
58
  const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
59
59
  }
60
60
 
61
- template <typename T, typename S = serde<T>, typename A = std::allocator<T>>
61
+ template<
62
+ typename T,
63
+ typename S = serde<T>, // deprecated, to be removed in the next major version
64
+ typename A = std::allocator<T>
65
+ >
62
66
  class var_opt_sketch {
63
67
 
64
68
  public:
@@ -135,18 +139,20 @@ class var_opt_sketch {
135
139
  /**
136
140
  * Computes size needed to serialize the current state of the sketch.
137
141
  * This version is for fixed-size arithmetic types (integral and floating point).
142
+ * @param instance of a SerDe
138
143
  * @return size in bytes needed to serialize this sketch
139
144
  */
140
- template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
141
- inline size_t get_serialized_size_bytes() const;
145
+ template<typename TT = T, typename SerDe = S, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
146
+ inline size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
142
147
 
143
148
  /**
144
149
  * Computes size needed to serialize the current state of the sketch.
145
150
  * This version is for all other types and can be expensive since every item needs to be looked at.
151
+ * @param instance of a SerDe
146
152
  * @return size in bytes needed to serialize this sketch
147
153
  */
148
- template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
149
- inline size_t get_serialized_size_bytes() const;
154
+ template<typename TT = T, typename SerDe = S, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
155
+ inline size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
150
156
 
151
157
  // This is a convenience alias for users
152
158
  // The type returned by the following serialize method
@@ -158,30 +164,61 @@ class var_opt_sketch {
158
164
  * It is a blank space of a given size.
159
165
  * This header is used in Datasketches PostgreSQL extension.
160
166
  * @param header_size_bytes space to reserve in front of the sketch
167
+ * @param instance of a SerDe
161
168
  */
162
- vector_bytes serialize(unsigned header_size_bytes = 0) const;
169
+ template<typename SerDe = S>
170
+ vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
163
171
 
164
172
  /**
165
173
  * This method serializes the sketch into a given stream in a binary form
166
174
  * @param os output stream
175
+ * @param instance of a SerDe
167
176
  */
168
- void serialize(std::ostream& os) const;
177
+ template<typename SerDe = S>
178
+ void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
169
179
 
170
180
  /**
171
181
  * This method deserializes a sketch from a given stream.
172
182
  * @param is input stream
183
+ * @param instance of an Allocator
173
184
  * @return an instance of a sketch
185
+ *
186
+ * Deprecated, to be removed in the next major version
174
187
  */
175
188
  static var_opt_sketch deserialize(std::istream& is, const A& allocator = A());
176
189
 
190
+ /**
191
+ * This method deserializes a sketch from a given stream.
192
+ * @param is input stream
193
+ * @param instance of a SerDe
194
+ * @param instance of an Allocator
195
+ * @return an instance of a sketch
196
+ */
197
+ template<typename SerDe = S>
198
+ static var_opt_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
199
+
177
200
  /**
178
201
  * This method deserializes a sketch from a given array of bytes.
179
202
  * @param bytes pointer to the array of bytes
180
203
  * @param size the size of the array
204
+ * @param instance of an Allocator
181
205
  * @return an instance of a sketch
206
+ *
207
+ * Deprecated, to be removed in the next major version
182
208
  */
183
209
  static var_opt_sketch deserialize(const void* bytes, size_t size, const A& allocator = A());
184
210
 
211
+ /**
212
+ * This method deserializes a sketch from a given array of bytes.
213
+ * @param bytes pointer to the array of bytes
214
+ * @param size the size of the array
215
+ * @param instance of a SerDe
216
+ * @param instance of an Allocator
217
+ * @return an instance of a sketch
218
+ */
219
+ template<typename SerDe = S>
220
+ static var_opt_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
221
+
185
222
  /**
186
223
  * Prints a summary of the sketch.
187
224
  * @return the summary as a string