datasketches 0.2.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (106) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +1 -1
  4. data/ext/datasketches/kll_wrapper.cpp +5 -1
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +4 -3
  7. data/vendor/datasketches-cpp/common/CMakeLists.txt +4 -0
  8. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
  10. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
  11. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
  12. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
  13. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
  14. data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
  15. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
  16. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
  17. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +2 -0
  18. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
  19. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
  20. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
  21. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +25 -9
  22. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
  23. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
  24. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
  25. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
  26. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
  28. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
  29. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
  30. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
  31. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
  32. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
  33. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
  34. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
  35. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
  36. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
  37. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -4
  38. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
  39. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
  40. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +96 -42
  41. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +105 -127
  42. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +94 -25
  43. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  44. data/vendor/datasketches-cpp/pyproject.toml +1 -1
  45. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  46. data/vendor/datasketches-cpp/python/README.md +7 -0
  47. data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
  48. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
  49. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
  50. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
  51. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
  52. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
  53. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
  54. data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
  55. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
  56. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
  57. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
  58. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
  59. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
  60. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
  61. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
  62. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
  63. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
  64. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
  65. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
  66. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
  67. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
  68. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
  69. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
  70. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
  71. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -2
  72. data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
  73. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
  74. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
  75. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +62 -59
  76. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
  77. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
  78. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +31 -26
  79. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
  80. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +25 -9
  81. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
  82. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
  83. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
  84. data/vendor/datasketches-cpp/setup.py +1 -1
  85. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  86. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +8 -6
  87. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +1 -0
  88. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -0
  89. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +7 -45
  90. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -0
  91. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +1 -0
  92. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
  93. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
  94. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -0
  95. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +29 -1
  96. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +2 -0
  97. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +16 -0
  98. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +1 -0
  99. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
  100. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
  101. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -0
  102. metadata +25 -9
  103. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
  104. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
  105. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
  106. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
@@ -42,6 +42,4 @@ install(FILES
42
42
  include/req_sketch_impl.hpp
43
43
  include/req_compactor.hpp
44
44
  include/req_compactor_impl.hpp
45
- include/req_quantile_calculator.hpp
46
- include/req_quantile_calculator_impl.hpp
47
45
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
@@ -21,17 +21,12 @@
21
21
  #define REQ_COMMON_HPP_
22
22
 
23
23
  #include <random>
24
- #include <chrono>
25
24
 
26
25
  #include "serde.hpp"
27
26
  #include "common_defs.hpp"
28
27
 
29
28
  namespace datasketches {
30
29
 
31
- // TODO: have a common random bit with KLL
32
- static std::independent_bits_engine<std::mt19937, 1, unsigned>
33
- req_random_bit(static_cast<unsigned>(std::chrono::system_clock::now().time_since_epoch().count()));
34
-
35
30
  namespace req_constants {
36
31
  static const uint16_t MIN_K = 4;
37
32
  static const uint8_t INIT_NUM_SECTIONS = 3;
@@ -26,6 +26,7 @@
26
26
 
27
27
  #include "count_zeros.hpp"
28
28
  #include "conditional_forward.hpp"
29
+ #include "common_defs.hpp"
29
30
 
30
31
  #include <iomanip>
31
32
 
@@ -245,7 +246,7 @@ std::pair<uint32_t, uint32_t> req_compactor<T, C, A>::compact(req_compactor& nex
245
246
  if (compaction_range.second - compaction_range.first < 2) throw std::logic_error("compaction range error");
246
247
 
247
248
  if ((state_ & 1) == 1) { coin_ = !coin_; } // for odd flip coin;
248
- else { coin_ = req_random_bit(); } // random coin flip
249
+ else { coin_ = random_bit(); } // random coin flip
249
250
 
250
251
  const auto num = (compaction_range.second - compaction_range.first) / 2;
251
252
  next.ensure_space(num);
@@ -451,7 +452,7 @@ req_compactor<T, C, A>::req_compactor(bool hra, uint8_t lg_weight, bool sorted,
451
452
  allocator_(allocator),
452
453
  lg_weight_(lg_weight),
453
454
  hra_(hra),
454
- coin_(req_random_bit()),
455
+ coin_(random_bit()),
455
456
  sorted_(sorted),
456
457
  section_size_raw_(section_size_raw),
457
458
  section_size_(nearest_even(section_size_raw)),
@@ -22,22 +22,25 @@
22
22
 
23
23
  #include "req_common.hpp"
24
24
  #include "req_compactor.hpp"
25
- #include "req_quantile_calculator.hpp"
25
+ #include "quantile_sketch_sorted_view.hpp"
26
+
27
+ #include <stdexcept>
26
28
 
27
29
  namespace datasketches {
28
30
 
29
31
  template<
30
32
  typename T,
31
- typename Comparator = std::less<T>,
32
- typename SerDe = serde<T>,
33
+ typename Comparator = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare)
34
+ typename S = serde<T>, // deprecated, to be removed in the next major version
33
35
  typename Allocator = std::allocator<T>
34
36
  >
35
37
  class req_sketch {
36
38
  public:
39
+ using value_type = T;
40
+ using comparator = Comparator;
37
41
  using Compactor = req_compactor<T, Comparator, Allocator>;
38
42
  using AllocCompactor = typename std::allocator_traits<Allocator>::template rebind_alloc<Compactor>;
39
- using AllocDouble = typename std::allocator_traits<Allocator>::template rebind_alloc<double>;
40
- using vector_double = std::vector<double, AllocDouble>;
43
+ using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
41
44
 
42
45
  /**
43
46
  * Constructor
@@ -113,6 +116,12 @@ public:
113
116
  */
114
117
  const T& get_max_value() const;
115
118
 
119
+ /**
120
+ * Returns an instance of the comparator for this sketch.
121
+ * @return comparator
122
+ */
123
+ Comparator get_comparator() const;
124
+
116
125
  /**
117
126
  * Returns an approximation to the normalized (fractional) rank of the given item from 0 to 1 inclusive.
118
127
  * With the template parameter inclusive=true the weight of the given item is included into the rank.
@@ -123,7 +132,6 @@ public:
123
132
  * @param item to be ranked
124
133
  * @return an approximate rank of the given item
125
134
  */
126
-
127
135
  template<bool inclusive = false>
128
136
  double get_rank(const T& item) const;
129
137
 
@@ -135,9 +143,10 @@ public:
135
143
  *
136
144
  * @param split_points an array of <i>m</i> unique, monotonically increasing values
137
145
  * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
138
- * The definition of an "interval" is inclusive of the left split point (or minimum value) and
139
- * exclusive of the right split point, with the exception that the last interval will include
140
- * the maximum value.
146
+ * If the template parameter inclusive=false, the definition of an "interval" is inclusive of the left split point and exclusive of the right
147
+ * split point, with the exception that the last interval will include the maximum value.
148
+ * If the template parameter inclusive=true, the definition of an "interval" is exclusive of the left split point and inclusive of the right
149
+ * split point.
141
150
  * It is not necessary to include either the min or max values in these split points.
142
151
  *
143
152
  * @return an array of m+1 doubles each of which is an approximation
@@ -178,8 +187,9 @@ public:
178
187
  * @param rank the given normalized rank
179
188
  * @return approximate quantile given the normalized rank
180
189
  */
190
+ using quantile_return_type = typename quantile_sketch_sorted_view<T, Comparator, Allocator>::quantile_return_type;
181
191
  template<bool inclusive = false>
182
- const T& get_quantile(double rank) const;
192
+ quantile_return_type get_quantile(double rank) const;
183
193
 
184
194
  /**
185
195
  * Returns an array of quantiles that correspond to the given array of normalized ranks.
@@ -221,24 +231,28 @@ public:
221
231
  /**
222
232
  * Computes size needed to serialize the current state of the sketch.
223
233
  * This version is for fixed-size arithmetic types (integral and floating point).
234
+ * @param instance of a SerDe
224
235
  * @return size in bytes needed to serialize this sketch
225
236
  */
226
- template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
227
- size_t get_serialized_size_bytes() const;
237
+ template<typename TT = T, typename SerDe = S, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
238
+ size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
228
239
 
229
240
  /**
230
241
  * Computes size needed to serialize the current state of the sketch.
231
242
  * This version is for all other types and can be expensive since every item needs to be looked at.
243
+ * @param instance of a SerDe
232
244
  * @return size in bytes needed to serialize this sketch
233
245
  */
234
- template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
235
- size_t get_serialized_size_bytes() const;
246
+ template<typename TT = T, typename SerDe = S, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
247
+ size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
236
248
 
237
249
  /**
238
250
  * This method serializes the sketch into a given stream in a binary form
239
251
  * @param os output stream
252
+ * @param instance of a SerDe
240
253
  */
241
- void serialize(std::ostream& os) const;
254
+ template<typename SerDe = S>
255
+ void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
242
256
 
243
257
  // This is a convenience alias for users
244
258
  // The type returned by the following serialize method
@@ -250,24 +264,53 @@ public:
250
264
  * It is a blank space of a given size.
251
265
  * This header is used in Datasketches PostgreSQL extension.
252
266
  * @param header_size_bytes space to reserve in front of the sketch
267
+ * @param instance of a SerDe
253
268
  */
254
- vector_bytes serialize(unsigned header_size_bytes = 0) const;
269
+ template<typename SerDe = S>
270
+ vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
255
271
 
256
272
  /**
257
273
  * This method deserializes a sketch from a given stream.
258
274
  * @param is input stream
275
+ * @param instance of an Allocator
259
276
  * @return an instance of a sketch
277
+ *
278
+ * Deprecated, to be removed in the next major version
260
279
  */
261
280
  static req_sketch deserialize(std::istream& is, const Allocator& allocator = Allocator());
262
281
 
282
+ /**
283
+ * This method deserializes a sketch from a given stream.
284
+ * @param is input stream
285
+ * @param instance of a SerDe
286
+ * @param instance of an Allocator
287
+ * @return an instance of a sketch
288
+ */
289
+ template<typename SerDe = S>
290
+ static req_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());
291
+
263
292
  /**
264
293
  * This method deserializes a sketch from a given array of bytes.
265
294
  * @param bytes pointer to the array of bytes
266
295
  * @param size the size of the array
296
+ * @param instance of an Allocator
267
297
  * @return an instance of a sketch
298
+ *
299
+ * Deprecated, to be removed in the next major version
268
300
  */
269
301
  static req_sketch deserialize(const void* bytes, size_t size, const Allocator& allocator = Allocator());
270
302
 
303
+ /**
304
+ * This method deserializes a sketch from a given array of bytes.
305
+ * @param bytes pointer to the array of bytes
306
+ * @param size the size of the array
307
+ * @param instance of a SerDe
308
+ * @param instance of an Allocator
309
+ * @return an instance of a sketch
310
+ */
311
+ template<typename SerDe = S>
312
+ static req_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const Allocator& allocator = Allocator());
313
+
271
314
  /**
272
315
  * Prints a summary of the sketch.
273
316
  * @param print_levels if true include information about levels
@@ -279,6 +322,9 @@ public:
279
322
  const_iterator begin() const;
280
323
  const_iterator end() const;
281
324
 
325
+ template<bool inclusive = false>
326
+ quantile_sketch_sorted_view<T, Comparator, Allocator> get_sorted_view(bool cumulative) const;
327
+
282
328
  private:
283
329
  Allocator allocator_;
284
330
  uint16_t k_;
@@ -310,13 +356,6 @@ private:
310
356
  static double get_rank_ub(uint16_t k, uint8_t num_levels, double rank, uint8_t num_std_dev, uint64_t n, bool hra);
311
357
  static bool is_exact_rank(uint16_t k, uint8_t num_levels, double rank, uint64_t n, bool hra);
312
358
 
313
- using QuantileCalculator = req_quantile_calculator<T, Comparator, Allocator>;
314
- using AllocCalc = typename std::allocator_traits<Allocator>::template rebind_alloc<QuantileCalculator>;
315
- class calculator_deleter;
316
- using QuantileCalculatorPtr = typename std::unique_ptr<QuantileCalculator, calculator_deleter>;
317
- template<bool inclusive>
318
- QuantileCalculatorPtr get_quantile_calculator() const;
319
-
320
359
  // for deserialization
321
360
  class item_deleter;
322
361
  req_sketch(uint16_t k, bool hra, uint64_t n, std::unique_ptr<T, item_deleter> min_value, std::unique_ptr<T, item_deleter> max_value, std::vector<Compactor, AllocCompactor>&& compactors);
@@ -196,6 +196,11 @@ const T& req_sketch<T, C, S, A>::get_max_value() const {
196
196
  return *max_value_;
197
197
  }
198
198
 
199
+ template<typename T, typename C, typename S, typename A>
200
+ C req_sketch<T, C, S, A>::get_comparator() const {
201
+ return C();
202
+ }
203
+
199
204
  template<typename T, typename C, typename S, typename A>
200
205
  template<bool inclusive>
201
206
  double req_sketch<T, C, S, A>::get_rank(const T& item) const {
@@ -210,6 +215,7 @@ template<typename T, typename C, typename S, typename A>
210
215
  template<bool inclusive>
211
216
  auto req_sketch<T, C, S, A>::get_PMF(const T* split_points, uint32_t size) const -> vector_double {
212
217
  auto buckets = get_CDF<inclusive>(split_points, size);
218
+ if (is_empty()) return buckets;
213
219
  for (uint32_t i = size; i > 0; --i) {
214
220
  buckets[i] -= buckets[i - 1];
215
221
  }
@@ -230,14 +236,15 @@ auto req_sketch<T, C, S, A>::get_CDF(const T* split_points, uint32_t size) const
230
236
 
231
237
  template<typename T, typename C, typename S, typename A>
232
238
  template<bool inclusive>
233
- const T& req_sketch<T, C, S, A>::get_quantile(double rank) const {
239
+ auto req_sketch<T, C, S, A>::get_quantile(double rank) const -> quantile_return_type {
234
240
  if (is_empty()) return get_invalid_value();
235
241
  if (rank == 0.0) return *min_value_;
236
242
  if (rank == 1.0) return *max_value_;
237
243
  if ((rank < 0.0) || (rank > 1.0)) {
238
244
  throw std::invalid_argument("Rank cannot be less than zero or greater than 1.0");
239
245
  }
240
- return *(get_quantile_calculator<inclusive>()->get_quantile(rank));
246
+ // possible side-effect of sorting level zero
247
+ return get_sorted_view<inclusive>(true).get_quantile(rank);
241
248
  }
242
249
 
243
250
  template<typename T, typename C, typename S, typename A>
@@ -245,8 +252,11 @@ template<bool inclusive>
245
252
  std::vector<T, A> req_sketch<T, C, S, A>::get_quantiles(const double* ranks, uint32_t size) const {
246
253
  std::vector<T, A> quantiles(allocator_);
247
254
  if (is_empty()) return quantiles;
248
- QuantileCalculatorPtr quantile_calculator(nullptr, calculator_deleter(allocator_));
249
255
  quantiles.reserve(size);
256
+
257
+ // possible side-effect of sorting level zero
258
+ auto view = get_sorted_view<inclusive>(true);
259
+
250
260
  for (uint32_t i = 0; i < size; ++i) {
251
261
  const double rank = ranks[i];
252
262
  if ((rank < 0.0) || (rank > 1.0)) {
@@ -255,47 +265,26 @@ std::vector<T, A> req_sketch<T, C, S, A>::get_quantiles(const double* ranks, uin
255
265
  if (rank == 0.0) quantiles.push_back(*min_value_);
256
266
  else if (rank == 1.0) quantiles.push_back(*max_value_);
257
267
  else {
258
- if (!quantile_calculator) {
259
- // has side effect of sorting level zero if needed
260
- quantile_calculator = const_cast<req_sketch*>(this)->get_quantile_calculator<inclusive>();
261
- }
262
- quantiles.push_back(*(quantile_calculator->get_quantile(rank)));
268
+ quantiles.push_back(view.get_quantile(rank));
263
269
  }
264
270
  }
265
271
  return quantiles;
266
272
  }
267
273
 
268
- template<typename T, typename C, typename S, typename A>
269
- class req_sketch<T, C, S, A>::calculator_deleter {
270
- public:
271
- calculator_deleter(const AllocCalc& allocator): allocator_(allocator) {}
272
- void operator() (QuantileCalculator* ptr) {
273
- if (ptr != nullptr) {
274
- ptr->~QuantileCalculator();
275
- allocator_.deallocate(ptr, 1);
276
- }
277
- }
278
- private:
279
- AllocCalc allocator_;
280
- };
281
-
282
274
  template<typename T, typename C, typename S, typename A>
283
275
  template<bool inclusive>
284
- auto req_sketch<T, C, S, A>::get_quantile_calculator() const -> QuantileCalculatorPtr {
276
+ quantile_sketch_sorted_view<T, C, A> req_sketch<T, C, S, A>::get_sorted_view(bool cumulative) const {
285
277
  if (!compactors_[0].is_sorted()) {
286
278
  const_cast<Compactor&>(compactors_[0]).sort(); // allow this side effect
287
279
  }
288
- AllocCalc ac(allocator_);
289
- QuantileCalculatorPtr quantile_calculator(
290
- new (ac.allocate(1)) req_quantile_calculator<T, C, A>(n_, ac),
291
- calculator_deleter(ac)
292
- );
280
+ quantile_sketch_sorted_view<T, C, A> view(get_num_retained(), allocator_);
293
281
 
294
282
  for (auto& compactor: compactors_) {
295
- quantile_calculator->add(compactor.begin(), compactor.end(), compactor.get_lg_weight());
283
+ view.add(compactor.begin(), compactor.end(), 1 << compactor.get_lg_weight());
296
284
  }
297
- quantile_calculator->template convert_to_cummulative<inclusive>();
298
- return quantile_calculator;
285
+
286
+ if (cumulative) view.template convert_to_cummulative<inclusive>();
287
+ return view;
299
288
  }
300
289
 
301
290
  template<typename T, typename C, typename S, typename A>
@@ -348,8 +337,8 @@ double req_sketch<T, C, S, A>::relative_rse_factor() {
348
337
 
349
338
  // implementation for fixed-size arithmetic types (integral and floating point)
350
339
  template<typename T, typename C, typename S, typename A>
351
- template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
352
- size_t req_sketch<T, C, S, A>::get_serialized_size_bytes() const {
340
+ template<typename TT, typename SerDe, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
341
+ size_t req_sketch<T, C, S, A>::get_serialized_size_bytes(const SerDe& sd) const {
353
342
  size_t size = PREAMBLE_SIZE_BYTES;
354
343
  if (is_empty()) return size;
355
344
  if (is_estimation_mode()) {
@@ -358,32 +347,33 @@ size_t req_sketch<T, C, S, A>::get_serialized_size_bytes() const {
358
347
  if (n_ == 1) {
359
348
  size += sizeof(TT);
360
349
  } else {
361
- for (const auto& compactor: compactors_) size += compactor.get_serialized_size_bytes(S());
350
+ for (const auto& compactor: compactors_) size += compactor.get_serialized_size_bytes(sd);
362
351
  }
363
352
  return size;
364
353
  }
365
354
 
366
355
  // implementation for all other types
367
356
  template<typename T, typename C, typename S, typename A>
368
- template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
369
- size_t req_sketch<T, C, S, A>::get_serialized_size_bytes() const {
357
+ template<typename TT, typename SerDe, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
358
+ size_t req_sketch<T, C, S, A>::get_serialized_size_bytes(const SerDe& sd) const {
370
359
  size_t size = PREAMBLE_SIZE_BYTES;
371
360
  if (is_empty()) return size;
372
361
  if (is_estimation_mode()) {
373
362
  size += sizeof(n_);
374
- size += S().size_of_item(*min_value_);
375
- size += S().size_of_item(*max_value_);
363
+ size += sd.size_of_item(*min_value_);
364
+ size += sd.size_of_item(*max_value_);
376
365
  }
377
366
  if (n_ == 1) {
378
- size += S().size_of_item(*compactors_[0].begin());
367
+ size += sd.size_of_item(*compactors_[0].begin());
379
368
  } else {
380
- for (const auto& compactor: compactors_) size += compactor.get_serialized_size_bytes(S());
369
+ for (const auto& compactor: compactors_) size += compactor.get_serialized_size_bytes(sd);
381
370
  }
382
371
  return size;
383
372
  }
384
373
 
385
374
  template<typename T, typename C, typename S, typename A>
386
- void req_sketch<T, C, S, A>::serialize(std::ostream& os) const {
375
+ template<typename SerDe>
376
+ void req_sketch<T, C, S, A>::serialize(std::ostream& os, const SerDe& sd) const {
387
377
  const uint8_t preamble_ints = is_estimation_mode() ? 4 : 2;
388
378
  write(os, preamble_ints);
389
379
  const uint8_t serial_version = SERIAL_VERSION;
@@ -406,19 +396,20 @@ void req_sketch<T, C, S, A>::serialize(std::ostream& os) const {
406
396
  if (is_empty()) return;
407
397
  if (is_estimation_mode()) {
408
398
  write(os, n_);
409
- S().serialize(os, min_value_, 1);
410
- S().serialize(os, max_value_, 1);
399
+ sd.serialize(os, min_value_, 1);
400
+ sd.serialize(os, max_value_, 1);
411
401
  }
412
402
  if (raw_items) {
413
- S().serialize(os, compactors_[0].begin(), num_raw_items);
403
+ sd.serialize(os, compactors_[0].begin(), num_raw_items);
414
404
  } else {
415
- for (const auto& compactor: compactors_) compactor.serialize(os, S());
405
+ for (const auto& compactor: compactors_) compactor.serialize(os, sd);
416
406
  }
417
407
  }
418
408
 
419
409
  template<typename T, typename C, typename S, typename A>
420
- auto req_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
421
- const size_t size = header_size_bytes + get_serialized_size_bytes();
410
+ template<typename SerDe>
411
+ auto req_sketch<T, C, S, A>::serialize(unsigned header_size_bytes, const SerDe& sd) const -> vector_bytes {
412
+ const size_t size = header_size_bytes + get_serialized_size_bytes(sd);
422
413
  vector_bytes bytes(size, 0, allocator_);
423
414
  uint8_t* ptr = bytes.data() + header_size_bytes;
424
415
  const uint8_t* end_ptr = ptr + size;
@@ -445,13 +436,13 @@ auto req_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const -> vect
445
436
  if (!is_empty()) {
446
437
  if (is_estimation_mode()) {
447
438
  ptr += copy_to_mem(n_, ptr);
448
- ptr += S().serialize(ptr, end_ptr - ptr, min_value_, 1);
449
- ptr += S().serialize(ptr, end_ptr - ptr, max_value_, 1);
439
+ ptr += sd.serialize(ptr, end_ptr - ptr, min_value_, 1);
440
+ ptr += sd.serialize(ptr, end_ptr - ptr, max_value_, 1);
450
441
  }
451
442
  if (raw_items) {
452
- ptr += S().serialize(ptr, end_ptr - ptr, compactors_[0].begin(), num_raw_items);
443
+ ptr += sd.serialize(ptr, end_ptr - ptr, compactors_[0].begin(), num_raw_items);
453
444
  } else {
454
- for (const auto& compactor: compactors_) ptr += compactor.serialize(ptr, end_ptr - ptr, S());
445
+ for (const auto& compactor: compactors_) ptr += compactor.serialize(ptr, end_ptr - ptr, sd);
455
446
  }
456
447
  }
457
448
  return bytes;
@@ -459,6 +450,12 @@ auto req_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const -> vect
459
450
 
460
451
  template<typename T, typename C, typename S, typename A>
461
452
  req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(std::istream& is, const A& allocator) {
453
+ return deserialize(is, S(), allocator);
454
+ }
455
+
456
+ template<typename T, typename C, typename S, typename A>
457
+ template<typename SerDe>
458
+ req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
462
459
  const auto preamble_ints = read<uint8_t>(is);
463
460
  const auto serial_version = read<uint8_t>(is);
464
461
  const auto family_id = read<uint8_t>(is);
@@ -490,19 +487,19 @@ req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(std::istream& is, con
490
487
  uint64_t n = 1;
491
488
  if (num_levels > 1) {
492
489
  n = read<uint64_t>(is);
493
- S().deserialize(is, min_value_buffer.get(), 1);
490
+ sd.deserialize(is, min_value_buffer.get(), 1);
494
491
  // serde call did not throw, repackage with destrtuctor
495
492
  min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
496
- S().deserialize(is, max_value_buffer.get(), 1);
493
+ sd.deserialize(is, max_value_buffer.get(), 1);
497
494
  // serde call did not throw, repackage with destrtuctor
498
495
  max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
499
496
  }
500
497
 
501
498
  if (raw_items) {
502
- compactors.push_back(Compactor::deserialize(is, S(), allocator, is_level_0_sorted, k, num_raw_items, hra));
499
+ compactors.push_back(Compactor::deserialize(is, sd, allocator, is_level_0_sorted, k, num_raw_items, hra));
503
500
  } else {
504
501
  for (size_t i = 0; i < num_levels; ++i) {
505
- compactors.push_back(Compactor::deserialize(is, S(), allocator, i == 0 ? is_level_0_sorted : true, hra));
502
+ compactors.push_back(Compactor::deserialize(is, sd, allocator, i == 0 ? is_level_0_sorted : true, hra));
506
503
  }
507
504
  }
508
505
  if (num_levels == 1) {
@@ -529,6 +526,12 @@ req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(std::istream& is, con
529
526
 
530
527
  template<typename T, typename C, typename S, typename A>
531
528
  req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(const void* bytes, size_t size, const A& allocator) {
529
+ return deserialize(bytes, size, S(), allocator);
530
+ }
531
+
532
+ template<typename T, typename C, typename S, typename A>
533
+ template<typename SerDe>
534
+ req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {
532
535
  ensure_minimum_memory(size, 8);
533
536
  const char* ptr = static_cast<const char*>(bytes);
534
537
  const char* end_ptr = static_cast<const char*>(bytes) + size;
@@ -571,21 +574,21 @@ req_sketch<T, C, S, A> req_sketch<T, C, S, A>::deserialize(const void* bytes, si
571
574
  if (num_levels > 1) {
572
575
  ensure_minimum_memory(end_ptr - ptr, sizeof(n));
573
576
  ptr += copy_from_mem(ptr, n);
574
- ptr += S().deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
577
+ ptr += sd.deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
575
578
  // serde call did not throw, repackage with destrtuctor
576
579
  min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
577
- ptr += S().deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
580
+ ptr += sd.deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
578
581
  // serde call did not throw, repackage with destrtuctor
579
582
  max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
580
583
  }
581
584
 
582
585
  if (raw_items) {
583
- auto pair = Compactor::deserialize(ptr, end_ptr - ptr, S(), allocator, is_level_0_sorted, k, num_raw_items, hra);
586
+ auto pair = Compactor::deserialize(ptr, end_ptr - ptr, sd, allocator, is_level_0_sorted, k, num_raw_items, hra);
584
587
  compactors.push_back(std::move(pair.first));
585
588
  ptr += pair.second;
586
589
  } else {
587
590
  for (size_t i = 0; i < num_levels; ++i) {
588
- auto pair = Compactor::deserialize(ptr, end_ptr - ptr, S(), allocator, i == 0 ? is_level_0_sorted : true, hra);
591
+ auto pair = Compactor::deserialize(ptr, end_ptr - ptr, sd, allocator, i == 0 ? is_level_0_sorted : true, hra);
589
592
  compactors.push_back(std::move(pair.first));
590
593
  ptr += pair.second;
591
594
  }
@@ -24,6 +24,7 @@
24
24
  #include <fstream>
25
25
  #include <sstream>
26
26
  #include <limits>
27
+ #include <stdexcept>
27
28
 
28
29
  namespace datasketches {
29
30
 
@@ -51,6 +52,10 @@ TEST_CASE("req sketch: empty", "[req_sketch]") {
51
52
  REQUIRE(std::isnan(sketch.get_quantile(1)));
52
53
  const double ranks[3] {0, 0.5, 1};
53
54
  REQUIRE(sketch.get_quantiles(ranks, 3).size() == 0);
55
+
56
+ const float split_points[1] {0};
57
+ REQUIRE(sketch.get_CDF(split_points, 1).empty());
58
+ REQUIRE(sketch.get_PMF(split_points, 1).empty());
54
59
  }
55
60
 
56
61
  TEST_CASE("req sketch: single value, lra", "[req_sketch]") {
@@ -58,7 +58,11 @@ namespace var_opt_constants {
58
58
  const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
59
59
  }
60
60
 
61
- template <typename T, typename S = serde<T>, typename A = std::allocator<T>>
61
+ template<
62
+ typename T,
63
+ typename S = serde<T>, // deprecated, to be removed in the next major version
64
+ typename A = std::allocator<T>
65
+ >
62
66
  class var_opt_sketch {
63
67
 
64
68
  public:
@@ -135,18 +139,20 @@ class var_opt_sketch {
135
139
  /**
136
140
  * Computes size needed to serialize the current state of the sketch.
137
141
  * This version is for fixed-size arithmetic types (integral and floating point).
142
+ * @param instance of a SerDe
138
143
  * @return size in bytes needed to serialize this sketch
139
144
  */
140
- template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
141
- inline size_t get_serialized_size_bytes() const;
145
+ template<typename TT = T, typename SerDe = S, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
146
+ inline size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
142
147
 
143
148
  /**
144
149
  * Computes size needed to serialize the current state of the sketch.
145
150
  * This version is for all other types and can be expensive since every item needs to be looked at.
151
+ * @param instance of a SerDe
146
152
  * @return size in bytes needed to serialize this sketch
147
153
  */
148
- template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
149
- inline size_t get_serialized_size_bytes() const;
154
+ template<typename TT = T, typename SerDe = S, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
155
+ inline size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
150
156
 
151
157
  // This is a convenience alias for users
152
158
  // The type returned by the following serialize method
@@ -158,30 +164,61 @@ class var_opt_sketch {
158
164
  * It is a blank space of a given size.
159
165
  * This header is used in Datasketches PostgreSQL extension.
160
166
  * @param header_size_bytes space to reserve in front of the sketch
167
+ * @param instance of a SerDe
161
168
  */
162
- vector_bytes serialize(unsigned header_size_bytes = 0) const;
169
+ template<typename SerDe = S>
170
+ vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
163
171
 
164
172
  /**
165
173
  * This method serializes the sketch into a given stream in a binary form
166
174
  * @param os output stream
175
+ * @param instance of a SerDe
167
176
  */
168
- void serialize(std::ostream& os) const;
177
+ template<typename SerDe = S>
178
+ void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
169
179
 
170
180
  /**
171
181
  * This method deserializes a sketch from a given stream.
172
182
  * @param is input stream
183
+ * @param instance of an Allocator
173
184
  * @return an instance of a sketch
185
+ *
186
+ * Deprecated, to be removed in the next major version
174
187
  */
175
188
  static var_opt_sketch deserialize(std::istream& is, const A& allocator = A());
176
189
 
190
+ /**
191
+ * This method deserializes a sketch from a given stream.
192
+ * @param is input stream
193
+ * @param instance of a SerDe
194
+ * @param instance of an Allocator
195
+ * @return an instance of a sketch
196
+ */
197
+ template<typename SerDe = S>
198
+ static var_opt_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
199
+
177
200
  /**
178
201
  * This method deserializes a sketch from a given array of bytes.
179
202
  * @param bytes pointer to the array of bytes
180
203
  * @param size the size of the array
204
+ * @param instance of an Allocator
181
205
  * @return an instance of a sketch
206
+ *
207
+ * Deprecated, to be removed in the next major version
182
208
  */
183
209
  static var_opt_sketch deserialize(const void* bytes, size_t size, const A& allocator = A());
184
210
 
211
+ /**
212
+ * This method deserializes a sketch from a given array of bytes.
213
+ * @param bytes pointer to the array of bytes
214
+ * @param size the size of the array
215
+ * @param instance of a SerDe
216
+ * @param instance of an Allocator
217
+ * @return an instance of a sketch
218
+ */
219
+ template<typename SerDe = S>
220
+ static var_opt_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
221
+
185
222
  /**
186
223
  * Prints a summary of the sketch.
187
224
  * @return the summary as a string