datasketches 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/datasketches/version.rb +1 -1
  4. data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
  5. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  6. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  7. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
  9. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  10. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  11. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  12. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  13. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  14. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
  16. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
  17. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  18. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  19. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  20. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  21. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  22. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  23. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  24. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
  25. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  26. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  28. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  29. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  30. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  31. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  32. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  33. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  34. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  35. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  36. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  37. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  38. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  39. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  40. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  41. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  42. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  43. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  44. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  45. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
  46. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  47. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  48. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  49. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  50. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  51. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  52. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  53. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  54. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  55. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  56. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  57. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  58. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  59. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  60. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  61. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  62. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  63. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  64. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  65. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  66. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  67. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
  69. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  70. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  71. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
  73. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  74. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  75. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
  76. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  78. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  79. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  80. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
  84. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  85. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
  86. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
  87. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  88. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  89. data/vendor/datasketches-cpp/setup.py +1 -1
  90. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  91. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  92. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  93. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  94. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  95. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  96. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
  97. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
  98. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
  99. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  100. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
  101. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
  103. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  104. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  105. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  106. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
  107. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  108. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  109. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
  110. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  111. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  112. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  113. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  114. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  115. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  116. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
  117. metadata +8 -3
@@ -24,19 +24,27 @@
24
24
 
25
25
  namespace datasketches {
26
26
 
27
+ // forward declaration
28
+ template<typename T, typename C, typename S, typename A> class kll_sketch;
29
+
27
30
  template <typename T, typename C, typename A>
28
31
  class kll_quantile_calculator {
29
32
  public:
30
- // assumes that all levels are sorted including level 0
31
- kll_quantile_calculator(const T* items, const uint32_t* levels, uint8_t num_levels, uint64_t n, const A& allocator);
33
+ using Entry = std::pair<T, uint64_t>;
34
+ using AllocEntry = typename std::allocator_traits<A>::template rebind_alloc<Entry>;
35
+ using Container = std::vector<Entry, AllocEntry>;
36
+ using const_iterator = typename Container::const_iterator;
37
+
38
+ template<typename S>
39
+ kll_quantile_calculator(const kll_sketch<T, C, S, A>& sketch);
40
+
32
41
  T get_quantile(double fraction) const;
42
+ const_iterator begin() const;
43
+ const_iterator end() const;
33
44
 
34
45
  private:
35
46
  using AllocU32 = typename std::allocator_traits<A>::template rebind_alloc<uint32_t>;
36
47
  using vector_u32 = std::vector<uint32_t, AllocU32>;
37
- using Entry = std::pair<T, uint64_t>;
38
- using AllocEntry = typename std::allocator_traits<A>::template rebind_alloc<Entry>;
39
- using Container = std::vector<Entry, AllocEntry>;
40
48
  uint64_t n_;
41
49
  vector_u32 levels_;
42
50
  Container entries_;
@@ -45,7 +53,7 @@ class kll_quantile_calculator {
45
53
  T approximately_answer_positional_query(uint64_t pos) const;
46
54
  void convert_to_preceding_cummulative();
47
55
  uint32_t chunk_containing_pos(uint64_t pos) const;
48
- uint32_t search_for_chunk_containing_pos(uint64_t pos, uint32_t l, uint32_t r) const;
56
+ uint32_t search_for_chunk_containing_pos(uint64_t pos, uint64_t l, uint64_t r) const;
49
57
  static void merge_sorted_blocks(Container& entries, const uint32_t* levels, uint8_t num_levels, uint32_t num_items);
50
58
  static void merge_sorted_blocks_direct(Container& orig, Container& temp, const uint32_t* levels, uint8_t starting_level, uint8_t num_levels);
51
59
  static void merge_sorted_blocks_reversed(Container& orig, Container& temp, const uint32_t* levels, uint8_t starting_level, uint8_t num_levels);
@@ -28,24 +28,38 @@
28
28
 
29
29
  namespace datasketches {
30
30
 
31
- template <typename T, typename C, typename A>
32
- kll_quantile_calculator<T, C, A>::kll_quantile_calculator(const T* items, const uint32_t* levels, uint8_t num_levels, uint64_t n, const A& allocator):
33
- n_(n), levels_(num_levels + 1, 0, allocator), entries_(allocator)
31
+ template<typename T, typename C, typename A>
32
+ template<typename S>
33
+ kll_quantile_calculator<T, C, A>::kll_quantile_calculator(const kll_sketch<T, C, S, A>& sketch):
34
+ n_(sketch.n_), levels_(sketch.num_levels_ + 1, 0, sketch.allocator_), entries_(sketch.allocator_)
34
35
  {
35
- const uint32_t num_items = levels[num_levels] - levels[0];
36
- entries_.reserve(num_items);
37
- populate_from_sketch(items, levels, num_levels);
38
- merge_sorted_blocks(entries_, levels_.data(), levels_.size() - 1, num_items);
39
- if (!is_sorted(entries_.begin(), entries_.end(), compare_pair_by_first<C>())) throw std::logic_error("entries must be sorted");
40
- convert_to_preceding_cummulative();
36
+ const uint32_t num_items = sketch.levels_[sketch.num_levels_] - sketch.levels_[0];
37
+ if (num_items > 0) {
38
+ entries_.reserve(num_items);
39
+ populate_from_sketch(sketch.items_, sketch.levels_.data(), sketch.num_levels_);
40
+ if (!sketch.is_level_zero_sorted_) std::sort(entries_.begin(), entries_.begin() + levels_[1], compare_pair_by_first<C>());
41
+ merge_sorted_blocks(entries_, levels_.data(), static_cast<uint8_t>(levels_.size()) - 1, num_items);
42
+ if (!is_sorted(entries_.begin(), entries_.end(), compare_pair_by_first<C>())) throw std::logic_error("entries must be sorted");
43
+ convert_to_preceding_cummulative();
44
+ }
41
45
  }
42
46
 
43
- template <typename T, typename C, typename A>
47
+ template<typename T, typename C, typename A>
44
48
  T kll_quantile_calculator<T, C, A>::get_quantile(double fraction) const {
45
49
  return approximately_answer_positional_query(pos_of_phi(fraction, n_));
46
50
  }
47
51
 
48
- template <typename T, typename C, typename A>
52
+ template<typename T, typename C, typename A>
53
+ auto kll_quantile_calculator<T, C, A>::begin() const -> const_iterator {
54
+ return entries_.begin();
55
+ }
56
+
57
+ template<typename T, typename C, typename A>
58
+ auto kll_quantile_calculator<T, C, A>::end() const -> const_iterator {
59
+ return entries_.end();
60
+ }
61
+
62
+ template<typename T, typename C, typename A>
49
63
  void kll_quantile_calculator<T, C, A>::populate_from_sketch(const T* items, const uint32_t* levels, uint8_t num_levels) {
50
64
  size_t src_level = 0;
51
65
  size_t dst_level = 0;
@@ -68,7 +82,7 @@ void kll_quantile_calculator<T, C, A>::populate_from_sketch(const T* items, cons
68
82
  if (levels_.size() > static_cast<size_t>(dst_level + 1)) levels_.resize(dst_level + 1);
69
83
  }
70
84
 
71
- template <typename T, typename C, typename A>
85
+ template<typename T, typename C, typename A>
72
86
  T kll_quantile_calculator<T, C, A>::approximately_answer_positional_query(uint64_t pos) const {
73
87
  if (pos >= n_) throw std::logic_error("position out of range");
74
88
  const uint32_t num_items = levels_[levels_.size() - 1];
@@ -77,7 +91,7 @@ T kll_quantile_calculator<T, C, A>::approximately_answer_positional_query(uint64
77
91
  return entries_[index].first;
78
92
  }
79
93
 
80
- template <typename T, typename C, typename A>
94
+ template<typename T, typename C, typename A>
81
95
  void kll_quantile_calculator<T, C, A>::convert_to_preceding_cummulative() {
82
96
  uint64_t subtotal = 0;
83
97
  for (auto& entry: entries_) {
@@ -87,13 +101,13 @@ void kll_quantile_calculator<T, C, A>::convert_to_preceding_cummulative() {
87
101
  }
88
102
  }
89
103
 
90
- template <typename T, typename C, typename A>
104
+ template<typename T, typename C, typename A>
91
105
  uint64_t kll_quantile_calculator<T, C, A>::pos_of_phi(double phi, uint64_t n) {
92
- const uint64_t pos = std::floor(phi * n);
106
+ const uint64_t pos = static_cast<uint64_t>(std::floor(phi * n));
93
107
  return (pos == n) ? n - 1 : pos;
94
108
  }
95
109
 
96
- template <typename T, typename C, typename A>
110
+ template<typename T, typename C, typename A>
97
111
  uint32_t kll_quantile_calculator<T, C, A>::chunk_containing_pos(uint64_t pos) const {
98
112
  if (entries_.size() < 1) throw std::logic_error("array too short");
99
113
  if (pos < entries_[0].second) throw std::logic_error("position too small");
@@ -101,19 +115,19 @@ uint32_t kll_quantile_calculator<T, C, A>::chunk_containing_pos(uint64_t pos) co
101
115
  return search_for_chunk_containing_pos(pos, 0, entries_.size());
102
116
  }
103
117
 
104
- template <typename T, typename C, typename A>
105
- uint32_t kll_quantile_calculator<T, C, A>::search_for_chunk_containing_pos(uint64_t pos, uint32_t l, uint32_t r) const {
118
+ template<typename T, typename C, typename A>
119
+ uint32_t kll_quantile_calculator<T, C, A>::search_for_chunk_containing_pos(uint64_t pos, uint64_t l, uint64_t r) const {
106
120
  if (l + 1 == r) {
107
- return l;
121
+ return static_cast<uint32_t>(l);
108
122
  }
109
- const uint32_t m(l + (r - l) / 2);
123
+ const uint64_t m = l + (r - l) / 2;
110
124
  if (entries_[m].second <= pos) {
111
125
  return search_for_chunk_containing_pos(pos, m, r);
112
126
  }
113
127
  return search_for_chunk_containing_pos(pos, l, m);
114
128
  }
115
129
 
116
- template <typename T, typename C, typename A>
130
+ template<typename T, typename C, typename A>
117
131
  void kll_quantile_calculator<T, C, A>::merge_sorted_blocks(Container& entries, const uint32_t* levels, uint8_t num_levels, uint32_t num_items) {
118
132
  if (num_levels == 1) return;
119
133
  Container temporary(entries.get_allocator());
@@ -121,7 +135,7 @@ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks(Container& entries, c
121
135
  merge_sorted_blocks_direct(entries, temporary, levels, 0, num_levels);
122
136
  }
123
137
 
124
- template <typename T, typename C, typename A>
138
+ template<typename T, typename C, typename A>
125
139
  void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_direct(Container& orig, Container& temp, const uint32_t* levels,
126
140
  uint8_t starting_level, uint8_t num_levels) {
127
141
  if (num_levels == 1) return;
@@ -129,10 +143,11 @@ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_direct(Container& ori
129
143
  const uint8_t num_levels_2 = num_levels - num_levels_1;
130
144
  const uint8_t starting_level_1 = starting_level;
131
145
  const uint8_t starting_level_2 = starting_level + num_levels_1;
132
- const auto chunk_begin = temp.begin() + temp.size();
146
+ const auto initial_size = temp.size();
133
147
  merge_sorted_blocks_reversed(orig, temp, levels, starting_level_1, num_levels_1);
134
148
  merge_sorted_blocks_reversed(orig, temp, levels, starting_level_2, num_levels_2);
135
149
  const uint32_t num_items_1 = levels[starting_level_1 + num_levels_1] - levels[starting_level_1];
150
+ const auto chunk_begin = temp.begin() + initial_size;
136
151
  std::merge(
137
152
  std::make_move_iterator(chunk_begin), std::make_move_iterator(chunk_begin + num_items_1),
138
153
  std::make_move_iterator(chunk_begin + num_items_1), std::make_move_iterator(temp.end()),
@@ -141,7 +156,7 @@ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_direct(Container& ori
141
156
  temp.erase(chunk_begin, temp.end());
142
157
  }
143
158
 
144
- template <typename T, typename C, typename A>
159
+ template<typename T, typename C, typename A>
145
160
  void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_reversed(Container& orig, Container& temp, const uint32_t* levels,
146
161
  uint8_t starting_level, uint8_t num_levels) {
147
162
  if (num_levels == 1) {
@@ -156,6 +156,9 @@ template<typename A> using vector_d = std::vector<double, AllocD<A>>;
156
156
  template <typename T, typename C = std::less<T>, typename S = serde<T>, typename A = std::allocator<T>>
157
157
  class kll_sketch {
158
158
  public:
159
+ using value_type = T;
160
+ using comparator = C;
161
+
159
162
  static const uint8_t DEFAULT_M = 8;
160
163
  static const uint16_t DEFAULT_K = 200;
161
164
  static const uint16_t MIN_K = DEFAULT_M;
@@ -296,7 +299,7 @@ class kll_sketch {
296
299
  *
297
300
  * @return array of approximations to the given number of evenly-spaced fractional ranks.
298
301
  */
299
- std::vector<T, A> get_quantiles(size_t num) const;
302
+ std::vector<T, A> get_quantiles(uint32_t num) const;
300
303
 
301
304
  /**
302
305
  * Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1,
@@ -383,6 +386,33 @@ class kll_sketch {
383
386
  template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
384
387
  size_t get_serialized_size_bytes() const;
385
388
 
389
+ /**
390
+ * Returns upper bound on the serialized size of a sketch given a parameter <em>k</em> and stream
391
+ * length. The resulting size is an overestimate to make sure actual sketches don't exceed it.
392
+ * This method can be used if allocation of storage is necessary beforehand, but it is not
393
+ * optimal.
394
+ * This method is for arithmetic types (integral and floating point)
395
+ * @param k parameter that controls size of the sketch and accuracy of estimates
396
+ * @param n stream length
397
+ * @return upper bound on the serialized size
398
+ */
399
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
400
+ static size_t get_max_serialized_size_bytes(uint16_t k, uint64_t n);
401
+
402
+ /**
403
+ * Returns upper bound on the serialized size of a sketch given a parameter <em>k</em> and stream
404
+ * length. The resulting size is an overestimate to make sure actual sketches don't exceed it.
405
+ * This method can be used if allocation of storage is necessary beforehand, but it is not
406
+ * optimal.
407
+ * This method is for all other non-arithmetic types, and it takes a max size of an item as input.
408
+ * @param k parameter that controls size of the sketch and accuracy of estimates
409
+ * @param n stream length
410
+ * @param max_item_size_bytes maximum size of an item in bytes
411
+ * @return upper bound on the serialized size
412
+ */
413
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
414
+ static size_t get_max_serialized_size_bytes(uint16_t k, uint64_t n, size_t max_item_size_bytes);
415
+
386
416
  /**
387
417
  * This method serializes the sketch into a given stream in a binary form
388
418
  * @param os output stream
@@ -391,7 +421,7 @@ class kll_sketch {
391
421
 
392
422
  // This is a convenience alias for users
393
423
  // The type returned by the following serialize method
394
- typedef vector_u8<A> vector_bytes;
424
+ using vector_bytes = vector_u8<A>;
395
425
 
396
426
  /**
397
427
  * This method serializes the sketch as a vector of bytes.
@@ -480,6 +510,8 @@ class kll_sketch {
480
510
  T* max_value_;
481
511
  bool is_level_zero_sorted_;
482
512
 
513
+ friend class kll_quantile_calculator<T, C, A>;
514
+
483
515
  // for deserialization
484
516
  class item_deleter;
485
517
  class items_deleter;
@@ -303,7 +303,7 @@ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(const double* fractions,
303
303
  }
304
304
 
305
305
  template<typename T, typename C, typename S, typename A>
306
- std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(size_t num) const {
306
+ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(uint32_t num) const {
307
307
  if (is_empty()) return std::vector<T, A>(allocator_);
308
308
  if (num == 0) {
309
309
  throw std::invalid_argument("num must be > 0");
@@ -380,36 +380,56 @@ size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
380
380
  size_t size = DATA_START + num_levels_ * sizeof(uint32_t);
381
381
  size += S().size_of_item(*min_value_);
382
382
  size += S().size_of_item(*max_value_);
383
- for (auto& it: *this) size += S().size_of_item(it.first);
383
+ for (auto it: *this) size += S().size_of_item(it.first);
384
384
  return size;
385
385
  }
386
386
 
387
+ // implementation for fixed-size arithmetic types (integral and floating point)
388
+ template<typename T, typename C, typename S, typename A>
389
+ template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
390
+ size_t kll_sketch<T, C, S, A>::get_max_serialized_size_bytes(uint16_t k, uint64_t n) {
391
+ const uint8_t num_levels = kll_helper::ub_on_num_levels(n);
392
+ const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, DEFAULT_M, num_levels);
393
+ // the last integer in the levels_ array is not serialized because it can be derived
394
+ return DATA_START + num_levels * sizeof(uint32_t) + (max_num_retained + 2) * sizeof(TT);
395
+ }
396
+
397
+ // implementation for all other types
398
+ template<typename T, typename C, typename S, typename A>
399
+ template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
400
+ size_t kll_sketch<T, C, S, A>::get_max_serialized_size_bytes(uint16_t k, uint64_t n, size_t max_item_size_bytes) {
401
+ const uint8_t num_levels = kll_helper::ub_on_num_levels(n);
402
+ const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, DEFAULT_M, num_levels);
403
+ // the last integer in the levels_ array is not serialized because it can be derived
404
+ return DATA_START + num_levels * sizeof(uint32_t) + (max_num_retained + 2) * max_item_size_bytes;
405
+ }
406
+
387
407
  template<typename T, typename C, typename S, typename A>
388
408
  void kll_sketch<T, C, S, A>::serialize(std::ostream& os) const {
389
409
  const bool is_single_item = n_ == 1;
390
410
  const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
391
- os.write(reinterpret_cast<const char*>(&preamble_ints), sizeof(preamble_ints));
411
+ write(os, preamble_ints);
392
412
  const uint8_t serial_version(is_single_item ? SERIAL_VERSION_2 : SERIAL_VERSION_1);
393
- os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
413
+ write(os, serial_version);
394
414
  const uint8_t family(FAMILY);
395
- os.write(reinterpret_cast<const char*>(&family), sizeof(family));
415
+ write(os, family);
396
416
  const uint8_t flags_byte(
397
417
  (is_empty() ? 1 << flags::IS_EMPTY : 0)
398
418
  | (is_level_zero_sorted_ ? 1 << flags::IS_LEVEL_ZERO_SORTED : 0)
399
419
  | (is_single_item ? 1 << flags::IS_SINGLE_ITEM : 0)
400
420
  );
401
- os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
402
- os.write((char*)&k_, sizeof(k_));
403
- os.write((char*)&m_, sizeof(m_));
421
+ write(os, flags_byte);
422
+ write(os, k_);
423
+ write(os, m_);
404
424
  const uint8_t unused = 0;
405
- os.write(reinterpret_cast<const char*>(&unused), sizeof(unused));
425
+ write(os, unused);
406
426
  if (is_empty()) return;
407
427
  if (!is_single_item) {
408
- os.write((char*)&n_, sizeof(n_));
409
- os.write((char*)&min_k_, sizeof(min_k_));
410
- os.write((char*)&num_levels_, sizeof(num_levels_));
411
- os.write((char*)&unused, sizeof(unused));
412
- os.write((char*)levels_.data(), sizeof(levels_[0]) * num_levels_);
428
+ write(os, n_);
429
+ write(os, min_k_);
430
+ write(os, num_levels_);
431
+ write(os, unused);
432
+ write(os, levels_.data(), sizeof(levels_[0]) * num_levels_);
413
433
  S().serialize(os, min_value_, 1);
414
434
  S().serialize(os, max_value_, 1);
415
435
  }
@@ -424,27 +444,26 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
424
444
  uint8_t* ptr = bytes.data() + header_size_bytes;
425
445
  const uint8_t* end_ptr = ptr + size;
426
446
  const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
427
- ptr += copy_to_mem(&preamble_ints, ptr, sizeof(preamble_ints));
447
+ ptr += copy_to_mem(preamble_ints, ptr);
428
448
  const uint8_t serial_version(is_single_item ? SERIAL_VERSION_2 : SERIAL_VERSION_1);
429
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
449
+ ptr += copy_to_mem(serial_version, ptr);
430
450
  const uint8_t family(FAMILY);
431
- ptr += copy_to_mem(&family, ptr, sizeof(family));
451
+ ptr += copy_to_mem(family, ptr);
432
452
  const uint8_t flags_byte(
433
453
  (is_empty() ? 1 << flags::IS_EMPTY : 0)
434
454
  | (is_level_zero_sorted_ ? 1 << flags::IS_LEVEL_ZERO_SORTED : 0)
435
455
  | (is_single_item ? 1 << flags::IS_SINGLE_ITEM : 0)
436
456
  );
437
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
438
- ptr += copy_to_mem(&k_, ptr, sizeof(k_));
439
- ptr += copy_to_mem(&m_, ptr, sizeof(m_));
440
- const uint8_t unused = 0;
441
- ptr += copy_to_mem(&unused, ptr, sizeof(unused));
457
+ ptr += copy_to_mem(flags_byte, ptr);
458
+ ptr += copy_to_mem(k_, ptr);
459
+ ptr += copy_to_mem(m_, ptr);
460
+ ptr += sizeof(uint8_t); // unused
442
461
  if (!is_empty()) {
443
462
  if (!is_single_item) {
444
- ptr += copy_to_mem(&n_, ptr, sizeof(n_));
445
- ptr += copy_to_mem(&min_k_, ptr, sizeof(min_k_));
446
- ptr += copy_to_mem(&num_levels_, ptr, sizeof(num_levels_));
447
- ptr += copy_to_mem(&unused, ptr, sizeof(unused));
463
+ ptr += copy_to_mem(n_, ptr);
464
+ ptr += copy_to_mem(min_k_, ptr);
465
+ ptr += copy_to_mem(num_levels_, ptr);
466
+ ptr += sizeof(uint8_t); // unused
448
467
  ptr += copy_to_mem(levels_.data(), ptr, sizeof(levels_[0]) * num_levels_);
449
468
  ptr += S().serialize(ptr, end_ptr - ptr, min_value_, 1);
450
469
  ptr += S().serialize(ptr, end_ptr - ptr, max_value_, 1);
@@ -459,20 +478,13 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
459
478
 
460
479
  template<typename T, typename C, typename S, typename A>
461
480
  kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, const A& allocator) {
462
- uint8_t preamble_ints;
463
- is.read((char*)&preamble_ints, sizeof(preamble_ints));
464
- uint8_t serial_version;
465
- is.read((char*)&serial_version, sizeof(serial_version));
466
- uint8_t family_id;
467
- is.read((char*)&family_id, sizeof(family_id));
468
- uint8_t flags_byte;
469
- is.read((char*)&flags_byte, sizeof(flags_byte));
470
- uint16_t k;
471
- is.read((char*)&k, sizeof(k));
472
- uint8_t m;
473
- is.read((char*)&m, sizeof(m));
474
- uint8_t unused;
475
- is.read((char*)&unused, sizeof(unused));
481
+ const auto preamble_ints = read<uint8_t>(is);
482
+ const auto serial_version = read<uint8_t>(is);
483
+ const auto family_id = read<uint8_t>(is);
484
+ const auto flags_byte = read<uint8_t>(is);
485
+ const auto k = read<uint16_t>(is);
486
+ const auto m = read<uint8_t>(is);
487
+ read<uint8_t>(is); // skip unused byte
476
488
 
477
489
  check_m(m);
478
490
  check_preamble_ints(preamble_ints, flags_byte);
@@ -492,10 +504,10 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, con
492
504
  min_k = k;
493
505
  num_levels = 1;
494
506
  } else {
495
- is.read((char*)&n, sizeof(n_));
496
- is.read((char*)&min_k, sizeof(min_k_));
497
- is.read((char*)&num_levels, sizeof(num_levels));
498
- is.read((char*)&unused, sizeof(unused));
507
+ n = read<uint64_t>(is);
508
+ min_k = read<uint16_t>(is);
509
+ num_levels = read<uint8_t>(is);
510
+ read<uint8_t>(is); // skip unused byte
499
511
  }
500
512
  vector_u32<A> levels(num_levels + 1, 0, allocator);
501
513
  const uint32_t capacity(kll_helper::compute_total_capacity(k, m, num_levels));
@@ -503,7 +515,7 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, con
503
515
  levels[0] = capacity - 1;
504
516
  } else {
505
517
  // the last integer in levels_ is not serialized because it can be derived
506
- is.read((char*)levels.data(), sizeof(levels[0]) * num_levels);
518
+ read(is, levels.data(), sizeof(levels[0]) * num_levels);
507
519
  }
508
520
  levels[num_levels] = capacity;
509
521
  A alloc(allocator);
@@ -546,24 +558,24 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, si
546
558
  ensure_minimum_memory(size, 8);
547
559
  const char* ptr = static_cast<const char*>(bytes);
548
560
  uint8_t preamble_ints;
549
- ptr += copy_from_mem(ptr, &preamble_ints, sizeof(preamble_ints));
561
+ ptr += copy_from_mem(ptr, preamble_ints);
550
562
  uint8_t serial_version;
551
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
563
+ ptr += copy_from_mem(ptr, serial_version);
552
564
  uint8_t family_id;
553
- ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
565
+ ptr += copy_from_mem(ptr, family_id);
554
566
  uint8_t flags_byte;
555
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
567
+ ptr += copy_from_mem(ptr, flags_byte);
556
568
  uint16_t k;
557
- ptr += copy_from_mem(ptr, &k, sizeof(k));
569
+ ptr += copy_from_mem(ptr, k);
558
570
  uint8_t m;
559
- ptr += copy_from_mem(ptr, &m, sizeof(m));
560
- ptr++; // skip unused byte
571
+ ptr += copy_from_mem(ptr, m);
572
+ ptr += sizeof(uint8_t); // skip unused byte
561
573
 
562
574
  check_m(m);
563
575
  check_preamble_ints(preamble_ints, flags_byte);
564
576
  check_serial_version(serial_version);
565
577
  check_family_id(family_id);
566
- ensure_minimum_memory(size, 1 << preamble_ints);
578
+ ensure_minimum_memory(size, 1ULL << preamble_ints);
567
579
 
568
580
  const bool is_empty(flags_byte & (1 << flags::IS_EMPTY));
569
581
  if (is_empty) return kll_sketch<T, C, S, A>(k, allocator);
@@ -578,10 +590,10 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, si
578
590
  min_k = k;
579
591
  num_levels = 1;
580
592
  } else {
581
- ptr += copy_from_mem(ptr, &n, sizeof(n));
582
- ptr += copy_from_mem(ptr, &min_k, sizeof(min_k));
583
- ptr += copy_from_mem(ptr, &num_levels, sizeof(num_levels));
584
- ptr++; // skip unused byte
593
+ ptr += copy_from_mem(ptr, n);
594
+ ptr += copy_from_mem(ptr, min_k);
595
+ ptr += copy_from_mem(ptr, num_levels);
596
+ ptr += sizeof(uint8_t); // skip unused byte
585
597
  }
586
598
  vector_u32<A> levels(num_levels + 1, 0, allocator);
587
599
  const uint32_t capacity(kll_helper::compute_total_capacity(k, m, num_levels));
@@ -779,7 +791,7 @@ std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantil
779
791
  using AllocCalc = typename std::allocator_traits<A>::template rebind_alloc<kll_quantile_calculator<T, C, A>>;
780
792
  AllocCalc alloc(allocator_);
781
793
  std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator(
782
- new (alloc.allocate(1)) kll_quantile_calculator<T, C, A>(items_, levels_.data(), num_levels_, n_, allocator_),
794
+ new (alloc.allocate(1)) kll_quantile_calculator<T, C, A>(*this),
783
795
  [&alloc](kll_quantile_calculator<T, C, A>* ptr){ ptr->~kll_quantile_calculator<T, C, A>(); alloc.deallocate(ptr, 1); }
784
796
  );
785
797
  return quantile_calculator;
@@ -1067,14 +1079,14 @@ typename kll_sketch<T, C, S, A>::const_iterator kll_sketch<T, C, S, A>::begin()
1067
1079
 
1068
1080
  template <typename T, typename C, typename S, typename A>
1069
1081
  typename kll_sketch<T, C, S, A>::const_iterator kll_sketch<T, C, S, A>::end() const {
1070
- return kll_sketch<T, C, S, A>::const_iterator(nullptr, nullptr, num_levels_);
1082
+ return kll_sketch<T, C, S, A>::const_iterator(nullptr, levels_.data(), num_levels_);
1071
1083
  }
1072
1084
 
1073
1085
  // kll_sketch::const_iterator implementation
1074
1086
 
1075
1087
  template<typename T, typename C, typename S, typename A>
1076
1088
  kll_sketch<T, C, S, A>::const_iterator::const_iterator(const T* items, const uint32_t* levels, const uint8_t num_levels):
1077
- items(items), levels(levels), num_levels(num_levels), index(levels == nullptr ? 0 : levels[0]), level(levels == nullptr ? num_levels : 0), weight(1)
1089
+ items(items), levels(levels), num_levels(num_levels), index(items == nullptr ? levels[num_levels] : levels[0]), level(items == nullptr ? num_levels : 0), weight(1)
1078
1090
  {}
1079
1091
 
1080
1092
  template<typename T, typename C, typename S, typename A>
@@ -1098,8 +1110,6 @@ typename kll_sketch<T, C, S, A>::const_iterator& kll_sketch<T, C, S, A>::const_i
1098
1110
 
1099
1111
  template<typename T, typename C, typename S, typename A>
1100
1112
  bool kll_sketch<T, C, S, A>::const_iterator::operator==(const const_iterator& other) const {
1101
- if (level != other.level) return false;
1102
- if (level == num_levels) return true; // end
1103
1113
  return index == other.index;
1104
1114
  }
1105
1115