datasketches 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (117) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/lib/datasketches/version.rb +1 -1
  4. data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
  5. data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
  6. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
  7. data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
  8. data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
  9. data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
  10. data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
  11. data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
  12. data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
  13. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
  14. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
  15. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
  16. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
  17. data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
  18. data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
  19. data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
  20. data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
  21. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
  22. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
  23. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  24. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
  25. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
  26. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
  28. data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
  29. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
  30. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
  31. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
  32. data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
  33. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
  34. data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
  35. data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
  36. data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
  37. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
  38. data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
  39. data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
  40. data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
  41. data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
  42. data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
  43. data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
  44. data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
  45. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
  46. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
  47. data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
  48. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
  49. data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
  50. data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
  51. data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
  52. data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
  53. data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
  54. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  55. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
  56. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
  57. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
  58. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
  59. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
  60. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
  61. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
  62. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
  63. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
  64. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
  65. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
  66. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
  67. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
  68. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
  69. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
  70. data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
  71. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
  72. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
  73. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
  74. data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
  75. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
  76. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
  77. data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
  78. data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
  79. data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
  80. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
  84. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
  85. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
  86. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
  87. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
  88. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
  89. data/vendor/datasketches-cpp/setup.py +1 -1
  90. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
  91. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
  92. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
  93. data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
  94. data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
  95. data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
  96. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
  97. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
  98. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
  99. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
  100. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
  101. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
  103. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
  104. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
  105. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
  106. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
  107. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
  108. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
  109. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
  110. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  111. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
  112. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
  113. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
  114. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  115. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
  116. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
  117. metadata +8 -3
@@ -24,19 +24,27 @@
24
24
 
25
25
  namespace datasketches {
26
26
 
27
+ // forward declaration
28
+ template<typename T, typename C, typename S, typename A> class kll_sketch;
29
+
27
30
  template <typename T, typename C, typename A>
28
31
  class kll_quantile_calculator {
29
32
  public:
30
- // assumes that all levels are sorted including level 0
31
- kll_quantile_calculator(const T* items, const uint32_t* levels, uint8_t num_levels, uint64_t n, const A& allocator);
33
+ using Entry = std::pair<T, uint64_t>;
34
+ using AllocEntry = typename std::allocator_traits<A>::template rebind_alloc<Entry>;
35
+ using Container = std::vector<Entry, AllocEntry>;
36
+ using const_iterator = typename Container::const_iterator;
37
+
38
+ template<typename S>
39
+ kll_quantile_calculator(const kll_sketch<T, C, S, A>& sketch);
40
+
32
41
  T get_quantile(double fraction) const;
42
+ const_iterator begin() const;
43
+ const_iterator end() const;
33
44
 
34
45
  private:
35
46
  using AllocU32 = typename std::allocator_traits<A>::template rebind_alloc<uint32_t>;
36
47
  using vector_u32 = std::vector<uint32_t, AllocU32>;
37
- using Entry = std::pair<T, uint64_t>;
38
- using AllocEntry = typename std::allocator_traits<A>::template rebind_alloc<Entry>;
39
- using Container = std::vector<Entry, AllocEntry>;
40
48
  uint64_t n_;
41
49
  vector_u32 levels_;
42
50
  Container entries_;
@@ -45,7 +53,7 @@ class kll_quantile_calculator {
45
53
  T approximately_answer_positional_query(uint64_t pos) const;
46
54
  void convert_to_preceding_cummulative();
47
55
  uint32_t chunk_containing_pos(uint64_t pos) const;
48
- uint32_t search_for_chunk_containing_pos(uint64_t pos, uint32_t l, uint32_t r) const;
56
+ uint32_t search_for_chunk_containing_pos(uint64_t pos, uint64_t l, uint64_t r) const;
49
57
  static void merge_sorted_blocks(Container& entries, const uint32_t* levels, uint8_t num_levels, uint32_t num_items);
50
58
  static void merge_sorted_blocks_direct(Container& orig, Container& temp, const uint32_t* levels, uint8_t starting_level, uint8_t num_levels);
51
59
  static void merge_sorted_blocks_reversed(Container& orig, Container& temp, const uint32_t* levels, uint8_t starting_level, uint8_t num_levels);
@@ -28,24 +28,38 @@
28
28
 
29
29
  namespace datasketches {
30
30
 
31
- template <typename T, typename C, typename A>
32
- kll_quantile_calculator<T, C, A>::kll_quantile_calculator(const T* items, const uint32_t* levels, uint8_t num_levels, uint64_t n, const A& allocator):
33
- n_(n), levels_(num_levels + 1, 0, allocator), entries_(allocator)
31
+ template<typename T, typename C, typename A>
32
+ template<typename S>
33
+ kll_quantile_calculator<T, C, A>::kll_quantile_calculator(const kll_sketch<T, C, S, A>& sketch):
34
+ n_(sketch.n_), levels_(sketch.num_levels_ + 1, 0, sketch.allocator_), entries_(sketch.allocator_)
34
35
  {
35
- const uint32_t num_items = levels[num_levels] - levels[0];
36
- entries_.reserve(num_items);
37
- populate_from_sketch(items, levels, num_levels);
38
- merge_sorted_blocks(entries_, levels_.data(), levels_.size() - 1, num_items);
39
- if (!is_sorted(entries_.begin(), entries_.end(), compare_pair_by_first<C>())) throw std::logic_error("entries must be sorted");
40
- convert_to_preceding_cummulative();
36
+ const uint32_t num_items = sketch.levels_[sketch.num_levels_] - sketch.levels_[0];
37
+ if (num_items > 0) {
38
+ entries_.reserve(num_items);
39
+ populate_from_sketch(sketch.items_, sketch.levels_.data(), sketch.num_levels_);
40
+ if (!sketch.is_level_zero_sorted_) std::sort(entries_.begin(), entries_.begin() + levels_[1], compare_pair_by_first<C>());
41
+ merge_sorted_blocks(entries_, levels_.data(), static_cast<uint8_t>(levels_.size()) - 1, num_items);
42
+ if (!is_sorted(entries_.begin(), entries_.end(), compare_pair_by_first<C>())) throw std::logic_error("entries must be sorted");
43
+ convert_to_preceding_cummulative();
44
+ }
41
45
  }
42
46
 
43
- template <typename T, typename C, typename A>
47
+ template<typename T, typename C, typename A>
44
48
  T kll_quantile_calculator<T, C, A>::get_quantile(double fraction) const {
45
49
  return approximately_answer_positional_query(pos_of_phi(fraction, n_));
46
50
  }
47
51
 
48
- template <typename T, typename C, typename A>
52
+ template<typename T, typename C, typename A>
53
+ auto kll_quantile_calculator<T, C, A>::begin() const -> const_iterator {
54
+ return entries_.begin();
55
+ }
56
+
57
+ template<typename T, typename C, typename A>
58
+ auto kll_quantile_calculator<T, C, A>::end() const -> const_iterator {
59
+ return entries_.end();
60
+ }
61
+
62
+ template<typename T, typename C, typename A>
49
63
  void kll_quantile_calculator<T, C, A>::populate_from_sketch(const T* items, const uint32_t* levels, uint8_t num_levels) {
50
64
  size_t src_level = 0;
51
65
  size_t dst_level = 0;
@@ -68,7 +82,7 @@ void kll_quantile_calculator<T, C, A>::populate_from_sketch(const T* items, cons
68
82
  if (levels_.size() > static_cast<size_t>(dst_level + 1)) levels_.resize(dst_level + 1);
69
83
  }
70
84
 
71
- template <typename T, typename C, typename A>
85
+ template<typename T, typename C, typename A>
72
86
  T kll_quantile_calculator<T, C, A>::approximately_answer_positional_query(uint64_t pos) const {
73
87
  if (pos >= n_) throw std::logic_error("position out of range");
74
88
  const uint32_t num_items = levels_[levels_.size() - 1];
@@ -77,7 +91,7 @@ T kll_quantile_calculator<T, C, A>::approximately_answer_positional_query(uint64
77
91
  return entries_[index].first;
78
92
  }
79
93
 
80
- template <typename T, typename C, typename A>
94
+ template<typename T, typename C, typename A>
81
95
  void kll_quantile_calculator<T, C, A>::convert_to_preceding_cummulative() {
82
96
  uint64_t subtotal = 0;
83
97
  for (auto& entry: entries_) {
@@ -87,13 +101,13 @@ void kll_quantile_calculator<T, C, A>::convert_to_preceding_cummulative() {
87
101
  }
88
102
  }
89
103
 
90
- template <typename T, typename C, typename A>
104
+ template<typename T, typename C, typename A>
91
105
  uint64_t kll_quantile_calculator<T, C, A>::pos_of_phi(double phi, uint64_t n) {
92
- const uint64_t pos = std::floor(phi * n);
106
+ const uint64_t pos = static_cast<uint64_t>(std::floor(phi * n));
93
107
  return (pos == n) ? n - 1 : pos;
94
108
  }
95
109
 
96
- template <typename T, typename C, typename A>
110
+ template<typename T, typename C, typename A>
97
111
  uint32_t kll_quantile_calculator<T, C, A>::chunk_containing_pos(uint64_t pos) const {
98
112
  if (entries_.size() < 1) throw std::logic_error("array too short");
99
113
  if (pos < entries_[0].second) throw std::logic_error("position too small");
@@ -101,19 +115,19 @@ uint32_t kll_quantile_calculator<T, C, A>::chunk_containing_pos(uint64_t pos) co
101
115
  return search_for_chunk_containing_pos(pos, 0, entries_.size());
102
116
  }
103
117
 
104
- template <typename T, typename C, typename A>
105
- uint32_t kll_quantile_calculator<T, C, A>::search_for_chunk_containing_pos(uint64_t pos, uint32_t l, uint32_t r) const {
118
+ template<typename T, typename C, typename A>
119
+ uint32_t kll_quantile_calculator<T, C, A>::search_for_chunk_containing_pos(uint64_t pos, uint64_t l, uint64_t r) const {
106
120
  if (l + 1 == r) {
107
- return l;
121
+ return static_cast<uint32_t>(l);
108
122
  }
109
- const uint32_t m(l + (r - l) / 2);
123
+ const uint64_t m = l + (r - l) / 2;
110
124
  if (entries_[m].second <= pos) {
111
125
  return search_for_chunk_containing_pos(pos, m, r);
112
126
  }
113
127
  return search_for_chunk_containing_pos(pos, l, m);
114
128
  }
115
129
 
116
- template <typename T, typename C, typename A>
130
+ template<typename T, typename C, typename A>
117
131
  void kll_quantile_calculator<T, C, A>::merge_sorted_blocks(Container& entries, const uint32_t* levels, uint8_t num_levels, uint32_t num_items) {
118
132
  if (num_levels == 1) return;
119
133
  Container temporary(entries.get_allocator());
@@ -121,7 +135,7 @@ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks(Container& entries, c
121
135
  merge_sorted_blocks_direct(entries, temporary, levels, 0, num_levels);
122
136
  }
123
137
 
124
- template <typename T, typename C, typename A>
138
+ template<typename T, typename C, typename A>
125
139
  void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_direct(Container& orig, Container& temp, const uint32_t* levels,
126
140
  uint8_t starting_level, uint8_t num_levels) {
127
141
  if (num_levels == 1) return;
@@ -129,10 +143,11 @@ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_direct(Container& ori
129
143
  const uint8_t num_levels_2 = num_levels - num_levels_1;
130
144
  const uint8_t starting_level_1 = starting_level;
131
145
  const uint8_t starting_level_2 = starting_level + num_levels_1;
132
- const auto chunk_begin = temp.begin() + temp.size();
146
+ const auto initial_size = temp.size();
133
147
  merge_sorted_blocks_reversed(orig, temp, levels, starting_level_1, num_levels_1);
134
148
  merge_sorted_blocks_reversed(orig, temp, levels, starting_level_2, num_levels_2);
135
149
  const uint32_t num_items_1 = levels[starting_level_1 + num_levels_1] - levels[starting_level_1];
150
+ const auto chunk_begin = temp.begin() + initial_size;
136
151
  std::merge(
137
152
  std::make_move_iterator(chunk_begin), std::make_move_iterator(chunk_begin + num_items_1),
138
153
  std::make_move_iterator(chunk_begin + num_items_1), std::make_move_iterator(temp.end()),
@@ -141,7 +156,7 @@ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_direct(Container& ori
141
156
  temp.erase(chunk_begin, temp.end());
142
157
  }
143
158
 
144
- template <typename T, typename C, typename A>
159
+ template<typename T, typename C, typename A>
145
160
  void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_reversed(Container& orig, Container& temp, const uint32_t* levels,
146
161
  uint8_t starting_level, uint8_t num_levels) {
147
162
  if (num_levels == 1) {
@@ -156,6 +156,9 @@ template<typename A> using vector_d = std::vector<double, AllocD<A>>;
156
156
  template <typename T, typename C = std::less<T>, typename S = serde<T>, typename A = std::allocator<T>>
157
157
  class kll_sketch {
158
158
  public:
159
+ using value_type = T;
160
+ using comparator = C;
161
+
159
162
  static const uint8_t DEFAULT_M = 8;
160
163
  static const uint16_t DEFAULT_K = 200;
161
164
  static const uint16_t MIN_K = DEFAULT_M;
@@ -296,7 +299,7 @@ class kll_sketch {
296
299
  *
297
300
  * @return array of approximations to the given number of evenly-spaced fractional ranks.
298
301
  */
299
- std::vector<T, A> get_quantiles(size_t num) const;
302
+ std::vector<T, A> get_quantiles(uint32_t num) const;
300
303
 
301
304
  /**
302
305
  * Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1,
@@ -383,6 +386,33 @@ class kll_sketch {
383
386
  template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
384
387
  size_t get_serialized_size_bytes() const;
385
388
 
389
+ /**
390
+ * Returns upper bound on the serialized size of a sketch given a parameter <em>k</em> and stream
391
+ * length. The resulting size is an overestimate to make sure actual sketches don't exceed it.
392
+ * This method can be used if allocation of storage is necessary beforehand, but it is not
393
+ * optimal.
394
+ * This method is for arithmetic types (integral and floating point)
395
+ * @param k parameter that controls size of the sketch and accuracy of estimates
396
+ * @param n stream length
397
+ * @return upper bound on the serialized size
398
+ */
399
+ template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
400
+ static size_t get_max_serialized_size_bytes(uint16_t k, uint64_t n);
401
+
402
+ /**
403
+ * Returns upper bound on the serialized size of a sketch given a parameter <em>k</em> and stream
404
+ * length. The resulting size is an overestimate to make sure actual sketches don't exceed it.
405
+ * This method can be used if allocation of storage is necessary beforehand, but it is not
406
+ * optimal.
407
+ * This method is for all other non-arithmetic types, and it takes a max size of an item as input.
408
+ * @param k parameter that controls size of the sketch and accuracy of estimates
409
+ * @param n stream length
410
+ * @param max_item_size_bytes maximum size of an item in bytes
411
+ * @return upper bound on the serialized size
412
+ */
413
+ template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
414
+ static size_t get_max_serialized_size_bytes(uint16_t k, uint64_t n, size_t max_item_size_bytes);
415
+
386
416
  /**
387
417
  * This method serializes the sketch into a given stream in a binary form
388
418
  * @param os output stream
@@ -391,7 +421,7 @@ class kll_sketch {
391
421
 
392
422
  // This is a convenience alias for users
393
423
  // The type returned by the following serialize method
394
- typedef vector_u8<A> vector_bytes;
424
+ using vector_bytes = vector_u8<A>;
395
425
 
396
426
  /**
397
427
  * This method serializes the sketch as a vector of bytes.
@@ -480,6 +510,8 @@ class kll_sketch {
480
510
  T* max_value_;
481
511
  bool is_level_zero_sorted_;
482
512
 
513
+ friend class kll_quantile_calculator<T, C, A>;
514
+
483
515
  // for deserialization
484
516
  class item_deleter;
485
517
  class items_deleter;
@@ -303,7 +303,7 @@ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(const double* fractions,
303
303
  }
304
304
 
305
305
  template<typename T, typename C, typename S, typename A>
306
- std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(size_t num) const {
306
+ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(uint32_t num) const {
307
307
  if (is_empty()) return std::vector<T, A>(allocator_);
308
308
  if (num == 0) {
309
309
  throw std::invalid_argument("num must be > 0");
@@ -380,36 +380,56 @@ size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
380
380
  size_t size = DATA_START + num_levels_ * sizeof(uint32_t);
381
381
  size += S().size_of_item(*min_value_);
382
382
  size += S().size_of_item(*max_value_);
383
- for (auto& it: *this) size += S().size_of_item(it.first);
383
+ for (auto it: *this) size += S().size_of_item(it.first);
384
384
  return size;
385
385
  }
386
386
 
387
+ // implementation for fixed-size arithmetic types (integral and floating point)
388
+ template<typename T, typename C, typename S, typename A>
389
+ template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
390
+ size_t kll_sketch<T, C, S, A>::get_max_serialized_size_bytes(uint16_t k, uint64_t n) {
391
+ const uint8_t num_levels = kll_helper::ub_on_num_levels(n);
392
+ const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, DEFAULT_M, num_levels);
393
+ // the last integer in the levels_ array is not serialized because it can be derived
394
+ return DATA_START + num_levels * sizeof(uint32_t) + (max_num_retained + 2) * sizeof(TT);
395
+ }
396
+
397
+ // implementation for all other types
398
+ template<typename T, typename C, typename S, typename A>
399
+ template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
400
+ size_t kll_sketch<T, C, S, A>::get_max_serialized_size_bytes(uint16_t k, uint64_t n, size_t max_item_size_bytes) {
401
+ const uint8_t num_levels = kll_helper::ub_on_num_levels(n);
402
+ const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, DEFAULT_M, num_levels);
403
+ // the last integer in the levels_ array is not serialized because it can be derived
404
+ return DATA_START + num_levels * sizeof(uint32_t) + (max_num_retained + 2) * max_item_size_bytes;
405
+ }
406
+
387
407
  template<typename T, typename C, typename S, typename A>
388
408
  void kll_sketch<T, C, S, A>::serialize(std::ostream& os) const {
389
409
  const bool is_single_item = n_ == 1;
390
410
  const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
391
- os.write(reinterpret_cast<const char*>(&preamble_ints), sizeof(preamble_ints));
411
+ write(os, preamble_ints);
392
412
  const uint8_t serial_version(is_single_item ? SERIAL_VERSION_2 : SERIAL_VERSION_1);
393
- os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
413
+ write(os, serial_version);
394
414
  const uint8_t family(FAMILY);
395
- os.write(reinterpret_cast<const char*>(&family), sizeof(family));
415
+ write(os, family);
396
416
  const uint8_t flags_byte(
397
417
  (is_empty() ? 1 << flags::IS_EMPTY : 0)
398
418
  | (is_level_zero_sorted_ ? 1 << flags::IS_LEVEL_ZERO_SORTED : 0)
399
419
  | (is_single_item ? 1 << flags::IS_SINGLE_ITEM : 0)
400
420
  );
401
- os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
402
- os.write((char*)&k_, sizeof(k_));
403
- os.write((char*)&m_, sizeof(m_));
421
+ write(os, flags_byte);
422
+ write(os, k_);
423
+ write(os, m_);
404
424
  const uint8_t unused = 0;
405
- os.write(reinterpret_cast<const char*>(&unused), sizeof(unused));
425
+ write(os, unused);
406
426
  if (is_empty()) return;
407
427
  if (!is_single_item) {
408
- os.write((char*)&n_, sizeof(n_));
409
- os.write((char*)&min_k_, sizeof(min_k_));
410
- os.write((char*)&num_levels_, sizeof(num_levels_));
411
- os.write((char*)&unused, sizeof(unused));
412
- os.write((char*)levels_.data(), sizeof(levels_[0]) * num_levels_);
428
+ write(os, n_);
429
+ write(os, min_k_);
430
+ write(os, num_levels_);
431
+ write(os, unused);
432
+ write(os, levels_.data(), sizeof(levels_[0]) * num_levels_);
413
433
  S().serialize(os, min_value_, 1);
414
434
  S().serialize(os, max_value_, 1);
415
435
  }
@@ -424,27 +444,26 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
424
444
  uint8_t* ptr = bytes.data() + header_size_bytes;
425
445
  const uint8_t* end_ptr = ptr + size;
426
446
  const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
427
- ptr += copy_to_mem(&preamble_ints, ptr, sizeof(preamble_ints));
447
+ ptr += copy_to_mem(preamble_ints, ptr);
428
448
  const uint8_t serial_version(is_single_item ? SERIAL_VERSION_2 : SERIAL_VERSION_1);
429
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
449
+ ptr += copy_to_mem(serial_version, ptr);
430
450
  const uint8_t family(FAMILY);
431
- ptr += copy_to_mem(&family, ptr, sizeof(family));
451
+ ptr += copy_to_mem(family, ptr);
432
452
  const uint8_t flags_byte(
433
453
  (is_empty() ? 1 << flags::IS_EMPTY : 0)
434
454
  | (is_level_zero_sorted_ ? 1 << flags::IS_LEVEL_ZERO_SORTED : 0)
435
455
  | (is_single_item ? 1 << flags::IS_SINGLE_ITEM : 0)
436
456
  );
437
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
438
- ptr += copy_to_mem(&k_, ptr, sizeof(k_));
439
- ptr += copy_to_mem(&m_, ptr, sizeof(m_));
440
- const uint8_t unused = 0;
441
- ptr += copy_to_mem(&unused, ptr, sizeof(unused));
457
+ ptr += copy_to_mem(flags_byte, ptr);
458
+ ptr += copy_to_mem(k_, ptr);
459
+ ptr += copy_to_mem(m_, ptr);
460
+ ptr += sizeof(uint8_t); // unused
442
461
  if (!is_empty()) {
443
462
  if (!is_single_item) {
444
- ptr += copy_to_mem(&n_, ptr, sizeof(n_));
445
- ptr += copy_to_mem(&min_k_, ptr, sizeof(min_k_));
446
- ptr += copy_to_mem(&num_levels_, ptr, sizeof(num_levels_));
447
- ptr += copy_to_mem(&unused, ptr, sizeof(unused));
463
+ ptr += copy_to_mem(n_, ptr);
464
+ ptr += copy_to_mem(min_k_, ptr);
465
+ ptr += copy_to_mem(num_levels_, ptr);
466
+ ptr += sizeof(uint8_t); // unused
448
467
  ptr += copy_to_mem(levels_.data(), ptr, sizeof(levels_[0]) * num_levels_);
449
468
  ptr += S().serialize(ptr, end_ptr - ptr, min_value_, 1);
450
469
  ptr += S().serialize(ptr, end_ptr - ptr, max_value_, 1);
@@ -459,20 +478,13 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
459
478
 
460
479
  template<typename T, typename C, typename S, typename A>
461
480
  kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, const A& allocator) {
462
- uint8_t preamble_ints;
463
- is.read((char*)&preamble_ints, sizeof(preamble_ints));
464
- uint8_t serial_version;
465
- is.read((char*)&serial_version, sizeof(serial_version));
466
- uint8_t family_id;
467
- is.read((char*)&family_id, sizeof(family_id));
468
- uint8_t flags_byte;
469
- is.read((char*)&flags_byte, sizeof(flags_byte));
470
- uint16_t k;
471
- is.read((char*)&k, sizeof(k));
472
- uint8_t m;
473
- is.read((char*)&m, sizeof(m));
474
- uint8_t unused;
475
- is.read((char*)&unused, sizeof(unused));
481
+ const auto preamble_ints = read<uint8_t>(is);
482
+ const auto serial_version = read<uint8_t>(is);
483
+ const auto family_id = read<uint8_t>(is);
484
+ const auto flags_byte = read<uint8_t>(is);
485
+ const auto k = read<uint16_t>(is);
486
+ const auto m = read<uint8_t>(is);
487
+ read<uint8_t>(is); // skip unused byte
476
488
 
477
489
  check_m(m);
478
490
  check_preamble_ints(preamble_ints, flags_byte);
@@ -492,10 +504,10 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, con
492
504
  min_k = k;
493
505
  num_levels = 1;
494
506
  } else {
495
- is.read((char*)&n, sizeof(n_));
496
- is.read((char*)&min_k, sizeof(min_k_));
497
- is.read((char*)&num_levels, sizeof(num_levels));
498
- is.read((char*)&unused, sizeof(unused));
507
+ n = read<uint64_t>(is);
508
+ min_k = read<uint16_t>(is);
509
+ num_levels = read<uint8_t>(is);
510
+ read<uint8_t>(is); // skip unused byte
499
511
  }
500
512
  vector_u32<A> levels(num_levels + 1, 0, allocator);
501
513
  const uint32_t capacity(kll_helper::compute_total_capacity(k, m, num_levels));
@@ -503,7 +515,7 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, con
503
515
  levels[0] = capacity - 1;
504
516
  } else {
505
517
  // the last integer in levels_ is not serialized because it can be derived
506
- is.read((char*)levels.data(), sizeof(levels[0]) * num_levels);
518
+ read(is, levels.data(), sizeof(levels[0]) * num_levels);
507
519
  }
508
520
  levels[num_levels] = capacity;
509
521
  A alloc(allocator);
@@ -546,24 +558,24 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, si
546
558
  ensure_minimum_memory(size, 8);
547
559
  const char* ptr = static_cast<const char*>(bytes);
548
560
  uint8_t preamble_ints;
549
- ptr += copy_from_mem(ptr, &preamble_ints, sizeof(preamble_ints));
561
+ ptr += copy_from_mem(ptr, preamble_ints);
550
562
  uint8_t serial_version;
551
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
563
+ ptr += copy_from_mem(ptr, serial_version);
552
564
  uint8_t family_id;
553
- ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
565
+ ptr += copy_from_mem(ptr, family_id);
554
566
  uint8_t flags_byte;
555
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
567
+ ptr += copy_from_mem(ptr, flags_byte);
556
568
  uint16_t k;
557
- ptr += copy_from_mem(ptr, &k, sizeof(k));
569
+ ptr += copy_from_mem(ptr, k);
558
570
  uint8_t m;
559
- ptr += copy_from_mem(ptr, &m, sizeof(m));
560
- ptr++; // skip unused byte
571
+ ptr += copy_from_mem(ptr, m);
572
+ ptr += sizeof(uint8_t); // skip unused byte
561
573
 
562
574
  check_m(m);
563
575
  check_preamble_ints(preamble_ints, flags_byte);
564
576
  check_serial_version(serial_version);
565
577
  check_family_id(family_id);
566
- ensure_minimum_memory(size, 1 << preamble_ints);
578
+ ensure_minimum_memory(size, 1ULL << preamble_ints);
567
579
 
568
580
  const bool is_empty(flags_byte & (1 << flags::IS_EMPTY));
569
581
  if (is_empty) return kll_sketch<T, C, S, A>(k, allocator);
@@ -578,10 +590,10 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, si
578
590
  min_k = k;
579
591
  num_levels = 1;
580
592
  } else {
581
- ptr += copy_from_mem(ptr, &n, sizeof(n));
582
- ptr += copy_from_mem(ptr, &min_k, sizeof(min_k));
583
- ptr += copy_from_mem(ptr, &num_levels, sizeof(num_levels));
584
- ptr++; // skip unused byte
593
+ ptr += copy_from_mem(ptr, n);
594
+ ptr += copy_from_mem(ptr, min_k);
595
+ ptr += copy_from_mem(ptr, num_levels);
596
+ ptr += sizeof(uint8_t); // skip unused byte
585
597
  }
586
598
  vector_u32<A> levels(num_levels + 1, 0, allocator);
587
599
  const uint32_t capacity(kll_helper::compute_total_capacity(k, m, num_levels));
@@ -779,7 +791,7 @@ std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantil
779
791
  using AllocCalc = typename std::allocator_traits<A>::template rebind_alloc<kll_quantile_calculator<T, C, A>>;
780
792
  AllocCalc alloc(allocator_);
781
793
  std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator(
782
- new (alloc.allocate(1)) kll_quantile_calculator<T, C, A>(items_, levels_.data(), num_levels_, n_, allocator_),
794
+ new (alloc.allocate(1)) kll_quantile_calculator<T, C, A>(*this),
783
795
  [&alloc](kll_quantile_calculator<T, C, A>* ptr){ ptr->~kll_quantile_calculator<T, C, A>(); alloc.deallocate(ptr, 1); }
784
796
  );
785
797
  return quantile_calculator;
@@ -1067,14 +1079,14 @@ typename kll_sketch<T, C, S, A>::const_iterator kll_sketch<T, C, S, A>::begin()
1067
1079
 
1068
1080
  template <typename T, typename C, typename S, typename A>
1069
1081
  typename kll_sketch<T, C, S, A>::const_iterator kll_sketch<T, C, S, A>::end() const {
1070
- return kll_sketch<T, C, S, A>::const_iterator(nullptr, nullptr, num_levels_);
1082
+ return kll_sketch<T, C, S, A>::const_iterator(nullptr, levels_.data(), num_levels_);
1071
1083
  }
1072
1084
 
1073
1085
  // kll_sketch::const_iterator implementation
1074
1086
 
1075
1087
  template<typename T, typename C, typename S, typename A>
1076
1088
  kll_sketch<T, C, S, A>::const_iterator::const_iterator(const T* items, const uint32_t* levels, const uint8_t num_levels):
1077
- items(items), levels(levels), num_levels(num_levels), index(levels == nullptr ? 0 : levels[0]), level(levels == nullptr ? num_levels : 0), weight(1)
1089
+ items(items), levels(levels), num_levels(num_levels), index(items == nullptr ? levels[num_levels] : levels[0]), level(items == nullptr ? num_levels : 0), weight(1)
1078
1090
  {}
1079
1091
 
1080
1092
  template<typename T, typename C, typename S, typename A>
@@ -1098,8 +1110,6 @@ typename kll_sketch<T, C, S, A>::const_iterator& kll_sketch<T, C, S, A>::const_i
1098
1110
 
1099
1111
  template<typename T, typename C, typename S, typename A>
1100
1112
  bool kll_sketch<T, C, S, A>::const_iterator::operator==(const const_iterator& other) const {
1101
- if (level != other.level) return false;
1102
- if (level == num_levels) return true; // end
1103
1113
  return index == other.index;
1104
1114
  }
1105
1115