datasketches 0.2.3 → 0.2.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (143) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +12 -0
  3. data/README.md +8 -8
  4. data/ext/datasketches/kll_wrapper.cpp +7 -3
  5. data/ext/datasketches/theta_wrapper.cpp +20 -4
  6. data/lib/datasketches/version.rb +1 -1
  7. data/vendor/datasketches-cpp/CMakeLists.txt +25 -5
  8. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  9. data/vendor/datasketches-cpp/NOTICE +6 -5
  10. data/vendor/datasketches-cpp/README.md +76 -9
  11. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  12. data/vendor/datasketches-cpp/common/CMakeLists.txt +18 -13
  13. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
  14. data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
  15. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
  16. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
  17. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
  18. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
  19. data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
  20. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  21. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +3 -1
  22. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
  23. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
  24. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +10 -6
  25. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  26. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
  27. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
  28. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  29. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
  30. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +29 -11
  31. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
  32. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
  33. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  34. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
  35. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
  36. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
  37. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
  38. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
  39. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
  40. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
  41. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
  42. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
  43. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
  44. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
  45. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
  46. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
  47. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
  48. data/vendor/datasketches-cpp/kll/CMakeLists.txt +5 -19
  49. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
  50. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +5 -2
  51. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +108 -41
  52. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +150 -132
  53. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +165 -31
  54. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  55. data/vendor/datasketches-cpp/pyproject.toml +1 -1
  56. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  57. data/vendor/datasketches-cpp/python/README.md +13 -9
  58. data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
  59. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
  60. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
  61. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
  62. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
  63. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
  64. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
  65. data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
  66. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
  67. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
  68. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +656 -0
  69. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1373 -0
  70. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
  71. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
  72. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
  73. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
  74. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
  75. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
  76. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
  77. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
  78. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
  79. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
  80. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
  81. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +975 -0
  82. data/vendor/datasketches-cpp/req/CMakeLists.txt +6 -21
  83. data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
  84. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +6 -0
  85. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +30 -2
  86. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +73 -23
  87. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +95 -63
  88. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +74 -3
  89. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +44 -33
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +33 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
  96. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
  97. data/vendor/datasketches-cpp/setup.py +1 -1
  98. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  99. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +92 -23
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +7 -6
  103. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +3 -2
  104. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
  105. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +150 -93
  106. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
  107. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  108. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -2
  109. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  110. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -4
  111. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +34 -9
  112. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  113. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
  114. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  115. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  116. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  117. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  118. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  119. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
  120. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +446 -0
  121. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +429 -1
  122. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -11
  123. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  124. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  125. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
  126. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  127. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  128. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +29 -9
  129. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +34 -14
  130. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  131. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  132. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
  133. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
  134. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
  135. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
  136. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +8 -0
  137. metadata +33 -12
  138. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
  139. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
  140. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
  141. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
  142. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -0,0 +1,1373 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _QUANTILES_SKETCH_IMPL_HPP_
21
+ #define _QUANTILES_SKETCH_IMPL_HPP_
22
+
23
+ #include <cmath>
24
+ #include <algorithm>
25
+ #include <stdexcept>
26
+ #include <iomanip>
27
+ #include <sstream>
28
+
29
+ #include "common_defs.hpp"
30
+ #include "count_zeros.hpp"
31
+ #include "conditional_forward.hpp"
32
+ #include "quantiles_sketch.hpp"
33
+
34
+ namespace datasketches {
35
+
36
+ template<typename T, typename C, typename A>
37
+ quantiles_sketch<T, C, A>::quantiles_sketch(uint16_t k, const A& allocator):
38
+ allocator_(allocator),
39
+ k_(k),
40
+ n_(0),
41
+ bit_pattern_(0),
42
+ base_buffer_(allocator_),
43
+ levels_(allocator_),
44
+ min_value_(nullptr),
45
+ max_value_(nullptr),
46
+ is_sorted_(true)
47
+ {
48
+ check_k(k_);
49
+ base_buffer_.reserve(2 * std::min(quantiles_constants::MIN_K, k));
50
+ }
51
+
52
+ template<typename T, typename C, typename A>
53
+ quantiles_sketch<T, C, A>::quantiles_sketch(const quantiles_sketch& other):
54
+ allocator_(other.allocator_),
55
+ k_(other.k_),
56
+ n_(other.n_),
57
+ bit_pattern_(other.bit_pattern_),
58
+ base_buffer_(other.base_buffer_),
59
+ levels_(other.levels_),
60
+ min_value_(nullptr),
61
+ max_value_(nullptr),
62
+ is_sorted_(other.is_sorted_)
63
+ {
64
+ if (other.min_value_ != nullptr) min_value_ = new (allocator_.allocate(1)) T(*other.min_value_);
65
+ if (other.max_value_ != nullptr) max_value_ = new (allocator_.allocate(1)) T(*other.max_value_);
66
+ for (size_t i = 0; i < levels_.size(); ++i) {
67
+ if (levels_[i].capacity() != other.levels_[i].capacity()) {
68
+ levels_[i].reserve(other.levels_[i].capacity());
69
+ }
70
+ }
71
+ }
72
+
73
+ template<typename T, typename C, typename A>
74
+ quantiles_sketch<T, C, A>::quantiles_sketch(quantiles_sketch&& other) noexcept:
75
+ allocator_(other.allocator_),
76
+ k_(other.k_),
77
+ n_(other.n_),
78
+ bit_pattern_(other.bit_pattern_),
79
+ base_buffer_(std::move(other.base_buffer_)),
80
+ levels_(std::move(other.levels_)),
81
+ min_value_(other.min_value_),
82
+ max_value_(other.max_value_),
83
+ is_sorted_(other.is_sorted_)
84
+ {
85
+ other.min_value_ = nullptr;
86
+ other.max_value_ = nullptr;
87
+ }
88
+
89
+ template<typename T, typename C, typename A>
90
+ quantiles_sketch<T, C, A>& quantiles_sketch<T, C, A>::operator=(const quantiles_sketch& other) {
91
+ quantiles_sketch<T, C, A> copy(other);
92
+ std::swap(allocator_, copy.allocator_);
93
+ std::swap(k_, copy.k_);
94
+ std::swap(n_, copy.n_);
95
+ std::swap(bit_pattern_, copy.bit_pattern_);
96
+ std::swap(base_buffer_, copy.base_buffer_);
97
+ std::swap(levels_, copy.levels_);
98
+ std::swap(min_value_, copy.min_value_);
99
+ std::swap(max_value_, copy.max_value_);
100
+ std::swap(is_sorted_, copy.is_sorted_);
101
+ return *this;
102
+ }
103
+
104
+ template<typename T, typename C, typename A>
105
+ quantiles_sketch<T, C, A>& quantiles_sketch<T, C, A>::operator=(quantiles_sketch&& other) noexcept {
106
+ std::swap(allocator_, other.allocator_);
107
+ std::swap(k_, other.k_);
108
+ std::swap(n_, other.n_);
109
+ std::swap(bit_pattern_, other.bit_pattern_);
110
+ std::swap(base_buffer_, other.base_buffer_);
111
+ std::swap(levels_, other.levels_);
112
+ std::swap(min_value_, other.min_value_);
113
+ std::swap(max_value_, other.max_value_);
114
+ std::swap(is_sorted_, other.is_sorted_);
115
+ return *this;
116
+ }
117
+
118
+ template<typename T, typename C, typename A>
119
+ quantiles_sketch<T, C, A>::quantiles_sketch(uint16_t k, uint64_t n, uint64_t bit_pattern,
120
+ Level&& base_buffer, VectorLevels&& levels,
121
+ std::unique_ptr<T, item_deleter> min_value, std::unique_ptr<T, item_deleter> max_value,
122
+ bool is_sorted, const A& allocator) :
123
+ allocator_(allocator),
124
+ k_(k),
125
+ n_(n),
126
+ bit_pattern_(bit_pattern),
127
+ base_buffer_(std::move(base_buffer)),
128
+ levels_(std::move(levels)),
129
+ min_value_(min_value.release()),
130
+ max_value_(max_value.release()),
131
+ is_sorted_(is_sorted)
132
+ {
133
+ uint32_t item_count = base_buffer_.size();
134
+ for (Level& lvl : levels_) {
135
+ item_count += lvl.size();
136
+ }
137
+ if (item_count != compute_retained_items(k_, n_))
138
+ throw std::logic_error("Item count does not match value computed from k, n");
139
+ }
140
+
141
+ template<typename T, typename C, typename A>
142
+ template<typename From, typename FC, typename FA>
143
+ quantiles_sketch<T, C, A>::quantiles_sketch(const quantiles_sketch<From, FC, FA>& other, const A& allocator) :
144
+ allocator_(allocator),
145
+ k_(other.get_k()),
146
+ n_(other.get_n()),
147
+ bit_pattern_(compute_bit_pattern(other.get_k(), other.get_n())),
148
+ base_buffer_(allocator),
149
+ levels_(allocator),
150
+ min_value_(nullptr),
151
+ max_value_(nullptr),
152
+ is_sorted_(false)
153
+ {
154
+ static_assert(std::is_constructible<T, From>::value,
155
+ "Type converting constructor requires new type to be constructible from existing type");
156
+
157
+ base_buffer_.reserve(2 * std::min(quantiles_constants::MIN_K, k_));
158
+
159
+ if (!other.is_empty()) {
160
+ min_value_ = new (allocator_.allocate(1)) T(other.get_min_value());
161
+ max_value_ = new (allocator_.allocate(1)) T(other.get_max_value());
162
+
163
+ // reserve space in levels
164
+ const uint8_t num_levels = compute_levels_needed(k_, n_);
165
+ levels_.reserve(num_levels);
166
+ for (int i = 0; i < num_levels; ++i) {
167
+ Level level(allocator);
168
+ level.reserve(k_);
169
+ levels_.push_back(std::move(level));
170
+ }
171
+
172
+ // iterate through points, assigning to the correct level as needed
173
+ for (auto pair : other) {
174
+ const uint64_t wt = pair.second;
175
+ if (wt == 1) {
176
+ base_buffer_.push_back(T(pair.first));
177
+ // resize where needed as if adding points via update()
178
+ if (base_buffer_.size() + 1 > base_buffer_.capacity()) {
179
+ const size_t new_size = std::max(std::min(static_cast<size_t>(2 * k_), 2 * base_buffer_.size()), static_cast<size_t>(1));
180
+ base_buffer_.reserve(new_size);
181
+ }
182
+ }
183
+ else {
184
+ const uint8_t idx = count_trailing_zeros_in_u64(pair.second) - 1;
185
+ levels_[idx].push_back(T(pair.first));
186
+ }
187
+ }
188
+
189
+ // validate that ordering within each level is preserved
190
+ // base_buffer_ can be considered unsorted for this purpose
191
+ for (int i = 0; i < num_levels; ++i) {
192
+ if (!std::is_sorted(levels_[i].begin(), levels_[i].end(), C())) {
193
+ throw std::logic_error("Copy construction across types produces invalid sorting");
194
+ }
195
+ }
196
+ }
197
+ }
198
+
199
+
200
+ template<typename T, typename C, typename A>
201
+ quantiles_sketch<T, C, A>::~quantiles_sketch() {
202
+ if (min_value_ != nullptr) {
203
+ min_value_->~T();
204
+ allocator_.deallocate(min_value_, 1);
205
+ }
206
+ if (max_value_ != nullptr) {
207
+ max_value_->~T();
208
+ allocator_.deallocate(max_value_, 1);
209
+ }
210
+ }
211
+
212
+ template<typename T, typename C, typename A>
213
+ template<typename FwdT>
214
+ void quantiles_sketch<T, C, A>::update(FwdT&& item) {
215
+ if (!check_update_value(item)) { return; }
216
+ if (is_empty()) {
217
+ min_value_ = new (allocator_.allocate(1)) T(item);
218
+ max_value_ = new (allocator_.allocate(1)) T(item);
219
+ } else {
220
+ if (C()(item, *min_value_)) *min_value_ = item;
221
+ if (C()(*max_value_, item)) *max_value_ = item;
222
+ }
223
+
224
+ // if exceed capacity, grow until size 2k -- assumes eager processing
225
+ if (base_buffer_.size() + 1 > base_buffer_.capacity())
226
+ grow_base_buffer();
227
+
228
+ base_buffer_.push_back(std::forward<FwdT>(item));
229
+ ++n_;
230
+
231
+ if (base_buffer_.size() > 1)
232
+ is_sorted_ = false;
233
+
234
+ if (base_buffer_.size() == 2 * k_)
235
+ process_full_base_buffer();
236
+ }
237
+
238
+ template<typename T, typename C, typename A>
239
+ template<typename FwdSk>
240
+ void quantiles_sketch<T, C, A>::merge(FwdSk&& other) {
241
+ if (other.is_empty()) {
242
+ return; // nothing to do
243
+ } else if (!other.is_estimation_mode()) {
244
+ // other is exact, stream in regardless of k
245
+ for (auto item : other.base_buffer_) {
246
+ update(conditional_forward<FwdSk>(item));
247
+ }
248
+ return; // we're done
249
+ }
250
+
251
+ // we know other has data and is in estimation mode
252
+ if (is_estimation_mode()) {
253
+ if (k_ == other.get_k()) {
254
+ standard_merge(*this, other);
255
+ } else if (k_ > other.get_k()) {
256
+ quantiles_sketch sk_copy(other);
257
+ downsampling_merge(sk_copy, *this);
258
+ *this = sk_copy;
259
+ } else { // k_ < other.get_k()
260
+ downsampling_merge(*this, other);
261
+ }
262
+ } else {
263
+ // exact or empty
264
+ quantiles_sketch sk_copy(other);
265
+ if (k_ <= other.get_k()) {
266
+ if (!is_empty()) {
267
+ for (uint16_t i = 0; i < base_buffer_.size(); ++i) {
268
+ sk_copy.update(std::move(base_buffer_[i]));
269
+ }
270
+ }
271
+ } else { // k_ > other.get_k()
272
+ downsampling_merge(sk_copy, *this);
273
+ }
274
+ *this = sk_copy;
275
+ }
276
+ }
277
+
278
+ template<typename T, typename C, typename A>
279
+ template<typename SerDe>
280
+ void quantiles_sketch<T, C, A>::serialize(std::ostream& os, const SerDe& serde) const {
281
+ const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_SHORT : PREAMBLE_LONGS_FULL;
282
+ write(os, preamble_longs);
283
+ const uint8_t ser_ver = SERIAL_VERSION;
284
+ write(os, ser_ver);
285
+ const uint8_t family = FAMILY;
286
+ write(os, family);
287
+
288
+ // side-effect: sort base buffer since always compact
289
+ // can't set is_sorted_ since const method
290
+ std::sort(const_cast<Level&>(base_buffer_).begin(), const_cast<Level&>(base_buffer_).end(), C());
291
+
292
+ // empty, ordered, compact are valid flags
293
+ const uint8_t flags_byte(
294
+ (is_empty() ? 1 << flags::IS_EMPTY : 0)
295
+ | (1 << flags::IS_SORTED) // always sorted as side effect noted above
296
+ | (1 << flags::IS_COMPACT) // always compact -- could be optional for numeric types?
297
+ );
298
+ write(os, flags_byte);
299
+ write(os, k_);
300
+ const uint16_t unused = 0;
301
+ write(os, unused);
302
+
303
+ if (!is_empty()) {
304
+ write(os, n_);
305
+
306
+ // min and max
307
+ serde.serialize(os, min_value_, 1);
308
+ serde.serialize(os, max_value_, 1);
309
+
310
+ // base buffer items
311
+ serde.serialize(os, base_buffer_.data(), static_cast<unsigned>(base_buffer_.size()));
312
+
313
+ // levels, only when data is present
314
+ for (Level lvl : levels_) {
315
+ if (lvl.size() > 0)
316
+ serde.serialize(os, lvl.data(), static_cast<unsigned>(lvl.size()));
317
+ }
318
+ }
319
+ }
320
+
321
+ template<typename T, typename C, typename A>
322
+ template<typename SerDe>
323
+ auto quantiles_sketch<T, C, A>::serialize(unsigned header_size_bytes, const SerDe& serde) const -> vector_bytes {
324
+ const size_t size = get_serialized_size_bytes(serde) + header_size_bytes;
325
+ vector_bytes bytes(size, 0, allocator_);
326
+ uint8_t* ptr = bytes.data() + header_size_bytes;
327
+ const uint8_t* end_ptr = ptr + size;
328
+
329
+ const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_SHORT : PREAMBLE_LONGS_FULL;
330
+ ptr += copy_to_mem(preamble_longs, ptr);
331
+ const uint8_t ser_ver = SERIAL_VERSION;
332
+ ptr += copy_to_mem(ser_ver, ptr);
333
+ const uint8_t family = FAMILY;
334
+ ptr += copy_to_mem(family, ptr);
335
+
336
+ // side-effect: sort base buffer since always compact
337
+ // can't set is_sorted_ since const method
338
+ std::sort(const_cast<Level&>(base_buffer_).begin(), const_cast<Level&>(base_buffer_).end(), C());
339
+
340
+ // empty, ordered, compact are valid flags
341
+ const uint8_t flags_byte(
342
+ (is_empty() ? 1 << flags::IS_EMPTY : 0)
343
+ | (1 << flags::IS_SORTED) // always sorted as side effect noted above
344
+ | (1 << flags::IS_COMPACT) // always compact
345
+ );
346
+ ptr += copy_to_mem(flags_byte, ptr);
347
+ ptr += copy_to_mem(k_, ptr);
348
+ ptr += sizeof(uint16_t); // 2 unused bytes
349
+
350
+ if (!is_empty()) {
351
+
352
+ ptr += copy_to_mem(n_, ptr);
353
+
354
+ // min and max
355
+ ptr += serde.serialize(ptr, end_ptr - ptr, min_value_, 1);
356
+ ptr += serde.serialize(ptr, end_ptr - ptr, max_value_, 1);
357
+
358
+ // base buffer items
359
+ if (base_buffer_.size() > 0)
360
+ ptr += serde.serialize(ptr, end_ptr - ptr, base_buffer_.data(), static_cast<unsigned>(base_buffer_.size()));
361
+
362
+ // levels, only when data is present
363
+ for (Level lvl : levels_) {
364
+ if (lvl.size() > 0)
365
+ ptr += serde.serialize(ptr, end_ptr - ptr, lvl.data(), static_cast<unsigned>(lvl.size()));
366
+ }
367
+ }
368
+
369
+ return bytes;
370
+ }
371
+
372
+ template<typename T, typename C, typename A>
373
+ template<typename SerDe>
374
+ auto quantiles_sketch<T, C, A>::deserialize(std::istream &is, const SerDe& serde, const A &allocator) -> quantiles_sketch {
375
+ const auto preamble_longs = read<uint8_t>(is);
376
+ const auto serial_version = read<uint8_t>(is);
377
+ const auto family_id = read<uint8_t>(is);
378
+ const auto flags_byte = read<uint8_t>(is);
379
+ const auto k = read<uint16_t>(is);
380
+ read<uint16_t>(is); // unused
381
+
382
+ check_k(k);
383
+ check_serial_version(serial_version); // a little redundant with the header check
384
+ check_family_id(family_id);
385
+ check_header_validity(preamble_longs, flags_byte, serial_version);
386
+
387
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
388
+ const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
389
+ if (is_empty) {
390
+ return quantiles_sketch(k, allocator);
391
+ }
392
+
393
+ const auto items_seen = read<uint64_t>(is);
394
+
395
+ const bool is_compact = (serial_version == 2) | ((flags_byte & (1 << flags::IS_COMPACT)) > 0);
396
+ const bool is_sorted = (flags_byte & (1 << flags::IS_SORTED)) > 0;
397
+
398
+ A alloc(allocator);
399
+ auto item_buffer_deleter = [&alloc](T* ptr) { alloc.deallocate(ptr, 1); };
400
+ std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(alloc.allocate(1), item_buffer_deleter);
401
+ std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(alloc.allocate(1), item_buffer_deleter);
402
+ std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
403
+ std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
404
+
405
+ serde.deserialize(is, min_value_buffer.get(), 1);
406
+ // serde call did not throw, repackage with destrtuctor
407
+ min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
408
+ serde.deserialize(is, max_value_buffer.get(), 1);
409
+ // serde call did not throw, repackage with destrtuctor
410
+ max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
411
+
412
+ if (serial_version == 1) {
413
+ read<uint64_t>(is); // no longer used
414
+ }
415
+
416
+ // allocate buffers as needed
417
+ const uint8_t levels_needed = compute_levels_needed(k, items_seen);
418
+ const uint64_t bit_pattern = compute_bit_pattern(k, items_seen);
419
+
420
+ // Java provides a compact storage layout for a sketch of primitive doubles. The C++ version
421
+ // does not currently operate sketches in compact mode, but will only serialize as compact
422
+ // to avoid complications around serialization of empty values for generic type T. We also need
423
+ // to be able to ingest either serialized format from Java.
424
+
425
+ // load base buffer
426
+ const uint32_t bb_items = compute_base_buffer_items(k, items_seen);
427
+ uint32_t items_to_read = (levels_needed == 0 || is_compact) ? bb_items : 2 * k;
428
+ Level base_buffer = deserialize_array(is, bb_items, 2 * k, serde, allocator);
429
+ if (items_to_read > bb_items) { // either equal or greater, never read fewer items
430
+ // read remaining items, but don't store them
431
+ deserialize_array(is, items_to_read - bb_items, items_to_read - bb_items, serde, allocator);
432
+ }
433
+
434
+ // populate vector of Levels directly
435
+ VectorLevels levels(allocator);
436
+ levels.reserve(levels_needed);
437
+ if (levels_needed > 0) {
438
+ uint64_t working_pattern = bit_pattern;
439
+ for (size_t i = 0; i < levels_needed; ++i, working_pattern >>= 1) {
440
+ if ((working_pattern & 0x01) == 1) {
441
+ Level level = deserialize_array(is, k, k, serde, allocator);
442
+ levels.push_back(std::move(level));
443
+ } else {
444
+ Level level(allocator);
445
+ level.reserve(k);
446
+ levels.push_back(std::move(level));
447
+ }
448
+ }
449
+ }
450
+
451
+ return quantiles_sketch(k, items_seen, bit_pattern,
452
+ std::move(base_buffer), std::move(levels), std::move(min_value), std::move(max_value), is_sorted, allocator);
453
+ }
454
+
455
+ template<typename T, typename C, typename A>
456
+ template<typename SerDe>
457
+ auto quantiles_sketch<T, C, A>::deserialize_array(std::istream& is, uint32_t num_items, uint32_t capacity, const SerDe& serde, const A& allocator) -> Level {
458
+ A alloc(allocator);
459
+ std::unique_ptr<T, items_deleter> items(alloc.allocate(num_items), items_deleter(allocator, false, num_items));
460
+ serde.deserialize(is, items.get(), num_items);
461
+ // serde did not throw, enable destructors
462
+ items.get_deleter().set_destroy(true);
463
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
464
+
465
+ // succesfully read, now put into a Level
466
+ Level level(allocator);
467
+ level.reserve(capacity);
468
+ level.insert(level.begin(),
469
+ std::make_move_iterator(items.get()),
470
+ std::make_move_iterator(items.get() + num_items));
471
+ return level;
472
+ }
473
+
474
+ template<typename T, typename C, typename A>
475
+ template<typename SerDe>
476
+ auto quantiles_sketch<T, C, A>::deserialize(const void* bytes, size_t size, const SerDe& serde, const A &allocator) -> quantiles_sketch {
477
+ ensure_minimum_memory(size, 8);
478
+ const char* ptr = static_cast<const char*>(bytes);
479
+ const char* end_ptr = static_cast<const char*>(bytes) + size;
480
+
481
+ uint8_t preamble_longs;
482
+ ptr += copy_from_mem(ptr, preamble_longs);
483
+ uint8_t serial_version;
484
+ ptr += copy_from_mem(ptr, serial_version);
485
+ uint8_t family_id;
486
+ ptr += copy_from_mem(ptr, family_id);
487
+ uint8_t flags_byte;
488
+ ptr += copy_from_mem(ptr, flags_byte);
489
+ uint16_t k;
490
+ ptr += copy_from_mem(ptr, k);
491
+ uint16_t unused;
492
+ ptr += copy_from_mem(ptr, unused);
493
+
494
+ check_k(k);
495
+ check_serial_version(serial_version); // a little redundant with the header check
496
+ check_family_id(family_id);
497
+ check_header_validity(preamble_longs, flags_byte, serial_version);
498
+
499
+ const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
500
+ if (is_empty) {
501
+ return quantiles_sketch(k, allocator);
502
+ }
503
+
504
+ ensure_minimum_memory(size, 16);
505
+ uint64_t items_seen;
506
+ ptr += copy_from_mem(ptr, items_seen);
507
+
508
+ const bool is_compact = (serial_version == 2) | ((flags_byte & (1 << flags::IS_COMPACT)) > 0);
509
+ const bool is_sorted = (flags_byte & (1 << flags::IS_SORTED)) > 0;
510
+
511
+ A alloc(allocator);
512
+ auto item_buffer_deleter = [&alloc](T* ptr) { alloc.deallocate(ptr, 1); };
513
+ std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(alloc.allocate(1), item_buffer_deleter);
514
+ std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(alloc.allocate(1), item_buffer_deleter);
515
+ std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
516
+ std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
517
+
518
+ ptr += serde.deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
519
+ // serde call did not throw, repackage with destrtuctor
520
+ min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
521
+ ptr += serde.deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
522
+ // serde call did not throw, repackage with destrtuctor
523
+ max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
524
+
525
+ if (serial_version == 1) {
526
+ uint64_t unused_long;
527
+ ptr += copy_from_mem(ptr, unused_long); // no longer used
528
+ }
529
+
530
+ // allocate buffers as needed
531
+ const uint8_t levels_needed = compute_levels_needed(k, items_seen);
532
+ const uint64_t bit_pattern = compute_bit_pattern(k, items_seen);
533
+
534
+ // Java provides a compact storage layout for a sketch of primitive doubles. The C++ version
535
+ // does not currently operate sketches in compact mode, but will only serialize as compact
536
+ // to avoid complications around serialization of empty values for generic type T. We also need
537
+ // to be able to ingest either serialized format from Java.
538
+
539
+ // load base buffer
540
+ const uint32_t bb_items = compute_base_buffer_items(k, items_seen);
541
+ uint32_t items_to_read = (levels_needed == 0 || is_compact) ? bb_items : 2 * k;
542
+ auto base_buffer_pair = deserialize_array(ptr, end_ptr - ptr, bb_items, 2 * k, serde, allocator);
543
+ ptr += base_buffer_pair.second;
544
+ if (items_to_read > bb_items) { // either equal or greater, never read fewer items
545
+ // read remaining items, only use to advance the pointer
546
+ auto extras = deserialize_array(ptr, end_ptr - ptr, items_to_read - bb_items, items_to_read - bb_items, serde, allocator);
547
+ ptr += extras.second;
548
+ }
549
+
550
+ // populate vector of Levels directly
551
+ VectorLevels levels(allocator);
552
+ levels.reserve(levels_needed);
553
+ if (levels_needed > 0) {
554
+ uint64_t working_pattern = bit_pattern;
555
+ for (size_t i = 0; i < levels_needed; ++i, working_pattern >>= 1) {
556
+
557
+ if ((working_pattern & 0x01) == 1) {
558
+ auto pair = deserialize_array(ptr, end_ptr - ptr, k, k, serde, allocator);
559
+ ptr += pair.second;
560
+ levels.push_back(std::move(pair.first));
561
+ } else {
562
+ Level level(allocator);
563
+ level.reserve(k);
564
+ levels.push_back(std::move(level));
565
+ }
566
+ }
567
+ }
568
+
569
+ return quantiles_sketch(k, items_seen, bit_pattern,
570
+ std::move(base_buffer_pair.first), std::move(levels), std::move(min_value), std::move(max_value), is_sorted, allocator);
571
+ }
572
+
573
+ template<typename T, typename C, typename A>
574
+ template<typename SerDe>
575
+ auto quantiles_sketch<T, C, A>::deserialize_array(const void* bytes, size_t size, uint32_t num_items, uint32_t capacity, const SerDe& serde, const A& allocator)
576
+ -> std::pair<Level, size_t> {
577
+ const char* ptr = static_cast<const char*>(bytes);
578
+ const char* end_ptr = static_cast<const char*>(bytes) + size;
579
+ A alloc(allocator);
580
+ std::unique_ptr<T, items_deleter> items(alloc.allocate(num_items), items_deleter(allocator, false, num_items));
581
+ ptr += serde.deserialize(ptr, end_ptr - ptr, items.get(), num_items);
582
+ // serde did not throw, enable destructors
583
+ items.get_deleter().set_destroy(true);
584
+
585
+ // succesfully read, now put into a Level
586
+ Level level(allocator);
587
+ level.reserve(capacity);
588
+ level.insert(level.begin(),
589
+ std::make_move_iterator(items.get()),
590
+ std::make_move_iterator(items.get() + num_items));
591
+
592
+ return std::pair<Level, size_t>(std::move(level), ptr - static_cast<const char*>(bytes));
593
+ }
594
+
595
+ template<typename T, typename C, typename A>
596
+ string<A> quantiles_sketch<T, C, A>::to_string(bool print_levels, bool print_items) const {
597
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
598
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
599
+ std::ostringstream os;
600
+ os << "### Quantiles Sketch summary:" << std::endl;
601
+ os << " K : " << k_ << std::endl;
602
+ os << " N : " << n_ << std::endl;
603
+ os << " Epsilon : " << std::setprecision(3) << get_normalized_rank_error(false) * 100 << "%" << std::endl;
604
+ os << " Epsilon PMF : " << get_normalized_rank_error(true) * 100 << "%" << std::endl;
605
+ os << " Empty : " << (is_empty() ? "true" : "false") << std::endl;
606
+ os << " Estimation mode: " << (is_estimation_mode() ? "true" : "false") << std::endl;
607
+ os << " Levels (w/o BB): " << levels_.size() << std::endl;
608
+ os << " Used Levels : " << compute_valid_levels(bit_pattern_) << std::endl;
609
+ os << " Retained items : " << get_num_retained() << std::endl;
610
+ if (!is_empty()) {
611
+ os << " Min value : " << *min_value_ << std::endl;
612
+ os << " Max value : " << *max_value_ << std::endl;
613
+ }
614
+ os << "### End sketch summary" << std::endl;
615
+
616
+ if (print_levels) {
617
+ os << "### Quantiles Sketch levels:" << std::endl;
618
+ os << " index: items in use" << std::endl;
619
+ os << " BB: " << base_buffer_.size() << std::endl;
620
+ for (uint8_t i = 0; i < levels_.size(); i++) {
621
+ os << " " << static_cast<unsigned int>(i) << ": " << levels_[i].size() << std::endl;
622
+ }
623
+ os << "### End sketch levels" << std::endl;
624
+ }
625
+
626
+ if (print_items) {
627
+ os << "### Quantiles Sketch data:" << std::endl;
628
+ uint8_t level = 0;
629
+ os << " BB:" << std::endl;
630
+ for (const T& item : base_buffer_) {
631
+ os << " " << std::to_string(item) << std::endl;
632
+ }
633
+ for (uint8_t i = 0; i < levels_.size(); ++i) {
634
+ os << " level " << static_cast<unsigned int>(level) << ":" << std::endl;
635
+ for (const T& item : levels_[i]) {
636
+ os << " " << std::to_string(item) << std::endl;
637
+ }
638
+ }
639
+ os << "### End sketch data" << std::endl;
640
+ }
641
+ return string<A>(os.str().c_str(), allocator_);
642
+ }
643
+
644
+ template<typename T, typename C, typename A>
645
+ uint16_t quantiles_sketch<T, C, A>::get_k() const {
646
+ return k_;
647
+ }
648
+
649
+ template<typename T, typename C, typename A>
650
+ uint64_t quantiles_sketch<T, C, A>::get_n() const {
651
+ return n_;
652
+ }
653
+
654
+ template<typename T, typename C, typename A>
655
+ bool quantiles_sketch<T, C, A>::is_empty() const {
656
+ return n_ == 0;
657
+ }
658
+
659
+ template<typename T, typename C, typename A>
660
+ bool quantiles_sketch<T, C, A>::is_estimation_mode() const {
661
+ return bit_pattern_ != 0;
662
+ }
663
+
664
+ template<typename T, typename C, typename A>
665
+ uint32_t quantiles_sketch<T, C, A>::get_num_retained() const {
666
+ return compute_retained_items(k_, n_);
667
+ }
668
+
669
+ template<typename T, typename C, typename A>
670
+ const T& quantiles_sketch<T, C, A>::get_min_value() const {
671
+ if (is_empty()) return get_invalid_value();
672
+ return *min_value_;
673
+ }
674
+
675
+ template<typename T, typename C, typename A>
676
+ const T& quantiles_sketch<T, C, A>::get_max_value() const {
677
+ if (is_empty()) return get_invalid_value();
678
+ return *max_value_;
679
+ }
680
+
681
+ template<typename T, typename C, typename A>
682
+ C quantiles_sketch<T, C, A>::get_comparator() const {
683
+ return C();
684
+ }
685
+
686
+ template<typename T, typename C, typename A>
687
+ A quantiles_sketch<T, C, A>::get_allocator() const {
688
+ return allocator_;
689
+ }
690
+
691
+ // implementation for fixed-size arithmetic types (integral and floating point)
692
+ template<typename T, typename C, typename A>
693
+ template<typename SerDe, typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
694
+ size_t quantiles_sketch<T, C, A>::get_serialized_size_bytes(const SerDe&) const {
695
+ if (is_empty()) { return EMPTY_SIZE_BYTES; }
696
+ return DATA_START + ((get_num_retained() + 2) * sizeof(TT));
697
+ }
698
+
699
+ // implementation for all other types
700
+ template<typename T, typename C, typename A>
701
+ template<typename SerDe, typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
702
+ size_t quantiles_sketch<T, C, A>::get_serialized_size_bytes(const SerDe& serde) const {
703
+ if (is_empty()) { return EMPTY_SIZE_BYTES; }
704
+ size_t size = DATA_START;
705
+ size += serde.size_of_item(*min_value_);
706
+ size += serde.size_of_item(*max_value_);
707
+ for (auto it: *this) size += serde.size_of_item(it.first);
708
+ return size;
709
+ }
710
+
711
+ template<typename T, typename C, typename A>
712
+ double quantiles_sketch<T, C, A>::get_normalized_rank_error(bool is_pmf) const {
713
+ return get_normalized_rank_error(k_, is_pmf);
714
+ }
715
+
716
+ template<typename T, typename C, typename A>
717
+ double quantiles_sketch<T, C, A>::get_normalized_rank_error(uint16_t k, bool is_pmf) {
718
+ return is_pmf
719
+ ? 1.854 / std::pow(k, 0.9657)
720
+ : 1.576 / std::pow(k, 0.9726);
721
+ }
722
+
723
+ template<typename T, typename C, typename A>
724
+ template<bool inclusive>
725
+ quantile_sketch_sorted_view<T, C, A> quantiles_sketch<T, C, A>::get_sorted_view(bool cumulative) const {
726
+ // allow side-effect of sorting the base buffer; can't set the flag since
727
+ // this is a const method
728
+ if (!is_sorted_) {
729
+ std::sort(const_cast<Level&>(base_buffer_).begin(), const_cast<Level&>(base_buffer_).end(), C());
730
+ }
731
+ quantile_sketch_sorted_view<T, C, A> view(get_num_retained(), allocator_);
732
+
733
+ uint64_t weight = 1;
734
+ view.add(base_buffer_.begin(), base_buffer_.end(), weight);
735
+ for (auto& level : levels_) {
736
+ weight <<= 1;
737
+ if (level.empty()) { continue; }
738
+ view.add(level.begin(), level.end(), weight);
739
+ }
740
+
741
+ if (cumulative) view.template convert_to_cummulative<inclusive>();
742
+ return view;
743
+ }
744
+
745
+ template<typename T, typename C, typename A>
746
+ template<bool inclusive>
747
+ auto quantiles_sketch<T, C, A>::get_quantile(double rank) const -> quantile_return_type {
748
+ if (is_empty()) return get_invalid_value();
749
+ if (rank == 0.0) return *min_value_;
750
+ if (rank == 1.0) return *max_value_;
751
+ if ((rank < 0.0) || (rank > 1.0)) {
752
+ throw std::invalid_argument("Rank cannot be less than zero or greater than 1.0");
753
+ }
754
+ // possible side-effect: sorting base buffer
755
+ return get_sorted_view<inclusive>(true).get_quantile(rank);
756
+ }
757
+
758
+ template<typename T, typename C, typename A>
759
+ template<bool inclusive>
760
+ std::vector<T, A> quantiles_sketch<T, C, A>::get_quantiles(const double* ranks, uint32_t size) const {
761
+ std::vector<T, A> quantiles(allocator_);
762
+ if (is_empty()) return quantiles;
763
+ quantiles.reserve(size);
764
+
765
+ // possible side-effect: sorting base buffer
766
+ auto view = get_sorted_view<inclusive>(true);
767
+
768
+ for (uint32_t i = 0; i < size; ++i) {
769
+ const double rank = ranks[i];
770
+ if ((rank < 0.0) || (rank > 1.0)) {
771
+ throw std::invalid_argument("rank cannot be less than zero or greater than 1.0");
772
+ }
773
+ if (rank == 0.0) quantiles.push_back(*min_value_);
774
+ else if (rank == 1.0) quantiles.push_back(*max_value_);
775
+ else {
776
+ quantiles.push_back(view.get_quantile(rank));
777
+ }
778
+ }
779
+ return quantiles;
780
+ }
781
+
782
+ template<typename T, typename C, typename A>
783
+ template<bool inclusive>
784
+ std::vector<T, A> quantiles_sketch<T, C, A>::get_quantiles(uint32_t num) const {
785
+ if (is_empty()) return std::vector<T, A>(allocator_);
786
+ if (num == 0) {
787
+ throw std::invalid_argument("num must be > 0");
788
+ }
789
+ vector_double fractions(num, 0, allocator_);
790
+ fractions[0] = 0.0;
791
+ for (size_t i = 1; i < num; i++) {
792
+ fractions[i] = static_cast<double>(i) / (num - 1);
793
+ }
794
+ if (num > 1) {
795
+ fractions[num - 1] = 1.0;
796
+ }
797
+ return get_quantiles<inclusive>(fractions.data(), num);
798
+ }
799
+
800
+ template<typename T, typename C, typename A>
801
+ template<bool inclusive>
802
+ double quantiles_sketch<T, C, A>::get_rank(const T& value) const {
803
+ if (is_empty()) return std::numeric_limits<double>::quiet_NaN();
804
+ uint64_t weight = 1;
805
+ uint64_t total = 0;
806
+ for (const T &item: base_buffer_) {
807
+ if (inclusive ? !C()(value, item) : C()(item, value))
808
+ total += weight;
809
+ }
810
+
811
+ weight *= 2;
812
+ for (uint8_t level = 0; level < levels_.size(); ++level, weight *= 2) {
813
+ if (levels_[level].empty()) { continue; }
814
+ const T* data = levels_[level].data();
815
+ for (uint16_t i = 0; i < k_; ++i) {
816
+ if (inclusive ? !C()(value, data[i]) : C()(data[i], value))
817
+ total += weight;
818
+ else
819
+ break; // levels are sorted, no point comparing further
820
+ }
821
+ }
822
+ return (double) total / n_;
823
+ }
824
+
825
+ template<typename T, typename C, typename A>
826
+ template<bool inclusive>
827
+ auto quantiles_sketch<T, C, A>::get_PMF(const T* split_points, uint32_t size) const -> vector_double {
828
+ auto buckets = get_CDF<inclusive>(split_points, size);
829
+ if (is_empty()) return buckets;
830
+ for (uint32_t i = size; i > 0; --i) {
831
+ buckets[i] -= buckets[i - 1];
832
+ }
833
+ return buckets;
834
+ }
835
+
836
+ template<typename T, typename C, typename A>
837
+ template<bool inclusive>
838
+ auto quantiles_sketch<T, C, A>::get_CDF(const T* split_points, uint32_t size) const -> vector_double {
839
+ vector_double buckets(allocator_);
840
+ if (is_empty()) return buckets;
841
+ check_split_points(split_points, size);
842
+ buckets.reserve(size + 1);
843
+ for (uint32_t i = 0; i < size; ++i) buckets.push_back(get_rank<inclusive>(split_points[i]));
844
+ buckets.push_back(1);
845
+ return buckets;
846
+ }
847
+
848
+ template<typename T, typename C, typename A>
849
+ uint32_t quantiles_sketch<T, C, A>::compute_retained_items(const uint16_t k, const uint64_t n) {
850
+ const uint32_t bb_count = compute_base_buffer_items(k, n);
851
+ const uint64_t bit_pattern = compute_bit_pattern(k, n);
852
+ const uint32_t valid_levels = compute_valid_levels(bit_pattern);
853
+ return bb_count + (k * valid_levels);
854
+ }
855
+
856
+ template<typename T, typename C, typename A>
857
+ uint32_t quantiles_sketch<T, C, A>::compute_base_buffer_items(const uint16_t k, const uint64_t n) {
858
+ return n % (static_cast<uint64_t>(2) * k);
859
+ }
860
+
861
+ template<typename T, typename C, typename A>
862
+ uint64_t quantiles_sketch<T, C, A>::compute_bit_pattern(const uint16_t k, const uint64_t n) {
863
+ return n / (static_cast<uint64_t>(2) * k);
864
+ }
865
+
866
+ template<typename T, typename C, typename A>
867
+ uint32_t quantiles_sketch<T, C, A>::compute_valid_levels(const uint64_t bit_pattern) {
868
+ // TODO: Java's Long.bitCount() probably uses a better method
869
+ uint64_t bp = bit_pattern;
870
+ uint32_t count = 0;
871
+ while (bp > 0) {
872
+ if ((bp & 0x01) == 1) ++count;
873
+ bp >>= 1;
874
+ }
875
+ return count;
876
+ }
877
+
878
+ template<typename T, typename C, typename A>
879
+ uint8_t quantiles_sketch<T, C, A>::compute_levels_needed(const uint16_t k, const uint64_t n) {
880
+ return static_cast<uint8_t>(64U) - count_leading_zeros_in_u64(n / (2 * k));
881
+ }
882
+
883
+ template<typename T, typename C, typename A>
884
+ void quantiles_sketch<T, C, A>::check_k(uint16_t k) {
885
+ if (k < quantiles_constants::MIN_K || k > quantiles_constants::MAX_K || (k & (k - 1)) != 0) {
886
+ throw std::invalid_argument("k must be a power of 2 that is >= "
887
+ + std::to_string(quantiles_constants::MIN_K) + " and <= "
888
+ + std::to_string(quantiles_constants::MAX_K) + ". Found: " + std::to_string(k));
889
+ }
890
+ }
891
+
892
+ template<typename T, typename C, typename A>
893
+ void quantiles_sketch<T, C, A>::check_serial_version(uint8_t serial_version) {
894
+ if (serial_version == SERIAL_VERSION || serial_version == SERIAL_VERSION_1 || serial_version == SERIAL_VERSION_2)
895
+ return;
896
+ else
897
+ throw std::invalid_argument("Possible corruption. Unrecognized serialization version: " + std::to_string(serial_version));
898
+ }
899
+
900
+ template<typename T, typename C, typename A>
901
+ void quantiles_sketch<T, C, A>::check_family_id(uint8_t family_id) {
902
+ if (family_id == FAMILY)
903
+ return;
904
+ else
905
+ throw std::invalid_argument("Possible corruption. Family id does not indicate quantiles sketch: " + std::to_string(family_id));
906
+ }
907
+
908
+ template<typename T, typename C, typename A>
909
+ void quantiles_sketch<T, C, A>::check_header_validity(uint8_t preamble_longs, uint8_t flags_byte, uint8_t serial_version) {
910
+ const bool empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
911
+ const bool compact = (flags_byte & (1 << flags::IS_COMPACT)) > 0;
912
+
913
+ const uint8_t sw = (compact ? 1 : 0) + (2 * (empty ? 1 : 0))
914
+ + (4 * (serial_version & 0xF)) + (32 * (preamble_longs & 0x3F));
915
+ bool valid = true;
916
+
917
+ switch (sw) { // exhaustive list and description of all valid cases
918
+ case 38 : break; //!compact, empty, serVer = 1, preLongs = 1; always stored as not compact
919
+ case 164 : break; //!compact, !empty, serVer = 1, preLongs = 5; always stored as not compact
920
+ case 42 : break; //!compact, empty, serVer = 2, preLongs = 1; always stored as compact
921
+ case 72 : break; //!compact, !empty, serVer = 2, preLongs = 2; always stored as compact
922
+ case 47 : break; // compact, empty, serVer = 3, preLongs = 1;
923
+ case 46 : break; //!compact, empty, serVer = 3, preLongs = 1;
924
+ case 79 : break; // compact, empty, serVer = 3, preLongs = 2;
925
+ case 78 : break; //!compact, empty, serVer = 3, preLongs = 2;
926
+ case 77 : break; // compact, !empty, serVer = 3, preLongs = 2;
927
+ case 76 : break; //!compact, !empty, serVer = 3, preLongs = 2;
928
+ default : //all other case values are invalid
929
+ valid = false;
930
+ }
931
+
932
+ if (!valid) {
933
+ std::ostringstream os;
934
+ os << "Possible sketch corruption. Inconsistent state: "
935
+ << "preamble_longs = " << preamble_longs
936
+ << ", empty = " << (empty ? "true" : "false")
937
+ << ", serialization_version = " << serial_version
938
+ << ", compact = " << (compact ? "true" : "false");
939
+ throw std::invalid_argument(os.str());
940
+ }
941
+ }
942
+
943
+ template <typename T, typename C, typename A>
944
+ typename quantiles_sketch<T, C, A>::const_iterator quantiles_sketch<T, C, A>::begin() const {
945
+ return quantiles_sketch<T, C, A>::const_iterator(base_buffer_, levels_, k_, n_, false);
946
+ }
947
+
948
+ template <typename T, typename C, typename A>
949
+ typename quantiles_sketch<T, C, A>::const_iterator quantiles_sketch<T, C, A>::end() const {
950
+ return quantiles_sketch<T, C, A>::const_iterator(base_buffer_, levels_, k_, n_, true);
951
+ }
952
+
953
+ template<typename T, typename C, typename A>
954
+ void quantiles_sketch<T, C, A>::grow_base_buffer() {
955
+ const size_t new_size = std::max(std::min(static_cast<size_t>(2 * k_), 2 * base_buffer_.size()), static_cast<size_t>(1));
956
+ base_buffer_.reserve(new_size);
957
+ }
958
+
959
+ template<typename T, typename C, typename A>
960
+ void quantiles_sketch<T, C, A>::process_full_base_buffer() {
961
+ // make sure there will be enough levels for the propagation
962
+ grow_levels_if_needed(); // note: n_ was already incremented by update() before this
963
+
964
+ std::sort(base_buffer_.begin(), base_buffer_.end(), C());
965
+ in_place_propagate_carry(0,
966
+ levels_[0], // unused here, but 0 is guaranteed to exist
967
+ base_buffer_,
968
+ true, *this);
969
+ base_buffer_.clear();
970
+ is_sorted_ = true;
971
+ if (n_ / (2 * k_) != bit_pattern_) {
972
+ throw std::logic_error("Internal error: n / 2k (" + std::to_string(n_ / 2 * k_)
973
+ + " != bit_pattern " + std::to_string(bit_pattern_));
974
+ }
975
+ }
976
+
977
+ template<typename T, typename C, typename A>
978
+ bool quantiles_sketch<T, C, A>::grow_levels_if_needed() {
979
+ const uint8_t levels_needed = compute_levels_needed(k_, n_);
980
+ if (levels_needed == 0)
981
+ return false; // don't need levels and might have small base buffer. Possible during merges.
982
+
983
+ // from here on, assume full size base buffer (2k) and at least one additional level
984
+ if (levels_needed <= levels_.size())
985
+ return false;
986
+
987
+ Level empty_level(allocator_);
988
+ empty_level.reserve(k_);
989
+ levels_.push_back(std::move(empty_level));
990
+ return true;
991
+ }
992
+
993
+ template<typename T, typename C, typename A>
994
+ template<typename FwdV>
995
+ void quantiles_sketch<T, C, A>::in_place_propagate_carry(uint8_t starting_level,
996
+ FwdV&& buf_size_k, Level& buf_size_2k,
997
+ bool apply_as_update,
998
+ quantiles_sketch& sketch) {
999
+ const uint64_t bit_pattern = sketch.bit_pattern_;
1000
+ const int k = sketch.k_;
1001
+
1002
+ uint8_t ending_level = lowest_zero_bit_starting_at(bit_pattern, starting_level);
1003
+
1004
+ if (apply_as_update) {
1005
+ // update version of computation
1006
+ // its is okay for buf_size_k to be null in this case
1007
+ zip_buffer(buf_size_2k, sketch.levels_[ending_level]);
1008
+ } else {
1009
+ // merge_into version of computation
1010
+ for (uint16_t i = 0; i < k; ++i) {
1011
+ sketch.levels_[ending_level].push_back(conditional_forward<FwdV>(buf_size_k[i]));
1012
+ }
1013
+ }
1014
+
1015
+ for (uint64_t lvl = starting_level; lvl < ending_level; lvl++) {
1016
+ if ((bit_pattern & (static_cast<uint64_t>(1) << lvl)) == 0) {
1017
+ throw std::logic_error("unexpected empty level in bit_pattern");
1018
+ }
1019
+ merge_two_size_k_buffers(
1020
+ sketch.levels_[lvl],
1021
+ sketch.levels_[ending_level],
1022
+ buf_size_2k);
1023
+ sketch.levels_[lvl].clear();
1024
+ sketch.levels_[ending_level].clear();
1025
+ zip_buffer(buf_size_2k, sketch.levels_[ending_level]);
1026
+ } // end of loop over lower levels
1027
+
1028
+ // update bit pattern with binary-arithmetic ripple carry
1029
+ sketch.bit_pattern_ = bit_pattern + (static_cast<uint64_t>(1) << starting_level);
1030
+ }
1031
+
1032
+ template<typename T, typename C, typename A>
1033
+ void quantiles_sketch<T, C, A>::zip_buffer(Level& buf_in, Level& buf_out) {
1034
+ #ifdef QUANTILES_VALIDATION
1035
+ static uint32_t next_offset = 0;
1036
+ uint32_t rand_offset = next_offset;
1037
+ next_offset = 1 - next_offset;
1038
+ #else
1039
+ uint32_t rand_offset = random_bit();
1040
+ #endif
1041
+ if ((buf_in.size() != 2 * buf_out.capacity())
1042
+ || (buf_out.size() > 0)) {
1043
+ throw std::logic_error("zip_buffer requires buf_in.size() == "
1044
+ "2*buf_out.capacity() and empty buf_out");
1045
+ }
1046
+
1047
+ size_t k = buf_out.capacity();
1048
+ for (uint32_t i = rand_offset, o = 0; o < k; i += 2, ++o) {
1049
+ buf_out.push_back(std::move(buf_in[i]));
1050
+ }
1051
+ buf_in.clear();
1052
+ }
1053
+
1054
+ template<typename T, typename C, typename A>
1055
+ template<typename FwdV>
1056
+ void quantiles_sketch<T, C, A>::zip_buffer_with_stride(FwdV&& buf_in, Level& buf_out, uint16_t stride) {
1057
+ // Random offset in range [0, stride)
1058
+ std::uniform_int_distribution<uint16_t> dist(0, stride - 1);
1059
+ const uint16_t rand_offset = dist(random_utils::rand);
1060
+
1061
+ if ((buf_in.size() != stride * buf_out.capacity())
1062
+ || (buf_out.size() > 0)) {
1063
+ throw std::logic_error("zip_buffer_with_stride requires buf_in.size() == "
1064
+ "stride*buf_out.capacity() and empty buf_out");
1065
+ }
1066
+
1067
+ const size_t k = buf_out.capacity();
1068
+ for (uint16_t i = rand_offset, o = 0; o < k; i += stride, ++o) {
1069
+ buf_out.push_back(conditional_forward<FwdV>(buf_in[i]));
1070
+ }
1071
+ // do not clear input buffer
1072
+ }
1073
+
1074
+
1075
+ template<typename T, typename C, typename A>
1076
+ void quantiles_sketch<T, C, A>::merge_two_size_k_buffers(Level& src_1, Level& src_2, Level& dst) {
1077
+ if (src_1.size() != src_2.size()
1078
+ || src_1.size() * 2 != dst.capacity()
1079
+ || dst.size() != 0) {
1080
+ throw std::logic_error("Input invariants violated in merge_two_size_k_buffers()");
1081
+ }
1082
+
1083
+ auto end1 = src_1.end(), end2 = src_2.end();
1084
+ auto it1 = src_1.begin(), it2 = src_2.begin();
1085
+
1086
+ // TODO: probably actually doing copies given Level&?
1087
+ while (it1 != end1 && it2 != end2) {
1088
+ if (C()(*it1, *it2)) {
1089
+ dst.push_back(std::move(*it1++));
1090
+ } else {
1091
+ dst.push_back(std::move(*it2++));
1092
+ }
1093
+ }
1094
+
1095
+ if (it1 != end1) {
1096
+ dst.insert(dst.end(), it1, end1);
1097
+ } else {
1098
+ if (it2 == end2) { throw std::logic_error("it2 unexpectedly already at end of range"); }
1099
+ dst.insert(dst.end(), it2, end2);
1100
+ }
1101
+ }
1102
+
1103
+
1104
+ template<typename T, typename C, typename A>
1105
+ template<typename FwdSk>
1106
+ void quantiles_sketch<T, C, A>::standard_merge(quantiles_sketch& tgt, FwdSk&& src) {
1107
+ if (src.get_k() != tgt.get_k()) {
1108
+ throw std::invalid_argument("src.get_k() != tgt.get_k()");
1109
+ }
1110
+ if (src.is_empty()) {
1111
+ return;
1112
+ }
1113
+
1114
+ uint64_t new_n = src.get_n() + tgt.get_n();
1115
+
1116
+ // move items from src's base buffer
1117
+ for (uint16_t i = 0; i < src.base_buffer_.size(); ++i) {
1118
+ tgt.update(conditional_forward<FwdSk>(src.base_buffer_[i]));
1119
+ }
1120
+
1121
+ // check (after moving raw items) if we need to extend levels array
1122
+ uint8_t levels_needed = compute_levels_needed(tgt.get_k(), new_n);
1123
+ if (levels_needed > tgt.levels_.size()) {
1124
+ tgt.levels_.reserve(levels_needed);
1125
+ while (tgt.levels_.size() < levels_needed) {
1126
+ Level empty_level(tgt.allocator_);
1127
+ empty_level.reserve(tgt.get_k());
1128
+ tgt.levels_.push_back(std::move(empty_level));
1129
+ }
1130
+ }
1131
+
1132
+ Level scratch_buf(tgt.allocator_);
1133
+ scratch_buf.reserve(2 * tgt.get_k());
1134
+
1135
+ uint64_t src_pattern = src.bit_pattern_;
1136
+ for (uint8_t src_lvl = 0; src_pattern != 0; ++src_lvl, src_pattern >>= 1) {
1137
+ if ((src_pattern & 1) > 0) {
1138
+ scratch_buf.clear();
1139
+
1140
+ // propagate-carry
1141
+ in_place_propagate_carry(src_lvl,
1142
+ src.levels_[src_lvl], scratch_buf,
1143
+ false, tgt);
1144
+ // update n_ at the end
1145
+ }
1146
+ }
1147
+ tgt.n_ = new_n;
1148
+ if ((tgt.get_n() / (2 * tgt.get_k())) != tgt.bit_pattern_) {
1149
+ throw std::logic_error("Failed internal consistency check after standard_merge()");
1150
+ }
1151
+
1152
+ // update min and max values
1153
+ // can't just check is_empty() since min and max might not have been set if
1154
+ // there were no base buffer items added via update()
1155
+ if (tgt.min_value_ == nullptr) {
1156
+ tgt.min_value_ = new (tgt.allocator_.allocate(1)) T(*src.min_value_);
1157
+ } else {
1158
+ if (C()(*src.min_value_, *tgt.min_value_))
1159
+ *tgt.min_value_ = conditional_forward<FwdSk>(*src.min_value_);
1160
+ }
1161
+
1162
+ if (tgt.max_value_ == nullptr) {
1163
+ tgt.max_value_ = new (tgt.allocator_.allocate(1)) T(*src.max_value_);
1164
+ } else {
1165
+ if (C()(*tgt.max_value_, *src.max_value_))
1166
+ *tgt.max_value_ = conditional_forward<FwdSk>(*src.max_value_);
1167
+ }
1168
+ }
1169
+
1170
+
1171
+ template<typename T, typename C, typename A>
1172
+ template<typename FwdSk>
1173
+ void quantiles_sketch<T, C, A>::downsampling_merge(quantiles_sketch& tgt, FwdSk&& src) {
1174
+ if (src.get_k() % tgt.get_k() != 0) {
1175
+ throw std::invalid_argument("src.get_k() is not a multiple of tgt.get_k()");
1176
+ }
1177
+ if (src.is_empty()) {
1178
+ return;
1179
+ }
1180
+
1181
+ const uint16_t downsample_factor = src.get_k() / tgt.get_k();
1182
+ const uint8_t lg_sample_factor = count_trailing_zeros_in_u32(downsample_factor);
1183
+
1184
+ const uint64_t new_n = src.get_n() + tgt.get_n();
1185
+
1186
+ // move items from src's base buffer
1187
+ for (uint16_t i = 0; i < src.base_buffer_.size(); ++i) {
1188
+ tgt.update(conditional_forward<FwdSk>(src.base_buffer_[i]));
1189
+ }
1190
+
1191
+ // check (after moving raw items) if we need to extend levels array
1192
+ const uint8_t levels_needed = compute_levels_needed(tgt.get_k(), new_n);
1193
+ if (levels_needed > tgt.levels_.size()) {
1194
+ tgt.levels_.reserve(levels_needed);
1195
+ while (tgt.levels_.size() < levels_needed) {
1196
+ Level empty_level(tgt.allocator_);
1197
+ empty_level.reserve(tgt.get_k());
1198
+ tgt.levels_.push_back(std::move(empty_level));
1199
+ }
1200
+ }
1201
+
1202
+ Level down_buf(tgt.allocator_);
1203
+ down_buf.reserve(tgt.get_k());
1204
+
1205
+ Level scratch_buf(tgt.allocator_);
1206
+ scratch_buf.reserve(2 * tgt.get_k());
1207
+
1208
+ uint64_t src_pattern = src.bit_pattern_;
1209
+ for (uint8_t src_lvl = 0; src_pattern != 0; ++src_lvl, src_pattern >>= 1) {
1210
+ if ((src_pattern & 1) > 0) {
1211
+ down_buf.clear();
1212
+ scratch_buf.clear();
1213
+
1214
+ // zip with stride, leaving input buffer intact
1215
+ zip_buffer_with_stride(src.levels_[src_lvl], down_buf, downsample_factor);
1216
+
1217
+ // propagate-carry
1218
+ in_place_propagate_carry(src_lvl + lg_sample_factor,
1219
+ down_buf, scratch_buf,
1220
+ false, tgt);
1221
+ // update n_ at the end
1222
+ }
1223
+ }
1224
+ tgt.n_ = new_n;
1225
+ if ((tgt.get_n() / (2 * tgt.get_k())) != tgt.bit_pattern_) {
1226
+ throw std::logic_error("Failed internal consistency check after downsampling_merge()");
1227
+ }
1228
+
1229
+ // update min and max values
1230
+ // can't just check is_empty() since min and max might not have been set if
1231
+ // there were no base buffer items added via update()
1232
+ if (tgt.min_value_ == nullptr) {
1233
+ tgt.min_value_ = new (tgt.allocator_.allocate(1)) T(*src.min_value_);
1234
+ } else {
1235
+ if (C()(*src.min_value_, *tgt.min_value_))
1236
+ *tgt.min_value_ = conditional_forward<FwdSk>(*src.min_value_);
1237
+ }
1238
+
1239
+ if (tgt.max_value_ == nullptr) {
1240
+ tgt.max_value_ = new (tgt.allocator_.allocate(1)) T(*src.max_value_);
1241
+ } else {
1242
+ if (C()(*tgt.max_value_, *src.max_value_))
1243
+ *tgt.max_value_ = conditional_forward<FwdSk>(*src.max_value_);
1244
+ }
1245
+ }
1246
+
1247
+
1248
+ template<typename T, typename C, typename A>
1249
+ uint8_t quantiles_sketch<T, C, A>::lowest_zero_bit_starting_at(uint64_t bits, uint8_t starting_bit) {
1250
+ uint8_t pos = starting_bit & 0X3F;
1251
+ uint64_t my_bits = bits >> pos;
1252
+
1253
+ while ((my_bits & static_cast<uint64_t>(1)) != 0) {
1254
+ my_bits >>= 1;
1255
+ pos++;
1256
+ }
1257
+ return pos;
1258
+ }
1259
+
1260
+ template<typename T, typename C, typename A>
1261
+ class quantiles_sketch<T, C, A>::item_deleter {
1262
+ public:
1263
+ item_deleter(const A& allocator): allocator_(allocator) {}
1264
+ void operator() (T* ptr) {
1265
+ if (ptr != nullptr) {
1266
+ ptr->~T();
1267
+ allocator_.deallocate(ptr, 1);
1268
+ }
1269
+ }
1270
+ private:
1271
+ A allocator_;
1272
+ };
1273
+
1274
+ template<typename T, typename C, typename A>
1275
+ class quantiles_sketch<T, C, A>::items_deleter {
1276
+ public:
1277
+ items_deleter(const A& allocator, bool destroy, size_t num): allocator_(allocator), destroy_(destroy), num_(num) {}
1278
+ void operator() (T* ptr) {
1279
+ if (ptr != nullptr) {
1280
+ if (destroy_) {
1281
+ for (size_t i = 0; i < num_; ++i) {
1282
+ ptr[i].~T();
1283
+ }
1284
+ }
1285
+ allocator_.deallocate(ptr, num_);
1286
+ }
1287
+ }
1288
+ void set_destroy(bool destroy) { destroy_ = destroy; }
1289
+ private:
1290
+ A allocator_;
1291
+ bool destroy_;
1292
+ size_t num_;
1293
+ };
1294
+
1295
+
1296
+ // quantiles_sketch::const_iterator implementation
1297
+
1298
+ template<typename T, typename C, typename A>
1299
+ quantiles_sketch<T, C, A>::const_iterator::const_iterator(const Level& base_buffer,
1300
+ const std::vector<Level, AllocLevel>& levels,
1301
+ uint16_t k,
1302
+ uint64_t n,
1303
+ bool is_end):
1304
+ base_buffer_(base_buffer),
1305
+ levels_(levels),
1306
+ level_(-1),
1307
+ index_(0),
1308
+ bb_count_(compute_base_buffer_items(k, n)),
1309
+ bit_pattern_(compute_bit_pattern(k, n)),
1310
+ weight_(1),
1311
+ k_(k)
1312
+ {
1313
+ if (is_end) {
1314
+ // if exact mode: index_ = n is end
1315
+ // if sampling, level_ = max_level + 1 and index_ = 0 is end
1316
+ if (bit_pattern_ == 0) // only a valid check for exact mode in constructor
1317
+ index_ = static_cast<uint32_t>(n);
1318
+ else
1319
+ level_ = static_cast<int>(levels_.size());
1320
+ } else { // find first non-empty item
1321
+ if (bb_count_ == 0 && bit_pattern_ > 0) {
1322
+ level_ = 0;
1323
+ weight_ = 2;
1324
+ while ((bit_pattern_ & 0x01) == 0) {
1325
+ weight_ *= 2;
1326
+ ++level_;
1327
+ bit_pattern_ >>= 1;
1328
+ }
1329
+ }
1330
+ }
1331
+ }
1332
+
1333
+ template<typename T, typename C, typename A>
1334
+ typename quantiles_sketch<T, C, A>::const_iterator& quantiles_sketch<T, C, A>::const_iterator::operator++() {
1335
+ ++index_;
1336
+
1337
+ if ((level_ == -1 && index_ == base_buffer_.size() && levels_.size() > 0) || (level_ >= 0 && index_ == k_)) { // go to the next non-empty level
1338
+ index_ = 0;
1339
+ do {
1340
+ ++level_;
1341
+ if (level_ > 0) bit_pattern_ = bit_pattern_ >> 1;
1342
+ if (bit_pattern_ == 0) return *this;
1343
+ weight_ *= 2;
1344
+ } while ((bit_pattern_ & static_cast<uint64_t>(1)) == 0);
1345
+ }
1346
+ return *this;
1347
+ }
1348
+
1349
+ template<typename T, typename C, typename A>
1350
+ typename quantiles_sketch<T, C, A>::const_iterator& quantiles_sketch<T, C, A>::const_iterator::operator++(int) {
1351
+ const_iterator tmp(*this);
1352
+ operator++();
1353
+ return tmp;
1354
+ }
1355
+
1356
+ template<typename T, typename C, typename A>
1357
+ bool quantiles_sketch<T, C, A>::const_iterator::operator==(const const_iterator& other) const {
1358
+ return level_ == other.level_ && index_ == other.index_;
1359
+ }
1360
+
1361
+ template<typename T, typename C, typename A>
1362
+ bool quantiles_sketch<T, C, A>::const_iterator::operator!=(const const_iterator& other) const {
1363
+ return !operator==(other);
1364
+ }
1365
+
1366
+ template<typename T, typename C, typename A>
1367
+ std::pair<const T&, const uint64_t> quantiles_sketch<T, C, A>::const_iterator::operator*() const {
1368
+ return std::pair<const T&, const uint64_t>(level_ == -1 ? base_buffer_[index_] : levels_[level_][index_], weight_);
1369
+ }
1370
+
1371
+ } /* namespace datasketches */
1372
+
1373
+ #endif // _QUANTILES_SKETCH_IMPL_HPP_