datasketches 0.2.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (106) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +1 -1
  4. data/ext/datasketches/kll_wrapper.cpp +5 -1
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +4 -3
  7. data/vendor/datasketches-cpp/common/CMakeLists.txt +4 -0
  8. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
  10. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
  11. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
  12. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
  13. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
  14. data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
  15. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
  16. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
  17. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +2 -0
  18. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
  19. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
  20. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
  21. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +25 -9
  22. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
  23. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
  24. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
  25. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
  26. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
  28. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
  29. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
  30. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
  31. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
  32. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
  33. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
  34. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
  35. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
  36. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
  37. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -4
  38. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
  39. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
  40. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +96 -42
  41. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +105 -127
  42. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +94 -25
  43. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  44. data/vendor/datasketches-cpp/pyproject.toml +1 -1
  45. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  46. data/vendor/datasketches-cpp/python/README.md +7 -0
  47. data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
  48. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
  49. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
  50. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
  51. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
  52. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
  53. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
  54. data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
  55. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
  56. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
  57. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
  58. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
  59. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
  60. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
  61. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
  62. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
  63. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
  64. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
  65. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
  66. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
  67. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
  68. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
  69. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
  70. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
  71. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -2
  72. data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
  73. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
  74. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
  75. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +62 -59
  76. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
  77. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
  78. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +31 -26
  79. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
  80. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +25 -9
  81. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
  82. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
  83. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
  84. data/vendor/datasketches-cpp/setup.py +1 -1
  85. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  86. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +8 -6
  87. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +1 -0
  88. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -0
  89. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +7 -45
  90. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -0
  91. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +1 -0
  92. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
  93. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
  94. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -0
  95. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +29 -1
  96. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +2 -0
  97. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +16 -0
  98. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +1 -0
  99. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
  100. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
  101. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -0
  102. metadata +25 -9
  103. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
  104. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
  105. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
  106. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
@@ -0,0 +1,1309 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _QUANTILES_SKETCH_IMPL_HPP_
21
+ #define _QUANTILES_SKETCH_IMPL_HPP_
22
+
23
+ #include <cmath>
24
+ #include <algorithm>
25
+ #include <stdexcept>
26
+ #include <iomanip>
27
+ #include <sstream>
28
+
29
+ #include "common_defs.hpp"
30
+ #include "count_zeros.hpp"
31
+ #include "conditional_forward.hpp"
32
+ #include "quantiles_sketch.hpp"
33
+
34
+ namespace datasketches {
35
+
36
+ template<typename T, typename C, typename A>
37
+ quantiles_sketch<T, C, A>::quantiles_sketch(uint16_t k, const A& allocator):
38
+ allocator_(allocator),
39
+ k_(k),
40
+ n_(0),
41
+ bit_pattern_(0),
42
+ base_buffer_(allocator_),
43
+ levels_(allocator_),
44
+ min_value_(nullptr),
45
+ max_value_(nullptr),
46
+ is_sorted_(true)
47
+ {
48
+ check_k(k_);
49
+ base_buffer_.reserve(2 * std::min(quantiles_constants::MIN_K, k));
50
+ }
51
+
52
+ template<typename T, typename C, typename A>
53
+ quantiles_sketch<T, C, A>::quantiles_sketch(const quantiles_sketch& other):
54
+ allocator_(other.allocator_),
55
+ k_(other.k_),
56
+ n_(other.n_),
57
+ bit_pattern_(other.bit_pattern_),
58
+ base_buffer_(other.base_buffer_),
59
+ levels_(other.levels_),
60
+ min_value_(nullptr),
61
+ max_value_(nullptr),
62
+ is_sorted_(other.is_sorted_)
63
+ {
64
+ if (other.min_value_ != nullptr) min_value_ = new (allocator_.allocate(1)) T(*other.min_value_);
65
+ if (other.max_value_ != nullptr) max_value_ = new (allocator_.allocate(1)) T(*other.max_value_);
66
+ for (size_t i = 0; i < levels_.size(); ++i) {
67
+ if (levels_[i].capacity() != other.levels_[i].capacity()) {
68
+ levels_[i].reserve(other.levels_[i].capacity());
69
+ }
70
+ }
71
+ }
72
+
73
+ template<typename T, typename C, typename A>
74
+ quantiles_sketch<T, C, A>::quantiles_sketch(quantiles_sketch&& other) noexcept:
75
+ allocator_(other.allocator_),
76
+ k_(other.k_),
77
+ n_(other.n_),
78
+ bit_pattern_(other.bit_pattern_),
79
+ base_buffer_(std::move(other.base_buffer_)),
80
+ levels_(std::move(other.levels_)),
81
+ min_value_(other.min_value_),
82
+ max_value_(other.max_value_),
83
+ is_sorted_(other.is_sorted_)
84
+ {
85
+ other.min_value_ = nullptr;
86
+ other.max_value_ = nullptr;
87
+ }
88
+
89
+ template<typename T, typename C, typename A>
90
+ quantiles_sketch<T, C, A>& quantiles_sketch<T, C, A>::operator=(const quantiles_sketch& other) {
91
+ quantiles_sketch<T, C, A> copy(other);
92
+ std::swap(allocator_, copy.allocator_);
93
+ std::swap(k_, copy.k_);
94
+ std::swap(n_, copy.n_);
95
+ std::swap(bit_pattern_, copy.bit_pattern_);
96
+ std::swap(base_buffer_, copy.base_buffer_);
97
+ std::swap(levels_, copy.levels_);
98
+ std::swap(min_value_, copy.min_value_);
99
+ std::swap(max_value_, copy.max_value_);
100
+ std::swap(is_sorted_, copy.is_sorted_);
101
+ return *this;
102
+ }
103
+
104
+ template<typename T, typename C, typename A>
105
+ quantiles_sketch<T, C, A>& quantiles_sketch<T, C, A>::operator=(quantiles_sketch&& other) noexcept {
106
+ std::swap(allocator_, other.allocator_);
107
+ std::swap(k_, other.k_);
108
+ std::swap(n_, other.n_);
109
+ std::swap(bit_pattern_, other.bit_pattern_);
110
+ std::swap(base_buffer_, other.base_buffer_);
111
+ std::swap(levels_, other.levels_);
112
+ std::swap(min_value_, other.min_value_);
113
+ std::swap(max_value_, other.max_value_);
114
+ std::swap(is_sorted_, other.is_sorted_);
115
+ return *this;
116
+ }
117
+
118
+ template<typename T, typename C, typename A>
119
+ quantiles_sketch<T, C, A>::quantiles_sketch(uint16_t k, uint64_t n, uint64_t bit_pattern,
120
+ Level&& base_buffer, VectorLevels&& levels,
121
+ std::unique_ptr<T, item_deleter> min_value, std::unique_ptr<T, item_deleter> max_value,
122
+ bool is_sorted, const A& allocator) :
123
+ allocator_(allocator),
124
+ k_(k),
125
+ n_(n),
126
+ bit_pattern_(bit_pattern),
127
+ base_buffer_(std::move(base_buffer)),
128
+ levels_(std::move(levels)),
129
+ min_value_(min_value.release()),
130
+ max_value_(max_value.release()),
131
+ is_sorted_(is_sorted)
132
+ {
133
+ uint32_t item_count = base_buffer_.size();
134
+ for (Level& lvl : levels_) {
135
+ item_count += lvl.size();
136
+ }
137
+ if (item_count != compute_retained_items(k_, n_))
138
+ throw std::logic_error("Item count does not match value computed from k, n");
139
+ }
140
+
141
+ template<typename T, typename C, typename A>
142
+ quantiles_sketch<T, C, A>::~quantiles_sketch() {
143
+ if (min_value_ != nullptr) {
144
+ min_value_->~T();
145
+ allocator_.deallocate(min_value_, 1);
146
+ }
147
+ if (max_value_ != nullptr) {
148
+ max_value_->~T();
149
+ allocator_.deallocate(max_value_, 1);
150
+ }
151
+ }
152
+
153
+ template<typename T, typename C, typename A>
154
+ template<typename FwdT>
155
+ void quantiles_sketch<T, C, A>::update(FwdT&& item) {
156
+ if (!check_update_value(item)) { return; }
157
+ if (is_empty()) {
158
+ min_value_ = new (allocator_.allocate(1)) T(item);
159
+ max_value_ = new (allocator_.allocate(1)) T(item);
160
+ } else {
161
+ if (C()(item, *min_value_)) *min_value_ = item;
162
+ if (C()(*max_value_, item)) *max_value_ = item;
163
+ }
164
+
165
+ // if exceed capacity, grow until size 2k -- assumes eager processing
166
+ if (base_buffer_.size() + 1 > base_buffer_.capacity())
167
+ grow_base_buffer();
168
+
169
+ base_buffer_.push_back(std::forward<FwdT>(item));
170
+ ++n_;
171
+
172
+ if (base_buffer_.size() > 1)
173
+ is_sorted_ = false;
174
+
175
+ if (base_buffer_.size() == 2 * k_)
176
+ process_full_base_buffer();
177
+ }
178
+
179
+ template<typename T, typename C, typename A>
180
+ template<typename FwdSk>
181
+ void quantiles_sketch<T, C, A>::merge(FwdSk&& other) {
182
+ if (other.is_empty()) {
183
+ return; // nothing to do
184
+ } else if (!other.is_estimation_mode()) {
185
+ // other is exact, stream in regardless of k
186
+ for (auto item : other.base_buffer_) {
187
+ update(conditional_forward<FwdSk>(item));
188
+ }
189
+ return; // we're done
190
+ }
191
+
192
+ // we know other has data and is in estimation mode
193
+ if (is_estimation_mode()) {
194
+ if (k_ == other.get_k()) {
195
+ standard_merge(*this, other);
196
+ } else if (k_ > other.get_k()) {
197
+ quantiles_sketch sk_copy(other);
198
+ downsampling_merge(sk_copy, *this);
199
+ *this = sk_copy;
200
+ } else { // k_ < other.get_k()
201
+ downsampling_merge(*this, other);
202
+ }
203
+ } else {
204
+ // exact or empty
205
+ quantiles_sketch sk_copy(other);
206
+ if (k_ <= other.get_k()) {
207
+ if (!is_empty()) {
208
+ for (uint16_t i = 0; i < base_buffer_.size(); ++i) {
209
+ sk_copy.update(std::move(base_buffer_[i]));
210
+ }
211
+ }
212
+ } else { // k_ > other.get_k()
213
+ downsampling_merge(sk_copy, *this);
214
+ }
215
+ *this = sk_copy;
216
+ }
217
+ }
218
+
219
+ template<typename T, typename C, typename A>
220
+ template<typename SerDe>
221
+ void quantiles_sketch<T, C, A>::serialize(std::ostream& os, const SerDe& serde) const {
222
+ const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_SHORT : PREAMBLE_LONGS_FULL;
223
+ write(os, preamble_longs);
224
+ const uint8_t ser_ver = SERIAL_VERSION;
225
+ write(os, ser_ver);
226
+ const uint8_t family = FAMILY;
227
+ write(os, family);
228
+
229
+ // side-effect: sort base buffer since always compact
230
+ // can't set is_sorted_ since const method
231
+ std::sort(const_cast<Level&>(base_buffer_).begin(), const_cast<Level&>(base_buffer_).end(), C());
232
+
233
+ // empty, ordered, compact are valid flags
234
+ const uint8_t flags_byte(
235
+ (is_empty() ? 1 << flags::IS_EMPTY : 0)
236
+ | (1 << flags::IS_SORTED) // always sorted as side effect noted above
237
+ | (1 << flags::IS_COMPACT) // always compact -- could be optional for numeric types?
238
+ );
239
+ write(os, flags_byte);
240
+ write(os, k_);
241
+ uint16_t unused = 0;
242
+ write(os, unused);
243
+
244
+ if (!is_empty()) {
245
+ write(os, n_);
246
+
247
+ // min and max
248
+ serde.serialize(os, min_value_, 1);
249
+ serde.serialize(os, max_value_, 1);
250
+
251
+ // base buffer items
252
+ serde.serialize(os, base_buffer_.data(), static_cast<unsigned>(base_buffer_.size()));
253
+
254
+ // levels, only when data is present
255
+ for (Level lvl : levels_) {
256
+ if (lvl.size() > 0)
257
+ serde.serialize(os, lvl.data(), static_cast<unsigned>(lvl.size()));
258
+ }
259
+ }
260
+ }
261
+
262
+ template<typename T, typename C, typename A>
263
+ template<typename SerDe>
264
+ auto quantiles_sketch<T, C, A>::serialize(unsigned header_size_bytes, const SerDe& serde) const -> vector_bytes {
265
+ const size_t size = get_serialized_size_bytes(serde) + header_size_bytes;
266
+ vector_bytes bytes(size, 0, allocator_);
267
+ uint8_t* ptr = bytes.data() + header_size_bytes;
268
+ const uint8_t* end_ptr = ptr + size;
269
+
270
+ const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_SHORT : PREAMBLE_LONGS_FULL;
271
+ ptr += copy_to_mem(preamble_longs, ptr);
272
+ const uint8_t ser_ver = SERIAL_VERSION;
273
+ ptr += copy_to_mem(ser_ver, ptr);
274
+ const uint8_t family = FAMILY;
275
+ ptr += copy_to_mem(family, ptr);
276
+
277
+ // side-effect: sort base buffer since always compact
278
+ // can't set is_sorted_ since const method
279
+ std::sort(const_cast<Level&>(base_buffer_).begin(), const_cast<Level&>(base_buffer_).end(), C());
280
+
281
+ // empty, ordered, compact are valid flags
282
+ const uint8_t flags_byte(
283
+ (is_empty() ? 1 << flags::IS_EMPTY : 0)
284
+ | (1 << flags::IS_SORTED) // always sorted as side effect noted above
285
+ | (1 << flags::IS_COMPACT) // always compact
286
+ );
287
+ ptr += copy_to_mem(flags_byte, ptr);
288
+ ptr += copy_to_mem(k_, ptr);
289
+ ptr += sizeof(uint16_t); // 2 unused bytes
290
+
291
+ if (!is_empty()) {
292
+
293
+ ptr += copy_to_mem(n_, ptr);
294
+
295
+ // min and max
296
+ ptr += serde.serialize(ptr, end_ptr - ptr, min_value_, 1);
297
+ ptr += serde.serialize(ptr, end_ptr - ptr, max_value_, 1);
298
+
299
+ // base buffer items
300
+ if (base_buffer_.size() > 0)
301
+ ptr += serde.serialize(ptr, end_ptr - ptr, base_buffer_.data(), static_cast<unsigned>(base_buffer_.size()));
302
+
303
+ // levels, only when data is present
304
+ for (Level lvl : levels_) {
305
+ if (lvl.size() > 0)
306
+ ptr += serde.serialize(ptr, end_ptr - ptr, lvl.data(), static_cast<unsigned>(lvl.size()));
307
+ }
308
+ }
309
+
310
+ return bytes;
311
+ }
312
+
313
+ template<typename T, typename C, typename A>
314
+ template<typename SerDe>
315
+ auto quantiles_sketch<T, C, A>::deserialize(std::istream &is, const SerDe& serde, const A &allocator) -> quantiles_sketch {
316
+ const auto preamble_longs = read<uint8_t>(is);
317
+ const auto serial_version = read<uint8_t>(is);
318
+ const auto family_id = read<uint8_t>(is);
319
+ const auto flags_byte = read<uint8_t>(is);
320
+ const auto k = read<uint16_t>(is);
321
+ read<uint16_t>(is); // unused
322
+
323
+ check_k(k);
324
+ check_serial_version(serial_version); // a little redundant with the header check
325
+ check_family_id(family_id);
326
+ check_header_validity(preamble_longs, flags_byte, serial_version);
327
+
328
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
329
+ const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
330
+ if (is_empty) {
331
+ return quantiles_sketch(k, allocator);
332
+ }
333
+
334
+ const auto items_seen = read<uint64_t>(is);
335
+
336
+ const bool is_compact = (serial_version == 2) | ((flags_byte & (1 << flags::IS_COMPACT)) > 0);
337
+ const bool is_sorted = (flags_byte & (1 << flags::IS_SORTED)) > 0;
338
+
339
+ A alloc(allocator);
340
+ auto item_buffer_deleter = [&alloc](T* ptr) { alloc.deallocate(ptr, 1); };
341
+ std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(alloc.allocate(1), item_buffer_deleter);
342
+ std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(alloc.allocate(1), item_buffer_deleter);
343
+ std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
344
+ std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
345
+
346
+ serde.deserialize(is, min_value_buffer.get(), 1);
347
+ // serde call did not throw, repackage with destrtuctor
348
+ min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
349
+ serde.deserialize(is, max_value_buffer.get(), 1);
350
+ // serde call did not throw, repackage with destrtuctor
351
+ max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
352
+
353
+ if (serial_version == 1) {
354
+ read<uint64_t>(is); // no longer used
355
+ }
356
+
357
+ // allocate buffers as needed
358
+ const uint8_t levels_needed = compute_levels_needed(k, items_seen);
359
+ const uint64_t bit_pattern = compute_bit_pattern(k, items_seen);
360
+
361
+ // Java provides a compact storage layout for a sketch of primitive doubles. The C++ version
362
+ // does not currently operate sketches in compact mode, but will only serialize as compact
363
+ // to avoid complications around serialization of empty values for generic type T. We also need
364
+ // to be able to ingest either serialized format from Java.
365
+
366
+ // load base buffer
367
+ const uint32_t bb_items = compute_base_buffer_items(k, items_seen);
368
+ uint32_t items_to_read = (levels_needed == 0 || is_compact) ? bb_items : 2 * k;
369
+ Level base_buffer = deserialize_array(is, bb_items, 2 * k, serde, allocator);
370
+ if (items_to_read > bb_items) { // either equal or greater, never read fewer items
371
+ // read remaining items, but don't store them
372
+ deserialize_array(is, items_to_read - bb_items, items_to_read - bb_items, serde, allocator);
373
+ }
374
+
375
+ // populate vector of Levels directly
376
+ VectorLevels levels(allocator);
377
+ levels.reserve(levels_needed);
378
+ if (levels_needed > 0) {
379
+ uint64_t working_pattern = bit_pattern;
380
+ for (size_t i = 0; i < levels_needed; ++i, working_pattern >>= 1) {
381
+ if ((working_pattern & 0x01) == 1) {
382
+ Level level = deserialize_array(is, k, k, serde, allocator);
383
+ levels.push_back(std::move(level));
384
+ } else {
385
+ Level level(allocator);
386
+ level.reserve(k);
387
+ levels.push_back(std::move(level));
388
+ }
389
+ }
390
+ }
391
+
392
+ return quantiles_sketch(k, items_seen, bit_pattern,
393
+ std::move(base_buffer), std::move(levels), std::move(min_value), std::move(max_value), is_sorted, allocator);
394
+ }
395
+
396
+ template<typename T, typename C, typename A>
397
+ template<typename SerDe>
398
+ auto quantiles_sketch<T, C, A>::deserialize_array(std::istream& is, uint32_t num_items, uint32_t capacity, const SerDe& serde, const A& allocator) -> Level {
399
+ A alloc(allocator);
400
+ std::unique_ptr<T, items_deleter> items(alloc.allocate(num_items), items_deleter(allocator, false, num_items));
401
+ serde.deserialize(is, items.get(), num_items);
402
+ // serde did not throw, enable destructors
403
+ items.get_deleter().set_destroy(true);
404
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
405
+
406
+ // succesfully read, now put into a Level
407
+ Level level(allocator);
408
+ level.reserve(capacity);
409
+ level.insert(level.begin(),
410
+ std::make_move_iterator(items.get()),
411
+ std::make_move_iterator(items.get() + num_items));
412
+ return level;
413
+ }
414
+
415
+ template<typename T, typename C, typename A>
416
+ template<typename SerDe>
417
+ auto quantiles_sketch<T, C, A>::deserialize(const void* bytes, size_t size, const SerDe& serde, const A &allocator) -> quantiles_sketch {
418
+ ensure_minimum_memory(size, 8);
419
+ const char* ptr = static_cast<const char*>(bytes);
420
+ const char* end_ptr = static_cast<const char*>(bytes) + size;
421
+
422
+ uint8_t preamble_longs;
423
+ ptr += copy_from_mem(ptr, preamble_longs);
424
+ uint8_t serial_version;
425
+ ptr += copy_from_mem(ptr, serial_version);
426
+ uint8_t family_id;
427
+ ptr += copy_from_mem(ptr, family_id);
428
+ uint8_t flags_byte;
429
+ ptr += copy_from_mem(ptr, flags_byte);
430
+ uint16_t k;
431
+ ptr += copy_from_mem(ptr, k);
432
+ uint16_t unused;
433
+ ptr += copy_from_mem(ptr, unused);
434
+
435
+ check_k(k);
436
+ check_serial_version(serial_version); // a little redundant with the header check
437
+ check_family_id(family_id);
438
+ check_header_validity(preamble_longs, flags_byte, serial_version);
439
+
440
+ const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
441
+ if (is_empty) {
442
+ return quantiles_sketch(k, allocator);
443
+ }
444
+
445
+ ensure_minimum_memory(size, 16);
446
+ uint64_t items_seen;
447
+ ptr += copy_from_mem(ptr, items_seen);
448
+
449
+ const bool is_compact = (serial_version == 2) | ((flags_byte & (1 << flags::IS_COMPACT)) > 0);
450
+ const bool is_sorted = (flags_byte & (1 << flags::IS_SORTED)) > 0;
451
+
452
+ A alloc(allocator);
453
+ auto item_buffer_deleter = [&alloc](T* ptr) { alloc.deallocate(ptr, 1); };
454
+ std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(alloc.allocate(1), item_buffer_deleter);
455
+ std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(alloc.allocate(1), item_buffer_deleter);
456
+ std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
457
+ std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
458
+
459
+ ptr += serde.deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
460
+ // serde call did not throw, repackage with destrtuctor
461
+ min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
462
+ ptr += serde.deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
463
+ // serde call did not throw, repackage with destrtuctor
464
+ max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
465
+
466
+ if (serial_version == 1) {
467
+ uint64_t unused_long;
468
+ ptr += copy_from_mem(ptr, unused_long); // no longer used
469
+ }
470
+
471
+ // allocate buffers as needed
472
+ const uint8_t levels_needed = compute_levels_needed(k, items_seen);
473
+ const uint64_t bit_pattern = compute_bit_pattern(k, items_seen);
474
+
475
+ // Java provides a compact storage layout for a sketch of primitive doubles. The C++ version
476
+ // does not currently operate sketches in compact mode, but will only serialize as compact
477
+ // to avoid complications around serialization of empty values for generic type T. We also need
478
+ // to be able to ingest either serialized format from Java.
479
+
480
+ // load base buffer
481
+ const uint32_t bb_items = compute_base_buffer_items(k, items_seen);
482
+ uint32_t items_to_read = (levels_needed == 0 || is_compact) ? bb_items : 2 * k;
483
+ auto base_buffer_pair = deserialize_array(ptr, end_ptr - ptr, bb_items, 2 * k, serde, allocator);
484
+ ptr += base_buffer_pair.second;
485
+ if (items_to_read > bb_items) { // either equal or greater, never read fewer items
486
+ // read remaining items, only use to advance the pointer
487
+ auto extras = deserialize_array(ptr, end_ptr - ptr, items_to_read - bb_items, items_to_read - bb_items, serde, allocator);
488
+ ptr += extras.second;
489
+ }
490
+
491
+ // populate vector of Levels directly
492
+ VectorLevels levels(allocator);
493
+ levels.reserve(levels_needed);
494
+ if (levels_needed > 0) {
495
+ uint64_t working_pattern = bit_pattern;
496
+ for (size_t i = 0; i < levels_needed; ++i, working_pattern >>= 1) {
497
+
498
+ if ((working_pattern & 0x01) == 1) {
499
+ auto pair = deserialize_array(ptr, end_ptr - ptr, k, k, serde, allocator);
500
+ ptr += pair.second;
501
+ levels.push_back(std::move(pair.first));
502
+ } else {
503
+ Level level(allocator);
504
+ level.reserve(k);
505
+ levels.push_back(std::move(level));
506
+ }
507
+ }
508
+ }
509
+
510
+ return quantiles_sketch(k, items_seen, bit_pattern,
511
+ std::move(base_buffer_pair.first), std::move(levels), std::move(min_value), std::move(max_value), is_sorted, allocator);
512
+ }
513
+
514
+ template<typename T, typename C, typename A>
515
+ template<typename SerDe>
516
+ auto quantiles_sketch<T, C, A>::deserialize_array(const void* bytes, size_t size, uint32_t num_items, uint32_t capacity, const SerDe& serde, const A& allocator)
517
+ -> std::pair<Level, size_t> {
518
+ const char* ptr = static_cast<const char*>(bytes);
519
+ const char* end_ptr = static_cast<const char*>(bytes) + size;
520
+ A alloc(allocator);
521
+ std::unique_ptr<T, items_deleter> items(alloc.allocate(num_items), items_deleter(allocator, false, num_items));
522
+ ptr += serde.deserialize(ptr, end_ptr - ptr, items.get(), num_items);
523
+ // serde did not throw, enable destructors
524
+ items.get_deleter().set_destroy(true);
525
+
526
+ // succesfully read, now put into a Level
527
+ Level level(allocator);
528
+ level.reserve(capacity);
529
+ level.insert(level.begin(),
530
+ std::make_move_iterator(items.get()),
531
+ std::make_move_iterator(items.get() + num_items));
532
+
533
+ return std::pair<Level, size_t>(std::move(level), ptr - static_cast<const char*>(bytes));
534
+ }
535
+
536
+ template<typename T, typename C, typename A>
537
+ string<A> quantiles_sketch<T, C, A>::to_string(bool print_levels, bool print_items) const {
538
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
539
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
540
+ std::ostringstream os;
541
+ os << "### Quantiles Sketch summary:" << std::endl;
542
+ os << " K : " << k_ << std::endl;
543
+ os << " N : " << n_ << std::endl;
544
+ os << " Epsilon : " << std::setprecision(3) << get_normalized_rank_error(false) * 100 << "%" << std::endl;
545
+ os << " Epsilon PMF : " << get_normalized_rank_error(true) * 100 << "%" << std::endl;
546
+ os << " Empty : " << (is_empty() ? "true" : "false") << std::endl;
547
+ os << " Estimation mode: " << (is_estimation_mode() ? "true" : "false") << std::endl;
548
+ os << " Levels (w/o BB): " << levels_.size() << std::endl;
549
+ os << " Used Levels : " << compute_valid_levels(bit_pattern_) << std::endl;
550
+ os << " Retained items : " << get_num_retained() << std::endl;
551
+ if (!is_empty()) {
552
+ os << " Min value : " << *min_value_ << std::endl;
553
+ os << " Max value : " << *max_value_ << std::endl;
554
+ }
555
+ os << "### End sketch summary" << std::endl;
556
+
557
+ if (print_levels) {
558
+ os << "### Quantiles Sketch levels:" << std::endl;
559
+ os << " index: items in use" << std::endl;
560
+ os << " BB: " << base_buffer_.size() << std::endl;
561
+ for (uint8_t i = 0; i < levels_.size(); i++) {
562
+ os << " " << static_cast<unsigned int>(i) << ": " << levels_[i].size() << std::endl;
563
+ }
564
+ os << "### End sketch levels" << std::endl;
565
+ }
566
+
567
+ if (print_items) {
568
+ os << "### Quantiles Sketch data:" << std::endl;
569
+ uint8_t level = 0;
570
+ os << " BB:" << std::endl;
571
+ for (const T& item : base_buffer_) {
572
+ os << " " << std::to_string(item) << std::endl;
573
+ }
574
+ for (uint8_t i = 0; i < levels_.size(); ++i) {
575
+ os << " level " << static_cast<unsigned int>(level) << ":" << std::endl;
576
+ for (const T& item : levels_[i]) {
577
+ os << " " << std::to_string(item) << std::endl;
578
+ }
579
+ }
580
+ os << "### End sketch data" << std::endl;
581
+ }
582
+ return string<A>(os.str().c_str(), allocator_);
583
+ }
584
+
585
+ template<typename T, typename C, typename A>
586
+ uint16_t quantiles_sketch<T, C, A>::get_k() const {
587
+ return k_;
588
+ }
589
+
590
+ template<typename T, typename C, typename A>
591
+ uint64_t quantiles_sketch<T, C, A>::get_n() const {
592
+ return n_;
593
+ }
594
+
595
+ template<typename T, typename C, typename A>
596
+ bool quantiles_sketch<T, C, A>::is_empty() const {
597
+ return n_ == 0;
598
+ }
599
+
600
+ template<typename T, typename C, typename A>
601
+ bool quantiles_sketch<T, C, A>::is_estimation_mode() const {
602
+ return bit_pattern_ != 0;
603
+ }
604
+
605
+ template<typename T, typename C, typename A>
606
+ uint32_t quantiles_sketch<T, C, A>::get_num_retained() const {
607
+ return compute_retained_items(k_, n_);
608
+ }
609
+
610
+ template<typename T, typename C, typename A>
611
+ const T& quantiles_sketch<T, C, A>::get_min_value() const {
612
+ if (is_empty()) return get_invalid_value();
613
+ return *min_value_;
614
+ }
615
+
616
+ template<typename T, typename C, typename A>
617
+ const T& quantiles_sketch<T, C, A>::get_max_value() const {
618
+ if (is_empty()) return get_invalid_value();
619
+ return *max_value_;
620
+ }
621
+
622
+ template<typename T, typename C, typename A>
623
+ C quantiles_sketch<T, C, A>::get_comparator() const {
624
+ return C();
625
+ }
626
+
627
+ // implementation for fixed-size arithmetic types (integral and floating point)
628
+ template<typename T, typename C, typename A>
629
+ template<typename SerDe, typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
630
+ size_t quantiles_sketch<T, C, A>::get_serialized_size_bytes(const SerDe&) const {
631
+ if (is_empty()) { return EMPTY_SIZE_BYTES; }
632
+ return DATA_START + ((get_num_retained() + 2) * sizeof(TT));
633
+ }
634
+
635
+ // implementation for all other types
636
+ template<typename T, typename C, typename A>
637
+ template<typename SerDe, typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
638
+ size_t quantiles_sketch<T, C, A>::get_serialized_size_bytes(const SerDe& serde) const {
639
+ if (is_empty()) { return EMPTY_SIZE_BYTES; }
640
+ size_t size = DATA_START;
641
+ size += serde.size_of_item(*min_value_);
642
+ size += serde.size_of_item(*max_value_);
643
+ for (auto it: *this) size += serde.size_of_item(it.first);
644
+ return size;
645
+ }
646
+
647
+ template<typename T, typename C, typename A>
648
+ double quantiles_sketch<T, C, A>::get_normalized_rank_error(bool is_pmf) const {
649
+ return get_normalized_rank_error(k_, is_pmf);
650
+ }
651
+
652
+ template<typename T, typename C, typename A>
653
+ double quantiles_sketch<T, C, A>::get_normalized_rank_error(uint16_t k, bool is_pmf) {
654
+ return is_pmf
655
+ ? 1.854 / std::pow(k, 0.9657)
656
+ : 1.576 / std::pow(k, 0.9726);
657
+ }
658
+
659
+ template<typename T, typename C, typename A>
660
+ template<bool inclusive>
661
+ quantile_sketch_sorted_view<T, C, A> quantiles_sketch<T, C, A>::get_sorted_view(bool cumulative) const {
662
+ // allow side-effect of sorting the base buffer; can't set the flag since
663
+ // this is a const method
664
+ if (!is_sorted_) {
665
+ std::sort(const_cast<Level&>(base_buffer_).begin(), const_cast<Level&>(base_buffer_).end(), C());
666
+ }
667
+ quantile_sketch_sorted_view<T, C, A> view(get_num_retained(), allocator_);
668
+
669
+ uint64_t weight = 1;
670
+ view.add(base_buffer_.begin(), base_buffer_.end(), weight);
671
+ for (auto& level : levels_) {
672
+ weight <<= 1;
673
+ if (level.empty()) { continue; }
674
+ view.add(level.begin(), level.end(), weight);
675
+ }
676
+
677
+ if (cumulative) view.template convert_to_cummulative<inclusive>();
678
+ return view;
679
+ }
680
+
681
+ template<typename T, typename C, typename A>
682
+ template<bool inclusive>
683
+ auto quantiles_sketch<T, C, A>::get_quantile(double rank) const -> quantile_return_type {
684
+ if (is_empty()) return get_invalid_value();
685
+ if (rank == 0.0) return *min_value_;
686
+ if (rank == 1.0) return *max_value_;
687
+ if ((rank < 0.0) || (rank > 1.0)) {
688
+ throw std::invalid_argument("Rank cannot be less than zero or greater than 1.0");
689
+ }
690
+ // possible side-effect: sorting base buffer
691
+ return get_sorted_view<inclusive>(true).get_quantile(rank);
692
+ }
693
+
694
+ template<typename T, typename C, typename A>
695
+ template<bool inclusive>
696
+ std::vector<T, A> quantiles_sketch<T, C, A>::get_quantiles(const double* ranks, uint32_t size) const {
697
+ std::vector<T, A> quantiles(allocator_);
698
+ if (is_empty()) return quantiles;
699
+ quantiles.reserve(size);
700
+
701
+ // possible side-effect: sorting base buffer
702
+ auto view = get_sorted_view<inclusive>(true);
703
+
704
+ for (uint32_t i = 0; i < size; ++i) {
705
+ const double rank = ranks[i];
706
+ if ((rank < 0.0) || (rank > 1.0)) {
707
+ throw std::invalid_argument("rank cannot be less than zero or greater than 1.0");
708
+ }
709
+ if (rank == 0.0) quantiles.push_back(*min_value_);
710
+ else if (rank == 1.0) quantiles.push_back(*max_value_);
711
+ else {
712
+ quantiles.push_back(view.get_quantile(rank));
713
+ }
714
+ }
715
+ return quantiles;
716
+ }
717
+
718
+ template<typename T, typename C, typename A>
719
+ template<bool inclusive>
720
+ std::vector<T, A> quantiles_sketch<T, C, A>::get_quantiles(uint32_t num) const {
721
+ if (is_empty()) return std::vector<T, A>(allocator_);
722
+ if (num == 0) {
723
+ throw std::invalid_argument("num must be > 0");
724
+ }
725
+ vector_double fractions(num, 0, allocator_);
726
+ fractions[0] = 0.0;
727
+ for (size_t i = 1; i < num; i++) {
728
+ fractions[i] = static_cast<double>(i) / (num - 1);
729
+ }
730
+ if (num > 1) {
731
+ fractions[num - 1] = 1.0;
732
+ }
733
+ return get_quantiles<inclusive>(fractions.data(), num);
734
+ }
735
+
736
+ template<typename T, typename C, typename A>
737
+ template<bool inclusive>
738
+ double quantiles_sketch<T, C, A>::get_rank(const T& value) const {
739
+ if (is_empty()) return std::numeric_limits<double>::quiet_NaN();
740
+ uint64_t weight = 1;
741
+ uint64_t total = 0;
742
+ for (const T &item: base_buffer_) {
743
+ if (inclusive ? !C()(value, item) : C()(item, value))
744
+ total += weight;
745
+ }
746
+
747
+ weight *= 2;
748
+ for (uint8_t level = 0; level < levels_.size(); ++level, weight *= 2) {
749
+ if (levels_[level].empty()) { continue; }
750
+ const T* data = levels_[level].data();
751
+ for (uint16_t i = 0; i < k_; ++i) {
752
+ if (inclusive ? !C()(value, data[i]) : C()(data[i], value))
753
+ total += weight;
754
+ else
755
+ break; // levels are sorted, no point comparing further
756
+ }
757
+ }
758
+ return (double) total / n_;
759
+ }
760
+
761
+ template<typename T, typename C, typename A>
762
+ template<bool inclusive>
763
+ auto quantiles_sketch<T, C, A>::get_PMF(const T* split_points, uint32_t size) const -> vector_double {
764
+ auto buckets = get_CDF<inclusive>(split_points, size);
765
+ if (is_empty()) return buckets;
766
+ for (uint32_t i = size; i > 0; --i) {
767
+ buckets[i] -= buckets[i - 1];
768
+ }
769
+ return buckets;
770
+ }
771
+
772
+ template<typename T, typename C, typename A>
773
+ template<bool inclusive>
774
+ auto quantiles_sketch<T, C, A>::get_CDF(const T* split_points, uint32_t size) const -> vector_double {
775
+ vector_double buckets(allocator_);
776
+ if (is_empty()) return buckets;
777
+ check_split_points(split_points, size);
778
+ buckets.reserve(size + 1);
779
+ for (uint32_t i = 0; i < size; ++i) buckets.push_back(get_rank<inclusive>(split_points[i]));
780
+ buckets.push_back(1);
781
+ return buckets;
782
+ }
783
+
784
+ template<typename T, typename C, typename A>
785
+ uint32_t quantiles_sketch<T, C, A>::compute_retained_items(const uint16_t k, const uint64_t n) {
786
+ uint32_t bb_count = compute_base_buffer_items(k, n);
787
+ uint64_t bit_pattern = compute_bit_pattern(k, n);
788
+ uint32_t valid_levels = compute_valid_levels(bit_pattern);
789
+ return bb_count + (k * valid_levels);
790
+ }
791
+
792
+ template<typename T, typename C, typename A>
793
+ uint32_t quantiles_sketch<T, C, A>::compute_base_buffer_items(const uint16_t k, const uint64_t n) {
794
+ return n % (static_cast<uint64_t>(2) * k);
795
+ }
796
+
797
+ template<typename T, typename C, typename A>
798
+ uint64_t quantiles_sketch<T, C, A>::compute_bit_pattern(const uint16_t k, const uint64_t n) {
799
+ return n / (static_cast<uint64_t>(2) * k);
800
+ }
801
+
802
+ template<typename T, typename C, typename A>
803
+ uint32_t quantiles_sketch<T, C, A>::compute_valid_levels(const uint64_t bit_pattern) {
804
+ // TODO: Java's Long.bitCount() probably uses a better method
805
+ uint64_t bp = bit_pattern;
806
+ uint32_t count = 0;
807
+ while (bp > 0) {
808
+ if ((bp & 0x01) == 1) ++count;
809
+ bp >>= 1;
810
+ }
811
+ return count;
812
+ }
813
+
814
+ template<typename T, typename C, typename A>
815
+ uint8_t quantiles_sketch<T, C, A>::compute_levels_needed(const uint16_t k, const uint64_t n) {
816
+ return static_cast<uint8_t>(64U) - count_leading_zeros_in_u64(n / (2 * k));
817
+ }
818
+
819
+ template<typename T, typename C, typename A>
820
+ void quantiles_sketch<T, C, A>::check_k(uint16_t k) {
821
+ if (k < quantiles_constants::MIN_K || k > quantiles_constants::MAX_K || (k & (k - 1)) != 0) {
822
+ throw std::invalid_argument("k must be a power of 2 that is >= "
823
+ + std::to_string(quantiles_constants::MIN_K) + " and <= "
824
+ + std::to_string(quantiles_constants::MAX_K) + ". Found: " + std::to_string(k));
825
+ }
826
+ }
827
+
828
+ template<typename T, typename C, typename A>
829
+ void quantiles_sketch<T, C, A>::check_serial_version(uint8_t serial_version) {
830
+ if (serial_version == SERIAL_VERSION || serial_version == SERIAL_VERSION_1 || serial_version == SERIAL_VERSION_2)
831
+ return;
832
+ else
833
+ throw std::invalid_argument("Possible corruption. Unrecognized serialization version: " + std::to_string(serial_version));
834
+ }
835
+
836
+ template<typename T, typename C, typename A>
837
+ void quantiles_sketch<T, C, A>::check_family_id(uint8_t family_id) {
838
+ if (family_id == FAMILY)
839
+ return;
840
+ else
841
+ throw std::invalid_argument("Possible corruption. Family id does not indicate quantiles sketch: " + std::to_string(family_id));
842
+ }
843
+
844
+ template<typename T, typename C, typename A>
845
+ void quantiles_sketch<T, C, A>::check_header_validity(uint8_t preamble_longs, uint8_t flags_byte, uint8_t serial_version) {
846
+ bool empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
847
+ bool compact = (flags_byte & (1 << flags::IS_COMPACT)) > 0;
848
+
849
+ uint8_t sw = (compact ? 1 : 0) + (2 * (empty ? 1 : 0))
850
+ + (4 * (serial_version & 0xF)) + (32 * (preamble_longs & 0x3F));
851
+ bool valid = true;
852
+
853
+ switch (sw) { // exhaustive list and description of all valid cases
854
+ case 38 : break; //!compact, empty, serVer = 1, preLongs = 1; always stored as not compact
855
+ case 164 : break; //!compact, !empty, serVer = 1, preLongs = 5; always stored as not compact
856
+ case 42 : break; //!compact, empty, serVer = 2, preLongs = 1; always stored as compact
857
+ case 72 : break; //!compact, !empty, serVer = 2, preLongs = 2; always stored as compact
858
+ case 47 : break; // compact, empty, serVer = 3, preLongs = 1;
859
+ case 46 : break; //!compact, empty, serVer = 3, preLongs = 1;
860
+ case 79 : break; // compact, empty, serVer = 3, preLongs = 2;
861
+ case 78 : break; //!compact, empty, serVer = 3, preLongs = 2;
862
+ case 77 : break; // compact, !empty, serVer = 3, preLongs = 2;
863
+ case 76 : break; //!compact, !empty, serVer = 3, preLongs = 2;
864
+ default : //all other case values are invalid
865
+ valid = false;
866
+ }
867
+
868
+ if (!valid) {
869
+ std::ostringstream os;
870
+ os << "Possible sketch corruption. Inconsistent state: "
871
+ << "preamble_longs = " << preamble_longs
872
+ << ", empty = " << (empty ? "true" : "false")
873
+ << ", serialization_version = " << serial_version
874
+ << ", compact = " << (compact ? "true" : "false");
875
+ throw std::invalid_argument(os.str());
876
+ }
877
+ }
878
+
879
+ template <typename T, typename C, typename A>
880
+ typename quantiles_sketch<T, C, A>::const_iterator quantiles_sketch<T, C, A>::begin() const {
881
+ return quantiles_sketch<T, C, A>::const_iterator(base_buffer_, levels_, k_, n_, false);
882
+ }
883
+
884
+ template <typename T, typename C, typename A>
885
+ typename quantiles_sketch<T, C, A>::const_iterator quantiles_sketch<T, C, A>::end() const {
886
+ return quantiles_sketch<T, C, A>::const_iterator(base_buffer_, levels_, k_, n_, true);
887
+ }
888
+
889
+ template<typename T, typename C, typename A>
890
+ void quantiles_sketch<T, C, A>::grow_base_buffer() {
891
+ size_t new_size = std::max(std::min(static_cast<size_t>(2 * k_), 2 * base_buffer_.size()), static_cast<size_t>(1));
892
+ base_buffer_.reserve(new_size);
893
+ }
894
+
895
+ template<typename T, typename C, typename A>
896
+ void quantiles_sketch<T, C, A>::process_full_base_buffer() {
897
+ // make sure there will be enough levels for the propagation
898
+ grow_levels_if_needed(); // note: n_ was already incremented by update() before this
899
+
900
+ std::sort(base_buffer_.begin(), base_buffer_.end(), C());
901
+ in_place_propagate_carry(0,
902
+ levels_[0], // unused here, but 0 is guaranteed to exist
903
+ base_buffer_,
904
+ true, *this);
905
+ base_buffer_.clear();
906
+ is_sorted_ = true;
907
+ if (n_ / (2 * k_) != bit_pattern_) {
908
+ throw std::logic_error("Internal error: n / 2k (" + std::to_string(n_ / 2 * k_)
909
+ + " != bit_pattern " + std::to_string(bit_pattern_));
910
+ }
911
+ }
912
+
913
+ template<typename T, typename C, typename A>
914
+ bool quantiles_sketch<T, C, A>::grow_levels_if_needed() {
915
+ uint8_t levels_needed = compute_levels_needed(k_, n_);
916
+ if (levels_needed == 0)
917
+ return false; // don't need levels and might have small base buffer. Possible during merges.
918
+
919
+ // from here on, assume full size base buffer (2k) and at least one additional level
920
+ if (levels_needed <= levels_.size())
921
+ return false;
922
+
923
+ Level empty_level(allocator_);
924
+ empty_level.reserve(k_);
925
+ levels_.push_back(std::move(empty_level));
926
+ return true;
927
+ }
928
+
929
+ template<typename T, typename C, typename A>
930
+ template<typename FwdV>
931
+ void quantiles_sketch<T, C, A>::in_place_propagate_carry(uint8_t starting_level,
932
+ FwdV&& buf_size_k, Level& buf_size_2k,
933
+ bool apply_as_update,
934
+ quantiles_sketch& sketch) {
935
+ const uint64_t bit_pattern = sketch.bit_pattern_;
936
+ const int k = sketch.k_;
937
+
938
+ uint8_t ending_level = lowest_zero_bit_starting_at(bit_pattern, starting_level);
939
+
940
+ if (apply_as_update) {
941
+ // update version of computation
942
+ // its is okay for buf_size_k to be null in this case
943
+ zip_buffer(buf_size_2k, sketch.levels_[ending_level]);
944
+ } else {
945
+ // merge_into version of computation
946
+ for (uint16_t i = 0; i < k; ++i) {
947
+ sketch.levels_[ending_level].push_back(conditional_forward<FwdV>(buf_size_k[i]));
948
+ }
949
+ }
950
+
951
+ for (uint64_t lvl = starting_level; lvl < ending_level; lvl++) {
952
+ if ((bit_pattern & (static_cast<uint64_t>(1) << lvl)) == 0) {
953
+ throw std::logic_error("unexpected empty level in bit_pattern");
954
+ }
955
+ merge_two_size_k_buffers(
956
+ sketch.levels_[lvl],
957
+ sketch.levels_[ending_level],
958
+ buf_size_2k);
959
+ sketch.levels_[lvl].clear();
960
+ sketch.levels_[ending_level].clear();
961
+ zip_buffer(buf_size_2k, sketch.levels_[ending_level]);
962
+ } // end of loop over lower levels
963
+
964
+ // update bit pattern with binary-arithmetic ripple carry
965
+ sketch.bit_pattern_ = bit_pattern + (static_cast<uint64_t>(1) << starting_level);
966
+ }
967
+
968
+ template<typename T, typename C, typename A>
969
+ void quantiles_sketch<T, C, A>::zip_buffer(Level& buf_in, Level& buf_out) {
970
+ #ifdef QUANTILES_VALIDATION
971
+ static uint32_t next_offset = 0;
972
+ uint32_t rand_offset = next_offset;
973
+ next_offset = 1 - next_offset;
974
+ #else
975
+ uint32_t rand_offset = random_bit();
976
+ #endif
977
+ if ((buf_in.size() != 2 * buf_out.capacity())
978
+ || (buf_out.size() > 0)) {
979
+ throw std::logic_error("zip_buffer requires buf_in.size() == "
980
+ "2*buf_out.capacity() and empty buf_out");
981
+ }
982
+
983
+ size_t k = buf_out.capacity();
984
+ for (uint32_t i = rand_offset, o = 0; o < k; i += 2, ++o) {
985
+ buf_out.push_back(std::move(buf_in[i]));
986
+ }
987
+ buf_in.clear();
988
+ }
989
+
990
+ template<typename T, typename C, typename A>
991
+ template<typename FwdV>
992
+ void quantiles_sketch<T, C, A>::zip_buffer_with_stride(FwdV&& buf_in, Level& buf_out, uint16_t stride) {
993
+ // Random offset in range [0, stride)
994
+ std::uniform_int_distribution<uint16_t> dist(0, stride - 1);
995
+ uint16_t rand_offset = dist(random_utils::rand);
996
+
997
+ if ((buf_in.size() != stride * buf_out.capacity())
998
+ || (buf_out.size() > 0)) {
999
+ throw std::logic_error("zip_buffer_with_stride requires buf_in.size() == "
1000
+ "stride*buf_out.capacity() and empty buf_out");
1001
+ }
1002
+
1003
+ size_t k = buf_out.capacity();
1004
+ for (uint16_t i = rand_offset, o = 0; o < k; i += stride, ++o) {
1005
+ buf_out.push_back(conditional_forward<FwdV>(buf_in[i]));
1006
+ }
1007
+ // do not clear input buffer
1008
+ }
1009
+
1010
+
1011
+ template<typename T, typename C, typename A>
1012
+ void quantiles_sketch<T, C, A>::merge_two_size_k_buffers(Level& src_1, Level& src_2, Level& dst) {
1013
+ if (src_1.size() != src_2.size()
1014
+ || src_1.size() * 2 != dst.capacity()
1015
+ || dst.size() != 0) {
1016
+ throw std::logic_error("Input invariants violated in merge_two_size_k_buffers()");
1017
+ }
1018
+
1019
+ auto end1 = src_1.end(), end2 = src_2.end();
1020
+ auto it1 = src_1.begin(), it2 = src_2.begin();
1021
+
1022
+ // TODO: probably actually doing copies given Level&?
1023
+ while (it1 != end1 && it2 != end2) {
1024
+ if (C()(*it1, *it2)) {
1025
+ dst.push_back(std::move(*it1++));
1026
+ } else {
1027
+ dst.push_back(std::move(*it2++));
1028
+ }
1029
+ }
1030
+
1031
+ if (it1 != end1) {
1032
+ dst.insert(dst.end(), it1, end1);
1033
+ } else {
1034
+ if (it2 == end2) { throw std::logic_error("it2 unexpectedly already at end of range"); }
1035
+ dst.insert(dst.end(), it2, end2);
1036
+ }
1037
+ }
1038
+
1039
+
1040
+ template<typename T, typename C, typename A>
1041
+ template<typename FwdSk>
1042
+ void quantiles_sketch<T, C, A>::standard_merge(quantiles_sketch& tgt, FwdSk&& src) {
1043
+ if (src.get_k() != tgt.get_k()) {
1044
+ throw std::invalid_argument("src.get_k() != tgt.get_k()");
1045
+ }
1046
+ if (src.is_empty()) {
1047
+ return;
1048
+ }
1049
+
1050
+ uint64_t new_n = src.get_n() + tgt.get_n();
1051
+
1052
+ // move items from src's base buffer
1053
+ for (uint16_t i = 0; i < src.base_buffer_.size(); ++i) {
1054
+ tgt.update(conditional_forward<FwdSk>(src.base_buffer_[i]));
1055
+ }
1056
+
1057
+ // check (after moving raw items) if we need to extend levels array
1058
+ uint8_t levels_needed = compute_levels_needed(tgt.get_k(), new_n);
1059
+ if (levels_needed > tgt.levels_.size()) {
1060
+ tgt.levels_.reserve(levels_needed);
1061
+ while (tgt.levels_.size() < levels_needed) {
1062
+ Level empty_level(tgt.allocator_);
1063
+ empty_level.reserve(tgt.get_k());
1064
+ tgt.levels_.push_back(std::move(empty_level));
1065
+ }
1066
+ }
1067
+
1068
+ Level scratch_buf(tgt.allocator_);
1069
+ scratch_buf.reserve(2 * tgt.get_k());
1070
+
1071
+ uint64_t src_pattern = src.bit_pattern_;
1072
+ for (uint8_t src_lvl = 0; src_pattern != 0; ++src_lvl, src_pattern >>= 1) {
1073
+ if ((src_pattern & 1) > 0) {
1074
+ scratch_buf.clear();
1075
+
1076
+ // propagate-carry
1077
+ in_place_propagate_carry(src_lvl,
1078
+ src.levels_[src_lvl], scratch_buf,
1079
+ false, tgt);
1080
+ // update n_ at the end
1081
+ }
1082
+ }
1083
+ tgt.n_ = new_n;
1084
+ if ((tgt.get_n() / (2 * tgt.get_k())) != tgt.bit_pattern_) {
1085
+ throw std::logic_error("Failed internal consistency check after standard_merge()");
1086
+ }
1087
+
1088
+ // update min and max values
1089
+ // can't just check is_empty() since min and max might not have been set if
1090
+ // there were no base buffer items added via update()
1091
+ if (tgt.min_value_ == nullptr) {
1092
+ tgt.min_value_ = new (tgt.allocator_.allocate(1)) T(*src.min_value_);
1093
+ } else {
1094
+ if (C()(*src.min_value_, *tgt.min_value_))
1095
+ *tgt.min_value_ = conditional_forward<FwdSk>(*src.min_value_);
1096
+ }
1097
+
1098
+ if (tgt.max_value_ == nullptr) {
1099
+ tgt.max_value_ = new (tgt.allocator_.allocate(1)) T(*src.max_value_);
1100
+ } else {
1101
+ if (C()(*tgt.max_value_, *src.max_value_))
1102
+ *tgt.max_value_ = conditional_forward<FwdSk>(*src.max_value_);
1103
+ }
1104
+ }
1105
+
1106
+
1107
+ template<typename T, typename C, typename A>
1108
+ template<typename FwdSk>
1109
+ void quantiles_sketch<T, C, A>::downsampling_merge(quantiles_sketch& tgt, FwdSk&& src) {
1110
+ if (src.get_k() % tgt.get_k() != 0) {
1111
+ throw std::invalid_argument("src.get_k() is not a multiple of tgt.get_k()");
1112
+ }
1113
+ if (src.is_empty()) {
1114
+ return;
1115
+ }
1116
+
1117
+ const uint16_t downsample_factor = src.get_k() / tgt.get_k();
1118
+ const uint8_t lg_sample_factor = count_trailing_zeros_in_u32(downsample_factor);
1119
+
1120
+ uint64_t new_n = src.get_n() + tgt.get_n();
1121
+
1122
+ // move items from src's base buffer
1123
+ for (uint16_t i = 0; i < src.base_buffer_.size(); ++i) {
1124
+ tgt.update(conditional_forward<FwdSk>(src.base_buffer_[i]));
1125
+ }
1126
+
1127
+ // check (after moving raw items) if we need to extend levels array
1128
+ uint8_t levels_needed = compute_levels_needed(tgt.get_k(), new_n);
1129
+ if (levels_needed > tgt.levels_.size()) {
1130
+ tgt.levels_.reserve(levels_needed);
1131
+ while (tgt.levels_.size() < levels_needed) {
1132
+ Level empty_level(tgt.allocator_);
1133
+ empty_level.reserve(tgt.get_k());
1134
+ tgt.levels_.push_back(std::move(empty_level));
1135
+ }
1136
+ }
1137
+
1138
+ Level down_buf(tgt.allocator_);
1139
+ down_buf.reserve(tgt.get_k());
1140
+
1141
+ Level scratch_buf(tgt.allocator_);
1142
+ scratch_buf.reserve(2 * tgt.get_k());
1143
+
1144
+ uint64_t src_pattern = src.bit_pattern_;
1145
+ for (uint8_t src_lvl = 0; src_pattern != 0; ++src_lvl, src_pattern >>= 1) {
1146
+ if ((src_pattern & 1) > 0) {
1147
+ down_buf.clear();
1148
+ scratch_buf.clear();
1149
+
1150
+ // zip with stride, leaving input buffer intact
1151
+ zip_buffer_with_stride(src.levels_[src_lvl], down_buf, downsample_factor);
1152
+
1153
+ // propagate-carry
1154
+ in_place_propagate_carry(src_lvl + lg_sample_factor,
1155
+ down_buf, scratch_buf,
1156
+ false, tgt);
1157
+ // update n_ at the end
1158
+ }
1159
+ }
1160
+ tgt.n_ = new_n;
1161
+ if ((tgt.get_n() / (2 * tgt.get_k())) != tgt.bit_pattern_) {
1162
+ throw std::logic_error("Failed internal consistency check after downsampling_merge()");
1163
+ }
1164
+
1165
+ // update min and max values
1166
+ // can't just check is_empty() since min and max might not have been set if
1167
+ // there were no base buffer items added via update()
1168
+ if (tgt.min_value_ == nullptr) {
1169
+ tgt.min_value_ = new (tgt.allocator_.allocate(1)) T(*src.min_value_);
1170
+ } else {
1171
+ if (C()(*src.min_value_, *tgt.min_value_))
1172
+ *tgt.min_value_ = conditional_forward<FwdSk>(*src.min_value_);
1173
+ }
1174
+
1175
+ if (tgt.max_value_ == nullptr) {
1176
+ tgt.max_value_ = new (tgt.allocator_.allocate(1)) T(*src.max_value_);
1177
+ } else {
1178
+ if (C()(*tgt.max_value_, *src.max_value_))
1179
+ *tgt.max_value_ = conditional_forward<FwdSk>(*src.max_value_);
1180
+ }
1181
+ }
1182
+
1183
+
1184
+ template<typename T, typename C, typename A>
1185
+ uint8_t quantiles_sketch<T, C, A>::lowest_zero_bit_starting_at(uint64_t bits, uint8_t starting_bit) {
1186
+ uint8_t pos = starting_bit & 0X3F;
1187
+ uint64_t my_bits = bits >> pos;
1188
+
1189
+ while ((my_bits & static_cast<uint64_t>(1)) != 0) {
1190
+ my_bits >>= 1;
1191
+ pos++;
1192
+ }
1193
+ return pos;
1194
+ }
1195
+
1196
+ template<typename T, typename C, typename A>
1197
+ class quantiles_sketch<T, C, A>::item_deleter {
1198
+ public:
1199
+ item_deleter(const A& allocator): allocator_(allocator) {}
1200
+ void operator() (T* ptr) {
1201
+ if (ptr != nullptr) {
1202
+ ptr->~T();
1203
+ allocator_.deallocate(ptr, 1);
1204
+ }
1205
+ }
1206
+ private:
1207
+ A allocator_;
1208
+ };
1209
+
1210
+ template<typename T, typename C, typename A>
1211
+ class quantiles_sketch<T, C, A>::items_deleter {
1212
+ public:
1213
+ items_deleter(const A& allocator, bool destroy, size_t num): allocator_(allocator), destroy_(destroy), num_(num) {}
1214
+ void operator() (T* ptr) {
1215
+ if (ptr != nullptr) {
1216
+ if (destroy_) {
1217
+ for (size_t i = 0; i < num_; ++i) {
1218
+ ptr[i].~T();
1219
+ }
1220
+ }
1221
+ allocator_.deallocate(ptr, num_);
1222
+ }
1223
+ }
1224
+ void set_destroy(bool destroy) { destroy_ = destroy; }
1225
+ private:
1226
+ A allocator_;
1227
+ bool destroy_;
1228
+ size_t num_;
1229
+ };
1230
+
1231
+
1232
+ // quantiles_sketch::const_iterator implementation
1233
+
1234
+ template<typename T, typename C, typename A>
1235
+ quantiles_sketch<T, C, A>::const_iterator::const_iterator(const Level& base_buffer,
1236
+ const std::vector<Level, AllocLevel>& levels,
1237
+ uint16_t k,
1238
+ uint64_t n,
1239
+ bool is_end):
1240
+ base_buffer_(base_buffer),
1241
+ levels_(levels),
1242
+ level_(-1),
1243
+ index_(0),
1244
+ bb_count_(compute_base_buffer_items(k, n)),
1245
+ bit_pattern_(compute_bit_pattern(k, n)),
1246
+ weight_(1),
1247
+ k_(k)
1248
+ {
1249
+ if (is_end) {
1250
+ // if exact mode: index_ = n is end
1251
+ // if sampling, level_ = max_level + 1 and index_ = 0 is end
1252
+ if (bit_pattern_ == 0) // only a valid check for exact mode in constructor
1253
+ index_ = static_cast<uint32_t>(n);
1254
+ else
1255
+ level_ = static_cast<int>(levels_.size());
1256
+ } else { // find first non-empty item
1257
+ if (bb_count_ == 0 && bit_pattern_ > 0) {
1258
+ level_ = 0;
1259
+ weight_ = 2;
1260
+ while ((bit_pattern_ & 0x01) == 0) {
1261
+ weight_ *= 2;
1262
+ ++level_;
1263
+ bit_pattern_ >>= 1;
1264
+ }
1265
+ }
1266
+ }
1267
+ }
1268
+
1269
+ template<typename T, typename C, typename A>
1270
+ typename quantiles_sketch<T, C, A>::const_iterator& quantiles_sketch<T, C, A>::const_iterator::operator++() {
1271
+ ++index_;
1272
+
1273
+ if ((level_ == -1 && index_ == base_buffer_.size() && levels_.size() > 0) || (level_ >= 0 && index_ == k_)) { // go to the next non-empty level
1274
+ index_ = 0;
1275
+ do {
1276
+ ++level_;
1277
+ if (level_ > 0) bit_pattern_ = bit_pattern_ >> 1;
1278
+ if (bit_pattern_ == 0) return *this;
1279
+ weight_ *= 2;
1280
+ } while ((bit_pattern_ & static_cast<uint64_t>(1)) == 0);
1281
+ }
1282
+ return *this;
1283
+ }
1284
+
1285
+ template<typename T, typename C, typename A>
1286
+ typename quantiles_sketch<T, C, A>::const_iterator& quantiles_sketch<T, C, A>::const_iterator::operator++(int) {
1287
+ const_iterator tmp(*this);
1288
+ operator++();
1289
+ return tmp;
1290
+ }
1291
+
1292
+ template<typename T, typename C, typename A>
1293
+ bool quantiles_sketch<T, C, A>::const_iterator::operator==(const const_iterator& other) const {
1294
+ return level_ == other.level_ && index_ == other.index_;
1295
+ }
1296
+
1297
+ template<typename T, typename C, typename A>
1298
+ bool quantiles_sketch<T, C, A>::const_iterator::operator!=(const const_iterator& other) const {
1299
+ return !operator==(other);
1300
+ }
1301
+
1302
+ template<typename T, typename C, typename A>
1303
+ std::pair<const T&, const uint64_t> quantiles_sketch<T, C, A>::const_iterator::operator*() const {
1304
+ return std::pair<const T&, const uint64_t>(level_ == -1 ? base_buffer_[index_] : levels_[level_][index_], weight_);
1305
+ }
1306
+
1307
+ } /* namespace datasketches */
1308
+
1309
+ #endif // _QUANTILES_SKETCH_IMPL_HPP_