datasketches 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +1 -1
  4. data/ext/datasketches/kll_wrapper.cpp +5 -1
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +4 -3
  7. data/vendor/datasketches-cpp/common/CMakeLists.txt +4 -0
  8. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
  10. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
  11. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
  12. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
  13. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
  14. data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
  15. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
  16. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
  17. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +2 -0
  18. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
  19. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
  20. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
  21. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +25 -9
  22. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
  23. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
  24. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
  25. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
  26. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
  28. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
  29. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
  30. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
  31. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
  32. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
  33. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
  34. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
  35. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
  36. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
  37. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -4
  38. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
  39. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
  40. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +96 -42
  41. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +105 -127
  42. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +94 -25
  43. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  44. data/vendor/datasketches-cpp/pyproject.toml +1 -1
  45. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  46. data/vendor/datasketches-cpp/python/README.md +7 -0
  47. data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
  48. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
  49. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
  50. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
  51. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
  52. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
  53. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
  54. data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
  55. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
  56. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
  57. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
  58. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
  59. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
  60. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
  61. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
  62. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
  63. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
  64. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
  65. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
  66. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
  67. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
  68. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
  69. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
  70. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
  71. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -2
  72. data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
  73. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
  74. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
  75. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +62 -59
  76. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
  77. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
  78. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +31 -26
  79. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
  80. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +25 -9
  81. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
  82. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
  83. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
  84. data/vendor/datasketches-cpp/setup.py +1 -1
  85. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  86. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +8 -6
  87. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +1 -0
  88. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -0
  89. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +7 -45
  90. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -0
  91. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +1 -0
  92. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
  93. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
  94. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -0
  95. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +29 -1
  96. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +2 -0
  97. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +16 -0
  98. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +1 -0
  99. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
  100. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
  101. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -0
  102. metadata +25 -9
  103. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
  104. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
  105. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
  106. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
@@ -0,0 +1,1309 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _QUANTILES_SKETCH_IMPL_HPP_
21
+ #define _QUANTILES_SKETCH_IMPL_HPP_
22
+
23
+ #include <cmath>
24
+ #include <algorithm>
25
+ #include <stdexcept>
26
+ #include <iomanip>
27
+ #include <sstream>
28
+
29
+ #include "common_defs.hpp"
30
+ #include "count_zeros.hpp"
31
+ #include "conditional_forward.hpp"
32
+ #include "quantiles_sketch.hpp"
33
+
34
+ namespace datasketches {
35
+
36
+ template<typename T, typename C, typename A>
37
+ quantiles_sketch<T, C, A>::quantiles_sketch(uint16_t k, const A& allocator):
38
+ allocator_(allocator),
39
+ k_(k),
40
+ n_(0),
41
+ bit_pattern_(0),
42
+ base_buffer_(allocator_),
43
+ levels_(allocator_),
44
+ min_value_(nullptr),
45
+ max_value_(nullptr),
46
+ is_sorted_(true)
47
+ {
48
+ check_k(k_);
49
+ base_buffer_.reserve(2 * std::min(quantiles_constants::MIN_K, k));
50
+ }
51
+
52
+ template<typename T, typename C, typename A>
53
+ quantiles_sketch<T, C, A>::quantiles_sketch(const quantiles_sketch& other):
54
+ allocator_(other.allocator_),
55
+ k_(other.k_),
56
+ n_(other.n_),
57
+ bit_pattern_(other.bit_pattern_),
58
+ base_buffer_(other.base_buffer_),
59
+ levels_(other.levels_),
60
+ min_value_(nullptr),
61
+ max_value_(nullptr),
62
+ is_sorted_(other.is_sorted_)
63
+ {
64
+ if (other.min_value_ != nullptr) min_value_ = new (allocator_.allocate(1)) T(*other.min_value_);
65
+ if (other.max_value_ != nullptr) max_value_ = new (allocator_.allocate(1)) T(*other.max_value_);
66
+ for (size_t i = 0; i < levels_.size(); ++i) {
67
+ if (levels_[i].capacity() != other.levels_[i].capacity()) {
68
+ levels_[i].reserve(other.levels_[i].capacity());
69
+ }
70
+ }
71
+ }
72
+
73
+ template<typename T, typename C, typename A>
74
+ quantiles_sketch<T, C, A>::quantiles_sketch(quantiles_sketch&& other) noexcept:
75
+ allocator_(other.allocator_),
76
+ k_(other.k_),
77
+ n_(other.n_),
78
+ bit_pattern_(other.bit_pattern_),
79
+ base_buffer_(std::move(other.base_buffer_)),
80
+ levels_(std::move(other.levels_)),
81
+ min_value_(other.min_value_),
82
+ max_value_(other.max_value_),
83
+ is_sorted_(other.is_sorted_)
84
+ {
85
+ other.min_value_ = nullptr;
86
+ other.max_value_ = nullptr;
87
+ }
88
+
89
+ template<typename T, typename C, typename A>
90
+ quantiles_sketch<T, C, A>& quantiles_sketch<T, C, A>::operator=(const quantiles_sketch& other) {
91
+ quantiles_sketch<T, C, A> copy(other);
92
+ std::swap(allocator_, copy.allocator_);
93
+ std::swap(k_, copy.k_);
94
+ std::swap(n_, copy.n_);
95
+ std::swap(bit_pattern_, copy.bit_pattern_);
96
+ std::swap(base_buffer_, copy.base_buffer_);
97
+ std::swap(levels_, copy.levels_);
98
+ std::swap(min_value_, copy.min_value_);
99
+ std::swap(max_value_, copy.max_value_);
100
+ std::swap(is_sorted_, copy.is_sorted_);
101
+ return *this;
102
+ }
103
+
104
+ template<typename T, typename C, typename A>
105
+ quantiles_sketch<T, C, A>& quantiles_sketch<T, C, A>::operator=(quantiles_sketch&& other) noexcept {
106
+ std::swap(allocator_, other.allocator_);
107
+ std::swap(k_, other.k_);
108
+ std::swap(n_, other.n_);
109
+ std::swap(bit_pattern_, other.bit_pattern_);
110
+ std::swap(base_buffer_, other.base_buffer_);
111
+ std::swap(levels_, other.levels_);
112
+ std::swap(min_value_, other.min_value_);
113
+ std::swap(max_value_, other.max_value_);
114
+ std::swap(is_sorted_, other.is_sorted_);
115
+ return *this;
116
+ }
117
+
118
+ template<typename T, typename C, typename A>
119
+ quantiles_sketch<T, C, A>::quantiles_sketch(uint16_t k, uint64_t n, uint64_t bit_pattern,
120
+ Level&& base_buffer, VectorLevels&& levels,
121
+ std::unique_ptr<T, item_deleter> min_value, std::unique_ptr<T, item_deleter> max_value,
122
+ bool is_sorted, const A& allocator) :
123
+ allocator_(allocator),
124
+ k_(k),
125
+ n_(n),
126
+ bit_pattern_(bit_pattern),
127
+ base_buffer_(std::move(base_buffer)),
128
+ levels_(std::move(levels)),
129
+ min_value_(min_value.release()),
130
+ max_value_(max_value.release()),
131
+ is_sorted_(is_sorted)
132
+ {
133
+ uint32_t item_count = base_buffer_.size();
134
+ for (Level& lvl : levels_) {
135
+ item_count += lvl.size();
136
+ }
137
+ if (item_count != compute_retained_items(k_, n_))
138
+ throw std::logic_error("Item count does not match value computed from k, n");
139
+ }
140
+
141
+ template<typename T, typename C, typename A>
142
+ quantiles_sketch<T, C, A>::~quantiles_sketch() {
143
+ if (min_value_ != nullptr) {
144
+ min_value_->~T();
145
+ allocator_.deallocate(min_value_, 1);
146
+ }
147
+ if (max_value_ != nullptr) {
148
+ max_value_->~T();
149
+ allocator_.deallocate(max_value_, 1);
150
+ }
151
+ }
152
+
153
+ template<typename T, typename C, typename A>
154
+ template<typename FwdT>
155
+ void quantiles_sketch<T, C, A>::update(FwdT&& item) {
156
+ if (!check_update_value(item)) { return; }
157
+ if (is_empty()) {
158
+ min_value_ = new (allocator_.allocate(1)) T(item);
159
+ max_value_ = new (allocator_.allocate(1)) T(item);
160
+ } else {
161
+ if (C()(item, *min_value_)) *min_value_ = item;
162
+ if (C()(*max_value_, item)) *max_value_ = item;
163
+ }
164
+
165
+ // if exceed capacity, grow until size 2k -- assumes eager processing
166
+ if (base_buffer_.size() + 1 > base_buffer_.capacity())
167
+ grow_base_buffer();
168
+
169
+ base_buffer_.push_back(std::forward<FwdT>(item));
170
+ ++n_;
171
+
172
+ if (base_buffer_.size() > 1)
173
+ is_sorted_ = false;
174
+
175
+ if (base_buffer_.size() == 2 * k_)
176
+ process_full_base_buffer();
177
+ }
178
+
179
+ template<typename T, typename C, typename A>
180
+ template<typename FwdSk>
181
+ void quantiles_sketch<T, C, A>::merge(FwdSk&& other) {
182
+ if (other.is_empty()) {
183
+ return; // nothing to do
184
+ } else if (!other.is_estimation_mode()) {
185
+ // other is exact, stream in regardless of k
186
+ for (auto item : other.base_buffer_) {
187
+ update(conditional_forward<FwdSk>(item));
188
+ }
189
+ return; // we're done
190
+ }
191
+
192
+ // we know other has data and is in estimation mode
193
+ if (is_estimation_mode()) {
194
+ if (k_ == other.get_k()) {
195
+ standard_merge(*this, other);
196
+ } else if (k_ > other.get_k()) {
197
+ quantiles_sketch sk_copy(other);
198
+ downsampling_merge(sk_copy, *this);
199
+ *this = sk_copy;
200
+ } else { // k_ < other.get_k()
201
+ downsampling_merge(*this, other);
202
+ }
203
+ } else {
204
+ // exact or empty
205
+ quantiles_sketch sk_copy(other);
206
+ if (k_ <= other.get_k()) {
207
+ if (!is_empty()) {
208
+ for (uint16_t i = 0; i < base_buffer_.size(); ++i) {
209
+ sk_copy.update(std::move(base_buffer_[i]));
210
+ }
211
+ }
212
+ } else { // k_ > other.get_k()
213
+ downsampling_merge(sk_copy, *this);
214
+ }
215
+ *this = sk_copy;
216
+ }
217
+ }
218
+
219
+ template<typename T, typename C, typename A>
220
+ template<typename SerDe>
221
+ void quantiles_sketch<T, C, A>::serialize(std::ostream& os, const SerDe& serde) const {
222
+ const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_SHORT : PREAMBLE_LONGS_FULL;
223
+ write(os, preamble_longs);
224
+ const uint8_t ser_ver = SERIAL_VERSION;
225
+ write(os, ser_ver);
226
+ const uint8_t family = FAMILY;
227
+ write(os, family);
228
+
229
+ // side-effect: sort base buffer since always compact
230
+ // can't set is_sorted_ since const method
231
+ std::sort(const_cast<Level&>(base_buffer_).begin(), const_cast<Level&>(base_buffer_).end(), C());
232
+
233
+ // empty, ordered, compact are valid flags
234
+ const uint8_t flags_byte(
235
+ (is_empty() ? 1 << flags::IS_EMPTY : 0)
236
+ | (1 << flags::IS_SORTED) // always sorted as side effect noted above
237
+ | (1 << flags::IS_COMPACT) // always compact -- could be optional for numeric types?
238
+ );
239
+ write(os, flags_byte);
240
+ write(os, k_);
241
+ uint16_t unused = 0;
242
+ write(os, unused);
243
+
244
+ if (!is_empty()) {
245
+ write(os, n_);
246
+
247
+ // min and max
248
+ serde.serialize(os, min_value_, 1);
249
+ serde.serialize(os, max_value_, 1);
250
+
251
+ // base buffer items
252
+ serde.serialize(os, base_buffer_.data(), static_cast<unsigned>(base_buffer_.size()));
253
+
254
+ // levels, only when data is present
255
+ for (Level lvl : levels_) {
256
+ if (lvl.size() > 0)
257
+ serde.serialize(os, lvl.data(), static_cast<unsigned>(lvl.size()));
258
+ }
259
+ }
260
+ }
261
+
262
+ template<typename T, typename C, typename A>
263
+ template<typename SerDe>
264
+ auto quantiles_sketch<T, C, A>::serialize(unsigned header_size_bytes, const SerDe& serde) const -> vector_bytes {
265
+ const size_t size = get_serialized_size_bytes(serde) + header_size_bytes;
266
+ vector_bytes bytes(size, 0, allocator_);
267
+ uint8_t* ptr = bytes.data() + header_size_bytes;
268
+ const uint8_t* end_ptr = ptr + size;
269
+
270
+ const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_SHORT : PREAMBLE_LONGS_FULL;
271
+ ptr += copy_to_mem(preamble_longs, ptr);
272
+ const uint8_t ser_ver = SERIAL_VERSION;
273
+ ptr += copy_to_mem(ser_ver, ptr);
274
+ const uint8_t family = FAMILY;
275
+ ptr += copy_to_mem(family, ptr);
276
+
277
+ // side-effect: sort base buffer since always compact
278
+ // can't set is_sorted_ since const method
279
+ std::sort(const_cast<Level&>(base_buffer_).begin(), const_cast<Level&>(base_buffer_).end(), C());
280
+
281
+ // empty, ordered, compact are valid flags
282
+ const uint8_t flags_byte(
283
+ (is_empty() ? 1 << flags::IS_EMPTY : 0)
284
+ | (1 << flags::IS_SORTED) // always sorted as side effect noted above
285
+ | (1 << flags::IS_COMPACT) // always compact
286
+ );
287
+ ptr += copy_to_mem(flags_byte, ptr);
288
+ ptr += copy_to_mem(k_, ptr);
289
+ ptr += sizeof(uint16_t); // 2 unused bytes
290
+
291
+ if (!is_empty()) {
292
+
293
+ ptr += copy_to_mem(n_, ptr);
294
+
295
+ // min and max
296
+ ptr += serde.serialize(ptr, end_ptr - ptr, min_value_, 1);
297
+ ptr += serde.serialize(ptr, end_ptr - ptr, max_value_, 1);
298
+
299
+ // base buffer items
300
+ if (base_buffer_.size() > 0)
301
+ ptr += serde.serialize(ptr, end_ptr - ptr, base_buffer_.data(), static_cast<unsigned>(base_buffer_.size()));
302
+
303
+ // levels, only when data is present
304
+ for (Level lvl : levels_) {
305
+ if (lvl.size() > 0)
306
+ ptr += serde.serialize(ptr, end_ptr - ptr, lvl.data(), static_cast<unsigned>(lvl.size()));
307
+ }
308
+ }
309
+
310
+ return bytes;
311
+ }
312
+
313
+ template<typename T, typename C, typename A>
314
+ template<typename SerDe>
315
+ auto quantiles_sketch<T, C, A>::deserialize(std::istream &is, const SerDe& serde, const A &allocator) -> quantiles_sketch {
316
+ const auto preamble_longs = read<uint8_t>(is);
317
+ const auto serial_version = read<uint8_t>(is);
318
+ const auto family_id = read<uint8_t>(is);
319
+ const auto flags_byte = read<uint8_t>(is);
320
+ const auto k = read<uint16_t>(is);
321
+ read<uint16_t>(is); // unused
322
+
323
+ check_k(k);
324
+ check_serial_version(serial_version); // a little redundant with the header check
325
+ check_family_id(family_id);
326
+ check_header_validity(preamble_longs, flags_byte, serial_version);
327
+
328
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
329
+ const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
330
+ if (is_empty) {
331
+ return quantiles_sketch(k, allocator);
332
+ }
333
+
334
+ const auto items_seen = read<uint64_t>(is);
335
+
336
+ const bool is_compact = (serial_version == 2) | ((flags_byte & (1 << flags::IS_COMPACT)) > 0);
337
+ const bool is_sorted = (flags_byte & (1 << flags::IS_SORTED)) > 0;
338
+
339
+ A alloc(allocator);
340
+ auto item_buffer_deleter = [&alloc](T* ptr) { alloc.deallocate(ptr, 1); };
341
+ std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(alloc.allocate(1), item_buffer_deleter);
342
+ std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(alloc.allocate(1), item_buffer_deleter);
343
+ std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
344
+ std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
345
+
346
+ serde.deserialize(is, min_value_buffer.get(), 1);
347
+ // serde call did not throw, repackage with destrtuctor
348
+ min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
349
+ serde.deserialize(is, max_value_buffer.get(), 1);
350
+ // serde call did not throw, repackage with destrtuctor
351
+ max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
352
+
353
+ if (serial_version == 1) {
354
+ read<uint64_t>(is); // no longer used
355
+ }
356
+
357
+ // allocate buffers as needed
358
+ const uint8_t levels_needed = compute_levels_needed(k, items_seen);
359
+ const uint64_t bit_pattern = compute_bit_pattern(k, items_seen);
360
+
361
+ // Java provides a compact storage layout for a sketch of primitive doubles. The C++ version
362
+ // does not currently operate sketches in compact mode, but will only serialize as compact
363
+ // to avoid complications around serialization of empty values for generic type T. We also need
364
+ // to be able to ingest either serialized format from Java.
365
+
366
+ // load base buffer
367
+ const uint32_t bb_items = compute_base_buffer_items(k, items_seen);
368
+ uint32_t items_to_read = (levels_needed == 0 || is_compact) ? bb_items : 2 * k;
369
+ Level base_buffer = deserialize_array(is, bb_items, 2 * k, serde, allocator);
370
+ if (items_to_read > bb_items) { // either equal or greater, never read fewer items
371
+ // read remaining items, but don't store them
372
+ deserialize_array(is, items_to_read - bb_items, items_to_read - bb_items, serde, allocator);
373
+ }
374
+
375
+ // populate vector of Levels directly
376
+ VectorLevels levels(allocator);
377
+ levels.reserve(levels_needed);
378
+ if (levels_needed > 0) {
379
+ uint64_t working_pattern = bit_pattern;
380
+ for (size_t i = 0; i < levels_needed; ++i, working_pattern >>= 1) {
381
+ if ((working_pattern & 0x01) == 1) {
382
+ Level level = deserialize_array(is, k, k, serde, allocator);
383
+ levels.push_back(std::move(level));
384
+ } else {
385
+ Level level(allocator);
386
+ level.reserve(k);
387
+ levels.push_back(std::move(level));
388
+ }
389
+ }
390
+ }
391
+
392
+ return quantiles_sketch(k, items_seen, bit_pattern,
393
+ std::move(base_buffer), std::move(levels), std::move(min_value), std::move(max_value), is_sorted, allocator);
394
+ }
395
+
396
+ template<typename T, typename C, typename A>
397
+ template<typename SerDe>
398
+ auto quantiles_sketch<T, C, A>::deserialize_array(std::istream& is, uint32_t num_items, uint32_t capacity, const SerDe& serde, const A& allocator) -> Level {
399
+ A alloc(allocator);
400
+ std::unique_ptr<T, items_deleter> items(alloc.allocate(num_items), items_deleter(allocator, false, num_items));
401
+ serde.deserialize(is, items.get(), num_items);
402
+ // serde did not throw, enable destructors
403
+ items.get_deleter().set_destroy(true);
404
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
405
+
406
+ // succesfully read, now put into a Level
407
+ Level level(allocator);
408
+ level.reserve(capacity);
409
+ level.insert(level.begin(),
410
+ std::make_move_iterator(items.get()),
411
+ std::make_move_iterator(items.get() + num_items));
412
+ return level;
413
+ }
414
+
415
+ template<typename T, typename C, typename A>
416
+ template<typename SerDe>
417
+ auto quantiles_sketch<T, C, A>::deserialize(const void* bytes, size_t size, const SerDe& serde, const A &allocator) -> quantiles_sketch {
418
+ ensure_minimum_memory(size, 8);
419
+ const char* ptr = static_cast<const char*>(bytes);
420
+ const char* end_ptr = static_cast<const char*>(bytes) + size;
421
+
422
+ uint8_t preamble_longs;
423
+ ptr += copy_from_mem(ptr, preamble_longs);
424
+ uint8_t serial_version;
425
+ ptr += copy_from_mem(ptr, serial_version);
426
+ uint8_t family_id;
427
+ ptr += copy_from_mem(ptr, family_id);
428
+ uint8_t flags_byte;
429
+ ptr += copy_from_mem(ptr, flags_byte);
430
+ uint16_t k;
431
+ ptr += copy_from_mem(ptr, k);
432
+ uint16_t unused;
433
+ ptr += copy_from_mem(ptr, unused);
434
+
435
+ check_k(k);
436
+ check_serial_version(serial_version); // a little redundant with the header check
437
+ check_family_id(family_id);
438
+ check_header_validity(preamble_longs, flags_byte, serial_version);
439
+
440
+ const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
441
+ if (is_empty) {
442
+ return quantiles_sketch(k, allocator);
443
+ }
444
+
445
+ ensure_minimum_memory(size, 16);
446
+ uint64_t items_seen;
447
+ ptr += copy_from_mem(ptr, items_seen);
448
+
449
+ const bool is_compact = (serial_version == 2) | ((flags_byte & (1 << flags::IS_COMPACT)) > 0);
450
+ const bool is_sorted = (flags_byte & (1 << flags::IS_SORTED)) > 0;
451
+
452
+ A alloc(allocator);
453
+ auto item_buffer_deleter = [&alloc](T* ptr) { alloc.deallocate(ptr, 1); };
454
+ std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(alloc.allocate(1), item_buffer_deleter);
455
+ std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(alloc.allocate(1), item_buffer_deleter);
456
+ std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
457
+ std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
458
+
459
+ ptr += serde.deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
460
+ // serde call did not throw, repackage with destrtuctor
461
+ min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
462
+ ptr += serde.deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
463
+ // serde call did not throw, repackage with destrtuctor
464
+ max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
465
+
466
+ if (serial_version == 1) {
467
+ uint64_t unused_long;
468
+ ptr += copy_from_mem(ptr, unused_long); // no longer used
469
+ }
470
+
471
+ // allocate buffers as needed
472
+ const uint8_t levels_needed = compute_levels_needed(k, items_seen);
473
+ const uint64_t bit_pattern = compute_bit_pattern(k, items_seen);
474
+
475
+ // Java provides a compact storage layout for a sketch of primitive doubles. The C++ version
476
+ // does not currently operate sketches in compact mode, but will only serialize as compact
477
+ // to avoid complications around serialization of empty values for generic type T. We also need
478
+ // to be able to ingest either serialized format from Java.
479
+
480
+ // load base buffer
481
+ const uint32_t bb_items = compute_base_buffer_items(k, items_seen);
482
+ uint32_t items_to_read = (levels_needed == 0 || is_compact) ? bb_items : 2 * k;
483
+ auto base_buffer_pair = deserialize_array(ptr, end_ptr - ptr, bb_items, 2 * k, serde, allocator);
484
+ ptr += base_buffer_pair.second;
485
+ if (items_to_read > bb_items) { // either equal or greater, never read fewer items
486
+ // read remaining items, only use to advance the pointer
487
+ auto extras = deserialize_array(ptr, end_ptr - ptr, items_to_read - bb_items, items_to_read - bb_items, serde, allocator);
488
+ ptr += extras.second;
489
+ }
490
+
491
+ // populate vector of Levels directly
492
+ VectorLevels levels(allocator);
493
+ levels.reserve(levels_needed);
494
+ if (levels_needed > 0) {
495
+ uint64_t working_pattern = bit_pattern;
496
+ for (size_t i = 0; i < levels_needed; ++i, working_pattern >>= 1) {
497
+
498
+ if ((working_pattern & 0x01) == 1) {
499
+ auto pair = deserialize_array(ptr, end_ptr - ptr, k, k, serde, allocator);
500
+ ptr += pair.second;
501
+ levels.push_back(std::move(pair.first));
502
+ } else {
503
+ Level level(allocator);
504
+ level.reserve(k);
505
+ levels.push_back(std::move(level));
506
+ }
507
+ }
508
+ }
509
+
510
+ return quantiles_sketch(k, items_seen, bit_pattern,
511
+ std::move(base_buffer_pair.first), std::move(levels), std::move(min_value), std::move(max_value), is_sorted, allocator);
512
+ }
513
+
514
+ template<typename T, typename C, typename A>
515
+ template<typename SerDe>
516
+ auto quantiles_sketch<T, C, A>::deserialize_array(const void* bytes, size_t size, uint32_t num_items, uint32_t capacity, const SerDe& serde, const A& allocator)
517
+ -> std::pair<Level, size_t> {
518
+ const char* ptr = static_cast<const char*>(bytes);
519
+ const char* end_ptr = static_cast<const char*>(bytes) + size;
520
+ A alloc(allocator);
521
+ std::unique_ptr<T, items_deleter> items(alloc.allocate(num_items), items_deleter(allocator, false, num_items));
522
+ ptr += serde.deserialize(ptr, end_ptr - ptr, items.get(), num_items);
523
+ // serde did not throw, enable destructors
524
+ items.get_deleter().set_destroy(true);
525
+
526
+ // succesfully read, now put into a Level
527
+ Level level(allocator);
528
+ level.reserve(capacity);
529
+ level.insert(level.begin(),
530
+ std::make_move_iterator(items.get()),
531
+ std::make_move_iterator(items.get() + num_items));
532
+
533
+ return std::pair<Level, size_t>(std::move(level), ptr - static_cast<const char*>(bytes));
534
+ }
535
+
536
+ template<typename T, typename C, typename A>
537
+ string<A> quantiles_sketch<T, C, A>::to_string(bool print_levels, bool print_items) const {
538
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
539
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
540
+ std::ostringstream os;
541
+ os << "### Quantiles Sketch summary:" << std::endl;
542
+ os << " K : " << k_ << std::endl;
543
+ os << " N : " << n_ << std::endl;
544
+ os << " Epsilon : " << std::setprecision(3) << get_normalized_rank_error(false) * 100 << "%" << std::endl;
545
+ os << " Epsilon PMF : " << get_normalized_rank_error(true) * 100 << "%" << std::endl;
546
+ os << " Empty : " << (is_empty() ? "true" : "false") << std::endl;
547
+ os << " Estimation mode: " << (is_estimation_mode() ? "true" : "false") << std::endl;
548
+ os << " Levels (w/o BB): " << levels_.size() << std::endl;
549
+ os << " Used Levels : " << compute_valid_levels(bit_pattern_) << std::endl;
550
+ os << " Retained items : " << get_num_retained() << std::endl;
551
+ if (!is_empty()) {
552
+ os << " Min value : " << *min_value_ << std::endl;
553
+ os << " Max value : " << *max_value_ << std::endl;
554
+ }
555
+ os << "### End sketch summary" << std::endl;
556
+
557
+ if (print_levels) {
558
+ os << "### Quantiles Sketch levels:" << std::endl;
559
+ os << " index: items in use" << std::endl;
560
+ os << " BB: " << base_buffer_.size() << std::endl;
561
+ for (uint8_t i = 0; i < levels_.size(); i++) {
562
+ os << " " << static_cast<unsigned int>(i) << ": " << levels_[i].size() << std::endl;
563
+ }
564
+ os << "### End sketch levels" << std::endl;
565
+ }
566
+
567
+ if (print_items) {
568
+ os << "### Quantiles Sketch data:" << std::endl;
569
+ uint8_t level = 0;
570
+ os << " BB:" << std::endl;
571
+ for (const T& item : base_buffer_) {
572
+ os << " " << std::to_string(item) << std::endl;
573
+ }
574
+ for (uint8_t i = 0; i < levels_.size(); ++i) {
575
+ os << " level " << static_cast<unsigned int>(level) << ":" << std::endl;
576
+ for (const T& item : levels_[i]) {
577
+ os << " " << std::to_string(item) << std::endl;
578
+ }
579
+ }
580
+ os << "### End sketch data" << std::endl;
581
+ }
582
+ return string<A>(os.str().c_str(), allocator_);
583
+ }
584
+
585
+ template<typename T, typename C, typename A>
586
+ uint16_t quantiles_sketch<T, C, A>::get_k() const {
587
+ return k_;
588
+ }
589
+
590
+ template<typename T, typename C, typename A>
591
+ uint64_t quantiles_sketch<T, C, A>::get_n() const {
592
+ return n_;
593
+ }
594
+
595
+ template<typename T, typename C, typename A>
596
+ bool quantiles_sketch<T, C, A>::is_empty() const {
597
+ return n_ == 0;
598
+ }
599
+
600
+ template<typename T, typename C, typename A>
601
+ bool quantiles_sketch<T, C, A>::is_estimation_mode() const {
602
+ return bit_pattern_ != 0;
603
+ }
604
+
605
+ template<typename T, typename C, typename A>
606
+ uint32_t quantiles_sketch<T, C, A>::get_num_retained() const {
607
+ return compute_retained_items(k_, n_);
608
+ }
609
+
610
+ template<typename T, typename C, typename A>
611
+ const T& quantiles_sketch<T, C, A>::get_min_value() const {
612
+ if (is_empty()) return get_invalid_value();
613
+ return *min_value_;
614
+ }
615
+
616
+ template<typename T, typename C, typename A>
617
+ const T& quantiles_sketch<T, C, A>::get_max_value() const {
618
+ if (is_empty()) return get_invalid_value();
619
+ return *max_value_;
620
+ }
621
+
622
+ template<typename T, typename C, typename A>
623
+ C quantiles_sketch<T, C, A>::get_comparator() const {
624
+ return C();
625
+ }
626
+
627
+ // implementation for fixed-size arithmetic types (integral and floating point)
628
+ template<typename T, typename C, typename A>
629
+ template<typename SerDe, typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
630
+ size_t quantiles_sketch<T, C, A>::get_serialized_size_bytes(const SerDe&) const {
631
+ if (is_empty()) { return EMPTY_SIZE_BYTES; }
632
+ return DATA_START + ((get_num_retained() + 2) * sizeof(TT));
633
+ }
634
+
635
+ // implementation for all other types
636
+ template<typename T, typename C, typename A>
637
+ template<typename SerDe, typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
638
+ size_t quantiles_sketch<T, C, A>::get_serialized_size_bytes(const SerDe& serde) const {
639
+ if (is_empty()) { return EMPTY_SIZE_BYTES; }
640
+ size_t size = DATA_START;
641
+ size += serde.size_of_item(*min_value_);
642
+ size += serde.size_of_item(*max_value_);
643
+ for (auto it: *this) size += serde.size_of_item(it.first);
644
+ return size;
645
+ }
646
+
647
+ template<typename T, typename C, typename A>
648
+ double quantiles_sketch<T, C, A>::get_normalized_rank_error(bool is_pmf) const {
649
+ return get_normalized_rank_error(k_, is_pmf);
650
+ }
651
+
652
+ template<typename T, typename C, typename A>
653
+ double quantiles_sketch<T, C, A>::get_normalized_rank_error(uint16_t k, bool is_pmf) {
654
+ return is_pmf
655
+ ? 1.854 / std::pow(k, 0.9657)
656
+ : 1.576 / std::pow(k, 0.9726);
657
+ }
658
+
659
+ template<typename T, typename C, typename A>
660
+ template<bool inclusive>
661
+ quantile_sketch_sorted_view<T, C, A> quantiles_sketch<T, C, A>::get_sorted_view(bool cumulative) const {
662
+ // allow side-effect of sorting the base buffer; can't set the flag since
663
+ // this is a const method
664
+ if (!is_sorted_) {
665
+ std::sort(const_cast<Level&>(base_buffer_).begin(), const_cast<Level&>(base_buffer_).end(), C());
666
+ }
667
+ quantile_sketch_sorted_view<T, C, A> view(get_num_retained(), allocator_);
668
+
669
+ uint64_t weight = 1;
670
+ view.add(base_buffer_.begin(), base_buffer_.end(), weight);
671
+ for (auto& level : levels_) {
672
+ weight <<= 1;
673
+ if (level.empty()) { continue; }
674
+ view.add(level.begin(), level.end(), weight);
675
+ }
676
+
677
+ if (cumulative) view.template convert_to_cummulative<inclusive>();
678
+ return view;
679
+ }
680
+
681
+ template<typename T, typename C, typename A>
682
+ template<bool inclusive>
683
+ auto quantiles_sketch<T, C, A>::get_quantile(double rank) const -> quantile_return_type {
684
+ if (is_empty()) return get_invalid_value();
685
+ if (rank == 0.0) return *min_value_;
686
+ if (rank == 1.0) return *max_value_;
687
+ if ((rank < 0.0) || (rank > 1.0)) {
688
+ throw std::invalid_argument("Rank cannot be less than zero or greater than 1.0");
689
+ }
690
+ // possible side-effect: sorting base buffer
691
+ return get_sorted_view<inclusive>(true).get_quantile(rank);
692
+ }
693
+
694
+ template<typename T, typename C, typename A>
695
+ template<bool inclusive>
696
+ std::vector<T, A> quantiles_sketch<T, C, A>::get_quantiles(const double* ranks, uint32_t size) const {
697
+ std::vector<T, A> quantiles(allocator_);
698
+ if (is_empty()) return quantiles;
699
+ quantiles.reserve(size);
700
+
701
+ // possible side-effect: sorting base buffer
702
+ auto view = get_sorted_view<inclusive>(true);
703
+
704
+ for (uint32_t i = 0; i < size; ++i) {
705
+ const double rank = ranks[i];
706
+ if ((rank < 0.0) || (rank > 1.0)) {
707
+ throw std::invalid_argument("rank cannot be less than zero or greater than 1.0");
708
+ }
709
+ if (rank == 0.0) quantiles.push_back(*min_value_);
710
+ else if (rank == 1.0) quantiles.push_back(*max_value_);
711
+ else {
712
+ quantiles.push_back(view.get_quantile(rank));
713
+ }
714
+ }
715
+ return quantiles;
716
+ }
717
+
718
+ template<typename T, typename C, typename A>
719
+ template<bool inclusive>
720
+ std::vector<T, A> quantiles_sketch<T, C, A>::get_quantiles(uint32_t num) const {
721
+ if (is_empty()) return std::vector<T, A>(allocator_);
722
+ if (num == 0) {
723
+ throw std::invalid_argument("num must be > 0");
724
+ }
725
+ vector_double fractions(num, 0, allocator_);
726
+ fractions[0] = 0.0;
727
+ for (size_t i = 1; i < num; i++) {
728
+ fractions[i] = static_cast<double>(i) / (num - 1);
729
+ }
730
+ if (num > 1) {
731
+ fractions[num - 1] = 1.0;
732
+ }
733
+ return get_quantiles<inclusive>(fractions.data(), num);
734
+ }
735
+
736
+ template<typename T, typename C, typename A>
737
+ template<bool inclusive>
738
+ double quantiles_sketch<T, C, A>::get_rank(const T& value) const {
739
+ if (is_empty()) return std::numeric_limits<double>::quiet_NaN();
740
+ uint64_t weight = 1;
741
+ uint64_t total = 0;
742
+ for (const T &item: base_buffer_) {
743
+ if (inclusive ? !C()(value, item) : C()(item, value))
744
+ total += weight;
745
+ }
746
+
747
+ weight *= 2;
748
+ for (uint8_t level = 0; level < levels_.size(); ++level, weight *= 2) {
749
+ if (levels_[level].empty()) { continue; }
750
+ const T* data = levels_[level].data();
751
+ for (uint16_t i = 0; i < k_; ++i) {
752
+ if (inclusive ? !C()(value, data[i]) : C()(data[i], value))
753
+ total += weight;
754
+ else
755
+ break; // levels are sorted, no point comparing further
756
+ }
757
+ }
758
+ return (double) total / n_;
759
+ }
760
+
761
+ template<typename T, typename C, typename A>
762
+ template<bool inclusive>
763
+ auto quantiles_sketch<T, C, A>::get_PMF(const T* split_points, uint32_t size) const -> vector_double {
764
+ auto buckets = get_CDF<inclusive>(split_points, size);
765
+ if (is_empty()) return buckets;
766
+ for (uint32_t i = size; i > 0; --i) {
767
+ buckets[i] -= buckets[i - 1];
768
+ }
769
+ return buckets;
770
+ }
771
+
772
+ template<typename T, typename C, typename A>
773
+ template<bool inclusive>
774
+ auto quantiles_sketch<T, C, A>::get_CDF(const T* split_points, uint32_t size) const -> vector_double {
775
+ vector_double buckets(allocator_);
776
+ if (is_empty()) return buckets;
777
+ check_split_points(split_points, size);
778
+ buckets.reserve(size + 1);
779
+ for (uint32_t i = 0; i < size; ++i) buckets.push_back(get_rank<inclusive>(split_points[i]));
780
+ buckets.push_back(1);
781
+ return buckets;
782
+ }
783
+
784
+ template<typename T, typename C, typename A>
785
+ uint32_t quantiles_sketch<T, C, A>::compute_retained_items(const uint16_t k, const uint64_t n) {
786
+ uint32_t bb_count = compute_base_buffer_items(k, n);
787
+ uint64_t bit_pattern = compute_bit_pattern(k, n);
788
+ uint32_t valid_levels = compute_valid_levels(bit_pattern);
789
+ return bb_count + (k * valid_levels);
790
+ }
791
+
792
+ template<typename T, typename C, typename A>
793
+ uint32_t quantiles_sketch<T, C, A>::compute_base_buffer_items(const uint16_t k, const uint64_t n) {
794
+ return n % (static_cast<uint64_t>(2) * k);
795
+ }
796
+
797
+ template<typename T, typename C, typename A>
798
+ uint64_t quantiles_sketch<T, C, A>::compute_bit_pattern(const uint16_t k, const uint64_t n) {
799
+ return n / (static_cast<uint64_t>(2) * k);
800
+ }
801
+
802
+ template<typename T, typename C, typename A>
803
+ uint32_t quantiles_sketch<T, C, A>::compute_valid_levels(const uint64_t bit_pattern) {
804
+ // TODO: Java's Long.bitCount() probably uses a better method
805
+ uint64_t bp = bit_pattern;
806
+ uint32_t count = 0;
807
+ while (bp > 0) {
808
+ if ((bp & 0x01) == 1) ++count;
809
+ bp >>= 1;
810
+ }
811
+ return count;
812
+ }
813
+
814
+ template<typename T, typename C, typename A>
815
+ uint8_t quantiles_sketch<T, C, A>::compute_levels_needed(const uint16_t k, const uint64_t n) {
816
+ return static_cast<uint8_t>(64U) - count_leading_zeros_in_u64(n / (2 * k));
817
+ }
818
+
819
+ template<typename T, typename C, typename A>
820
+ void quantiles_sketch<T, C, A>::check_k(uint16_t k) {
821
+ if (k < quantiles_constants::MIN_K || k > quantiles_constants::MAX_K || (k & (k - 1)) != 0) {
822
+ throw std::invalid_argument("k must be a power of 2 that is >= "
823
+ + std::to_string(quantiles_constants::MIN_K) + " and <= "
824
+ + std::to_string(quantiles_constants::MAX_K) + ". Found: " + std::to_string(k));
825
+ }
826
+ }
827
+
828
+ template<typename T, typename C, typename A>
829
+ void quantiles_sketch<T, C, A>::check_serial_version(uint8_t serial_version) {
830
+ if (serial_version == SERIAL_VERSION || serial_version == SERIAL_VERSION_1 || serial_version == SERIAL_VERSION_2)
831
+ return;
832
+ else
833
+ throw std::invalid_argument("Possible corruption. Unrecognized serialization version: " + std::to_string(serial_version));
834
+ }
835
+
836
+ template<typename T, typename C, typename A>
837
+ void quantiles_sketch<T, C, A>::check_family_id(uint8_t family_id) {
838
+ if (family_id == FAMILY)
839
+ return;
840
+ else
841
+ throw std::invalid_argument("Possible corruption. Family id does not indicate quantiles sketch: " + std::to_string(family_id));
842
+ }
843
+
844
+ template<typename T, typename C, typename A>
845
+ void quantiles_sketch<T, C, A>::check_header_validity(uint8_t preamble_longs, uint8_t flags_byte, uint8_t serial_version) {
846
+ bool empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
847
+ bool compact = (flags_byte & (1 << flags::IS_COMPACT)) > 0;
848
+
849
+ uint8_t sw = (compact ? 1 : 0) + (2 * (empty ? 1 : 0))
850
+ + (4 * (serial_version & 0xF)) + (32 * (preamble_longs & 0x3F));
851
+ bool valid = true;
852
+
853
+ switch (sw) { // exhaustive list and description of all valid cases
854
+ case 38 : break; //!compact, empty, serVer = 1, preLongs = 1; always stored as not compact
855
+ case 164 : break; //!compact, !empty, serVer = 1, preLongs = 5; always stored as not compact
856
+ case 42 : break; //!compact, empty, serVer = 2, preLongs = 1; always stored as compact
857
+ case 72 : break; //!compact, !empty, serVer = 2, preLongs = 2; always stored as compact
858
+ case 47 : break; // compact, empty, serVer = 3, preLongs = 1;
859
+ case 46 : break; //!compact, empty, serVer = 3, preLongs = 1;
860
+ case 79 : break; // compact, empty, serVer = 3, preLongs = 2;
861
+ case 78 : break; //!compact, empty, serVer = 3, preLongs = 2;
862
+ case 77 : break; // compact, !empty, serVer = 3, preLongs = 2;
863
+ case 76 : break; //!compact, !empty, serVer = 3, preLongs = 2;
864
+ default : //all other case values are invalid
865
+ valid = false;
866
+ }
867
+
868
+ if (!valid) {
869
+ std::ostringstream os;
870
+ os << "Possible sketch corruption. Inconsistent state: "
871
+ << "preamble_longs = " << preamble_longs
872
+ << ", empty = " << (empty ? "true" : "false")
873
+ << ", serialization_version = " << serial_version
874
+ << ", compact = " << (compact ? "true" : "false");
875
+ throw std::invalid_argument(os.str());
876
+ }
877
+ }
878
+
879
+ template <typename T, typename C, typename A>
880
+ typename quantiles_sketch<T, C, A>::const_iterator quantiles_sketch<T, C, A>::begin() const {
881
+ return quantiles_sketch<T, C, A>::const_iterator(base_buffer_, levels_, k_, n_, false);
882
+ }
883
+
884
+ template <typename T, typename C, typename A>
885
+ typename quantiles_sketch<T, C, A>::const_iterator quantiles_sketch<T, C, A>::end() const {
886
+ return quantiles_sketch<T, C, A>::const_iterator(base_buffer_, levels_, k_, n_, true);
887
+ }
888
+
889
+ template<typename T, typename C, typename A>
890
+ void quantiles_sketch<T, C, A>::grow_base_buffer() {
891
+ size_t new_size = std::max(std::min(static_cast<size_t>(2 * k_), 2 * base_buffer_.size()), static_cast<size_t>(1));
892
+ base_buffer_.reserve(new_size);
893
+ }
894
+
895
+ template<typename T, typename C, typename A>
896
+ void quantiles_sketch<T, C, A>::process_full_base_buffer() {
897
+ // make sure there will be enough levels for the propagation
898
+ grow_levels_if_needed(); // note: n_ was already incremented by update() before this
899
+
900
+ std::sort(base_buffer_.begin(), base_buffer_.end(), C());
901
+ in_place_propagate_carry(0,
902
+ levels_[0], // unused here, but 0 is guaranteed to exist
903
+ base_buffer_,
904
+ true, *this);
905
+ base_buffer_.clear();
906
+ is_sorted_ = true;
907
+ if (n_ / (2 * k_) != bit_pattern_) {
908
+ throw std::logic_error("Internal error: n / 2k (" + std::to_string(n_ / 2 * k_)
909
+ + " != bit_pattern " + std::to_string(bit_pattern_));
910
+ }
911
+ }
912
+
913
+ template<typename T, typename C, typename A>
914
+ bool quantiles_sketch<T, C, A>::grow_levels_if_needed() {
915
+ uint8_t levels_needed = compute_levels_needed(k_, n_);
916
+ if (levels_needed == 0)
917
+ return false; // don't need levels and might have small base buffer. Possible during merges.
918
+
919
+ // from here on, assume full size base buffer (2k) and at least one additional level
920
+ if (levels_needed <= levels_.size())
921
+ return false;
922
+
923
+ Level empty_level(allocator_);
924
+ empty_level.reserve(k_);
925
+ levels_.push_back(std::move(empty_level));
926
+ return true;
927
+ }
928
+
929
+ template<typename T, typename C, typename A>
930
+ template<typename FwdV>
931
+ void quantiles_sketch<T, C, A>::in_place_propagate_carry(uint8_t starting_level,
932
+ FwdV&& buf_size_k, Level& buf_size_2k,
933
+ bool apply_as_update,
934
+ quantiles_sketch& sketch) {
935
+ const uint64_t bit_pattern = sketch.bit_pattern_;
936
+ const int k = sketch.k_;
937
+
938
+ uint8_t ending_level = lowest_zero_bit_starting_at(bit_pattern, starting_level);
939
+
940
+ if (apply_as_update) {
941
+ // update version of computation
942
+ // its is okay for buf_size_k to be null in this case
943
+ zip_buffer(buf_size_2k, sketch.levels_[ending_level]);
944
+ } else {
945
+ // merge_into version of computation
946
+ for (uint16_t i = 0; i < k; ++i) {
947
+ sketch.levels_[ending_level].push_back(conditional_forward<FwdV>(buf_size_k[i]));
948
+ }
949
+ }
950
+
951
+ for (uint64_t lvl = starting_level; lvl < ending_level; lvl++) {
952
+ if ((bit_pattern & (static_cast<uint64_t>(1) << lvl)) == 0) {
953
+ throw std::logic_error("unexpected empty level in bit_pattern");
954
+ }
955
+ merge_two_size_k_buffers(
956
+ sketch.levels_[lvl],
957
+ sketch.levels_[ending_level],
958
+ buf_size_2k);
959
+ sketch.levels_[lvl].clear();
960
+ sketch.levels_[ending_level].clear();
961
+ zip_buffer(buf_size_2k, sketch.levels_[ending_level]);
962
+ } // end of loop over lower levels
963
+
964
+ // update bit pattern with binary-arithmetic ripple carry
965
+ sketch.bit_pattern_ = bit_pattern + (static_cast<uint64_t>(1) << starting_level);
966
+ }
967
+
968
+ template<typename T, typename C, typename A>
969
+ void quantiles_sketch<T, C, A>::zip_buffer(Level& buf_in, Level& buf_out) {
970
+ #ifdef QUANTILES_VALIDATION
971
+ static uint32_t next_offset = 0;
972
+ uint32_t rand_offset = next_offset;
973
+ next_offset = 1 - next_offset;
974
+ #else
975
+ uint32_t rand_offset = random_bit();
976
+ #endif
977
+ if ((buf_in.size() != 2 * buf_out.capacity())
978
+ || (buf_out.size() > 0)) {
979
+ throw std::logic_error("zip_buffer requires buf_in.size() == "
980
+ "2*buf_out.capacity() and empty buf_out");
981
+ }
982
+
983
+ size_t k = buf_out.capacity();
984
+ for (uint32_t i = rand_offset, o = 0; o < k; i += 2, ++o) {
985
+ buf_out.push_back(std::move(buf_in[i]));
986
+ }
987
+ buf_in.clear();
988
+ }
989
+
990
+ template<typename T, typename C, typename A>
991
+ template<typename FwdV>
992
+ void quantiles_sketch<T, C, A>::zip_buffer_with_stride(FwdV&& buf_in, Level& buf_out, uint16_t stride) {
993
+ // Random offset in range [0, stride)
994
+ std::uniform_int_distribution<uint16_t> dist(0, stride - 1);
995
+ uint16_t rand_offset = dist(random_utils::rand);
996
+
997
+ if ((buf_in.size() != stride * buf_out.capacity())
998
+ || (buf_out.size() > 0)) {
999
+ throw std::logic_error("zip_buffer_with_stride requires buf_in.size() == "
1000
+ "stride*buf_out.capacity() and empty buf_out");
1001
+ }
1002
+
1003
+ size_t k = buf_out.capacity();
1004
+ for (uint16_t i = rand_offset, o = 0; o < k; i += stride, ++o) {
1005
+ buf_out.push_back(conditional_forward<FwdV>(buf_in[i]));
1006
+ }
1007
+ // do not clear input buffer
1008
+ }
1009
+
1010
+
1011
+ template<typename T, typename C, typename A>
1012
+ void quantiles_sketch<T, C, A>::merge_two_size_k_buffers(Level& src_1, Level& src_2, Level& dst) {
1013
+ if (src_1.size() != src_2.size()
1014
+ || src_1.size() * 2 != dst.capacity()
1015
+ || dst.size() != 0) {
1016
+ throw std::logic_error("Input invariants violated in merge_two_size_k_buffers()");
1017
+ }
1018
+
1019
+ auto end1 = src_1.end(), end2 = src_2.end();
1020
+ auto it1 = src_1.begin(), it2 = src_2.begin();
1021
+
1022
+ // TODO: probably actually doing copies given Level&?
1023
+ while (it1 != end1 && it2 != end2) {
1024
+ if (C()(*it1, *it2)) {
1025
+ dst.push_back(std::move(*it1++));
1026
+ } else {
1027
+ dst.push_back(std::move(*it2++));
1028
+ }
1029
+ }
1030
+
1031
+ if (it1 != end1) {
1032
+ dst.insert(dst.end(), it1, end1);
1033
+ } else {
1034
+ if (it2 == end2) { throw std::logic_error("it2 unexpectedly already at end of range"); }
1035
+ dst.insert(dst.end(), it2, end2);
1036
+ }
1037
+ }
1038
+
1039
+
1040
+ template<typename T, typename C, typename A>
1041
+ template<typename FwdSk>
1042
+ void quantiles_sketch<T, C, A>::standard_merge(quantiles_sketch& tgt, FwdSk&& src) {
1043
+ if (src.get_k() != tgt.get_k()) {
1044
+ throw std::invalid_argument("src.get_k() != tgt.get_k()");
1045
+ }
1046
+ if (src.is_empty()) {
1047
+ return;
1048
+ }
1049
+
1050
+ uint64_t new_n = src.get_n() + tgt.get_n();
1051
+
1052
+ // move items from src's base buffer
1053
+ for (uint16_t i = 0; i < src.base_buffer_.size(); ++i) {
1054
+ tgt.update(conditional_forward<FwdSk>(src.base_buffer_[i]));
1055
+ }
1056
+
1057
+ // check (after moving raw items) if we need to extend levels array
1058
+ uint8_t levels_needed = compute_levels_needed(tgt.get_k(), new_n);
1059
+ if (levels_needed > tgt.levels_.size()) {
1060
+ tgt.levels_.reserve(levels_needed);
1061
+ while (tgt.levels_.size() < levels_needed) {
1062
+ Level empty_level(tgt.allocator_);
1063
+ empty_level.reserve(tgt.get_k());
1064
+ tgt.levels_.push_back(std::move(empty_level));
1065
+ }
1066
+ }
1067
+
1068
+ Level scratch_buf(tgt.allocator_);
1069
+ scratch_buf.reserve(2 * tgt.get_k());
1070
+
1071
+ uint64_t src_pattern = src.bit_pattern_;
1072
+ for (uint8_t src_lvl = 0; src_pattern != 0; ++src_lvl, src_pattern >>= 1) {
1073
+ if ((src_pattern & 1) > 0) {
1074
+ scratch_buf.clear();
1075
+
1076
+ // propagate-carry
1077
+ in_place_propagate_carry(src_lvl,
1078
+ src.levels_[src_lvl], scratch_buf,
1079
+ false, tgt);
1080
+ // update n_ at the end
1081
+ }
1082
+ }
1083
+ tgt.n_ = new_n;
1084
+ if ((tgt.get_n() / (2 * tgt.get_k())) != tgt.bit_pattern_) {
1085
+ throw std::logic_error("Failed internal consistency check after standard_merge()");
1086
+ }
1087
+
1088
+ // update min and max values
1089
+ // can't just check is_empty() since min and max might not have been set if
1090
+ // there were no base buffer items added via update()
1091
+ if (tgt.min_value_ == nullptr) {
1092
+ tgt.min_value_ = new (tgt.allocator_.allocate(1)) T(*src.min_value_);
1093
+ } else {
1094
+ if (C()(*src.min_value_, *tgt.min_value_))
1095
+ *tgt.min_value_ = conditional_forward<FwdSk>(*src.min_value_);
1096
+ }
1097
+
1098
+ if (tgt.max_value_ == nullptr) {
1099
+ tgt.max_value_ = new (tgt.allocator_.allocate(1)) T(*src.max_value_);
1100
+ } else {
1101
+ if (C()(*tgt.max_value_, *src.max_value_))
1102
+ *tgt.max_value_ = conditional_forward<FwdSk>(*src.max_value_);
1103
+ }
1104
+ }
1105
+
1106
+
1107
+ template<typename T, typename C, typename A>
1108
+ template<typename FwdSk>
1109
+ void quantiles_sketch<T, C, A>::downsampling_merge(quantiles_sketch& tgt, FwdSk&& src) {
1110
+ if (src.get_k() % tgt.get_k() != 0) {
1111
+ throw std::invalid_argument("src.get_k() is not a multiple of tgt.get_k()");
1112
+ }
1113
+ if (src.is_empty()) {
1114
+ return;
1115
+ }
1116
+
1117
+ const uint16_t downsample_factor = src.get_k() / tgt.get_k();
1118
+ const uint8_t lg_sample_factor = count_trailing_zeros_in_u32(downsample_factor);
1119
+
1120
+ uint64_t new_n = src.get_n() + tgt.get_n();
1121
+
1122
+ // move items from src's base buffer
1123
+ for (uint16_t i = 0; i < src.base_buffer_.size(); ++i) {
1124
+ tgt.update(conditional_forward<FwdSk>(src.base_buffer_[i]));
1125
+ }
1126
+
1127
+ // check (after moving raw items) if we need to extend levels array
1128
+ uint8_t levels_needed = compute_levels_needed(tgt.get_k(), new_n);
1129
+ if (levels_needed > tgt.levels_.size()) {
1130
+ tgt.levels_.reserve(levels_needed);
1131
+ while (tgt.levels_.size() < levels_needed) {
1132
+ Level empty_level(tgt.allocator_);
1133
+ empty_level.reserve(tgt.get_k());
1134
+ tgt.levels_.push_back(std::move(empty_level));
1135
+ }
1136
+ }
1137
+
1138
+ Level down_buf(tgt.allocator_);
1139
+ down_buf.reserve(tgt.get_k());
1140
+
1141
+ Level scratch_buf(tgt.allocator_);
1142
+ scratch_buf.reserve(2 * tgt.get_k());
1143
+
1144
+ uint64_t src_pattern = src.bit_pattern_;
1145
+ for (uint8_t src_lvl = 0; src_pattern != 0; ++src_lvl, src_pattern >>= 1) {
1146
+ if ((src_pattern & 1) > 0) {
1147
+ down_buf.clear();
1148
+ scratch_buf.clear();
1149
+
1150
+ // zip with stride, leaving input buffer intact
1151
+ zip_buffer_with_stride(src.levels_[src_lvl], down_buf, downsample_factor);
1152
+
1153
+ // propagate-carry
1154
+ in_place_propagate_carry(src_lvl + lg_sample_factor,
1155
+ down_buf, scratch_buf,
1156
+ false, tgt);
1157
+ // update n_ at the end
1158
+ }
1159
+ }
1160
+ tgt.n_ = new_n;
1161
+ if ((tgt.get_n() / (2 * tgt.get_k())) != tgt.bit_pattern_) {
1162
+ throw std::logic_error("Failed internal consistency check after downsampling_merge()");
1163
+ }
1164
+
1165
+ // update min and max values
1166
+ // can't just check is_empty() since min and max might not have been set if
1167
+ // there were no base buffer items added via update()
1168
+ if (tgt.min_value_ == nullptr) {
1169
+ tgt.min_value_ = new (tgt.allocator_.allocate(1)) T(*src.min_value_);
1170
+ } else {
1171
+ if (C()(*src.min_value_, *tgt.min_value_))
1172
+ *tgt.min_value_ = conditional_forward<FwdSk>(*src.min_value_);
1173
+ }
1174
+
1175
+ if (tgt.max_value_ == nullptr) {
1176
+ tgt.max_value_ = new (tgt.allocator_.allocate(1)) T(*src.max_value_);
1177
+ } else {
1178
+ if (C()(*tgt.max_value_, *src.max_value_))
1179
+ *tgt.max_value_ = conditional_forward<FwdSk>(*src.max_value_);
1180
+ }
1181
+ }
1182
+
1183
+
1184
+ template<typename T, typename C, typename A>
1185
+ uint8_t quantiles_sketch<T, C, A>::lowest_zero_bit_starting_at(uint64_t bits, uint8_t starting_bit) {
1186
+ uint8_t pos = starting_bit & 0X3F;
1187
+ uint64_t my_bits = bits >> pos;
1188
+
1189
+ while ((my_bits & static_cast<uint64_t>(1)) != 0) {
1190
+ my_bits >>= 1;
1191
+ pos++;
1192
+ }
1193
+ return pos;
1194
+ }
1195
+
1196
+ template<typename T, typename C, typename A>
1197
+ class quantiles_sketch<T, C, A>::item_deleter {
1198
+ public:
1199
+ item_deleter(const A& allocator): allocator_(allocator) {}
1200
+ void operator() (T* ptr) {
1201
+ if (ptr != nullptr) {
1202
+ ptr->~T();
1203
+ allocator_.deallocate(ptr, 1);
1204
+ }
1205
+ }
1206
+ private:
1207
+ A allocator_;
1208
+ };
1209
+
1210
+ template<typename T, typename C, typename A>
1211
+ class quantiles_sketch<T, C, A>::items_deleter {
1212
+ public:
1213
+ items_deleter(const A& allocator, bool destroy, size_t num): allocator_(allocator), destroy_(destroy), num_(num) {}
1214
+ void operator() (T* ptr) {
1215
+ if (ptr != nullptr) {
1216
+ if (destroy_) {
1217
+ for (size_t i = 0; i < num_; ++i) {
1218
+ ptr[i].~T();
1219
+ }
1220
+ }
1221
+ allocator_.deallocate(ptr, num_);
1222
+ }
1223
+ }
1224
+ void set_destroy(bool destroy) { destroy_ = destroy; }
1225
+ private:
1226
+ A allocator_;
1227
+ bool destroy_;
1228
+ size_t num_;
1229
+ };
1230
+
1231
+
1232
+ // quantiles_sketch::const_iterator implementation
1233
+
1234
+ template<typename T, typename C, typename A>
1235
+ quantiles_sketch<T, C, A>::const_iterator::const_iterator(const Level& base_buffer,
1236
+ const std::vector<Level, AllocLevel>& levels,
1237
+ uint16_t k,
1238
+ uint64_t n,
1239
+ bool is_end):
1240
+ base_buffer_(base_buffer),
1241
+ levels_(levels),
1242
+ level_(-1),
1243
+ index_(0),
1244
+ bb_count_(compute_base_buffer_items(k, n)),
1245
+ bit_pattern_(compute_bit_pattern(k, n)),
1246
+ weight_(1),
1247
+ k_(k)
1248
+ {
1249
+ if (is_end) {
1250
+ // if exact mode: index_ = n is end
1251
+ // if sampling, level_ = max_level + 1 and index_ = 0 is end
1252
+ if (bit_pattern_ == 0) // only a valid check for exact mode in constructor
1253
+ index_ = static_cast<uint32_t>(n);
1254
+ else
1255
+ level_ = static_cast<int>(levels_.size());
1256
+ } else { // find first non-empty item
1257
+ if (bb_count_ == 0 && bit_pattern_ > 0) {
1258
+ level_ = 0;
1259
+ weight_ = 2;
1260
+ while ((bit_pattern_ & 0x01) == 0) {
1261
+ weight_ *= 2;
1262
+ ++level_;
1263
+ bit_pattern_ >>= 1;
1264
+ }
1265
+ }
1266
+ }
1267
+ }
1268
+
1269
+ template<typename T, typename C, typename A>
1270
+ typename quantiles_sketch<T, C, A>::const_iterator& quantiles_sketch<T, C, A>::const_iterator::operator++() {
1271
+ ++index_;
1272
+
1273
+ if ((level_ == -1 && index_ == base_buffer_.size() && levels_.size() > 0) || (level_ >= 0 && index_ == k_)) { // go to the next non-empty level
1274
+ index_ = 0;
1275
+ do {
1276
+ ++level_;
1277
+ if (level_ > 0) bit_pattern_ = bit_pattern_ >> 1;
1278
+ if (bit_pattern_ == 0) return *this;
1279
+ weight_ *= 2;
1280
+ } while ((bit_pattern_ & static_cast<uint64_t>(1)) == 0);
1281
+ }
1282
+ return *this;
1283
+ }
1284
+
1285
+ template<typename T, typename C, typename A>
1286
+ typename quantiles_sketch<T, C, A>::const_iterator& quantiles_sketch<T, C, A>::const_iterator::operator++(int) {
1287
+ const_iterator tmp(*this);
1288
+ operator++();
1289
+ return tmp;
1290
+ }
1291
+
1292
+ template<typename T, typename C, typename A>
1293
+ bool quantiles_sketch<T, C, A>::const_iterator::operator==(const const_iterator& other) const {
1294
+ return level_ == other.level_ && index_ == other.index_;
1295
+ }
1296
+
1297
+ template<typename T, typename C, typename A>
1298
+ bool quantiles_sketch<T, C, A>::const_iterator::operator!=(const const_iterator& other) const {
1299
+ return !operator==(other);
1300
+ }
1301
+
1302
+ template<typename T, typename C, typename A>
1303
+ std::pair<const T&, const uint64_t> quantiles_sketch<T, C, A>::const_iterator::operator*() const {
1304
+ return std::pair<const T&, const uint64_t>(level_ == -1 ? base_buffer_[index_] : levels_[level_][index_], weight_);
1305
+ }
1306
+
1307
+ } /* namespace datasketches */
1308
+
1309
+ #endif // _QUANTILES_SKETCH_IMPL_HPP_