datasketches 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +1 -1
- data/ext/datasketches/kll_wrapper.cpp +5 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +4 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +2 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +25 -9
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +96 -42
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +105 -127
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +94 -25
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
- data/vendor/datasketches-cpp/pyproject.toml +1 -1
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/README.md +7 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
- data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -2
- data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +62 -59
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +31 -26
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +25 -9
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +8 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +7 -45
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +29 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +2 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +16 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -0
- metadata +25 -9
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
|
@@ -0,0 +1,1309 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef _QUANTILES_SKETCH_IMPL_HPP_
|
|
21
|
+
#define _QUANTILES_SKETCH_IMPL_HPP_
|
|
22
|
+
|
|
23
|
+
#include <cmath>
|
|
24
|
+
#include <algorithm>
|
|
25
|
+
#include <stdexcept>
|
|
26
|
+
#include <iomanip>
|
|
27
|
+
#include <sstream>
|
|
28
|
+
|
|
29
|
+
#include "common_defs.hpp"
|
|
30
|
+
#include "count_zeros.hpp"
|
|
31
|
+
#include "conditional_forward.hpp"
|
|
32
|
+
#include "quantiles_sketch.hpp"
|
|
33
|
+
|
|
34
|
+
namespace datasketches {
|
|
35
|
+
|
|
36
|
+
template<typename T, typename C, typename A>
|
|
37
|
+
quantiles_sketch<T, C, A>::quantiles_sketch(uint16_t k, const A& allocator):
|
|
38
|
+
allocator_(allocator),
|
|
39
|
+
k_(k),
|
|
40
|
+
n_(0),
|
|
41
|
+
bit_pattern_(0),
|
|
42
|
+
base_buffer_(allocator_),
|
|
43
|
+
levels_(allocator_),
|
|
44
|
+
min_value_(nullptr),
|
|
45
|
+
max_value_(nullptr),
|
|
46
|
+
is_sorted_(true)
|
|
47
|
+
{
|
|
48
|
+
check_k(k_);
|
|
49
|
+
base_buffer_.reserve(2 * std::min(quantiles_constants::MIN_K, k));
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
template<typename T, typename C, typename A>
|
|
53
|
+
quantiles_sketch<T, C, A>::quantiles_sketch(const quantiles_sketch& other):
|
|
54
|
+
allocator_(other.allocator_),
|
|
55
|
+
k_(other.k_),
|
|
56
|
+
n_(other.n_),
|
|
57
|
+
bit_pattern_(other.bit_pattern_),
|
|
58
|
+
base_buffer_(other.base_buffer_),
|
|
59
|
+
levels_(other.levels_),
|
|
60
|
+
min_value_(nullptr),
|
|
61
|
+
max_value_(nullptr),
|
|
62
|
+
is_sorted_(other.is_sorted_)
|
|
63
|
+
{
|
|
64
|
+
if (other.min_value_ != nullptr) min_value_ = new (allocator_.allocate(1)) T(*other.min_value_);
|
|
65
|
+
if (other.max_value_ != nullptr) max_value_ = new (allocator_.allocate(1)) T(*other.max_value_);
|
|
66
|
+
for (size_t i = 0; i < levels_.size(); ++i) {
|
|
67
|
+
if (levels_[i].capacity() != other.levels_[i].capacity()) {
|
|
68
|
+
levels_[i].reserve(other.levels_[i].capacity());
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
template<typename T, typename C, typename A>
|
|
74
|
+
quantiles_sketch<T, C, A>::quantiles_sketch(quantiles_sketch&& other) noexcept:
|
|
75
|
+
allocator_(other.allocator_),
|
|
76
|
+
k_(other.k_),
|
|
77
|
+
n_(other.n_),
|
|
78
|
+
bit_pattern_(other.bit_pattern_),
|
|
79
|
+
base_buffer_(std::move(other.base_buffer_)),
|
|
80
|
+
levels_(std::move(other.levels_)),
|
|
81
|
+
min_value_(other.min_value_),
|
|
82
|
+
max_value_(other.max_value_),
|
|
83
|
+
is_sorted_(other.is_sorted_)
|
|
84
|
+
{
|
|
85
|
+
other.min_value_ = nullptr;
|
|
86
|
+
other.max_value_ = nullptr;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
template<typename T, typename C, typename A>
|
|
90
|
+
quantiles_sketch<T, C, A>& quantiles_sketch<T, C, A>::operator=(const quantiles_sketch& other) {
|
|
91
|
+
quantiles_sketch<T, C, A> copy(other);
|
|
92
|
+
std::swap(allocator_, copy.allocator_);
|
|
93
|
+
std::swap(k_, copy.k_);
|
|
94
|
+
std::swap(n_, copy.n_);
|
|
95
|
+
std::swap(bit_pattern_, copy.bit_pattern_);
|
|
96
|
+
std::swap(base_buffer_, copy.base_buffer_);
|
|
97
|
+
std::swap(levels_, copy.levels_);
|
|
98
|
+
std::swap(min_value_, copy.min_value_);
|
|
99
|
+
std::swap(max_value_, copy.max_value_);
|
|
100
|
+
std::swap(is_sorted_, copy.is_sorted_);
|
|
101
|
+
return *this;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
template<typename T, typename C, typename A>
|
|
105
|
+
quantiles_sketch<T, C, A>& quantiles_sketch<T, C, A>::operator=(quantiles_sketch&& other) noexcept {
|
|
106
|
+
std::swap(allocator_, other.allocator_);
|
|
107
|
+
std::swap(k_, other.k_);
|
|
108
|
+
std::swap(n_, other.n_);
|
|
109
|
+
std::swap(bit_pattern_, other.bit_pattern_);
|
|
110
|
+
std::swap(base_buffer_, other.base_buffer_);
|
|
111
|
+
std::swap(levels_, other.levels_);
|
|
112
|
+
std::swap(min_value_, other.min_value_);
|
|
113
|
+
std::swap(max_value_, other.max_value_);
|
|
114
|
+
std::swap(is_sorted_, other.is_sorted_);
|
|
115
|
+
return *this;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
template<typename T, typename C, typename A>
|
|
119
|
+
quantiles_sketch<T, C, A>::quantiles_sketch(uint16_t k, uint64_t n, uint64_t bit_pattern,
|
|
120
|
+
Level&& base_buffer, VectorLevels&& levels,
|
|
121
|
+
std::unique_ptr<T, item_deleter> min_value, std::unique_ptr<T, item_deleter> max_value,
|
|
122
|
+
bool is_sorted, const A& allocator) :
|
|
123
|
+
allocator_(allocator),
|
|
124
|
+
k_(k),
|
|
125
|
+
n_(n),
|
|
126
|
+
bit_pattern_(bit_pattern),
|
|
127
|
+
base_buffer_(std::move(base_buffer)),
|
|
128
|
+
levels_(std::move(levels)),
|
|
129
|
+
min_value_(min_value.release()),
|
|
130
|
+
max_value_(max_value.release()),
|
|
131
|
+
is_sorted_(is_sorted)
|
|
132
|
+
{
|
|
133
|
+
uint32_t item_count = base_buffer_.size();
|
|
134
|
+
for (Level& lvl : levels_) {
|
|
135
|
+
item_count += lvl.size();
|
|
136
|
+
}
|
|
137
|
+
if (item_count != compute_retained_items(k_, n_))
|
|
138
|
+
throw std::logic_error("Item count does not match value computed from k, n");
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
template<typename T, typename C, typename A>
|
|
142
|
+
quantiles_sketch<T, C, A>::~quantiles_sketch() {
|
|
143
|
+
if (min_value_ != nullptr) {
|
|
144
|
+
min_value_->~T();
|
|
145
|
+
allocator_.deallocate(min_value_, 1);
|
|
146
|
+
}
|
|
147
|
+
if (max_value_ != nullptr) {
|
|
148
|
+
max_value_->~T();
|
|
149
|
+
allocator_.deallocate(max_value_, 1);
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
template<typename T, typename C, typename A>
|
|
154
|
+
template<typename FwdT>
|
|
155
|
+
void quantiles_sketch<T, C, A>::update(FwdT&& item) {
|
|
156
|
+
if (!check_update_value(item)) { return; }
|
|
157
|
+
if (is_empty()) {
|
|
158
|
+
min_value_ = new (allocator_.allocate(1)) T(item);
|
|
159
|
+
max_value_ = new (allocator_.allocate(1)) T(item);
|
|
160
|
+
} else {
|
|
161
|
+
if (C()(item, *min_value_)) *min_value_ = item;
|
|
162
|
+
if (C()(*max_value_, item)) *max_value_ = item;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// if exceed capacity, grow until size 2k -- assumes eager processing
|
|
166
|
+
if (base_buffer_.size() + 1 > base_buffer_.capacity())
|
|
167
|
+
grow_base_buffer();
|
|
168
|
+
|
|
169
|
+
base_buffer_.push_back(std::forward<FwdT>(item));
|
|
170
|
+
++n_;
|
|
171
|
+
|
|
172
|
+
if (base_buffer_.size() > 1)
|
|
173
|
+
is_sorted_ = false;
|
|
174
|
+
|
|
175
|
+
if (base_buffer_.size() == 2 * k_)
|
|
176
|
+
process_full_base_buffer();
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
template<typename T, typename C, typename A>
|
|
180
|
+
template<typename FwdSk>
|
|
181
|
+
void quantiles_sketch<T, C, A>::merge(FwdSk&& other) {
|
|
182
|
+
if (other.is_empty()) {
|
|
183
|
+
return; // nothing to do
|
|
184
|
+
} else if (!other.is_estimation_mode()) {
|
|
185
|
+
// other is exact, stream in regardless of k
|
|
186
|
+
for (auto item : other.base_buffer_) {
|
|
187
|
+
update(conditional_forward<FwdSk>(item));
|
|
188
|
+
}
|
|
189
|
+
return; // we're done
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
// we know other has data and is in estimation mode
|
|
193
|
+
if (is_estimation_mode()) {
|
|
194
|
+
if (k_ == other.get_k()) {
|
|
195
|
+
standard_merge(*this, other);
|
|
196
|
+
} else if (k_ > other.get_k()) {
|
|
197
|
+
quantiles_sketch sk_copy(other);
|
|
198
|
+
downsampling_merge(sk_copy, *this);
|
|
199
|
+
*this = sk_copy;
|
|
200
|
+
} else { // k_ < other.get_k()
|
|
201
|
+
downsampling_merge(*this, other);
|
|
202
|
+
}
|
|
203
|
+
} else {
|
|
204
|
+
// exact or empty
|
|
205
|
+
quantiles_sketch sk_copy(other);
|
|
206
|
+
if (k_ <= other.get_k()) {
|
|
207
|
+
if (!is_empty()) {
|
|
208
|
+
for (uint16_t i = 0; i < base_buffer_.size(); ++i) {
|
|
209
|
+
sk_copy.update(std::move(base_buffer_[i]));
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
} else { // k_ > other.get_k()
|
|
213
|
+
downsampling_merge(sk_copy, *this);
|
|
214
|
+
}
|
|
215
|
+
*this = sk_copy;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
template<typename T, typename C, typename A>
|
|
220
|
+
template<typename SerDe>
|
|
221
|
+
void quantiles_sketch<T, C, A>::serialize(std::ostream& os, const SerDe& serde) const {
|
|
222
|
+
const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_SHORT : PREAMBLE_LONGS_FULL;
|
|
223
|
+
write(os, preamble_longs);
|
|
224
|
+
const uint8_t ser_ver = SERIAL_VERSION;
|
|
225
|
+
write(os, ser_ver);
|
|
226
|
+
const uint8_t family = FAMILY;
|
|
227
|
+
write(os, family);
|
|
228
|
+
|
|
229
|
+
// side-effect: sort base buffer since always compact
|
|
230
|
+
// can't set is_sorted_ since const method
|
|
231
|
+
std::sort(const_cast<Level&>(base_buffer_).begin(), const_cast<Level&>(base_buffer_).end(), C());
|
|
232
|
+
|
|
233
|
+
// empty, ordered, compact are valid flags
|
|
234
|
+
const uint8_t flags_byte(
|
|
235
|
+
(is_empty() ? 1 << flags::IS_EMPTY : 0)
|
|
236
|
+
| (1 << flags::IS_SORTED) // always sorted as side effect noted above
|
|
237
|
+
| (1 << flags::IS_COMPACT) // always compact -- could be optional for numeric types?
|
|
238
|
+
);
|
|
239
|
+
write(os, flags_byte);
|
|
240
|
+
write(os, k_);
|
|
241
|
+
uint16_t unused = 0;
|
|
242
|
+
write(os, unused);
|
|
243
|
+
|
|
244
|
+
if (!is_empty()) {
|
|
245
|
+
write(os, n_);
|
|
246
|
+
|
|
247
|
+
// min and max
|
|
248
|
+
serde.serialize(os, min_value_, 1);
|
|
249
|
+
serde.serialize(os, max_value_, 1);
|
|
250
|
+
|
|
251
|
+
// base buffer items
|
|
252
|
+
serde.serialize(os, base_buffer_.data(), static_cast<unsigned>(base_buffer_.size()));
|
|
253
|
+
|
|
254
|
+
// levels, only when data is present
|
|
255
|
+
for (Level lvl : levels_) {
|
|
256
|
+
if (lvl.size() > 0)
|
|
257
|
+
serde.serialize(os, lvl.data(), static_cast<unsigned>(lvl.size()));
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
template<typename T, typename C, typename A>
|
|
263
|
+
template<typename SerDe>
|
|
264
|
+
auto quantiles_sketch<T, C, A>::serialize(unsigned header_size_bytes, const SerDe& serde) const -> vector_bytes {
|
|
265
|
+
const size_t size = get_serialized_size_bytes(serde) + header_size_bytes;
|
|
266
|
+
vector_bytes bytes(size, 0, allocator_);
|
|
267
|
+
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
268
|
+
const uint8_t* end_ptr = ptr + size;
|
|
269
|
+
|
|
270
|
+
const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_SHORT : PREAMBLE_LONGS_FULL;
|
|
271
|
+
ptr += copy_to_mem(preamble_longs, ptr);
|
|
272
|
+
const uint8_t ser_ver = SERIAL_VERSION;
|
|
273
|
+
ptr += copy_to_mem(ser_ver, ptr);
|
|
274
|
+
const uint8_t family = FAMILY;
|
|
275
|
+
ptr += copy_to_mem(family, ptr);
|
|
276
|
+
|
|
277
|
+
// side-effect: sort base buffer since always compact
|
|
278
|
+
// can't set is_sorted_ since const method
|
|
279
|
+
std::sort(const_cast<Level&>(base_buffer_).begin(), const_cast<Level&>(base_buffer_).end(), C());
|
|
280
|
+
|
|
281
|
+
// empty, ordered, compact are valid flags
|
|
282
|
+
const uint8_t flags_byte(
|
|
283
|
+
(is_empty() ? 1 << flags::IS_EMPTY : 0)
|
|
284
|
+
| (1 << flags::IS_SORTED) // always sorted as side effect noted above
|
|
285
|
+
| (1 << flags::IS_COMPACT) // always compact
|
|
286
|
+
);
|
|
287
|
+
ptr += copy_to_mem(flags_byte, ptr);
|
|
288
|
+
ptr += copy_to_mem(k_, ptr);
|
|
289
|
+
ptr += sizeof(uint16_t); // 2 unused bytes
|
|
290
|
+
|
|
291
|
+
if (!is_empty()) {
|
|
292
|
+
|
|
293
|
+
ptr += copy_to_mem(n_, ptr);
|
|
294
|
+
|
|
295
|
+
// min and max
|
|
296
|
+
ptr += serde.serialize(ptr, end_ptr - ptr, min_value_, 1);
|
|
297
|
+
ptr += serde.serialize(ptr, end_ptr - ptr, max_value_, 1);
|
|
298
|
+
|
|
299
|
+
// base buffer items
|
|
300
|
+
if (base_buffer_.size() > 0)
|
|
301
|
+
ptr += serde.serialize(ptr, end_ptr - ptr, base_buffer_.data(), static_cast<unsigned>(base_buffer_.size()));
|
|
302
|
+
|
|
303
|
+
// levels, only when data is present
|
|
304
|
+
for (Level lvl : levels_) {
|
|
305
|
+
if (lvl.size() > 0)
|
|
306
|
+
ptr += serde.serialize(ptr, end_ptr - ptr, lvl.data(), static_cast<unsigned>(lvl.size()));
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
return bytes;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
template<typename T, typename C, typename A>
|
|
314
|
+
template<typename SerDe>
|
|
315
|
+
auto quantiles_sketch<T, C, A>::deserialize(std::istream &is, const SerDe& serde, const A &allocator) -> quantiles_sketch {
|
|
316
|
+
const auto preamble_longs = read<uint8_t>(is);
|
|
317
|
+
const auto serial_version = read<uint8_t>(is);
|
|
318
|
+
const auto family_id = read<uint8_t>(is);
|
|
319
|
+
const auto flags_byte = read<uint8_t>(is);
|
|
320
|
+
const auto k = read<uint16_t>(is);
|
|
321
|
+
read<uint16_t>(is); // unused
|
|
322
|
+
|
|
323
|
+
check_k(k);
|
|
324
|
+
check_serial_version(serial_version); // a little redundant with the header check
|
|
325
|
+
check_family_id(family_id);
|
|
326
|
+
check_header_validity(preamble_longs, flags_byte, serial_version);
|
|
327
|
+
|
|
328
|
+
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
|
329
|
+
const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
|
|
330
|
+
if (is_empty) {
|
|
331
|
+
return quantiles_sketch(k, allocator);
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
const auto items_seen = read<uint64_t>(is);
|
|
335
|
+
|
|
336
|
+
const bool is_compact = (serial_version == 2) | ((flags_byte & (1 << flags::IS_COMPACT)) > 0);
|
|
337
|
+
const bool is_sorted = (flags_byte & (1 << flags::IS_SORTED)) > 0;
|
|
338
|
+
|
|
339
|
+
A alloc(allocator);
|
|
340
|
+
auto item_buffer_deleter = [&alloc](T* ptr) { alloc.deallocate(ptr, 1); };
|
|
341
|
+
std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(alloc.allocate(1), item_buffer_deleter);
|
|
342
|
+
std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(alloc.allocate(1), item_buffer_deleter);
|
|
343
|
+
std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
|
|
344
|
+
std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
|
|
345
|
+
|
|
346
|
+
serde.deserialize(is, min_value_buffer.get(), 1);
|
|
347
|
+
// serde call did not throw, repackage with destrtuctor
|
|
348
|
+
min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
|
|
349
|
+
serde.deserialize(is, max_value_buffer.get(), 1);
|
|
350
|
+
// serde call did not throw, repackage with destrtuctor
|
|
351
|
+
max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
|
|
352
|
+
|
|
353
|
+
if (serial_version == 1) {
|
|
354
|
+
read<uint64_t>(is); // no longer used
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
// allocate buffers as needed
|
|
358
|
+
const uint8_t levels_needed = compute_levels_needed(k, items_seen);
|
|
359
|
+
const uint64_t bit_pattern = compute_bit_pattern(k, items_seen);
|
|
360
|
+
|
|
361
|
+
// Java provides a compact storage layout for a sketch of primitive doubles. The C++ version
|
|
362
|
+
// does not currently operate sketches in compact mode, but will only serialize as compact
|
|
363
|
+
// to avoid complications around serialization of empty values for generic type T. We also need
|
|
364
|
+
// to be able to ingest either serialized format from Java.
|
|
365
|
+
|
|
366
|
+
// load base buffer
|
|
367
|
+
const uint32_t bb_items = compute_base_buffer_items(k, items_seen);
|
|
368
|
+
uint32_t items_to_read = (levels_needed == 0 || is_compact) ? bb_items : 2 * k;
|
|
369
|
+
Level base_buffer = deserialize_array(is, bb_items, 2 * k, serde, allocator);
|
|
370
|
+
if (items_to_read > bb_items) { // either equal or greater, never read fewer items
|
|
371
|
+
// read remaining items, but don't store them
|
|
372
|
+
deserialize_array(is, items_to_read - bb_items, items_to_read - bb_items, serde, allocator);
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
// populate vector of Levels directly
|
|
376
|
+
VectorLevels levels(allocator);
|
|
377
|
+
levels.reserve(levels_needed);
|
|
378
|
+
if (levels_needed > 0) {
|
|
379
|
+
uint64_t working_pattern = bit_pattern;
|
|
380
|
+
for (size_t i = 0; i < levels_needed; ++i, working_pattern >>= 1) {
|
|
381
|
+
if ((working_pattern & 0x01) == 1) {
|
|
382
|
+
Level level = deserialize_array(is, k, k, serde, allocator);
|
|
383
|
+
levels.push_back(std::move(level));
|
|
384
|
+
} else {
|
|
385
|
+
Level level(allocator);
|
|
386
|
+
level.reserve(k);
|
|
387
|
+
levels.push_back(std::move(level));
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
return quantiles_sketch(k, items_seen, bit_pattern,
|
|
393
|
+
std::move(base_buffer), std::move(levels), std::move(min_value), std::move(max_value), is_sorted, allocator);
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
template<typename T, typename C, typename A>
|
|
397
|
+
template<typename SerDe>
|
|
398
|
+
auto quantiles_sketch<T, C, A>::deserialize_array(std::istream& is, uint32_t num_items, uint32_t capacity, const SerDe& serde, const A& allocator) -> Level {
|
|
399
|
+
A alloc(allocator);
|
|
400
|
+
std::unique_ptr<T, items_deleter> items(alloc.allocate(num_items), items_deleter(allocator, false, num_items));
|
|
401
|
+
serde.deserialize(is, items.get(), num_items);
|
|
402
|
+
// serde did not throw, enable destructors
|
|
403
|
+
items.get_deleter().set_destroy(true);
|
|
404
|
+
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
|
405
|
+
|
|
406
|
+
// succesfully read, now put into a Level
|
|
407
|
+
Level level(allocator);
|
|
408
|
+
level.reserve(capacity);
|
|
409
|
+
level.insert(level.begin(),
|
|
410
|
+
std::make_move_iterator(items.get()),
|
|
411
|
+
std::make_move_iterator(items.get() + num_items));
|
|
412
|
+
return level;
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
template<typename T, typename C, typename A>
|
|
416
|
+
template<typename SerDe>
|
|
417
|
+
auto quantiles_sketch<T, C, A>::deserialize(const void* bytes, size_t size, const SerDe& serde, const A &allocator) -> quantiles_sketch {
|
|
418
|
+
ensure_minimum_memory(size, 8);
|
|
419
|
+
const char* ptr = static_cast<const char*>(bytes);
|
|
420
|
+
const char* end_ptr = static_cast<const char*>(bytes) + size;
|
|
421
|
+
|
|
422
|
+
uint8_t preamble_longs;
|
|
423
|
+
ptr += copy_from_mem(ptr, preamble_longs);
|
|
424
|
+
uint8_t serial_version;
|
|
425
|
+
ptr += copy_from_mem(ptr, serial_version);
|
|
426
|
+
uint8_t family_id;
|
|
427
|
+
ptr += copy_from_mem(ptr, family_id);
|
|
428
|
+
uint8_t flags_byte;
|
|
429
|
+
ptr += copy_from_mem(ptr, flags_byte);
|
|
430
|
+
uint16_t k;
|
|
431
|
+
ptr += copy_from_mem(ptr, k);
|
|
432
|
+
uint16_t unused;
|
|
433
|
+
ptr += copy_from_mem(ptr, unused);
|
|
434
|
+
|
|
435
|
+
check_k(k);
|
|
436
|
+
check_serial_version(serial_version); // a little redundant with the header check
|
|
437
|
+
check_family_id(family_id);
|
|
438
|
+
check_header_validity(preamble_longs, flags_byte, serial_version);
|
|
439
|
+
|
|
440
|
+
const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
|
|
441
|
+
if (is_empty) {
|
|
442
|
+
return quantiles_sketch(k, allocator);
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
ensure_minimum_memory(size, 16);
|
|
446
|
+
uint64_t items_seen;
|
|
447
|
+
ptr += copy_from_mem(ptr, items_seen);
|
|
448
|
+
|
|
449
|
+
const bool is_compact = (serial_version == 2) | ((flags_byte & (1 << flags::IS_COMPACT)) > 0);
|
|
450
|
+
const bool is_sorted = (flags_byte & (1 << flags::IS_SORTED)) > 0;
|
|
451
|
+
|
|
452
|
+
A alloc(allocator);
|
|
453
|
+
auto item_buffer_deleter = [&alloc](T* ptr) { alloc.deallocate(ptr, 1); };
|
|
454
|
+
std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(alloc.allocate(1), item_buffer_deleter);
|
|
455
|
+
std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(alloc.allocate(1), item_buffer_deleter);
|
|
456
|
+
std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
|
|
457
|
+
std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
|
|
458
|
+
|
|
459
|
+
ptr += serde.deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
|
|
460
|
+
// serde call did not throw, repackage with destrtuctor
|
|
461
|
+
min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
|
|
462
|
+
ptr += serde.deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
|
|
463
|
+
// serde call did not throw, repackage with destrtuctor
|
|
464
|
+
max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
|
|
465
|
+
|
|
466
|
+
if (serial_version == 1) {
|
|
467
|
+
uint64_t unused_long;
|
|
468
|
+
ptr += copy_from_mem(ptr, unused_long); // no longer used
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
// allocate buffers as needed
|
|
472
|
+
const uint8_t levels_needed = compute_levels_needed(k, items_seen);
|
|
473
|
+
const uint64_t bit_pattern = compute_bit_pattern(k, items_seen);
|
|
474
|
+
|
|
475
|
+
// Java provides a compact storage layout for a sketch of primitive doubles. The C++ version
|
|
476
|
+
// does not currently operate sketches in compact mode, but will only serialize as compact
|
|
477
|
+
// to avoid complications around serialization of empty values for generic type T. We also need
|
|
478
|
+
// to be able to ingest either serialized format from Java.
|
|
479
|
+
|
|
480
|
+
// load base buffer
|
|
481
|
+
const uint32_t bb_items = compute_base_buffer_items(k, items_seen);
|
|
482
|
+
uint32_t items_to_read = (levels_needed == 0 || is_compact) ? bb_items : 2 * k;
|
|
483
|
+
auto base_buffer_pair = deserialize_array(ptr, end_ptr - ptr, bb_items, 2 * k, serde, allocator);
|
|
484
|
+
ptr += base_buffer_pair.second;
|
|
485
|
+
if (items_to_read > bb_items) { // either equal or greater, never read fewer items
|
|
486
|
+
// read remaining items, only use to advance the pointer
|
|
487
|
+
auto extras = deserialize_array(ptr, end_ptr - ptr, items_to_read - bb_items, items_to_read - bb_items, serde, allocator);
|
|
488
|
+
ptr += extras.second;
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
// populate vector of Levels directly
|
|
492
|
+
VectorLevels levels(allocator);
|
|
493
|
+
levels.reserve(levels_needed);
|
|
494
|
+
if (levels_needed > 0) {
|
|
495
|
+
uint64_t working_pattern = bit_pattern;
|
|
496
|
+
for (size_t i = 0; i < levels_needed; ++i, working_pattern >>= 1) {
|
|
497
|
+
|
|
498
|
+
if ((working_pattern & 0x01) == 1) {
|
|
499
|
+
auto pair = deserialize_array(ptr, end_ptr - ptr, k, k, serde, allocator);
|
|
500
|
+
ptr += pair.second;
|
|
501
|
+
levels.push_back(std::move(pair.first));
|
|
502
|
+
} else {
|
|
503
|
+
Level level(allocator);
|
|
504
|
+
level.reserve(k);
|
|
505
|
+
levels.push_back(std::move(level));
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
return quantiles_sketch(k, items_seen, bit_pattern,
|
|
511
|
+
std::move(base_buffer_pair.first), std::move(levels), std::move(min_value), std::move(max_value), is_sorted, allocator);
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
template<typename T, typename C, typename A>
|
|
515
|
+
template<typename SerDe>
|
|
516
|
+
auto quantiles_sketch<T, C, A>::deserialize_array(const void* bytes, size_t size, uint32_t num_items, uint32_t capacity, const SerDe& serde, const A& allocator)
|
|
517
|
+
-> std::pair<Level, size_t> {
|
|
518
|
+
const char* ptr = static_cast<const char*>(bytes);
|
|
519
|
+
const char* end_ptr = static_cast<const char*>(bytes) + size;
|
|
520
|
+
A alloc(allocator);
|
|
521
|
+
std::unique_ptr<T, items_deleter> items(alloc.allocate(num_items), items_deleter(allocator, false, num_items));
|
|
522
|
+
ptr += serde.deserialize(ptr, end_ptr - ptr, items.get(), num_items);
|
|
523
|
+
// serde did not throw, enable destructors
|
|
524
|
+
items.get_deleter().set_destroy(true);
|
|
525
|
+
|
|
526
|
+
// succesfully read, now put into a Level
|
|
527
|
+
Level level(allocator);
|
|
528
|
+
level.reserve(capacity);
|
|
529
|
+
level.insert(level.begin(),
|
|
530
|
+
std::make_move_iterator(items.get()),
|
|
531
|
+
std::make_move_iterator(items.get() + num_items));
|
|
532
|
+
|
|
533
|
+
return std::pair<Level, size_t>(std::move(level), ptr - static_cast<const char*>(bytes));
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
template<typename T, typename C, typename A>
|
|
537
|
+
string<A> quantiles_sketch<T, C, A>::to_string(bool print_levels, bool print_items) const {
|
|
538
|
+
// Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
|
|
539
|
+
// The stream does not support passing an allocator instance, and alternatives are complicated.
|
|
540
|
+
std::ostringstream os;
|
|
541
|
+
os << "### Quantiles Sketch summary:" << std::endl;
|
|
542
|
+
os << " K : " << k_ << std::endl;
|
|
543
|
+
os << " N : " << n_ << std::endl;
|
|
544
|
+
os << " Epsilon : " << std::setprecision(3) << get_normalized_rank_error(false) * 100 << "%" << std::endl;
|
|
545
|
+
os << " Epsilon PMF : " << get_normalized_rank_error(true) * 100 << "%" << std::endl;
|
|
546
|
+
os << " Empty : " << (is_empty() ? "true" : "false") << std::endl;
|
|
547
|
+
os << " Estimation mode: " << (is_estimation_mode() ? "true" : "false") << std::endl;
|
|
548
|
+
os << " Levels (w/o BB): " << levels_.size() << std::endl;
|
|
549
|
+
os << " Used Levels : " << compute_valid_levels(bit_pattern_) << std::endl;
|
|
550
|
+
os << " Retained items : " << get_num_retained() << std::endl;
|
|
551
|
+
if (!is_empty()) {
|
|
552
|
+
os << " Min value : " << *min_value_ << std::endl;
|
|
553
|
+
os << " Max value : " << *max_value_ << std::endl;
|
|
554
|
+
}
|
|
555
|
+
os << "### End sketch summary" << std::endl;
|
|
556
|
+
|
|
557
|
+
if (print_levels) {
|
|
558
|
+
os << "### Quantiles Sketch levels:" << std::endl;
|
|
559
|
+
os << " index: items in use" << std::endl;
|
|
560
|
+
os << " BB: " << base_buffer_.size() << std::endl;
|
|
561
|
+
for (uint8_t i = 0; i < levels_.size(); i++) {
|
|
562
|
+
os << " " << static_cast<unsigned int>(i) << ": " << levels_[i].size() << std::endl;
|
|
563
|
+
}
|
|
564
|
+
os << "### End sketch levels" << std::endl;
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
if (print_items) {
|
|
568
|
+
os << "### Quantiles Sketch data:" << std::endl;
|
|
569
|
+
uint8_t level = 0;
|
|
570
|
+
os << " BB:" << std::endl;
|
|
571
|
+
for (const T& item : base_buffer_) {
|
|
572
|
+
os << " " << std::to_string(item) << std::endl;
|
|
573
|
+
}
|
|
574
|
+
for (uint8_t i = 0; i < levels_.size(); ++i) {
|
|
575
|
+
os << " level " << static_cast<unsigned int>(level) << ":" << std::endl;
|
|
576
|
+
for (const T& item : levels_[i]) {
|
|
577
|
+
os << " " << std::to_string(item) << std::endl;
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
os << "### End sketch data" << std::endl;
|
|
581
|
+
}
|
|
582
|
+
return string<A>(os.str().c_str(), allocator_);
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
template<typename T, typename C, typename A>
|
|
586
|
+
uint16_t quantiles_sketch<T, C, A>::get_k() const {
|
|
587
|
+
return k_;
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
template<typename T, typename C, typename A>
|
|
591
|
+
uint64_t quantiles_sketch<T, C, A>::get_n() const {
|
|
592
|
+
return n_;
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
template<typename T, typename C, typename A>
|
|
596
|
+
bool quantiles_sketch<T, C, A>::is_empty() const {
|
|
597
|
+
return n_ == 0;
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
template<typename T, typename C, typename A>
|
|
601
|
+
bool quantiles_sketch<T, C, A>::is_estimation_mode() const {
|
|
602
|
+
return bit_pattern_ != 0;
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
template<typename T, typename C, typename A>
|
|
606
|
+
uint32_t quantiles_sketch<T, C, A>::get_num_retained() const {
|
|
607
|
+
return compute_retained_items(k_, n_);
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
template<typename T, typename C, typename A>
|
|
611
|
+
const T& quantiles_sketch<T, C, A>::get_min_value() const {
|
|
612
|
+
if (is_empty()) return get_invalid_value();
|
|
613
|
+
return *min_value_;
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
template<typename T, typename C, typename A>
|
|
617
|
+
const T& quantiles_sketch<T, C, A>::get_max_value() const {
|
|
618
|
+
if (is_empty()) return get_invalid_value();
|
|
619
|
+
return *max_value_;
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
template<typename T, typename C, typename A>
|
|
623
|
+
C quantiles_sketch<T, C, A>::get_comparator() const {
|
|
624
|
+
return C();
|
|
625
|
+
}
|
|
626
|
+
|
|
627
|
+
// implementation for fixed-size arithmetic types (integral and floating point)
|
|
628
|
+
template<typename T, typename C, typename A>
|
|
629
|
+
template<typename SerDe, typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
|
|
630
|
+
size_t quantiles_sketch<T, C, A>::get_serialized_size_bytes(const SerDe&) const {
|
|
631
|
+
if (is_empty()) { return EMPTY_SIZE_BYTES; }
|
|
632
|
+
return DATA_START + ((get_num_retained() + 2) * sizeof(TT));
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
// implementation for all other types
|
|
636
|
+
template<typename T, typename C, typename A>
|
|
637
|
+
template<typename SerDe, typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
|
|
638
|
+
size_t quantiles_sketch<T, C, A>::get_serialized_size_bytes(const SerDe& serde) const {
|
|
639
|
+
if (is_empty()) { return EMPTY_SIZE_BYTES; }
|
|
640
|
+
size_t size = DATA_START;
|
|
641
|
+
size += serde.size_of_item(*min_value_);
|
|
642
|
+
size += serde.size_of_item(*max_value_);
|
|
643
|
+
for (auto it: *this) size += serde.size_of_item(it.first);
|
|
644
|
+
return size;
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
template<typename T, typename C, typename A>
|
|
648
|
+
double quantiles_sketch<T, C, A>::get_normalized_rank_error(bool is_pmf) const {
|
|
649
|
+
return get_normalized_rank_error(k_, is_pmf);
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
template<typename T, typename C, typename A>
|
|
653
|
+
double quantiles_sketch<T, C, A>::get_normalized_rank_error(uint16_t k, bool is_pmf) {
|
|
654
|
+
return is_pmf
|
|
655
|
+
? 1.854 / std::pow(k, 0.9657)
|
|
656
|
+
: 1.576 / std::pow(k, 0.9726);
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
template<typename T, typename C, typename A>
|
|
660
|
+
template<bool inclusive>
|
|
661
|
+
quantile_sketch_sorted_view<T, C, A> quantiles_sketch<T, C, A>::get_sorted_view(bool cumulative) const {
|
|
662
|
+
// allow side-effect of sorting the base buffer; can't set the flag since
|
|
663
|
+
// this is a const method
|
|
664
|
+
if (!is_sorted_) {
|
|
665
|
+
std::sort(const_cast<Level&>(base_buffer_).begin(), const_cast<Level&>(base_buffer_).end(), C());
|
|
666
|
+
}
|
|
667
|
+
quantile_sketch_sorted_view<T, C, A> view(get_num_retained(), allocator_);
|
|
668
|
+
|
|
669
|
+
uint64_t weight = 1;
|
|
670
|
+
view.add(base_buffer_.begin(), base_buffer_.end(), weight);
|
|
671
|
+
for (auto& level : levels_) {
|
|
672
|
+
weight <<= 1;
|
|
673
|
+
if (level.empty()) { continue; }
|
|
674
|
+
view.add(level.begin(), level.end(), weight);
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
if (cumulative) view.template convert_to_cummulative<inclusive>();
|
|
678
|
+
return view;
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
template<typename T, typename C, typename A>
|
|
682
|
+
template<bool inclusive>
|
|
683
|
+
auto quantiles_sketch<T, C, A>::get_quantile(double rank) const -> quantile_return_type {
|
|
684
|
+
if (is_empty()) return get_invalid_value();
|
|
685
|
+
if (rank == 0.0) return *min_value_;
|
|
686
|
+
if (rank == 1.0) return *max_value_;
|
|
687
|
+
if ((rank < 0.0) || (rank > 1.0)) {
|
|
688
|
+
throw std::invalid_argument("Rank cannot be less than zero or greater than 1.0");
|
|
689
|
+
}
|
|
690
|
+
// possible side-effect: sorting base buffer
|
|
691
|
+
return get_sorted_view<inclusive>(true).get_quantile(rank);
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
template<typename T, typename C, typename A>
|
|
695
|
+
template<bool inclusive>
|
|
696
|
+
std::vector<T, A> quantiles_sketch<T, C, A>::get_quantiles(const double* ranks, uint32_t size) const {
|
|
697
|
+
std::vector<T, A> quantiles(allocator_);
|
|
698
|
+
if (is_empty()) return quantiles;
|
|
699
|
+
quantiles.reserve(size);
|
|
700
|
+
|
|
701
|
+
// possible side-effect: sorting base buffer
|
|
702
|
+
auto view = get_sorted_view<inclusive>(true);
|
|
703
|
+
|
|
704
|
+
for (uint32_t i = 0; i < size; ++i) {
|
|
705
|
+
const double rank = ranks[i];
|
|
706
|
+
if ((rank < 0.0) || (rank > 1.0)) {
|
|
707
|
+
throw std::invalid_argument("rank cannot be less than zero or greater than 1.0");
|
|
708
|
+
}
|
|
709
|
+
if (rank == 0.0) quantiles.push_back(*min_value_);
|
|
710
|
+
else if (rank == 1.0) quantiles.push_back(*max_value_);
|
|
711
|
+
else {
|
|
712
|
+
quantiles.push_back(view.get_quantile(rank));
|
|
713
|
+
}
|
|
714
|
+
}
|
|
715
|
+
return quantiles;
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
template<typename T, typename C, typename A>
|
|
719
|
+
template<bool inclusive>
|
|
720
|
+
std::vector<T, A> quantiles_sketch<T, C, A>::get_quantiles(uint32_t num) const {
|
|
721
|
+
if (is_empty()) return std::vector<T, A>(allocator_);
|
|
722
|
+
if (num == 0) {
|
|
723
|
+
throw std::invalid_argument("num must be > 0");
|
|
724
|
+
}
|
|
725
|
+
vector_double fractions(num, 0, allocator_);
|
|
726
|
+
fractions[0] = 0.0;
|
|
727
|
+
for (size_t i = 1; i < num; i++) {
|
|
728
|
+
fractions[i] = static_cast<double>(i) / (num - 1);
|
|
729
|
+
}
|
|
730
|
+
if (num > 1) {
|
|
731
|
+
fractions[num - 1] = 1.0;
|
|
732
|
+
}
|
|
733
|
+
return get_quantiles<inclusive>(fractions.data(), num);
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
template<typename T, typename C, typename A>
|
|
737
|
+
template<bool inclusive>
|
|
738
|
+
double quantiles_sketch<T, C, A>::get_rank(const T& value) const {
|
|
739
|
+
if (is_empty()) return std::numeric_limits<double>::quiet_NaN();
|
|
740
|
+
uint64_t weight = 1;
|
|
741
|
+
uint64_t total = 0;
|
|
742
|
+
for (const T &item: base_buffer_) {
|
|
743
|
+
if (inclusive ? !C()(value, item) : C()(item, value))
|
|
744
|
+
total += weight;
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
weight *= 2;
|
|
748
|
+
for (uint8_t level = 0; level < levels_.size(); ++level, weight *= 2) {
|
|
749
|
+
if (levels_[level].empty()) { continue; }
|
|
750
|
+
const T* data = levels_[level].data();
|
|
751
|
+
for (uint16_t i = 0; i < k_; ++i) {
|
|
752
|
+
if (inclusive ? !C()(value, data[i]) : C()(data[i], value))
|
|
753
|
+
total += weight;
|
|
754
|
+
else
|
|
755
|
+
break; // levels are sorted, no point comparing further
|
|
756
|
+
}
|
|
757
|
+
}
|
|
758
|
+
return (double) total / n_;
|
|
759
|
+
}
|
|
760
|
+
|
|
761
|
+
template<typename T, typename C, typename A>
|
|
762
|
+
template<bool inclusive>
|
|
763
|
+
auto quantiles_sketch<T, C, A>::get_PMF(const T* split_points, uint32_t size) const -> vector_double {
|
|
764
|
+
auto buckets = get_CDF<inclusive>(split_points, size);
|
|
765
|
+
if (is_empty()) return buckets;
|
|
766
|
+
for (uint32_t i = size; i > 0; --i) {
|
|
767
|
+
buckets[i] -= buckets[i - 1];
|
|
768
|
+
}
|
|
769
|
+
return buckets;
|
|
770
|
+
}
|
|
771
|
+
|
|
772
|
+
template<typename T, typename C, typename A>
|
|
773
|
+
template<bool inclusive>
|
|
774
|
+
auto quantiles_sketch<T, C, A>::get_CDF(const T* split_points, uint32_t size) const -> vector_double {
|
|
775
|
+
vector_double buckets(allocator_);
|
|
776
|
+
if (is_empty()) return buckets;
|
|
777
|
+
check_split_points(split_points, size);
|
|
778
|
+
buckets.reserve(size + 1);
|
|
779
|
+
for (uint32_t i = 0; i < size; ++i) buckets.push_back(get_rank<inclusive>(split_points[i]));
|
|
780
|
+
buckets.push_back(1);
|
|
781
|
+
return buckets;
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
template<typename T, typename C, typename A>
|
|
785
|
+
uint32_t quantiles_sketch<T, C, A>::compute_retained_items(const uint16_t k, const uint64_t n) {
|
|
786
|
+
uint32_t bb_count = compute_base_buffer_items(k, n);
|
|
787
|
+
uint64_t bit_pattern = compute_bit_pattern(k, n);
|
|
788
|
+
uint32_t valid_levels = compute_valid_levels(bit_pattern);
|
|
789
|
+
return bb_count + (k * valid_levels);
|
|
790
|
+
}
|
|
791
|
+
|
|
792
|
+
template<typename T, typename C, typename A>
|
|
793
|
+
uint32_t quantiles_sketch<T, C, A>::compute_base_buffer_items(const uint16_t k, const uint64_t n) {
|
|
794
|
+
return n % (static_cast<uint64_t>(2) * k);
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
template<typename T, typename C, typename A>
|
|
798
|
+
uint64_t quantiles_sketch<T, C, A>::compute_bit_pattern(const uint16_t k, const uint64_t n) {
|
|
799
|
+
return n / (static_cast<uint64_t>(2) * k);
|
|
800
|
+
}
|
|
801
|
+
|
|
802
|
+
template<typename T, typename C, typename A>
|
|
803
|
+
uint32_t quantiles_sketch<T, C, A>::compute_valid_levels(const uint64_t bit_pattern) {
|
|
804
|
+
// TODO: Java's Long.bitCount() probably uses a better method
|
|
805
|
+
uint64_t bp = bit_pattern;
|
|
806
|
+
uint32_t count = 0;
|
|
807
|
+
while (bp > 0) {
|
|
808
|
+
if ((bp & 0x01) == 1) ++count;
|
|
809
|
+
bp >>= 1;
|
|
810
|
+
}
|
|
811
|
+
return count;
|
|
812
|
+
}
|
|
813
|
+
|
|
814
|
+
template<typename T, typename C, typename A>
|
|
815
|
+
uint8_t quantiles_sketch<T, C, A>::compute_levels_needed(const uint16_t k, const uint64_t n) {
|
|
816
|
+
return static_cast<uint8_t>(64U) - count_leading_zeros_in_u64(n / (2 * k));
|
|
817
|
+
}
|
|
818
|
+
|
|
819
|
+
template<typename T, typename C, typename A>
|
|
820
|
+
void quantiles_sketch<T, C, A>::check_k(uint16_t k) {
|
|
821
|
+
if (k < quantiles_constants::MIN_K || k > quantiles_constants::MAX_K || (k & (k - 1)) != 0) {
|
|
822
|
+
throw std::invalid_argument("k must be a power of 2 that is >= "
|
|
823
|
+
+ std::to_string(quantiles_constants::MIN_K) + " and <= "
|
|
824
|
+
+ std::to_string(quantiles_constants::MAX_K) + ". Found: " + std::to_string(k));
|
|
825
|
+
}
|
|
826
|
+
}
|
|
827
|
+
|
|
828
|
+
template<typename T, typename C, typename A>
|
|
829
|
+
void quantiles_sketch<T, C, A>::check_serial_version(uint8_t serial_version) {
|
|
830
|
+
if (serial_version == SERIAL_VERSION || serial_version == SERIAL_VERSION_1 || serial_version == SERIAL_VERSION_2)
|
|
831
|
+
return;
|
|
832
|
+
else
|
|
833
|
+
throw std::invalid_argument("Possible corruption. Unrecognized serialization version: " + std::to_string(serial_version));
|
|
834
|
+
}
|
|
835
|
+
|
|
836
|
+
template<typename T, typename C, typename A>
|
|
837
|
+
void quantiles_sketch<T, C, A>::check_family_id(uint8_t family_id) {
|
|
838
|
+
if (family_id == FAMILY)
|
|
839
|
+
return;
|
|
840
|
+
else
|
|
841
|
+
throw std::invalid_argument("Possible corruption. Family id does not indicate quantiles sketch: " + std::to_string(family_id));
|
|
842
|
+
}
|
|
843
|
+
|
|
844
|
+
template<typename T, typename C, typename A>
|
|
845
|
+
void quantiles_sketch<T, C, A>::check_header_validity(uint8_t preamble_longs, uint8_t flags_byte, uint8_t serial_version) {
|
|
846
|
+
bool empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
|
|
847
|
+
bool compact = (flags_byte & (1 << flags::IS_COMPACT)) > 0;
|
|
848
|
+
|
|
849
|
+
uint8_t sw = (compact ? 1 : 0) + (2 * (empty ? 1 : 0))
|
|
850
|
+
+ (4 * (serial_version & 0xF)) + (32 * (preamble_longs & 0x3F));
|
|
851
|
+
bool valid = true;
|
|
852
|
+
|
|
853
|
+
switch (sw) { // exhaustive list and description of all valid cases
|
|
854
|
+
case 38 : break; //!compact, empty, serVer = 1, preLongs = 1; always stored as not compact
|
|
855
|
+
case 164 : break; //!compact, !empty, serVer = 1, preLongs = 5; always stored as not compact
|
|
856
|
+
case 42 : break; //!compact, empty, serVer = 2, preLongs = 1; always stored as compact
|
|
857
|
+
case 72 : break; //!compact, !empty, serVer = 2, preLongs = 2; always stored as compact
|
|
858
|
+
case 47 : break; // compact, empty, serVer = 3, preLongs = 1;
|
|
859
|
+
case 46 : break; //!compact, empty, serVer = 3, preLongs = 1;
|
|
860
|
+
case 79 : break; // compact, empty, serVer = 3, preLongs = 2;
|
|
861
|
+
case 78 : break; //!compact, empty, serVer = 3, preLongs = 2;
|
|
862
|
+
case 77 : break; // compact, !empty, serVer = 3, preLongs = 2;
|
|
863
|
+
case 76 : break; //!compact, !empty, serVer = 3, preLongs = 2;
|
|
864
|
+
default : //all other case values are invalid
|
|
865
|
+
valid = false;
|
|
866
|
+
}
|
|
867
|
+
|
|
868
|
+
if (!valid) {
|
|
869
|
+
std::ostringstream os;
|
|
870
|
+
os << "Possible sketch corruption. Inconsistent state: "
|
|
871
|
+
<< "preamble_longs = " << preamble_longs
|
|
872
|
+
<< ", empty = " << (empty ? "true" : "false")
|
|
873
|
+
<< ", serialization_version = " << serial_version
|
|
874
|
+
<< ", compact = " << (compact ? "true" : "false");
|
|
875
|
+
throw std::invalid_argument(os.str());
|
|
876
|
+
}
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
template <typename T, typename C, typename A>
|
|
880
|
+
typename quantiles_sketch<T, C, A>::const_iterator quantiles_sketch<T, C, A>::begin() const {
|
|
881
|
+
return quantiles_sketch<T, C, A>::const_iterator(base_buffer_, levels_, k_, n_, false);
|
|
882
|
+
}
|
|
883
|
+
|
|
884
|
+
template <typename T, typename C, typename A>
|
|
885
|
+
typename quantiles_sketch<T, C, A>::const_iterator quantiles_sketch<T, C, A>::end() const {
|
|
886
|
+
return quantiles_sketch<T, C, A>::const_iterator(base_buffer_, levels_, k_, n_, true);
|
|
887
|
+
}
|
|
888
|
+
|
|
889
|
+
template<typename T, typename C, typename A>
|
|
890
|
+
void quantiles_sketch<T, C, A>::grow_base_buffer() {
|
|
891
|
+
size_t new_size = std::max(std::min(static_cast<size_t>(2 * k_), 2 * base_buffer_.size()), static_cast<size_t>(1));
|
|
892
|
+
base_buffer_.reserve(new_size);
|
|
893
|
+
}
|
|
894
|
+
|
|
895
|
+
template<typename T, typename C, typename A>
|
|
896
|
+
void quantiles_sketch<T, C, A>::process_full_base_buffer() {
|
|
897
|
+
// make sure there will be enough levels for the propagation
|
|
898
|
+
grow_levels_if_needed(); // note: n_ was already incremented by update() before this
|
|
899
|
+
|
|
900
|
+
std::sort(base_buffer_.begin(), base_buffer_.end(), C());
|
|
901
|
+
in_place_propagate_carry(0,
|
|
902
|
+
levels_[0], // unused here, but 0 is guaranteed to exist
|
|
903
|
+
base_buffer_,
|
|
904
|
+
true, *this);
|
|
905
|
+
base_buffer_.clear();
|
|
906
|
+
is_sorted_ = true;
|
|
907
|
+
if (n_ / (2 * k_) != bit_pattern_) {
|
|
908
|
+
throw std::logic_error("Internal error: n / 2k (" + std::to_string(n_ / 2 * k_)
|
|
909
|
+
+ " != bit_pattern " + std::to_string(bit_pattern_));
|
|
910
|
+
}
|
|
911
|
+
}
|
|
912
|
+
|
|
913
|
+
template<typename T, typename C, typename A>
|
|
914
|
+
bool quantiles_sketch<T, C, A>::grow_levels_if_needed() {
|
|
915
|
+
uint8_t levels_needed = compute_levels_needed(k_, n_);
|
|
916
|
+
if (levels_needed == 0)
|
|
917
|
+
return false; // don't need levels and might have small base buffer. Possible during merges.
|
|
918
|
+
|
|
919
|
+
// from here on, assume full size base buffer (2k) and at least one additional level
|
|
920
|
+
if (levels_needed <= levels_.size())
|
|
921
|
+
return false;
|
|
922
|
+
|
|
923
|
+
Level empty_level(allocator_);
|
|
924
|
+
empty_level.reserve(k_);
|
|
925
|
+
levels_.push_back(std::move(empty_level));
|
|
926
|
+
return true;
|
|
927
|
+
}
|
|
928
|
+
|
|
929
|
+
template<typename T, typename C, typename A>
|
|
930
|
+
template<typename FwdV>
|
|
931
|
+
void quantiles_sketch<T, C, A>::in_place_propagate_carry(uint8_t starting_level,
|
|
932
|
+
FwdV&& buf_size_k, Level& buf_size_2k,
|
|
933
|
+
bool apply_as_update,
|
|
934
|
+
quantiles_sketch& sketch) {
|
|
935
|
+
const uint64_t bit_pattern = sketch.bit_pattern_;
|
|
936
|
+
const int k = sketch.k_;
|
|
937
|
+
|
|
938
|
+
uint8_t ending_level = lowest_zero_bit_starting_at(bit_pattern, starting_level);
|
|
939
|
+
|
|
940
|
+
if (apply_as_update) {
|
|
941
|
+
// update version of computation
|
|
942
|
+
// its is okay for buf_size_k to be null in this case
|
|
943
|
+
zip_buffer(buf_size_2k, sketch.levels_[ending_level]);
|
|
944
|
+
} else {
|
|
945
|
+
// merge_into version of computation
|
|
946
|
+
for (uint16_t i = 0; i < k; ++i) {
|
|
947
|
+
sketch.levels_[ending_level].push_back(conditional_forward<FwdV>(buf_size_k[i]));
|
|
948
|
+
}
|
|
949
|
+
}
|
|
950
|
+
|
|
951
|
+
for (uint64_t lvl = starting_level; lvl < ending_level; lvl++) {
|
|
952
|
+
if ((bit_pattern & (static_cast<uint64_t>(1) << lvl)) == 0) {
|
|
953
|
+
throw std::logic_error("unexpected empty level in bit_pattern");
|
|
954
|
+
}
|
|
955
|
+
merge_two_size_k_buffers(
|
|
956
|
+
sketch.levels_[lvl],
|
|
957
|
+
sketch.levels_[ending_level],
|
|
958
|
+
buf_size_2k);
|
|
959
|
+
sketch.levels_[lvl].clear();
|
|
960
|
+
sketch.levels_[ending_level].clear();
|
|
961
|
+
zip_buffer(buf_size_2k, sketch.levels_[ending_level]);
|
|
962
|
+
} // end of loop over lower levels
|
|
963
|
+
|
|
964
|
+
// update bit pattern with binary-arithmetic ripple carry
|
|
965
|
+
sketch.bit_pattern_ = bit_pattern + (static_cast<uint64_t>(1) << starting_level);
|
|
966
|
+
}
|
|
967
|
+
|
|
968
|
+
template<typename T, typename C, typename A>
|
|
969
|
+
void quantiles_sketch<T, C, A>::zip_buffer(Level& buf_in, Level& buf_out) {
|
|
970
|
+
#ifdef QUANTILES_VALIDATION
|
|
971
|
+
static uint32_t next_offset = 0;
|
|
972
|
+
uint32_t rand_offset = next_offset;
|
|
973
|
+
next_offset = 1 - next_offset;
|
|
974
|
+
#else
|
|
975
|
+
uint32_t rand_offset = random_bit();
|
|
976
|
+
#endif
|
|
977
|
+
if ((buf_in.size() != 2 * buf_out.capacity())
|
|
978
|
+
|| (buf_out.size() > 0)) {
|
|
979
|
+
throw std::logic_error("zip_buffer requires buf_in.size() == "
|
|
980
|
+
"2*buf_out.capacity() and empty buf_out");
|
|
981
|
+
}
|
|
982
|
+
|
|
983
|
+
size_t k = buf_out.capacity();
|
|
984
|
+
for (uint32_t i = rand_offset, o = 0; o < k; i += 2, ++o) {
|
|
985
|
+
buf_out.push_back(std::move(buf_in[i]));
|
|
986
|
+
}
|
|
987
|
+
buf_in.clear();
|
|
988
|
+
}
|
|
989
|
+
|
|
990
|
+
template<typename T, typename C, typename A>
|
|
991
|
+
template<typename FwdV>
|
|
992
|
+
void quantiles_sketch<T, C, A>::zip_buffer_with_stride(FwdV&& buf_in, Level& buf_out, uint16_t stride) {
|
|
993
|
+
// Random offset in range [0, stride)
|
|
994
|
+
std::uniform_int_distribution<uint16_t> dist(0, stride - 1);
|
|
995
|
+
uint16_t rand_offset = dist(random_utils::rand);
|
|
996
|
+
|
|
997
|
+
if ((buf_in.size() != stride * buf_out.capacity())
|
|
998
|
+
|| (buf_out.size() > 0)) {
|
|
999
|
+
throw std::logic_error("zip_buffer_with_stride requires buf_in.size() == "
|
|
1000
|
+
"stride*buf_out.capacity() and empty buf_out");
|
|
1001
|
+
}
|
|
1002
|
+
|
|
1003
|
+
size_t k = buf_out.capacity();
|
|
1004
|
+
for (uint16_t i = rand_offset, o = 0; o < k; i += stride, ++o) {
|
|
1005
|
+
buf_out.push_back(conditional_forward<FwdV>(buf_in[i]));
|
|
1006
|
+
}
|
|
1007
|
+
// do not clear input buffer
|
|
1008
|
+
}
|
|
1009
|
+
|
|
1010
|
+
|
|
1011
|
+
template<typename T, typename C, typename A>
|
|
1012
|
+
void quantiles_sketch<T, C, A>::merge_two_size_k_buffers(Level& src_1, Level& src_2, Level& dst) {
|
|
1013
|
+
if (src_1.size() != src_2.size()
|
|
1014
|
+
|| src_1.size() * 2 != dst.capacity()
|
|
1015
|
+
|| dst.size() != 0) {
|
|
1016
|
+
throw std::logic_error("Input invariants violated in merge_two_size_k_buffers()");
|
|
1017
|
+
}
|
|
1018
|
+
|
|
1019
|
+
auto end1 = src_1.end(), end2 = src_2.end();
|
|
1020
|
+
auto it1 = src_1.begin(), it2 = src_2.begin();
|
|
1021
|
+
|
|
1022
|
+
// TODO: probably actually doing copies given Level&?
|
|
1023
|
+
while (it1 != end1 && it2 != end2) {
|
|
1024
|
+
if (C()(*it1, *it2)) {
|
|
1025
|
+
dst.push_back(std::move(*it1++));
|
|
1026
|
+
} else {
|
|
1027
|
+
dst.push_back(std::move(*it2++));
|
|
1028
|
+
}
|
|
1029
|
+
}
|
|
1030
|
+
|
|
1031
|
+
if (it1 != end1) {
|
|
1032
|
+
dst.insert(dst.end(), it1, end1);
|
|
1033
|
+
} else {
|
|
1034
|
+
if (it2 == end2) { throw std::logic_error("it2 unexpectedly already at end of range"); }
|
|
1035
|
+
dst.insert(dst.end(), it2, end2);
|
|
1036
|
+
}
|
|
1037
|
+
}
|
|
1038
|
+
|
|
1039
|
+
|
|
1040
|
+
template<typename T, typename C, typename A>
|
|
1041
|
+
template<typename FwdSk>
|
|
1042
|
+
void quantiles_sketch<T, C, A>::standard_merge(quantiles_sketch& tgt, FwdSk&& src) {
|
|
1043
|
+
if (src.get_k() != tgt.get_k()) {
|
|
1044
|
+
throw std::invalid_argument("src.get_k() != tgt.get_k()");
|
|
1045
|
+
}
|
|
1046
|
+
if (src.is_empty()) {
|
|
1047
|
+
return;
|
|
1048
|
+
}
|
|
1049
|
+
|
|
1050
|
+
uint64_t new_n = src.get_n() + tgt.get_n();
|
|
1051
|
+
|
|
1052
|
+
// move items from src's base buffer
|
|
1053
|
+
for (uint16_t i = 0; i < src.base_buffer_.size(); ++i) {
|
|
1054
|
+
tgt.update(conditional_forward<FwdSk>(src.base_buffer_[i]));
|
|
1055
|
+
}
|
|
1056
|
+
|
|
1057
|
+
// check (after moving raw items) if we need to extend levels array
|
|
1058
|
+
uint8_t levels_needed = compute_levels_needed(tgt.get_k(), new_n);
|
|
1059
|
+
if (levels_needed > tgt.levels_.size()) {
|
|
1060
|
+
tgt.levels_.reserve(levels_needed);
|
|
1061
|
+
while (tgt.levels_.size() < levels_needed) {
|
|
1062
|
+
Level empty_level(tgt.allocator_);
|
|
1063
|
+
empty_level.reserve(tgt.get_k());
|
|
1064
|
+
tgt.levels_.push_back(std::move(empty_level));
|
|
1065
|
+
}
|
|
1066
|
+
}
|
|
1067
|
+
|
|
1068
|
+
Level scratch_buf(tgt.allocator_);
|
|
1069
|
+
scratch_buf.reserve(2 * tgt.get_k());
|
|
1070
|
+
|
|
1071
|
+
uint64_t src_pattern = src.bit_pattern_;
|
|
1072
|
+
for (uint8_t src_lvl = 0; src_pattern != 0; ++src_lvl, src_pattern >>= 1) {
|
|
1073
|
+
if ((src_pattern & 1) > 0) {
|
|
1074
|
+
scratch_buf.clear();
|
|
1075
|
+
|
|
1076
|
+
// propagate-carry
|
|
1077
|
+
in_place_propagate_carry(src_lvl,
|
|
1078
|
+
src.levels_[src_lvl], scratch_buf,
|
|
1079
|
+
false, tgt);
|
|
1080
|
+
// update n_ at the end
|
|
1081
|
+
}
|
|
1082
|
+
}
|
|
1083
|
+
tgt.n_ = new_n;
|
|
1084
|
+
if ((tgt.get_n() / (2 * tgt.get_k())) != tgt.bit_pattern_) {
|
|
1085
|
+
throw std::logic_error("Failed internal consistency check after standard_merge()");
|
|
1086
|
+
}
|
|
1087
|
+
|
|
1088
|
+
// update min and max values
|
|
1089
|
+
// can't just check is_empty() since min and max might not have been set if
|
|
1090
|
+
// there were no base buffer items added via update()
|
|
1091
|
+
if (tgt.min_value_ == nullptr) {
|
|
1092
|
+
tgt.min_value_ = new (tgt.allocator_.allocate(1)) T(*src.min_value_);
|
|
1093
|
+
} else {
|
|
1094
|
+
if (C()(*src.min_value_, *tgt.min_value_))
|
|
1095
|
+
*tgt.min_value_ = conditional_forward<FwdSk>(*src.min_value_);
|
|
1096
|
+
}
|
|
1097
|
+
|
|
1098
|
+
if (tgt.max_value_ == nullptr) {
|
|
1099
|
+
tgt.max_value_ = new (tgt.allocator_.allocate(1)) T(*src.max_value_);
|
|
1100
|
+
} else {
|
|
1101
|
+
if (C()(*tgt.max_value_, *src.max_value_))
|
|
1102
|
+
*tgt.max_value_ = conditional_forward<FwdSk>(*src.max_value_);
|
|
1103
|
+
}
|
|
1104
|
+
}
|
|
1105
|
+
|
|
1106
|
+
|
|
1107
|
+
template<typename T, typename C, typename A>
|
|
1108
|
+
template<typename FwdSk>
|
|
1109
|
+
void quantiles_sketch<T, C, A>::downsampling_merge(quantiles_sketch& tgt, FwdSk&& src) {
|
|
1110
|
+
if (src.get_k() % tgt.get_k() != 0) {
|
|
1111
|
+
throw std::invalid_argument("src.get_k() is not a multiple of tgt.get_k()");
|
|
1112
|
+
}
|
|
1113
|
+
if (src.is_empty()) {
|
|
1114
|
+
return;
|
|
1115
|
+
}
|
|
1116
|
+
|
|
1117
|
+
const uint16_t downsample_factor = src.get_k() / tgt.get_k();
|
|
1118
|
+
const uint8_t lg_sample_factor = count_trailing_zeros_in_u32(downsample_factor);
|
|
1119
|
+
|
|
1120
|
+
uint64_t new_n = src.get_n() + tgt.get_n();
|
|
1121
|
+
|
|
1122
|
+
// move items from src's base buffer
|
|
1123
|
+
for (uint16_t i = 0; i < src.base_buffer_.size(); ++i) {
|
|
1124
|
+
tgt.update(conditional_forward<FwdSk>(src.base_buffer_[i]));
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
// check (after moving raw items) if we need to extend levels array
|
|
1128
|
+
uint8_t levels_needed = compute_levels_needed(tgt.get_k(), new_n);
|
|
1129
|
+
if (levels_needed > tgt.levels_.size()) {
|
|
1130
|
+
tgt.levels_.reserve(levels_needed);
|
|
1131
|
+
while (tgt.levels_.size() < levels_needed) {
|
|
1132
|
+
Level empty_level(tgt.allocator_);
|
|
1133
|
+
empty_level.reserve(tgt.get_k());
|
|
1134
|
+
tgt.levels_.push_back(std::move(empty_level));
|
|
1135
|
+
}
|
|
1136
|
+
}
|
|
1137
|
+
|
|
1138
|
+
Level down_buf(tgt.allocator_);
|
|
1139
|
+
down_buf.reserve(tgt.get_k());
|
|
1140
|
+
|
|
1141
|
+
Level scratch_buf(tgt.allocator_);
|
|
1142
|
+
scratch_buf.reserve(2 * tgt.get_k());
|
|
1143
|
+
|
|
1144
|
+
uint64_t src_pattern = src.bit_pattern_;
|
|
1145
|
+
for (uint8_t src_lvl = 0; src_pattern != 0; ++src_lvl, src_pattern >>= 1) {
|
|
1146
|
+
if ((src_pattern & 1) > 0) {
|
|
1147
|
+
down_buf.clear();
|
|
1148
|
+
scratch_buf.clear();
|
|
1149
|
+
|
|
1150
|
+
// zip with stride, leaving input buffer intact
|
|
1151
|
+
zip_buffer_with_stride(src.levels_[src_lvl], down_buf, downsample_factor);
|
|
1152
|
+
|
|
1153
|
+
// propagate-carry
|
|
1154
|
+
in_place_propagate_carry(src_lvl + lg_sample_factor,
|
|
1155
|
+
down_buf, scratch_buf,
|
|
1156
|
+
false, tgt);
|
|
1157
|
+
// update n_ at the end
|
|
1158
|
+
}
|
|
1159
|
+
}
|
|
1160
|
+
tgt.n_ = new_n;
|
|
1161
|
+
if ((tgt.get_n() / (2 * tgt.get_k())) != tgt.bit_pattern_) {
|
|
1162
|
+
throw std::logic_error("Failed internal consistency check after downsampling_merge()");
|
|
1163
|
+
}
|
|
1164
|
+
|
|
1165
|
+
// update min and max values
|
|
1166
|
+
// can't just check is_empty() since min and max might not have been set if
|
|
1167
|
+
// there were no base buffer items added via update()
|
|
1168
|
+
if (tgt.min_value_ == nullptr) {
|
|
1169
|
+
tgt.min_value_ = new (tgt.allocator_.allocate(1)) T(*src.min_value_);
|
|
1170
|
+
} else {
|
|
1171
|
+
if (C()(*src.min_value_, *tgt.min_value_))
|
|
1172
|
+
*tgt.min_value_ = conditional_forward<FwdSk>(*src.min_value_);
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
if (tgt.max_value_ == nullptr) {
|
|
1176
|
+
tgt.max_value_ = new (tgt.allocator_.allocate(1)) T(*src.max_value_);
|
|
1177
|
+
} else {
|
|
1178
|
+
if (C()(*tgt.max_value_, *src.max_value_))
|
|
1179
|
+
*tgt.max_value_ = conditional_forward<FwdSk>(*src.max_value_);
|
|
1180
|
+
}
|
|
1181
|
+
}
|
|
1182
|
+
|
|
1183
|
+
|
|
1184
|
+
template<typename T, typename C, typename A>
|
|
1185
|
+
uint8_t quantiles_sketch<T, C, A>::lowest_zero_bit_starting_at(uint64_t bits, uint8_t starting_bit) {
|
|
1186
|
+
uint8_t pos = starting_bit & 0X3F;
|
|
1187
|
+
uint64_t my_bits = bits >> pos;
|
|
1188
|
+
|
|
1189
|
+
while ((my_bits & static_cast<uint64_t>(1)) != 0) {
|
|
1190
|
+
my_bits >>= 1;
|
|
1191
|
+
pos++;
|
|
1192
|
+
}
|
|
1193
|
+
return pos;
|
|
1194
|
+
}
|
|
1195
|
+
|
|
1196
|
+
template<typename T, typename C, typename A>
|
|
1197
|
+
class quantiles_sketch<T, C, A>::item_deleter {
|
|
1198
|
+
public:
|
|
1199
|
+
item_deleter(const A& allocator): allocator_(allocator) {}
|
|
1200
|
+
void operator() (T* ptr) {
|
|
1201
|
+
if (ptr != nullptr) {
|
|
1202
|
+
ptr->~T();
|
|
1203
|
+
allocator_.deallocate(ptr, 1);
|
|
1204
|
+
}
|
|
1205
|
+
}
|
|
1206
|
+
private:
|
|
1207
|
+
A allocator_;
|
|
1208
|
+
};
|
|
1209
|
+
|
|
1210
|
+
template<typename T, typename C, typename A>
|
|
1211
|
+
class quantiles_sketch<T, C, A>::items_deleter {
|
|
1212
|
+
public:
|
|
1213
|
+
items_deleter(const A& allocator, bool destroy, size_t num): allocator_(allocator), destroy_(destroy), num_(num) {}
|
|
1214
|
+
void operator() (T* ptr) {
|
|
1215
|
+
if (ptr != nullptr) {
|
|
1216
|
+
if (destroy_) {
|
|
1217
|
+
for (size_t i = 0; i < num_; ++i) {
|
|
1218
|
+
ptr[i].~T();
|
|
1219
|
+
}
|
|
1220
|
+
}
|
|
1221
|
+
allocator_.deallocate(ptr, num_);
|
|
1222
|
+
}
|
|
1223
|
+
}
|
|
1224
|
+
void set_destroy(bool destroy) { destroy_ = destroy; }
|
|
1225
|
+
private:
|
|
1226
|
+
A allocator_;
|
|
1227
|
+
bool destroy_;
|
|
1228
|
+
size_t num_;
|
|
1229
|
+
};
|
|
1230
|
+
|
|
1231
|
+
|
|
1232
|
+
// quantiles_sketch::const_iterator implementation
|
|
1233
|
+
|
|
1234
|
+
template<typename T, typename C, typename A>
|
|
1235
|
+
quantiles_sketch<T, C, A>::const_iterator::const_iterator(const Level& base_buffer,
|
|
1236
|
+
const std::vector<Level, AllocLevel>& levels,
|
|
1237
|
+
uint16_t k,
|
|
1238
|
+
uint64_t n,
|
|
1239
|
+
bool is_end):
|
|
1240
|
+
base_buffer_(base_buffer),
|
|
1241
|
+
levels_(levels),
|
|
1242
|
+
level_(-1),
|
|
1243
|
+
index_(0),
|
|
1244
|
+
bb_count_(compute_base_buffer_items(k, n)),
|
|
1245
|
+
bit_pattern_(compute_bit_pattern(k, n)),
|
|
1246
|
+
weight_(1),
|
|
1247
|
+
k_(k)
|
|
1248
|
+
{
|
|
1249
|
+
if (is_end) {
|
|
1250
|
+
// if exact mode: index_ = n is end
|
|
1251
|
+
// if sampling, level_ = max_level + 1 and index_ = 0 is end
|
|
1252
|
+
if (bit_pattern_ == 0) // only a valid check for exact mode in constructor
|
|
1253
|
+
index_ = static_cast<uint32_t>(n);
|
|
1254
|
+
else
|
|
1255
|
+
level_ = static_cast<int>(levels_.size());
|
|
1256
|
+
} else { // find first non-empty item
|
|
1257
|
+
if (bb_count_ == 0 && bit_pattern_ > 0) {
|
|
1258
|
+
level_ = 0;
|
|
1259
|
+
weight_ = 2;
|
|
1260
|
+
while ((bit_pattern_ & 0x01) == 0) {
|
|
1261
|
+
weight_ *= 2;
|
|
1262
|
+
++level_;
|
|
1263
|
+
bit_pattern_ >>= 1;
|
|
1264
|
+
}
|
|
1265
|
+
}
|
|
1266
|
+
}
|
|
1267
|
+
}
|
|
1268
|
+
|
|
1269
|
+
template<typename T, typename C, typename A>
|
|
1270
|
+
typename quantiles_sketch<T, C, A>::const_iterator& quantiles_sketch<T, C, A>::const_iterator::operator++() {
|
|
1271
|
+
++index_;
|
|
1272
|
+
|
|
1273
|
+
if ((level_ == -1 && index_ == base_buffer_.size() && levels_.size() > 0) || (level_ >= 0 && index_ == k_)) { // go to the next non-empty level
|
|
1274
|
+
index_ = 0;
|
|
1275
|
+
do {
|
|
1276
|
+
++level_;
|
|
1277
|
+
if (level_ > 0) bit_pattern_ = bit_pattern_ >> 1;
|
|
1278
|
+
if (bit_pattern_ == 0) return *this;
|
|
1279
|
+
weight_ *= 2;
|
|
1280
|
+
} while ((bit_pattern_ & static_cast<uint64_t>(1)) == 0);
|
|
1281
|
+
}
|
|
1282
|
+
return *this;
|
|
1283
|
+
}
|
|
1284
|
+
|
|
1285
|
+
template<typename T, typename C, typename A>
|
|
1286
|
+
typename quantiles_sketch<T, C, A>::const_iterator& quantiles_sketch<T, C, A>::const_iterator::operator++(int) {
|
|
1287
|
+
const_iterator tmp(*this);
|
|
1288
|
+
operator++();
|
|
1289
|
+
return tmp;
|
|
1290
|
+
}
|
|
1291
|
+
|
|
1292
|
+
template<typename T, typename C, typename A>
|
|
1293
|
+
bool quantiles_sketch<T, C, A>::const_iterator::operator==(const const_iterator& other) const {
|
|
1294
|
+
return level_ == other.level_ && index_ == other.index_;
|
|
1295
|
+
}
|
|
1296
|
+
|
|
1297
|
+
template<typename T, typename C, typename A>
|
|
1298
|
+
bool quantiles_sketch<T, C, A>::const_iterator::operator!=(const const_iterator& other) const {
|
|
1299
|
+
return !operator==(other);
|
|
1300
|
+
}
|
|
1301
|
+
|
|
1302
|
+
template<typename T, typename C, typename A>
|
|
1303
|
+
std::pair<const T&, const uint64_t> quantiles_sketch<T, C, A>::const_iterator::operator*() const {
|
|
1304
|
+
return std::pair<const T&, const uint64_t>(level_ == -1 ? base_buffer_[index_] : levels_[level_][index_], weight_);
|
|
1305
|
+
}
|
|
1306
|
+
|
|
1307
|
+
} /* namespace datasketches */
|
|
1308
|
+
|
|
1309
|
+
#endif // _QUANTILES_SKETCH_IMPL_HPP_
|