datasketches 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +1 -1
- data/ext/datasketches/kll_wrapper.cpp +5 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +4 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +2 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +25 -9
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +96 -42
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +105 -127
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +94 -25
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
- data/vendor/datasketches-cpp/pyproject.toml +1 -1
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/README.md +7 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
- data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -2
- data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +62 -59
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +31 -26
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +25 -9
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +8 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +7 -45
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +29 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +2 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +16 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -0
- metadata +25 -9
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
|
@@ -23,7 +23,9 @@
|
|
|
23
23
|
#include <iostream>
|
|
24
24
|
#include <iomanip>
|
|
25
25
|
#include <sstream>
|
|
26
|
+
#include <stdexcept>
|
|
26
27
|
|
|
28
|
+
#include "conditional_forward.hpp"
|
|
27
29
|
#include "memory_operations.hpp"
|
|
28
30
|
#include "kll_helper.hpp"
|
|
29
31
|
|
|
@@ -146,19 +148,12 @@ kll_sketch<T, C, S, A>::~kll_sketch() {
|
|
|
146
148
|
}
|
|
147
149
|
|
|
148
150
|
template<typename T, typename C, typename S, typename A>
|
|
149
|
-
|
|
151
|
+
template<typename FwdT>
|
|
152
|
+
void kll_sketch<T, C, S, A>::update(FwdT&& value) {
|
|
150
153
|
if (!check_update_value(value)) { return; }
|
|
151
154
|
update_min_max(value);
|
|
152
155
|
const uint32_t index = internal_update();
|
|
153
|
-
new (&items_[index]) T(value);
|
|
154
|
-
}
|
|
155
|
-
|
|
156
|
-
template<typename T, typename C, typename S, typename A>
|
|
157
|
-
void kll_sketch<T, C, S, A>::update(T&& value) {
|
|
158
|
-
if (!check_update_value(value)) { return; }
|
|
159
|
-
update_min_max(value);
|
|
160
|
-
const uint32_t index = internal_update();
|
|
161
|
-
new (&items_[index]) T(std::move(value));
|
|
156
|
+
new (&items_[index]) T(std::forward<FwdT>(value));
|
|
162
157
|
}
|
|
163
158
|
|
|
164
159
|
template<typename T, typename C, typename S, typename A>
|
|
@@ -181,22 +176,23 @@ uint32_t kll_sketch<T, C, S, A>::internal_update() {
|
|
|
181
176
|
}
|
|
182
177
|
|
|
183
178
|
template<typename T, typename C, typename S, typename A>
|
|
184
|
-
|
|
179
|
+
template<typename FwdSk>
|
|
180
|
+
void kll_sketch<T, C, S, A>::merge(FwdSk&& other) {
|
|
185
181
|
if (other.is_empty()) return;
|
|
186
182
|
if (m_ != other.m_) {
|
|
187
183
|
throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
|
|
188
184
|
}
|
|
189
185
|
if (is_empty()) {
|
|
190
|
-
min_value_ = new (allocator_.allocate(1)) T(*other.min_value_);
|
|
191
|
-
max_value_ = new (allocator_.allocate(1)) T(*other.max_value_);
|
|
186
|
+
min_value_ = new (allocator_.allocate(1)) T(conditional_forward<FwdSk>(*other.min_value_));
|
|
187
|
+
max_value_ = new (allocator_.allocate(1)) T(conditional_forward<FwdSk>(*other.max_value_));
|
|
192
188
|
} else {
|
|
193
|
-
if (C()(*other.min_value_, *min_value_)) *min_value_ = *other.min_value_;
|
|
194
|
-
if (C()(*max_value_, *other.max_value_)) *max_value_ = *other.max_value_;
|
|
189
|
+
if (C()(*other.min_value_, *min_value_)) *min_value_ = conditional_forward<FwdSk>(*other.min_value_);
|
|
190
|
+
if (C()(*max_value_, *other.max_value_)) *max_value_ = conditional_forward<FwdSk>(*other.max_value_);
|
|
195
191
|
}
|
|
196
192
|
const uint64_t final_n = n_ + other.n_;
|
|
197
193
|
for (uint32_t i = other.levels_[0]; i < other.levels_[1]; i++) {
|
|
198
194
|
const uint32_t index = internal_update();
|
|
199
|
-
new (&items_[index]) T(other.items_[i]);
|
|
195
|
+
new (&items_[index]) T(conditional_forward<FwdSk>(other.items_[i]));
|
|
200
196
|
}
|
|
201
197
|
if (other.num_levels_ >= 2) merge_higher_levels(other, final_n);
|
|
202
198
|
n_ = final_n;
|
|
@@ -204,30 +200,6 @@ void kll_sketch<T, C, S, A>::merge(const kll_sketch& other) {
|
|
|
204
200
|
assert_correct_total_weight();
|
|
205
201
|
}
|
|
206
202
|
|
|
207
|
-
template<typename T, typename C, typename S, typename A>
|
|
208
|
-
void kll_sketch<T, C, S, A>::merge(kll_sketch&& other) {
|
|
209
|
-
if (other.is_empty()) return;
|
|
210
|
-
if (m_ != other.m_) {
|
|
211
|
-
throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
|
|
212
|
-
}
|
|
213
|
-
if (is_empty()) {
|
|
214
|
-
min_value_ = new (allocator_.allocate(1)) T(std::move(*other.min_value_));
|
|
215
|
-
max_value_ = new (allocator_.allocate(1)) T(std::move(*other.max_value_));
|
|
216
|
-
} else {
|
|
217
|
-
if (C()(*other.min_value_, *min_value_)) *min_value_ = std::move(*other.min_value_);
|
|
218
|
-
if (C()(*max_value_, *other.max_value_)) *max_value_ = std::move(*other.max_value_);
|
|
219
|
-
}
|
|
220
|
-
const uint64_t final_n = n_ + other.n_;
|
|
221
|
-
for (uint32_t i = other.levels_[0]; i < other.levels_[1]; i++) {
|
|
222
|
-
const uint32_t index = internal_update();
|
|
223
|
-
new (&items_[index]) T(std::move(other.items_[i]));
|
|
224
|
-
}
|
|
225
|
-
if (other.num_levels_ >= 2) merge_higher_levels(std::forward<kll_sketch>(other), final_n);
|
|
226
|
-
n_ = final_n;
|
|
227
|
-
if (other.is_estimation_mode()) min_k_ = std::min(min_k_, other.min_k_);
|
|
228
|
-
assert_correct_total_weight();
|
|
229
|
-
}
|
|
230
|
-
|
|
231
203
|
template<typename T, typename C, typename S, typename A>
|
|
232
204
|
bool kll_sketch<T, C, S, A>::is_empty() const {
|
|
233
205
|
return n_ == 0;
|
|
@@ -266,43 +238,49 @@ T kll_sketch<T, C, S, A>::get_max_value() const {
|
|
|
266
238
|
}
|
|
267
239
|
|
|
268
240
|
template<typename T, typename C, typename S, typename A>
|
|
269
|
-
|
|
241
|
+
C kll_sketch<T, C, S, A>::get_comparator() const {
|
|
242
|
+
return C();
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
template<typename T, typename C, typename S, typename A>
|
|
246
|
+
template<bool inclusive>
|
|
247
|
+
auto kll_sketch<T, C, S, A>::get_quantile(double rank) const -> quantile_return_type {
|
|
270
248
|
if (is_empty()) return get_invalid_value();
|
|
271
|
-
if (
|
|
272
|
-
if (
|
|
273
|
-
if ((
|
|
249
|
+
if (rank == 0.0) return *min_value_;
|
|
250
|
+
if (rank == 1.0) return *max_value_;
|
|
251
|
+
if ((rank < 0.0) || (rank > 1.0)) {
|
|
274
252
|
throw std::invalid_argument("Fraction cannot be less than zero or greater than 1.0");
|
|
275
253
|
}
|
|
276
|
-
//
|
|
277
|
-
|
|
278
|
-
return quantile_calculator->get_quantile(fraction);
|
|
254
|
+
// may have a side effect of sorting level zero if needed
|
|
255
|
+
return get_sorted_view<inclusive>(true).get_quantile(rank);
|
|
279
256
|
}
|
|
280
257
|
|
|
281
258
|
template<typename T, typename C, typename S, typename A>
|
|
282
|
-
|
|
259
|
+
template<bool inclusive>
|
|
260
|
+
std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(const double* ranks, uint32_t size) const {
|
|
283
261
|
std::vector<T, A> quantiles(allocator_);
|
|
284
262
|
if (is_empty()) return quantiles;
|
|
285
|
-
std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator;
|
|
286
263
|
quantiles.reserve(size);
|
|
264
|
+
|
|
265
|
+
// may have a side effect of sorting level zero if needed
|
|
266
|
+
auto view = get_sorted_view<inclusive>(true);
|
|
267
|
+
|
|
287
268
|
for (uint32_t i = 0; i < size; i++) {
|
|
288
|
-
const double
|
|
289
|
-
if ((
|
|
269
|
+
const double rank = ranks[i];
|
|
270
|
+
if ((rank < 0.0) || (rank > 1.0)) {
|
|
290
271
|
throw std::invalid_argument("Fraction cannot be less than zero or greater than 1.0");
|
|
291
272
|
}
|
|
292
|
-
if
|
|
293
|
-
else if (
|
|
273
|
+
else if (rank == 0.0) quantiles.push_back(*min_value_);
|
|
274
|
+
else if (rank == 1.0) quantiles.push_back(*max_value_);
|
|
294
275
|
else {
|
|
295
|
-
|
|
296
|
-
// has side effect of sorting level zero if needed
|
|
297
|
-
quantile_calculator = const_cast<kll_sketch*>(this)->get_quantile_calculator();
|
|
298
|
-
}
|
|
299
|
-
quantiles.push_back(quantile_calculator->get_quantile(fraction));
|
|
276
|
+
quantiles.push_back(view.get_quantile(rank));
|
|
300
277
|
}
|
|
301
278
|
}
|
|
302
279
|
return quantiles;
|
|
303
280
|
}
|
|
304
281
|
|
|
305
282
|
template<typename T, typename C, typename S, typename A>
|
|
283
|
+
template<bool inclusive>
|
|
306
284
|
std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(uint32_t num) const {
|
|
307
285
|
if (is_empty()) return std::vector<T, A>(allocator_);
|
|
308
286
|
if (num == 0) {
|
|
@@ -316,10 +294,11 @@ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(uint32_t num) const {
|
|
|
316
294
|
if (num > 1) {
|
|
317
295
|
fractions[num - 1] = 1.0;
|
|
318
296
|
}
|
|
319
|
-
return get_quantiles(fractions.data(), num);
|
|
297
|
+
return get_quantiles<inclusive>(fractions.data(), num);
|
|
320
298
|
}
|
|
321
299
|
|
|
322
300
|
template<typename T, typename C, typename S, typename A>
|
|
301
|
+
template<bool inclusive>
|
|
323
302
|
double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
|
|
324
303
|
if (is_empty()) return std::numeric_limits<double>::quiet_NaN();
|
|
325
304
|
uint8_t level = 0;
|
|
@@ -329,7 +308,7 @@ double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
|
|
|
329
308
|
const auto from_index(levels_[level]);
|
|
330
309
|
const auto to_index(levels_[level + 1]); // exclusive
|
|
331
310
|
for (uint32_t i = from_index; i < to_index; i++) {
|
|
332
|
-
if (C()(items_[i], value)) {
|
|
311
|
+
if (inclusive ? !C()(value, items_[i]) : C()(items_[i], value)) {
|
|
333
312
|
total += weight;
|
|
334
313
|
} else if ((level > 0) || is_level_zero_sorted_) {
|
|
335
314
|
break; // levels above 0 are sorted, no point comparing further
|
|
@@ -342,13 +321,15 @@ double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
|
|
|
342
321
|
}
|
|
343
322
|
|
|
344
323
|
template<typename T, typename C, typename S, typename A>
|
|
324
|
+
template<bool inclusive>
|
|
345
325
|
vector_d<A> kll_sketch<T, C, S, A>::get_PMF(const T* split_points, uint32_t size) const {
|
|
346
|
-
return get_PMF_or_CDF(split_points, size, false);
|
|
326
|
+
return get_PMF_or_CDF<inclusive>(split_points, size, false);
|
|
347
327
|
}
|
|
348
328
|
|
|
349
329
|
template<typename T, typename C, typename S, typename A>
|
|
330
|
+
template<bool inclusive>
|
|
350
331
|
vector_d<A> kll_sketch<T, C, S, A>::get_CDF(const T* split_points, uint32_t size) const {
|
|
351
|
-
return get_PMF_or_CDF(split_points, size, true);
|
|
332
|
+
return get_PMF_or_CDF<inclusive>(split_points, size, true);
|
|
352
333
|
}
|
|
353
334
|
|
|
354
335
|
template<typename T, typename C, typename S, typename A>
|
|
@@ -358,8 +339,8 @@ double kll_sketch<T, C, S, A>::get_normalized_rank_error(bool pmf) const {
|
|
|
358
339
|
|
|
359
340
|
// implementation for fixed-size arithmetic types (integral and floating point)
|
|
360
341
|
template<typename T, typename C, typename S, typename A>
|
|
361
|
-
template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
|
|
362
|
-
size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
|
|
342
|
+
template<typename TT, typename SerDe, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
|
|
343
|
+
size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes(const SerDe&) const {
|
|
363
344
|
if (is_empty()) { return EMPTY_SIZE_BYTES; }
|
|
364
345
|
if (num_levels_ == 1 && get_num_retained() == 1) {
|
|
365
346
|
return DATA_START_SINGLE_ITEM + sizeof(TT);
|
|
@@ -370,17 +351,17 @@ size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
|
|
|
370
351
|
|
|
371
352
|
// implementation for all other types
|
|
372
353
|
template<typename T, typename C, typename S, typename A>
|
|
373
|
-
template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
|
|
374
|
-
size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
|
|
354
|
+
template<typename TT, typename SerDe, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
|
|
355
|
+
size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes(const SerDe& sd) const {
|
|
375
356
|
if (is_empty()) { return EMPTY_SIZE_BYTES; }
|
|
376
357
|
if (num_levels_ == 1 && get_num_retained() == 1) {
|
|
377
|
-
return DATA_START_SINGLE_ITEM +
|
|
358
|
+
return DATA_START_SINGLE_ITEM + sd.size_of_item(items_[levels_[0]]);
|
|
378
359
|
}
|
|
379
360
|
// the last integer in the levels_ array is not serialized because it can be derived
|
|
380
361
|
size_t size = DATA_START + num_levels_ * sizeof(uint32_t);
|
|
381
|
-
size +=
|
|
382
|
-
size +=
|
|
383
|
-
for (auto it: *this) size +=
|
|
362
|
+
size += sd.size_of_item(*min_value_);
|
|
363
|
+
size += sd.size_of_item(*max_value_);
|
|
364
|
+
for (auto it: *this) size += sd.size_of_item(it.first);
|
|
384
365
|
return size;
|
|
385
366
|
}
|
|
386
367
|
|
|
@@ -405,7 +386,8 @@ size_t kll_sketch<T, C, S, A>::get_max_serialized_size_bytes(uint16_t k, uint64_
|
|
|
405
386
|
}
|
|
406
387
|
|
|
407
388
|
template<typename T, typename C, typename S, typename A>
|
|
408
|
-
|
|
389
|
+
template<typename SerDe>
|
|
390
|
+
void kll_sketch<T, C, S, A>::serialize(std::ostream& os, const SerDe& sd) const {
|
|
409
391
|
const bool is_single_item = n_ == 1;
|
|
410
392
|
const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
|
|
411
393
|
write(os, preamble_ints);
|
|
@@ -430,16 +412,17 @@ void kll_sketch<T, C, S, A>::serialize(std::ostream& os) const {
|
|
|
430
412
|
write(os, num_levels_);
|
|
431
413
|
write(os, unused);
|
|
432
414
|
write(os, levels_.data(), sizeof(levels_[0]) * num_levels_);
|
|
433
|
-
|
|
434
|
-
|
|
415
|
+
sd.serialize(os, min_value_, 1);
|
|
416
|
+
sd.serialize(os, max_value_, 1);
|
|
435
417
|
}
|
|
436
|
-
|
|
418
|
+
sd.serialize(os, &items_[levels_[0]], get_num_retained());
|
|
437
419
|
}
|
|
438
420
|
|
|
439
421
|
template<typename T, typename C, typename S, typename A>
|
|
440
|
-
|
|
422
|
+
template<typename SerDe>
|
|
423
|
+
vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes, const SerDe& sd) const {
|
|
441
424
|
const bool is_single_item = n_ == 1;
|
|
442
|
-
const size_t size = header_size_bytes + get_serialized_size_bytes();
|
|
425
|
+
const size_t size = header_size_bytes + get_serialized_size_bytes(sd);
|
|
443
426
|
vector_u8<A> bytes(size, 0, allocator_);
|
|
444
427
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
445
428
|
const uint8_t* end_ptr = ptr + size;
|
|
@@ -465,11 +448,11 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
|
|
|
465
448
|
ptr += copy_to_mem(num_levels_, ptr);
|
|
466
449
|
ptr += sizeof(uint8_t); // unused
|
|
467
450
|
ptr += copy_to_mem(levels_.data(), ptr, sizeof(levels_[0]) * num_levels_);
|
|
468
|
-
ptr +=
|
|
469
|
-
ptr +=
|
|
451
|
+
ptr += sd.serialize(ptr, end_ptr - ptr, min_value_, 1);
|
|
452
|
+
ptr += sd.serialize(ptr, end_ptr - ptr, max_value_, 1);
|
|
470
453
|
}
|
|
471
454
|
const size_t bytes_remaining = end_ptr - ptr;
|
|
472
|
-
ptr +=
|
|
455
|
+
ptr += sd.serialize(ptr, bytes_remaining, &items_[levels_[0]], get_num_retained());
|
|
473
456
|
}
|
|
474
457
|
const size_t delta = ptr - bytes.data();
|
|
475
458
|
if (delta != size) throw std::logic_error("serialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size));
|
|
@@ -478,6 +461,12 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
|
|
|
478
461
|
|
|
479
462
|
template<typename T, typename C, typename S, typename A>
|
|
480
463
|
kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, const A& allocator) {
|
|
464
|
+
return deserialize(is, S(), allocator);
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
template<typename T, typename C, typename S, typename A>
|
|
468
|
+
template<typename SerDe>
|
|
469
|
+
kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
|
|
481
470
|
const auto preamble_ints = read<uint8_t>(is);
|
|
482
471
|
const auto serial_version = read<uint8_t>(is);
|
|
483
472
|
const auto family_id = read<uint8_t>(is);
|
|
@@ -525,17 +514,17 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, con
|
|
|
525
514
|
std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
|
|
526
515
|
std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
|
|
527
516
|
if (!is_single_item) {
|
|
528
|
-
|
|
517
|
+
sd.deserialize(is, min_value_buffer.get(), 1);
|
|
529
518
|
// serde call did not throw, repackage with destrtuctor
|
|
530
519
|
min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
|
|
531
|
-
|
|
520
|
+
sd.deserialize(is, max_value_buffer.get(), 1);
|
|
532
521
|
// serde call did not throw, repackage with destrtuctor
|
|
533
522
|
max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
|
|
534
523
|
}
|
|
535
524
|
auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); };
|
|
536
525
|
std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(alloc.allocate(capacity), items_buffer_deleter);
|
|
537
526
|
const auto num_items = levels[num_levels] - levels[0];
|
|
538
|
-
|
|
527
|
+
sd.deserialize(is, &items_buffer.get()[levels[0]], num_items);
|
|
539
528
|
// serde call did not throw, repackage with destrtuctors
|
|
540
529
|
std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity, allocator));
|
|
541
530
|
const bool is_level_zero_sorted = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
|
|
@@ -555,6 +544,12 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, con
|
|
|
555
544
|
|
|
556
545
|
template<typename T, typename C, typename S, typename A>
|
|
557
546
|
kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, size_t size, const A& allocator) {
|
|
547
|
+
return deserialize(bytes, size, S(), allocator);
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
template<typename T, typename C, typename S, typename A>
|
|
551
|
+
template<typename SerDe>
|
|
552
|
+
kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {
|
|
558
553
|
ensure_minimum_memory(size, 8);
|
|
559
554
|
const char* ptr = static_cast<const char*>(bytes);
|
|
560
555
|
uint8_t preamble_ints;
|
|
@@ -611,17 +606,17 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, si
|
|
|
611
606
|
std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
|
|
612
607
|
std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
|
|
613
608
|
if (!is_single_item) {
|
|
614
|
-
ptr +=
|
|
609
|
+
ptr += sd.deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
|
|
615
610
|
// serde call did not throw, repackage with destrtuctor
|
|
616
611
|
min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
|
|
617
|
-
ptr +=
|
|
612
|
+
ptr += sd.deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
|
|
618
613
|
// serde call did not throw, repackage with destrtuctor
|
|
619
614
|
max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
|
|
620
615
|
}
|
|
621
616
|
auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); };
|
|
622
617
|
std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(alloc.allocate(capacity), items_buffer_deleter);
|
|
623
618
|
const auto num_items = levels[num_levels] - levels[0];
|
|
624
|
-
ptr +=
|
|
619
|
+
ptr += sd.deserialize(ptr, end_ptr - ptr, &items_buffer.get()[levels[0]], num_items);
|
|
625
620
|
// serde call did not throw, repackage with destrtuctors
|
|
626
621
|
std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity, allocator));
|
|
627
622
|
const size_t delta = ptr - static_cast<const char*>(bytes);
|
|
@@ -786,18 +781,23 @@ void kll_sketch<T, C, S, A>::sort_level_zero() {
|
|
|
786
781
|
}
|
|
787
782
|
|
|
788
783
|
template<typename T, typename C, typename S, typename A>
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
784
|
+
template<bool inclusive>
|
|
785
|
+
quantile_sketch_sorted_view<T, C, A> kll_sketch<T, C, S, A>::get_sorted_view(bool cumulative) const {
|
|
786
|
+
const_cast<kll_sketch*>(this)->sort_level_zero(); // allow this side effect
|
|
787
|
+
quantile_sketch_sorted_view<T, C, A> view(get_num_retained(), allocator_);
|
|
788
|
+
uint8_t level = 0;
|
|
789
|
+
while (level < num_levels_) {
|
|
790
|
+
const auto from = items_ + levels_[level];
|
|
791
|
+
const auto to = items_ + levels_[level + 1]; // exclusive
|
|
792
|
+
view.add(from, to, 1 << level);
|
|
793
|
+
++level;
|
|
794
|
+
}
|
|
795
|
+
if (cumulative) view.template convert_to_cummulative<inclusive>();
|
|
796
|
+
return view;
|
|
798
797
|
}
|
|
799
798
|
|
|
800
799
|
template<typename T, typename C, typename S, typename A>
|
|
800
|
+
template<bool inclusive>
|
|
801
801
|
vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32_t size, bool is_CDF) const {
|
|
802
802
|
if (is_empty()) return vector_d<A>(allocator_);
|
|
803
803
|
kll_helper::validate_values<T, C>(split_points, size);
|
|
@@ -808,9 +808,9 @@ vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32
|
|
|
808
808
|
const auto from_index = levels_[level];
|
|
809
809
|
const auto to_index = levels_[level + 1]; // exclusive
|
|
810
810
|
if ((level == 0) && !is_level_zero_sorted_) {
|
|
811
|
-
increment_buckets_unsorted_level(from_index, to_index, weight, split_points, size, buckets.data());
|
|
811
|
+
increment_buckets_unsorted_level<inclusive>(from_index, to_index, weight, split_points, size, buckets.data());
|
|
812
812
|
} else {
|
|
813
|
-
increment_buckets_sorted_level(from_index, to_index, weight, split_points, size, buckets.data());
|
|
813
|
+
increment_buckets_sorted_level<inclusive>(from_index, to_index, weight, split_points, size, buckets.data());
|
|
814
814
|
}
|
|
815
815
|
level++;
|
|
816
816
|
weight *= 2;
|
|
@@ -831,13 +831,14 @@ vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32
|
|
|
831
831
|
}
|
|
832
832
|
|
|
833
833
|
template<typename T, typename C, typename S, typename A>
|
|
834
|
+
template<bool inclusive>
|
|
834
835
|
void kll_sketch<T, C, S, A>::increment_buckets_unsorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
|
|
835
836
|
const T* split_points, uint32_t size, double* buckets) const
|
|
836
837
|
{
|
|
837
838
|
for (uint32_t i = from_index; i < to_index; i++) {
|
|
838
839
|
uint32_t j;
|
|
839
840
|
for (j = 0; j < size; j++) {
|
|
840
|
-
if (C()(items_[i], split_points[j])) {
|
|
841
|
+
if (inclusive ? !C()(split_points[j], items_[i]) : C()(items_[i], split_points[j])) {
|
|
841
842
|
break;
|
|
842
843
|
}
|
|
843
844
|
}
|
|
@@ -846,13 +847,14 @@ void kll_sketch<T, C, S, A>::increment_buckets_unsorted_level(uint32_t from_inde
|
|
|
846
847
|
}
|
|
847
848
|
|
|
848
849
|
template<typename T, typename C, typename S, typename A>
|
|
850
|
+
template<bool inclusive>
|
|
849
851
|
void kll_sketch<T, C, S, A>::increment_buckets_sorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
|
|
850
852
|
const T* split_points, uint32_t size, double* buckets) const
|
|
851
853
|
{
|
|
852
854
|
uint32_t i = from_index;
|
|
853
855
|
uint32_t j = 0;
|
|
854
856
|
while ((i < to_index) && (j < size)) {
|
|
855
|
-
if (C()(items_[i], split_points[j])) {
|
|
857
|
+
if (inclusive ? !C()(split_points[j], items_[i]) : C()(items_[i], split_points[j])) {
|
|
856
858
|
buckets[j] += weight; // this sample goes into this bucket
|
|
857
859
|
i++; // move on to next sample and see whether it also goes into this bucket
|
|
858
860
|
} else {
|
|
@@ -910,34 +912,9 @@ void kll_sketch<T, C, S, A>::merge_higher_levels(O&& other, uint64_t final_n) {
|
|
|
910
912
|
}
|
|
911
913
|
|
|
912
914
|
// this leaves items_ uninitialized (all objects moved out and destroyed)
|
|
913
|
-
// this version copies objects from the incoming sketch
|
|
914
|
-
template<typename T, typename C, typename S, typename A>
|
|
915
|
-
void kll_sketch<T, C, S, A>::populate_work_arrays(const kll_sketch& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels) {
|
|
916
|
-
worklevels[0] = 0;
|
|
917
|
-
|
|
918
|
-
// the level zero data from "other" was already inserted into "this"
|
|
919
|
-
kll_helper::move_construct<T>(items_, levels_[0], levels_[1], workbuf, 0, true);
|
|
920
|
-
worklevels[1] = safe_level_size(0);
|
|
921
|
-
|
|
922
|
-
for (uint8_t lvl = 1; lvl < provisional_num_levels; lvl++) {
|
|
923
|
-
const uint32_t self_pop = safe_level_size(lvl);
|
|
924
|
-
const uint32_t other_pop = other.safe_level_size(lvl);
|
|
925
|
-
worklevels[lvl + 1] = worklevels[lvl] + self_pop + other_pop;
|
|
926
|
-
|
|
927
|
-
if ((self_pop > 0) && (other_pop == 0)) {
|
|
928
|
-
kll_helper::move_construct<T>(items_, levels_[lvl], levels_[lvl] + self_pop, workbuf, worklevels[lvl], true);
|
|
929
|
-
} else if ((self_pop == 0) && (other_pop > 0)) {
|
|
930
|
-
kll_helper::copy_construct<T>(other.items_, other.levels_[lvl], other.levels_[lvl] + other_pop, workbuf, worklevels[lvl]);
|
|
931
|
-
} else if ((self_pop > 0) && (other_pop > 0)) {
|
|
932
|
-
kll_helper::merge_sorted_arrays<T, C>(items_, levels_[lvl], self_pop, other.items_, other.levels_[lvl], other_pop, workbuf, worklevels[lvl]);
|
|
933
|
-
}
|
|
934
|
-
}
|
|
935
|
-
}
|
|
936
|
-
|
|
937
|
-
// this leaves items_ uninitialized (all objects moved out and destroyed)
|
|
938
|
-
// this version moves objects from the incoming sketch
|
|
939
915
|
template<typename T, typename C, typename S, typename A>
|
|
940
|
-
|
|
916
|
+
template<typename FwdSk>
|
|
917
|
+
void kll_sketch<T, C, S, A>::populate_work_arrays(FwdSk&& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels) {
|
|
941
918
|
worklevels[0] = 0;
|
|
942
919
|
|
|
943
920
|
// the level zero data from "other" was already inserted into "this"
|
|
@@ -952,7 +929,9 @@ void kll_sketch<T, C, S, A>::populate_work_arrays(kll_sketch&& other, T* workbuf
|
|
|
952
929
|
if ((self_pop > 0) && (other_pop == 0)) {
|
|
953
930
|
kll_helper::move_construct<T>(items_, levels_[lvl], levels_[lvl] + self_pop, workbuf, worklevels[lvl], true);
|
|
954
931
|
} else if ((self_pop == 0) && (other_pop > 0)) {
|
|
955
|
-
|
|
932
|
+
for (auto i = other.levels_[lvl], j = worklevels[lvl]; i < other.levels_[lvl] + other_pop; ++i, ++j) {
|
|
933
|
+
new (&workbuf[j]) T(conditional_forward<FwdSk>(other.items_[i]));
|
|
934
|
+
}
|
|
956
935
|
} else if ((self_pop > 0) && (other_pop > 0)) {
|
|
957
936
|
kll_helper::merge_sorted_arrays<T, C>(items_, levels_[lvl], self_pop, other.items_, other.levels_[lvl], other_pop, workbuf, worklevels[lvl]);
|
|
958
937
|
}
|
|
@@ -1039,7 +1018,6 @@ string<A> kll_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items)
|
|
|
1039
1018
|
os << " Sorted : " << (is_level_zero_sorted_ ? "true" : "false") << std::endl;
|
|
1040
1019
|
os << " Capacity items : " << items_size_ << std::endl;
|
|
1041
1020
|
os << " Retained items : " << get_num_retained() << std::endl;
|
|
1042
|
-
os << " Storage bytes : " << get_serialized_size_bytes() << std::endl;
|
|
1043
1021
|
if (!is_empty()) {
|
|
1044
1022
|
os << " Min value : " << *min_value_ << std::endl;
|
|
1045
1023
|
os << " Max value : " << *max_value_ << std::endl;
|