datasketches 0.2.4 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +1 -1
- data/ext/datasketches/kll_wrapper.cpp +5 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +4 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +2 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +25 -9
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +96 -42
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +105 -127
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +94 -25
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
- data/vendor/datasketches-cpp/pyproject.toml +1 -1
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/README.md +7 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
- data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -2
- data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +62 -59
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +31 -26
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +25 -9
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +8 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +7 -45
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +29 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +2 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +16 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -0
- metadata +25 -9
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
@@ -23,7 +23,9 @@
|
|
23
23
|
#include <iostream>
|
24
24
|
#include <iomanip>
|
25
25
|
#include <sstream>
|
26
|
+
#include <stdexcept>
|
26
27
|
|
28
|
+
#include "conditional_forward.hpp"
|
27
29
|
#include "memory_operations.hpp"
|
28
30
|
#include "kll_helper.hpp"
|
29
31
|
|
@@ -146,19 +148,12 @@ kll_sketch<T, C, S, A>::~kll_sketch() {
|
|
146
148
|
}
|
147
149
|
|
148
150
|
template<typename T, typename C, typename S, typename A>
|
149
|
-
|
151
|
+
template<typename FwdT>
|
152
|
+
void kll_sketch<T, C, S, A>::update(FwdT&& value) {
|
150
153
|
if (!check_update_value(value)) { return; }
|
151
154
|
update_min_max(value);
|
152
155
|
const uint32_t index = internal_update();
|
153
|
-
new (&items_[index]) T(value);
|
154
|
-
}
|
155
|
-
|
156
|
-
template<typename T, typename C, typename S, typename A>
|
157
|
-
void kll_sketch<T, C, S, A>::update(T&& value) {
|
158
|
-
if (!check_update_value(value)) { return; }
|
159
|
-
update_min_max(value);
|
160
|
-
const uint32_t index = internal_update();
|
161
|
-
new (&items_[index]) T(std::move(value));
|
156
|
+
new (&items_[index]) T(std::forward<FwdT>(value));
|
162
157
|
}
|
163
158
|
|
164
159
|
template<typename T, typename C, typename S, typename A>
|
@@ -181,22 +176,23 @@ uint32_t kll_sketch<T, C, S, A>::internal_update() {
|
|
181
176
|
}
|
182
177
|
|
183
178
|
template<typename T, typename C, typename S, typename A>
|
184
|
-
|
179
|
+
template<typename FwdSk>
|
180
|
+
void kll_sketch<T, C, S, A>::merge(FwdSk&& other) {
|
185
181
|
if (other.is_empty()) return;
|
186
182
|
if (m_ != other.m_) {
|
187
183
|
throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
|
188
184
|
}
|
189
185
|
if (is_empty()) {
|
190
|
-
min_value_ = new (allocator_.allocate(1)) T(*other.min_value_);
|
191
|
-
max_value_ = new (allocator_.allocate(1)) T(*other.max_value_);
|
186
|
+
min_value_ = new (allocator_.allocate(1)) T(conditional_forward<FwdSk>(*other.min_value_));
|
187
|
+
max_value_ = new (allocator_.allocate(1)) T(conditional_forward<FwdSk>(*other.max_value_));
|
192
188
|
} else {
|
193
|
-
if (C()(*other.min_value_, *min_value_)) *min_value_ = *other.min_value_;
|
194
|
-
if (C()(*max_value_, *other.max_value_)) *max_value_ = *other.max_value_;
|
189
|
+
if (C()(*other.min_value_, *min_value_)) *min_value_ = conditional_forward<FwdSk>(*other.min_value_);
|
190
|
+
if (C()(*max_value_, *other.max_value_)) *max_value_ = conditional_forward<FwdSk>(*other.max_value_);
|
195
191
|
}
|
196
192
|
const uint64_t final_n = n_ + other.n_;
|
197
193
|
for (uint32_t i = other.levels_[0]; i < other.levels_[1]; i++) {
|
198
194
|
const uint32_t index = internal_update();
|
199
|
-
new (&items_[index]) T(other.items_[i]);
|
195
|
+
new (&items_[index]) T(conditional_forward<FwdSk>(other.items_[i]));
|
200
196
|
}
|
201
197
|
if (other.num_levels_ >= 2) merge_higher_levels(other, final_n);
|
202
198
|
n_ = final_n;
|
@@ -204,30 +200,6 @@ void kll_sketch<T, C, S, A>::merge(const kll_sketch& other) {
|
|
204
200
|
assert_correct_total_weight();
|
205
201
|
}
|
206
202
|
|
207
|
-
template<typename T, typename C, typename S, typename A>
|
208
|
-
void kll_sketch<T, C, S, A>::merge(kll_sketch&& other) {
|
209
|
-
if (other.is_empty()) return;
|
210
|
-
if (m_ != other.m_) {
|
211
|
-
throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
|
212
|
-
}
|
213
|
-
if (is_empty()) {
|
214
|
-
min_value_ = new (allocator_.allocate(1)) T(std::move(*other.min_value_));
|
215
|
-
max_value_ = new (allocator_.allocate(1)) T(std::move(*other.max_value_));
|
216
|
-
} else {
|
217
|
-
if (C()(*other.min_value_, *min_value_)) *min_value_ = std::move(*other.min_value_);
|
218
|
-
if (C()(*max_value_, *other.max_value_)) *max_value_ = std::move(*other.max_value_);
|
219
|
-
}
|
220
|
-
const uint64_t final_n = n_ + other.n_;
|
221
|
-
for (uint32_t i = other.levels_[0]; i < other.levels_[1]; i++) {
|
222
|
-
const uint32_t index = internal_update();
|
223
|
-
new (&items_[index]) T(std::move(other.items_[i]));
|
224
|
-
}
|
225
|
-
if (other.num_levels_ >= 2) merge_higher_levels(std::forward<kll_sketch>(other), final_n);
|
226
|
-
n_ = final_n;
|
227
|
-
if (other.is_estimation_mode()) min_k_ = std::min(min_k_, other.min_k_);
|
228
|
-
assert_correct_total_weight();
|
229
|
-
}
|
230
|
-
|
231
203
|
template<typename T, typename C, typename S, typename A>
|
232
204
|
bool kll_sketch<T, C, S, A>::is_empty() const {
|
233
205
|
return n_ == 0;
|
@@ -266,43 +238,49 @@ T kll_sketch<T, C, S, A>::get_max_value() const {
|
|
266
238
|
}
|
267
239
|
|
268
240
|
template<typename T, typename C, typename S, typename A>
|
269
|
-
|
241
|
+
C kll_sketch<T, C, S, A>::get_comparator() const {
|
242
|
+
return C();
|
243
|
+
}
|
244
|
+
|
245
|
+
template<typename T, typename C, typename S, typename A>
|
246
|
+
template<bool inclusive>
|
247
|
+
auto kll_sketch<T, C, S, A>::get_quantile(double rank) const -> quantile_return_type {
|
270
248
|
if (is_empty()) return get_invalid_value();
|
271
|
-
if (
|
272
|
-
if (
|
273
|
-
if ((
|
249
|
+
if (rank == 0.0) return *min_value_;
|
250
|
+
if (rank == 1.0) return *max_value_;
|
251
|
+
if ((rank < 0.0) || (rank > 1.0)) {
|
274
252
|
throw std::invalid_argument("Fraction cannot be less than zero or greater than 1.0");
|
275
253
|
}
|
276
|
-
//
|
277
|
-
|
278
|
-
return quantile_calculator->get_quantile(fraction);
|
254
|
+
// may have a side effect of sorting level zero if needed
|
255
|
+
return get_sorted_view<inclusive>(true).get_quantile(rank);
|
279
256
|
}
|
280
257
|
|
281
258
|
template<typename T, typename C, typename S, typename A>
|
282
|
-
|
259
|
+
template<bool inclusive>
|
260
|
+
std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(const double* ranks, uint32_t size) const {
|
283
261
|
std::vector<T, A> quantiles(allocator_);
|
284
262
|
if (is_empty()) return quantiles;
|
285
|
-
std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator;
|
286
263
|
quantiles.reserve(size);
|
264
|
+
|
265
|
+
// may have a side effect of sorting level zero if needed
|
266
|
+
auto view = get_sorted_view<inclusive>(true);
|
267
|
+
|
287
268
|
for (uint32_t i = 0; i < size; i++) {
|
288
|
-
const double
|
289
|
-
if ((
|
269
|
+
const double rank = ranks[i];
|
270
|
+
if ((rank < 0.0) || (rank > 1.0)) {
|
290
271
|
throw std::invalid_argument("Fraction cannot be less than zero or greater than 1.0");
|
291
272
|
}
|
292
|
-
if
|
293
|
-
else if (
|
273
|
+
else if (rank == 0.0) quantiles.push_back(*min_value_);
|
274
|
+
else if (rank == 1.0) quantiles.push_back(*max_value_);
|
294
275
|
else {
|
295
|
-
|
296
|
-
// has side effect of sorting level zero if needed
|
297
|
-
quantile_calculator = const_cast<kll_sketch*>(this)->get_quantile_calculator();
|
298
|
-
}
|
299
|
-
quantiles.push_back(quantile_calculator->get_quantile(fraction));
|
276
|
+
quantiles.push_back(view.get_quantile(rank));
|
300
277
|
}
|
301
278
|
}
|
302
279
|
return quantiles;
|
303
280
|
}
|
304
281
|
|
305
282
|
template<typename T, typename C, typename S, typename A>
|
283
|
+
template<bool inclusive>
|
306
284
|
std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(uint32_t num) const {
|
307
285
|
if (is_empty()) return std::vector<T, A>(allocator_);
|
308
286
|
if (num == 0) {
|
@@ -316,10 +294,11 @@ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(uint32_t num) const {
|
|
316
294
|
if (num > 1) {
|
317
295
|
fractions[num - 1] = 1.0;
|
318
296
|
}
|
319
|
-
return get_quantiles(fractions.data(), num);
|
297
|
+
return get_quantiles<inclusive>(fractions.data(), num);
|
320
298
|
}
|
321
299
|
|
322
300
|
template<typename T, typename C, typename S, typename A>
|
301
|
+
template<bool inclusive>
|
323
302
|
double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
|
324
303
|
if (is_empty()) return std::numeric_limits<double>::quiet_NaN();
|
325
304
|
uint8_t level = 0;
|
@@ -329,7 +308,7 @@ double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
|
|
329
308
|
const auto from_index(levels_[level]);
|
330
309
|
const auto to_index(levels_[level + 1]); // exclusive
|
331
310
|
for (uint32_t i = from_index; i < to_index; i++) {
|
332
|
-
if (C()(items_[i], value)) {
|
311
|
+
if (inclusive ? !C()(value, items_[i]) : C()(items_[i], value)) {
|
333
312
|
total += weight;
|
334
313
|
} else if ((level > 0) || is_level_zero_sorted_) {
|
335
314
|
break; // levels above 0 are sorted, no point comparing further
|
@@ -342,13 +321,15 @@ double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
|
|
342
321
|
}
|
343
322
|
|
344
323
|
template<typename T, typename C, typename S, typename A>
|
324
|
+
template<bool inclusive>
|
345
325
|
vector_d<A> kll_sketch<T, C, S, A>::get_PMF(const T* split_points, uint32_t size) const {
|
346
|
-
return get_PMF_or_CDF(split_points, size, false);
|
326
|
+
return get_PMF_or_CDF<inclusive>(split_points, size, false);
|
347
327
|
}
|
348
328
|
|
349
329
|
template<typename T, typename C, typename S, typename A>
|
330
|
+
template<bool inclusive>
|
350
331
|
vector_d<A> kll_sketch<T, C, S, A>::get_CDF(const T* split_points, uint32_t size) const {
|
351
|
-
return get_PMF_or_CDF(split_points, size, true);
|
332
|
+
return get_PMF_or_CDF<inclusive>(split_points, size, true);
|
352
333
|
}
|
353
334
|
|
354
335
|
template<typename T, typename C, typename S, typename A>
|
@@ -358,8 +339,8 @@ double kll_sketch<T, C, S, A>::get_normalized_rank_error(bool pmf) const {
|
|
358
339
|
|
359
340
|
// implementation for fixed-size arithmetic types (integral and floating point)
|
360
341
|
template<typename T, typename C, typename S, typename A>
|
361
|
-
template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
|
362
|
-
size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
|
342
|
+
template<typename TT, typename SerDe, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
|
343
|
+
size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes(const SerDe&) const {
|
363
344
|
if (is_empty()) { return EMPTY_SIZE_BYTES; }
|
364
345
|
if (num_levels_ == 1 && get_num_retained() == 1) {
|
365
346
|
return DATA_START_SINGLE_ITEM + sizeof(TT);
|
@@ -370,17 +351,17 @@ size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
|
|
370
351
|
|
371
352
|
// implementation for all other types
|
372
353
|
template<typename T, typename C, typename S, typename A>
|
373
|
-
template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
|
374
|
-
size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
|
354
|
+
template<typename TT, typename SerDe, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
|
355
|
+
size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes(const SerDe& sd) const {
|
375
356
|
if (is_empty()) { return EMPTY_SIZE_BYTES; }
|
376
357
|
if (num_levels_ == 1 && get_num_retained() == 1) {
|
377
|
-
return DATA_START_SINGLE_ITEM +
|
358
|
+
return DATA_START_SINGLE_ITEM + sd.size_of_item(items_[levels_[0]]);
|
378
359
|
}
|
379
360
|
// the last integer in the levels_ array is not serialized because it can be derived
|
380
361
|
size_t size = DATA_START + num_levels_ * sizeof(uint32_t);
|
381
|
-
size +=
|
382
|
-
size +=
|
383
|
-
for (auto it: *this) size +=
|
362
|
+
size += sd.size_of_item(*min_value_);
|
363
|
+
size += sd.size_of_item(*max_value_);
|
364
|
+
for (auto it: *this) size += sd.size_of_item(it.first);
|
384
365
|
return size;
|
385
366
|
}
|
386
367
|
|
@@ -405,7 +386,8 @@ size_t kll_sketch<T, C, S, A>::get_max_serialized_size_bytes(uint16_t k, uint64_
|
|
405
386
|
}
|
406
387
|
|
407
388
|
template<typename T, typename C, typename S, typename A>
|
408
|
-
|
389
|
+
template<typename SerDe>
|
390
|
+
void kll_sketch<T, C, S, A>::serialize(std::ostream& os, const SerDe& sd) const {
|
409
391
|
const bool is_single_item = n_ == 1;
|
410
392
|
const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
|
411
393
|
write(os, preamble_ints);
|
@@ -430,16 +412,17 @@ void kll_sketch<T, C, S, A>::serialize(std::ostream& os) const {
|
|
430
412
|
write(os, num_levels_);
|
431
413
|
write(os, unused);
|
432
414
|
write(os, levels_.data(), sizeof(levels_[0]) * num_levels_);
|
433
|
-
|
434
|
-
|
415
|
+
sd.serialize(os, min_value_, 1);
|
416
|
+
sd.serialize(os, max_value_, 1);
|
435
417
|
}
|
436
|
-
|
418
|
+
sd.serialize(os, &items_[levels_[0]], get_num_retained());
|
437
419
|
}
|
438
420
|
|
439
421
|
template<typename T, typename C, typename S, typename A>
|
440
|
-
|
422
|
+
template<typename SerDe>
|
423
|
+
vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes, const SerDe& sd) const {
|
441
424
|
const bool is_single_item = n_ == 1;
|
442
|
-
const size_t size = header_size_bytes + get_serialized_size_bytes();
|
425
|
+
const size_t size = header_size_bytes + get_serialized_size_bytes(sd);
|
443
426
|
vector_u8<A> bytes(size, 0, allocator_);
|
444
427
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
445
428
|
const uint8_t* end_ptr = ptr + size;
|
@@ -465,11 +448,11 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
|
|
465
448
|
ptr += copy_to_mem(num_levels_, ptr);
|
466
449
|
ptr += sizeof(uint8_t); // unused
|
467
450
|
ptr += copy_to_mem(levels_.data(), ptr, sizeof(levels_[0]) * num_levels_);
|
468
|
-
ptr +=
|
469
|
-
ptr +=
|
451
|
+
ptr += sd.serialize(ptr, end_ptr - ptr, min_value_, 1);
|
452
|
+
ptr += sd.serialize(ptr, end_ptr - ptr, max_value_, 1);
|
470
453
|
}
|
471
454
|
const size_t bytes_remaining = end_ptr - ptr;
|
472
|
-
ptr +=
|
455
|
+
ptr += sd.serialize(ptr, bytes_remaining, &items_[levels_[0]], get_num_retained());
|
473
456
|
}
|
474
457
|
const size_t delta = ptr - bytes.data();
|
475
458
|
if (delta != size) throw std::logic_error("serialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size));
|
@@ -478,6 +461,12 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
|
|
478
461
|
|
479
462
|
template<typename T, typename C, typename S, typename A>
|
480
463
|
kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, const A& allocator) {
|
464
|
+
return deserialize(is, S(), allocator);
|
465
|
+
}
|
466
|
+
|
467
|
+
template<typename T, typename C, typename S, typename A>
|
468
|
+
template<typename SerDe>
|
469
|
+
kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
|
481
470
|
const auto preamble_ints = read<uint8_t>(is);
|
482
471
|
const auto serial_version = read<uint8_t>(is);
|
483
472
|
const auto family_id = read<uint8_t>(is);
|
@@ -525,17 +514,17 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, con
|
|
525
514
|
std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
|
526
515
|
std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
|
527
516
|
if (!is_single_item) {
|
528
|
-
|
517
|
+
sd.deserialize(is, min_value_buffer.get(), 1);
|
529
518
|
// serde call did not throw, repackage with destrtuctor
|
530
519
|
min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
|
531
|
-
|
520
|
+
sd.deserialize(is, max_value_buffer.get(), 1);
|
532
521
|
// serde call did not throw, repackage with destrtuctor
|
533
522
|
max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
|
534
523
|
}
|
535
524
|
auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); };
|
536
525
|
std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(alloc.allocate(capacity), items_buffer_deleter);
|
537
526
|
const auto num_items = levels[num_levels] - levels[0];
|
538
|
-
|
527
|
+
sd.deserialize(is, &items_buffer.get()[levels[0]], num_items);
|
539
528
|
// serde call did not throw, repackage with destrtuctors
|
540
529
|
std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity, allocator));
|
541
530
|
const bool is_level_zero_sorted = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
|
@@ -555,6 +544,12 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, con
|
|
555
544
|
|
556
545
|
template<typename T, typename C, typename S, typename A>
|
557
546
|
kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, size_t size, const A& allocator) {
|
547
|
+
return deserialize(bytes, size, S(), allocator);
|
548
|
+
}
|
549
|
+
|
550
|
+
template<typename T, typename C, typename S, typename A>
|
551
|
+
template<typename SerDe>
|
552
|
+
kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {
|
558
553
|
ensure_minimum_memory(size, 8);
|
559
554
|
const char* ptr = static_cast<const char*>(bytes);
|
560
555
|
uint8_t preamble_ints;
|
@@ -611,17 +606,17 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, si
|
|
611
606
|
std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
|
612
607
|
std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
|
613
608
|
if (!is_single_item) {
|
614
|
-
ptr +=
|
609
|
+
ptr += sd.deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
|
615
610
|
// serde call did not throw, repackage with destrtuctor
|
616
611
|
min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
|
617
|
-
ptr +=
|
612
|
+
ptr += sd.deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
|
618
613
|
// serde call did not throw, repackage with destrtuctor
|
619
614
|
max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
|
620
615
|
}
|
621
616
|
auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); };
|
622
617
|
std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(alloc.allocate(capacity), items_buffer_deleter);
|
623
618
|
const auto num_items = levels[num_levels] - levels[0];
|
624
|
-
ptr +=
|
619
|
+
ptr += sd.deserialize(ptr, end_ptr - ptr, &items_buffer.get()[levels[0]], num_items);
|
625
620
|
// serde call did not throw, repackage with destrtuctors
|
626
621
|
std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity, allocator));
|
627
622
|
const size_t delta = ptr - static_cast<const char*>(bytes);
|
@@ -786,18 +781,23 @@ void kll_sketch<T, C, S, A>::sort_level_zero() {
|
|
786
781
|
}
|
787
782
|
|
788
783
|
template<typename T, typename C, typename S, typename A>
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
784
|
+
template<bool inclusive>
|
785
|
+
quantile_sketch_sorted_view<T, C, A> kll_sketch<T, C, S, A>::get_sorted_view(bool cumulative) const {
|
786
|
+
const_cast<kll_sketch*>(this)->sort_level_zero(); // allow this side effect
|
787
|
+
quantile_sketch_sorted_view<T, C, A> view(get_num_retained(), allocator_);
|
788
|
+
uint8_t level = 0;
|
789
|
+
while (level < num_levels_) {
|
790
|
+
const auto from = items_ + levels_[level];
|
791
|
+
const auto to = items_ + levels_[level + 1]; // exclusive
|
792
|
+
view.add(from, to, 1 << level);
|
793
|
+
++level;
|
794
|
+
}
|
795
|
+
if (cumulative) view.template convert_to_cummulative<inclusive>();
|
796
|
+
return view;
|
798
797
|
}
|
799
798
|
|
800
799
|
template<typename T, typename C, typename S, typename A>
|
800
|
+
template<bool inclusive>
|
801
801
|
vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32_t size, bool is_CDF) const {
|
802
802
|
if (is_empty()) return vector_d<A>(allocator_);
|
803
803
|
kll_helper::validate_values<T, C>(split_points, size);
|
@@ -808,9 +808,9 @@ vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32
|
|
808
808
|
const auto from_index = levels_[level];
|
809
809
|
const auto to_index = levels_[level + 1]; // exclusive
|
810
810
|
if ((level == 0) && !is_level_zero_sorted_) {
|
811
|
-
increment_buckets_unsorted_level(from_index, to_index, weight, split_points, size, buckets.data());
|
811
|
+
increment_buckets_unsorted_level<inclusive>(from_index, to_index, weight, split_points, size, buckets.data());
|
812
812
|
} else {
|
813
|
-
increment_buckets_sorted_level(from_index, to_index, weight, split_points, size, buckets.data());
|
813
|
+
increment_buckets_sorted_level<inclusive>(from_index, to_index, weight, split_points, size, buckets.data());
|
814
814
|
}
|
815
815
|
level++;
|
816
816
|
weight *= 2;
|
@@ -831,13 +831,14 @@ vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32
|
|
831
831
|
}
|
832
832
|
|
833
833
|
template<typename T, typename C, typename S, typename A>
|
834
|
+
template<bool inclusive>
|
834
835
|
void kll_sketch<T, C, S, A>::increment_buckets_unsorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
|
835
836
|
const T* split_points, uint32_t size, double* buckets) const
|
836
837
|
{
|
837
838
|
for (uint32_t i = from_index; i < to_index; i++) {
|
838
839
|
uint32_t j;
|
839
840
|
for (j = 0; j < size; j++) {
|
840
|
-
if (C()(items_[i], split_points[j])) {
|
841
|
+
if (inclusive ? !C()(split_points[j], items_[i]) : C()(items_[i], split_points[j])) {
|
841
842
|
break;
|
842
843
|
}
|
843
844
|
}
|
@@ -846,13 +847,14 @@ void kll_sketch<T, C, S, A>::increment_buckets_unsorted_level(uint32_t from_inde
|
|
846
847
|
}
|
847
848
|
|
848
849
|
template<typename T, typename C, typename S, typename A>
|
850
|
+
template<bool inclusive>
|
849
851
|
void kll_sketch<T, C, S, A>::increment_buckets_sorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
|
850
852
|
const T* split_points, uint32_t size, double* buckets) const
|
851
853
|
{
|
852
854
|
uint32_t i = from_index;
|
853
855
|
uint32_t j = 0;
|
854
856
|
while ((i < to_index) && (j < size)) {
|
855
|
-
if (C()(items_[i], split_points[j])) {
|
857
|
+
if (inclusive ? !C()(split_points[j], items_[i]) : C()(items_[i], split_points[j])) {
|
856
858
|
buckets[j] += weight; // this sample goes into this bucket
|
857
859
|
i++; // move on to next sample and see whether it also goes into this bucket
|
858
860
|
} else {
|
@@ -910,34 +912,9 @@ void kll_sketch<T, C, S, A>::merge_higher_levels(O&& other, uint64_t final_n) {
|
|
910
912
|
}
|
911
913
|
|
912
914
|
// this leaves items_ uninitialized (all objects moved out and destroyed)
|
913
|
-
// this version copies objects from the incoming sketch
|
914
|
-
template<typename T, typename C, typename S, typename A>
|
915
|
-
void kll_sketch<T, C, S, A>::populate_work_arrays(const kll_sketch& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels) {
|
916
|
-
worklevels[0] = 0;
|
917
|
-
|
918
|
-
// the level zero data from "other" was already inserted into "this"
|
919
|
-
kll_helper::move_construct<T>(items_, levels_[0], levels_[1], workbuf, 0, true);
|
920
|
-
worklevels[1] = safe_level_size(0);
|
921
|
-
|
922
|
-
for (uint8_t lvl = 1; lvl < provisional_num_levels; lvl++) {
|
923
|
-
const uint32_t self_pop = safe_level_size(lvl);
|
924
|
-
const uint32_t other_pop = other.safe_level_size(lvl);
|
925
|
-
worklevels[lvl + 1] = worklevels[lvl] + self_pop + other_pop;
|
926
|
-
|
927
|
-
if ((self_pop > 0) && (other_pop == 0)) {
|
928
|
-
kll_helper::move_construct<T>(items_, levels_[lvl], levels_[lvl] + self_pop, workbuf, worklevels[lvl], true);
|
929
|
-
} else if ((self_pop == 0) && (other_pop > 0)) {
|
930
|
-
kll_helper::copy_construct<T>(other.items_, other.levels_[lvl], other.levels_[lvl] + other_pop, workbuf, worklevels[lvl]);
|
931
|
-
} else if ((self_pop > 0) && (other_pop > 0)) {
|
932
|
-
kll_helper::merge_sorted_arrays<T, C>(items_, levels_[lvl], self_pop, other.items_, other.levels_[lvl], other_pop, workbuf, worklevels[lvl]);
|
933
|
-
}
|
934
|
-
}
|
935
|
-
}
|
936
|
-
|
937
|
-
// this leaves items_ uninitialized (all objects moved out and destroyed)
|
938
|
-
// this version moves objects from the incoming sketch
|
939
915
|
template<typename T, typename C, typename S, typename A>
|
940
|
-
|
916
|
+
template<typename FwdSk>
|
917
|
+
void kll_sketch<T, C, S, A>::populate_work_arrays(FwdSk&& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels) {
|
941
918
|
worklevels[0] = 0;
|
942
919
|
|
943
920
|
// the level zero data from "other" was already inserted into "this"
|
@@ -952,7 +929,9 @@ void kll_sketch<T, C, S, A>::populate_work_arrays(kll_sketch&& other, T* workbuf
|
|
952
929
|
if ((self_pop > 0) && (other_pop == 0)) {
|
953
930
|
kll_helper::move_construct<T>(items_, levels_[lvl], levels_[lvl] + self_pop, workbuf, worklevels[lvl], true);
|
954
931
|
} else if ((self_pop == 0) && (other_pop > 0)) {
|
955
|
-
|
932
|
+
for (auto i = other.levels_[lvl], j = worklevels[lvl]; i < other.levels_[lvl] + other_pop; ++i, ++j) {
|
933
|
+
new (&workbuf[j]) T(conditional_forward<FwdSk>(other.items_[i]));
|
934
|
+
}
|
956
935
|
} else if ((self_pop > 0) && (other_pop > 0)) {
|
957
936
|
kll_helper::merge_sorted_arrays<T, C>(items_, levels_[lvl], self_pop, other.items_, other.levels_[lvl], other_pop, workbuf, worklevels[lvl]);
|
958
937
|
}
|
@@ -1039,7 +1018,6 @@ string<A> kll_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items)
|
|
1039
1018
|
os << " Sorted : " << (is_level_zero_sorted_ ? "true" : "false") << std::endl;
|
1040
1019
|
os << " Capacity items : " << items_size_ << std::endl;
|
1041
1020
|
os << " Retained items : " << get_num_retained() << std::endl;
|
1042
|
-
os << " Storage bytes : " << get_serialized_size_bytes() << std::endl;
|
1043
1021
|
if (!is_empty()) {
|
1044
1022
|
os << " Min value : " << *min_value_ << std::endl;
|
1045
1023
|
os << " Max value : " << *max_value_ << std::endl;
|