datasketches 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +1 -1
- data/ext/datasketches/kll_wrapper.cpp +5 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +4 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +2 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +25 -9
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +96 -42
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +105 -127
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +94 -25
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
- data/vendor/datasketches-cpp/pyproject.toml +1 -1
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/README.md +7 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
- data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -2
- data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +62 -59
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +31 -26
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +25 -9
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +8 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +7 -45
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +29 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +2 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +16 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -0
- metadata +25 -9
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
|
@@ -25,6 +25,7 @@
|
|
|
25
25
|
#include <cmath>
|
|
26
26
|
#include <random>
|
|
27
27
|
#include <algorithm>
|
|
28
|
+
#include <stdexcept>
|
|
28
29
|
|
|
29
30
|
#include "var_opt_sketch.hpp"
|
|
30
31
|
#include "serde.hpp"
|
|
@@ -311,8 +312,8 @@ var_opt_sketch<T,S,A>& var_opt_sketch<T,S,A>::operator=(var_opt_sketch&& other)
|
|
|
311
312
|
|
|
312
313
|
// implementation for fixed-size arithmetic types (integral and floating point)
|
|
313
314
|
template<typename T, typename S, typename A>
|
|
314
|
-
template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
|
|
315
|
-
size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
|
|
315
|
+
template<typename TT, typename SerDe, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
|
|
316
|
+
size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes(const SerDe&) const {
|
|
316
317
|
if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; }
|
|
317
318
|
size_t num_bytes = (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL) << 3;
|
|
318
319
|
num_bytes += h_ * sizeof(double); // weights
|
|
@@ -325,8 +326,8 @@ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
|
|
|
325
326
|
|
|
326
327
|
// implementation for all other types
|
|
327
328
|
template<typename T, typename S, typename A>
|
|
328
|
-
template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
|
|
329
|
-
size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
|
|
329
|
+
template<typename TT, typename SerDe, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
|
|
330
|
+
size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes(const SerDe& sd) const {
|
|
330
331
|
if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; }
|
|
331
332
|
size_t num_bytes = (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL) << 3;
|
|
332
333
|
num_bytes += h_ * sizeof(double); // weights
|
|
@@ -335,13 +336,14 @@ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
|
|
|
335
336
|
}
|
|
336
337
|
// must iterate over the items
|
|
337
338
|
for (auto it: *this)
|
|
338
|
-
num_bytes +=
|
|
339
|
+
num_bytes += sd.size_of_item(it.first);
|
|
339
340
|
return num_bytes;
|
|
340
341
|
}
|
|
341
342
|
|
|
342
343
|
template<typename T, typename S, typename A>
|
|
343
|
-
|
|
344
|
-
|
|
344
|
+
template<typename SerDe>
|
|
345
|
+
std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned header_size_bytes, const SerDe& sd) const {
|
|
346
|
+
const size_t size = header_size_bytes + get_serialized_size_bytes(sd);
|
|
345
347
|
std::vector<uint8_t, AllocU8<A>> bytes(size, 0, allocator_);
|
|
346
348
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
347
349
|
uint8_t* end_ptr = ptr + size;
|
|
@@ -400,8 +402,8 @@ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned heade
|
|
|
400
402
|
}
|
|
401
403
|
|
|
402
404
|
// write the sample items, skipping the gap. Either h_ or r_ may be 0
|
|
403
|
-
ptr +=
|
|
404
|
-
ptr +=
|
|
405
|
+
ptr += sd.serialize(ptr, end_ptr - ptr, data_, h_);
|
|
406
|
+
ptr += sd.serialize(ptr, end_ptr - ptr, &data_[h_ + 1], r_);
|
|
405
407
|
}
|
|
406
408
|
|
|
407
409
|
size_t bytes_written = ptr - bytes.data();
|
|
@@ -413,7 +415,8 @@ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned heade
|
|
|
413
415
|
}
|
|
414
416
|
|
|
415
417
|
template<typename T, typename S, typename A>
|
|
416
|
-
|
|
418
|
+
template<typename SerDe>
|
|
419
|
+
void var_opt_sketch<T,S,A>::serialize(std::ostream& os, const SerDe& sd) const {
|
|
417
420
|
const bool empty = (h_ == 0) && (r_ == 0);
|
|
418
421
|
|
|
419
422
|
const uint8_t preLongs = (empty ? PREAMBLE_LONGS_EMPTY
|
|
@@ -469,13 +472,19 @@ void var_opt_sketch<T,S,A>::serialize(std::ostream& os) const {
|
|
|
469
472
|
}
|
|
470
473
|
|
|
471
474
|
// write the sample items, skipping the gap. Either h_ or r_ may be 0
|
|
472
|
-
|
|
473
|
-
|
|
475
|
+
sd.serialize(os, data_, h_);
|
|
476
|
+
sd.serialize(os, &data_[h_ + 1], r_);
|
|
474
477
|
}
|
|
475
478
|
}
|
|
476
479
|
|
|
477
480
|
template<typename T, typename S, typename A>
|
|
478
481
|
var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size_t size, const A& allocator) {
|
|
482
|
+
return deserialize(bytes, size, S(), allocator);
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
template<typename T, typename S, typename A>
|
|
486
|
+
template<typename SerDe>
|
|
487
|
+
var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {
|
|
479
488
|
ensure_minimum_memory(size, 8);
|
|
480
489
|
const char* ptr = static_cast<const char*>(bytes);
|
|
481
490
|
const char* base = ptr;
|
|
@@ -559,10 +568,10 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
|
|
|
559
568
|
items_deleter deleter(array_size, allocator);
|
|
560
569
|
std::unique_ptr<T, items_deleter> items(A(allocator).allocate(array_size), deleter);
|
|
561
570
|
|
|
562
|
-
ptr +=
|
|
571
|
+
ptr += sd.deserialize(ptr, end_ptr - ptr, items.get(), h);
|
|
563
572
|
items.get_deleter().set_h(h); // serde didn't throw, so the items are now valid
|
|
564
573
|
|
|
565
|
-
ptr +=
|
|
574
|
+
ptr += sd.deserialize(ptr, end_ptr - ptr, &(items.get()[h + 1]), r);
|
|
566
575
|
items.get_deleter().set_r(r); // serde didn't throw, so the items are now valid
|
|
567
576
|
|
|
568
577
|
return var_opt_sketch(k, h, (r > 0 ? 1 : 0), r, n, total_wt_r, rf, array_size, false,
|
|
@@ -571,6 +580,12 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
|
|
|
571
580
|
|
|
572
581
|
template<typename T, typename S, typename A>
|
|
573
582
|
var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const A& allocator) {
|
|
583
|
+
return deserialize(is, S(), allocator);
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
template<typename T, typename S, typename A>
|
|
587
|
+
template<typename SerDe>
|
|
588
|
+
var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
|
|
574
589
|
const auto first_byte = read<uint8_t>(is);
|
|
575
590
|
uint8_t preamble_longs = first_byte & 0x3f;
|
|
576
591
|
const resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
|
|
@@ -640,10 +655,10 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const
|
|
|
640
655
|
items_deleter deleter(array_size, allocator);
|
|
641
656
|
std::unique_ptr<T, items_deleter> items(A(allocator).allocate(array_size), deleter);
|
|
642
657
|
|
|
643
|
-
|
|
658
|
+
sd.deserialize(is, items.get(), h); // aka &data_[0]
|
|
644
659
|
items.get_deleter().set_h(h); // serde didn't throw, so the items are now valid
|
|
645
660
|
|
|
646
|
-
|
|
661
|
+
sd.deserialize(is, &(items.get()[h + 1]), r);
|
|
647
662
|
items.get_deleter().set_r(r); // serde didn't throw, so the items are now valid
|
|
648
663
|
|
|
649
664
|
if (!is.good())
|
|
@@ -1683,16 +1698,6 @@ bool var_opt_sketch<T, S, A>::iterator::get_mark() const {
|
|
|
1683
1698
|
return sk_->marks_ == nullptr ? false : sk_->marks_[idx_];
|
|
1684
1699
|
}
|
|
1685
1700
|
|
|
1686
|
-
|
|
1687
|
-
|
|
1688
|
-
// ******************** MOVE TO COMMON UTILS AREA EVENTUALLY *********************
|
|
1689
|
-
|
|
1690
|
-
namespace random_utils {
|
|
1691
|
-
static std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
|
|
1692
|
-
static std::mt19937_64 rand(rd());
|
|
1693
|
-
static std::uniform_real_distribution<> next_double(0.0, 1.0);
|
|
1694
|
-
}
|
|
1695
|
-
|
|
1696
1701
|
/**
|
|
1697
1702
|
* Checks if target sampling allocation is more than 50% of max sampling size.
|
|
1698
1703
|
* If so, returns max sampling size, otherwise passes through target size.
|
|
@@ -45,7 +45,11 @@ template<typename A> using AllocU8 = typename std::allocator_traits<A>::template
|
|
|
45
45
|
* author Kevin Lang
|
|
46
46
|
* author Jon Malkin
|
|
47
47
|
*/
|
|
48
|
-
template
|
|
48
|
+
template<
|
|
49
|
+
typename T,
|
|
50
|
+
typename S = serde<T>, // deprecated, to be removed in the next major version
|
|
51
|
+
typename A = std::allocator<T>
|
|
52
|
+
>
|
|
49
53
|
class var_opt_union {
|
|
50
54
|
|
|
51
55
|
public:
|
|
@@ -88,14 +92,16 @@ public:
|
|
|
88
92
|
/**
|
|
89
93
|
* Computes size needed to serialize the current state of the union.
|
|
90
94
|
* This version is for all other types and can be expensive since every item needs to be looked at.
|
|
95
|
+
* @param instance of a SerDe
|
|
91
96
|
* @return size in bytes needed to serialize this sketch
|
|
92
97
|
*/
|
|
93
|
-
|
|
94
|
-
|
|
98
|
+
template<typename SerDe = S>
|
|
99
|
+
size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
|
|
100
|
+
|
|
95
101
|
// This is a convenience alias for users
|
|
96
102
|
// The type returned by the following serialize method
|
|
97
103
|
typedef vector_u8<A> vector_bytes;
|
|
98
|
-
|
|
104
|
+
|
|
99
105
|
/**
|
|
100
106
|
* NOTE: This method may be deprecated in a future version.
|
|
101
107
|
* This method serializes the sketch as a vector of bytes.
|
|
@@ -103,33 +109,62 @@ public:
|
|
|
103
109
|
* It is a blank space of a given size.
|
|
104
110
|
* This header is used in Datasketches PostgreSQL extension.
|
|
105
111
|
* @param header_size_bytes space to reserve in front of the sketch
|
|
112
|
+
* @param instance of a SerDe
|
|
106
113
|
*/
|
|
107
|
-
|
|
114
|
+
template<typename SerDe = S>
|
|
115
|
+
vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
|
|
108
116
|
|
|
109
117
|
/**
|
|
110
118
|
* NOTE: This method may be deprecated in a future version.
|
|
111
119
|
* This method serializes the sketch into a given stream in a binary form
|
|
112
120
|
* @param os output stream
|
|
121
|
+
* @param instance of a SerDe
|
|
113
122
|
*/
|
|
114
|
-
|
|
123
|
+
template<typename SerDe = S>
|
|
124
|
+
void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
|
|
115
125
|
|
|
116
126
|
/**
|
|
117
127
|
* NOTE: This method may be deprecated in a future version.
|
|
118
128
|
* This method deserializes a union from a given stream.
|
|
119
129
|
* @param is input stream
|
|
130
|
+
* @param instance of an Allocator
|
|
120
131
|
* @return an instance of a union
|
|
121
132
|
*/
|
|
122
133
|
static var_opt_union deserialize(std::istream& is, const A& allocator = A());
|
|
123
134
|
|
|
135
|
+
/**
|
|
136
|
+
* NOTE: This method may be deprecated in a future version.
|
|
137
|
+
* This method deserializes a union from a given stream.
|
|
138
|
+
* @param is input stream
|
|
139
|
+
* @param instance of a SerDe
|
|
140
|
+
* @param instance of an Allocator
|
|
141
|
+
* @return an instance of a union
|
|
142
|
+
*/
|
|
143
|
+
template<typename SerDe = S>
|
|
144
|
+
static var_opt_union deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
|
|
145
|
+
|
|
124
146
|
/**
|
|
125
147
|
* NOTE: This method may be deprecated in a future version.
|
|
126
148
|
* This method deserializes a union from a given array of bytes.
|
|
127
149
|
* @param bytes pointer to the array of bytes
|
|
128
150
|
* @param size the size of the array
|
|
151
|
+
* @param instance of an Allocator
|
|
129
152
|
* @return an instance of a union
|
|
130
153
|
*/
|
|
131
154
|
static var_opt_union deserialize(const void* bytes, size_t size, const A& allocator = A());
|
|
132
155
|
|
|
156
|
+
/**
|
|
157
|
+
* NOTE: This method may be deprecated in a future version.
|
|
158
|
+
* This method deserializes a union from a given array of bytes.
|
|
159
|
+
* @param bytes pointer to the array of bytes
|
|
160
|
+
* @param size the size of the array
|
|
161
|
+
* @param instance of a SerDe
|
|
162
|
+
* @param instance of an Allocator
|
|
163
|
+
* @return an instance of a union
|
|
164
|
+
*/
|
|
165
|
+
template<typename SerDe = S>
|
|
166
|
+
static var_opt_union deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
|
|
167
|
+
|
|
133
168
|
/**
|
|
134
169
|
* Prints a summary of the union as a string.
|
|
135
170
|
* @return the summary as a string
|
|
@@ -24,6 +24,7 @@
|
|
|
24
24
|
|
|
25
25
|
#include <cmath>
|
|
26
26
|
#include <sstream>
|
|
27
|
+
#include <stdexcept>
|
|
27
28
|
|
|
28
29
|
namespace datasketches {
|
|
29
30
|
|
|
@@ -129,6 +130,12 @@ var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(var_opt_union&& other) {
|
|
|
129
130
|
|
|
130
131
|
template<typename T, typename S, typename A>
|
|
131
132
|
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A& allocator) {
|
|
133
|
+
return deserialize(is, S(), allocator);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
template<typename T, typename S, typename A>
|
|
137
|
+
template<typename SerDe>
|
|
138
|
+
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
|
|
132
139
|
const auto preamble_longs = read<uint8_t>(is);
|
|
133
140
|
const auto serial_version = read<uint8_t>(is);
|
|
134
141
|
const auto family_id = read<uint8_t>(is);
|
|
@@ -155,7 +162,7 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A
|
|
|
155
162
|
const auto outer_tau_numer = read<double>(is);
|
|
156
163
|
const auto outer_tau_denom = read<uint64_t>(is);
|
|
157
164
|
|
|
158
|
-
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is, allocator);
|
|
165
|
+
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is, sd, allocator);
|
|
159
166
|
|
|
160
167
|
if (!is.good())
|
|
161
168
|
throw std::runtime_error("error reading from std::istream");
|
|
@@ -165,6 +172,12 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A
|
|
|
165
172
|
|
|
166
173
|
template<typename T, typename S, typename A>
|
|
167
174
|
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size, const A& allocator) {
|
|
175
|
+
return deserialize(bytes, size, S(), allocator);
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
template<typename T, typename S, typename A>
|
|
179
|
+
template<typename SerDe>
|
|
180
|
+
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {
|
|
168
181
|
ensure_minimum_memory(size, 8);
|
|
169
182
|
const char* ptr = static_cast<const char*>(bytes);
|
|
170
183
|
uint8_t preamble_longs;
|
|
@@ -199,22 +212,24 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
|
|
|
199
212
|
ptr += copy_from_mem(ptr, outer_tau_denom);
|
|
200
213
|
|
|
201
214
|
const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
|
|
202
|
-
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size, allocator);
|
|
215
|
+
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size, sd, allocator);
|
|
203
216
|
|
|
204
217
|
return var_opt_union<T,S,A>(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
|
|
205
218
|
}
|
|
206
219
|
|
|
207
220
|
template<typename T, typename S, typename A>
|
|
208
|
-
|
|
221
|
+
template<typename SerDe>
|
|
222
|
+
size_t var_opt_union<T,S,A>::get_serialized_size_bytes(const SerDe& sd) const {
|
|
209
223
|
if (n_ == 0) {
|
|
210
224
|
return PREAMBLE_LONGS_EMPTY << 3;
|
|
211
225
|
} else {
|
|
212
|
-
return (PREAMBLE_LONGS_NON_EMPTY << 3) + gadget_.get_serialized_size_bytes();
|
|
226
|
+
return (PREAMBLE_LONGS_NON_EMPTY << 3) + gadget_.get_serialized_size_bytes(sd);
|
|
213
227
|
}
|
|
214
228
|
}
|
|
215
229
|
|
|
216
230
|
template<typename T, typename S, typename A>
|
|
217
|
-
|
|
231
|
+
template<typename SerDe>
|
|
232
|
+
void var_opt_union<T,S,A>::serialize(std::ostream& os, const SerDe& sd) const {
|
|
218
233
|
bool empty = (n_ == 0);
|
|
219
234
|
|
|
220
235
|
const uint8_t serialization_version(SER_VER);
|
|
@@ -240,13 +255,14 @@ void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
|
|
|
240
255
|
write(os, n_);
|
|
241
256
|
write(os, outer_tau_numer_);
|
|
242
257
|
write(os, outer_tau_denom_);
|
|
243
|
-
gadget_.serialize(os);
|
|
258
|
+
gadget_.serialize(os, sd);
|
|
244
259
|
}
|
|
245
260
|
}
|
|
246
261
|
|
|
247
262
|
template<typename T, typename S, typename A>
|
|
248
|
-
|
|
249
|
-
|
|
263
|
+
template<typename SerDe>
|
|
264
|
+
std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header_size_bytes, const SerDe& sd) const {
|
|
265
|
+
const size_t size = header_size_bytes + get_serialized_size_bytes(sd);
|
|
250
266
|
std::vector<uint8_t, AllocU8<A>> bytes(size, 0, gadget_.allocator_);
|
|
251
267
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
252
268
|
|
|
@@ -278,7 +294,7 @@ std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header
|
|
|
278
294
|
ptr += copy_to_mem(outer_tau_numer_, ptr);
|
|
279
295
|
ptr += copy_to_mem(outer_tau_denom_, ptr);
|
|
280
296
|
|
|
281
|
-
auto gadget_bytes = gadget_.serialize();
|
|
297
|
+
auto gadget_bytes = gadget_.serialize(0, sd);
|
|
282
298
|
ptr += copy_to_mem(gadget_bytes.data(), ptr, gadget_bytes.size() * sizeof(uint8_t));
|
|
283
299
|
}
|
|
284
300
|
|
|
@@ -39,7 +39,7 @@ TEST_CASE("varopt allocation test", "[var_opt_sketch]") {
|
|
|
39
39
|
var_opt_test_sketch sk1(10, var_opt_test_sketch::DEFAULT_RESIZE_FACTOR, 0);
|
|
40
40
|
for (int i = 0; i < 100; ++i) sk1.update(i);
|
|
41
41
|
auto bytes1 = sk1.serialize();
|
|
42
|
-
auto sk2 = var_opt_test_sketch::deserialize(bytes1.data(), bytes1.size(), 0);
|
|
42
|
+
auto sk2 = var_opt_test_sketch::deserialize(bytes1.data(), bytes1.size(), test_type_serde(), 0);
|
|
43
43
|
|
|
44
44
|
std::stringstream ss;
|
|
45
45
|
sk1.serialize(ss);
|
|
@@ -51,7 +51,7 @@ TEST_CASE("varopt allocation test", "[var_opt_sketch]") {
|
|
|
51
51
|
u1.update(sk3);
|
|
52
52
|
|
|
53
53
|
auto bytes2 = u1.serialize();
|
|
54
|
-
auto u2 = var_opt_test_union::deserialize(bytes2.data(), bytes2.size(), 0);
|
|
54
|
+
auto u2 = var_opt_test_union::deserialize(bytes2.data(), bytes2.size(), test_type_serde(), 0);
|
|
55
55
|
}
|
|
56
56
|
REQUIRE(test_allocator_total_bytes == 0);
|
|
57
57
|
REQUIRE(test_allocator_net_allocations == 0);
|
|
@@ -22,12 +22,13 @@
|
|
|
22
22
|
|
|
23
23
|
#include <iostream>
|
|
24
24
|
#include <iomanip>
|
|
25
|
+
#include <stdexcept>
|
|
25
26
|
|
|
26
27
|
namespace datasketches {
|
|
27
28
|
|
|
28
29
|
template<bool dummy>
|
|
29
30
|
auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
|
|
30
|
-
if (size < 8) throw std::
|
|
31
|
+
if (size < 8) throw std::out_of_range("at least 8 bytes expected, actual " + std::to_string(size)
|
|
31
32
|
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
|
32
33
|
|
|
33
34
|
uint8_t serial_version = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE];
|
|
@@ -43,10 +44,11 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin
|
|
|
43
44
|
checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
|
44
45
|
const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
|
|
45
46
|
if (has_theta) {
|
|
46
|
-
if (size < 16) throw std::
|
|
47
|
+
if (size < 16) throw std::out_of_range("at least 16 bytes expected, actual " + std::to_string(size));
|
|
47
48
|
theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
|
|
48
49
|
}
|
|
49
50
|
if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] == 1) {
|
|
51
|
+
if (size < 16) throw std::out_of_range("at least 16 bytes expected, actual " + std::to_string(size));
|
|
50
52
|
return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
|
|
51
53
|
}
|
|
52
54
|
const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
|
|
@@ -54,7 +56,7 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin
|
|
|
54
56
|
const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + entries_start_u64;
|
|
55
57
|
const size_t expected_size_bytes = (entries_start_u64 + num_entries) * sizeof(uint64_t);
|
|
56
58
|
if (size < expected_size_bytes) {
|
|
57
|
-
throw std::
|
|
59
|
+
throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
|
|
58
60
|
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
|
59
61
|
}
|
|
60
62
|
const bool is_ordered = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_ORDERED_FLAG);
|
|
@@ -72,7 +74,7 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin
|
|
|
72
74
|
const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
|
|
73
75
|
const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
|
|
74
76
|
if (size < expected_size_bytes) {
|
|
75
|
-
throw std::
|
|
77
|
+
throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
|
|
76
78
|
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
|
77
79
|
}
|
|
78
80
|
return {false, true, seed_hash, num_entries, theta, entries};
|
|
@@ -91,7 +93,7 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin
|
|
|
91
93
|
} else {
|
|
92
94
|
const size_t expected_size_bytes = (preamble_size + num_entries) << 3;
|
|
93
95
|
if (size < expected_size_bytes) {
|
|
94
|
-
throw std::
|
|
96
|
+
throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
|
|
95
97
|
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
|
96
98
|
}
|
|
97
99
|
const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_EXACT_U64;
|
|
@@ -107,7 +109,7 @@ auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uin
|
|
|
107
109
|
const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
|
|
108
110
|
const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
|
|
109
111
|
if (size < expected_size_bytes) {
|
|
110
|
-
throw std::
|
|
112
|
+
throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
|
|
111
113
|
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
|
112
114
|
}
|
|
113
115
|
return {false, true, seed_hash, num_entries, theta, entries};
|
|
@@ -22,6 +22,7 @@
|
|
|
22
22
|
|
|
23
23
|
#include <sstream>
|
|
24
24
|
#include <vector>
|
|
25
|
+
#include <stdexcept>
|
|
25
26
|
|
|
26
27
|
#include "serde.hpp"
|
|
27
28
|
#include "binomial_bounds.hpp"
|
|
@@ -453,7 +454,7 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
|
|
|
453
454
|
const auto num_entries = read<uint32_t>(is);
|
|
454
455
|
read<uint32_t>(is); //unused
|
|
455
456
|
const auto theta = read<uint64_t>(is);
|
|
456
|
-
std::vector<uint64_t> entries(num_entries, 0, allocator);
|
|
457
|
+
std::vector<uint64_t, A> entries(num_entries, 0, allocator);
|
|
457
458
|
bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
|
|
458
459
|
if (!is_empty)
|
|
459
460
|
read(is, entries.data(), sizeof(uint64_t) * entries.size());
|
|
@@ -470,12 +471,12 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
|
|
|
470
471
|
if (preamble_longs == 1) {
|
|
471
472
|
if (!is.good())
|
|
472
473
|
throw std::runtime_error("error reading from std::istream");
|
|
473
|
-
std::vector<uint64_t> entries(0, 0, allocator);
|
|
474
|
+
std::vector<uint64_t, A> entries(0, 0, allocator);
|
|
474
475
|
return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
|
|
475
476
|
} else if (preamble_longs == 2) {
|
|
476
477
|
const uint32_t num_entries = read<uint32_t>(is);
|
|
477
478
|
read<uint32_t>(is); // unused
|
|
478
|
-
std::vector<uint64_t> entries(num_entries, 0, allocator);
|
|
479
|
+
std::vector<uint64_t, A> entries(num_entries, 0, allocator);
|
|
479
480
|
if (num_entries == 0) {
|
|
480
481
|
return compact_theta_sketch_alloc(true, true, seed_hash, theta_constants::MAX_THETA, std::move(entries));
|
|
481
482
|
}
|
|
@@ -488,7 +489,7 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
|
|
|
488
489
|
read<uint32_t>(is); // unused
|
|
489
490
|
const auto theta = read<uint64_t>(is);
|
|
490
491
|
bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
|
|
491
|
-
std::vector<uint64_t> entries(num_entries, 0, allocator);
|
|
492
|
+
std::vector<uint64_t, A> entries(num_entries, 0, allocator);
|
|
492
493
|
if (is_empty) {
|
|
493
494
|
if (!is.good())
|
|
494
495
|
throw std::runtime_error("error reading from std::istream");
|
|
@@ -514,47 +515,8 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::is
|
|
|
514
515
|
|
|
515
516
|
template<typename A>
|
|
516
517
|
compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
const char* base = ptr;
|
|
520
|
-
uint8_t preamble_longs;
|
|
521
|
-
ptr += copy_from_mem(ptr, preamble_longs);
|
|
522
|
-
uint8_t serial_version;
|
|
523
|
-
ptr += copy_from_mem(ptr, serial_version);
|
|
524
|
-
uint8_t type;
|
|
525
|
-
ptr += copy_from_mem(ptr, type);
|
|
526
|
-
ptr += sizeof(uint16_t); // unused
|
|
527
|
-
uint8_t flags_byte;
|
|
528
|
-
ptr += copy_from_mem(ptr, flags_byte);
|
|
529
|
-
uint16_t seed_hash;
|
|
530
|
-
ptr += copy_from_mem(ptr, seed_hash);
|
|
531
|
-
checker<true>::check_sketch_type(type, SKETCH_TYPE);
|
|
532
|
-
checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
|
|
533
|
-
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
|
534
|
-
if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
|
535
|
-
|
|
536
|
-
uint64_t theta = theta_constants::MAX_THETA;
|
|
537
|
-
uint32_t num_entries = 0;
|
|
538
|
-
if (!is_empty) {
|
|
539
|
-
if (preamble_longs == 1) {
|
|
540
|
-
num_entries = 1;
|
|
541
|
-
} else {
|
|
542
|
-
ensure_minimum_memory(size, 8); // read the first prelong before this method
|
|
543
|
-
ptr += copy_from_mem(ptr, num_entries);
|
|
544
|
-
ptr += sizeof(uint32_t); // unused
|
|
545
|
-
if (preamble_longs > 2) {
|
|
546
|
-
ensure_minimum_memory(size, (preamble_longs - 1) << 3);
|
|
547
|
-
ptr += copy_from_mem(ptr, theta);
|
|
548
|
-
}
|
|
549
|
-
}
|
|
550
|
-
}
|
|
551
|
-
const size_t entries_size_bytes = sizeof(uint64_t) * num_entries;
|
|
552
|
-
check_memory_size(ptr - base + entries_size_bytes, size);
|
|
553
|
-
std::vector<uint64_t, A> entries(num_entries, 0, allocator);
|
|
554
|
-
if (!is_empty) ptr += copy_from_mem(ptr, entries.data(), entries_size_bytes);
|
|
555
|
-
|
|
556
|
-
const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
|
|
557
|
-
return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
|
|
518
|
+
auto data = compact_theta_sketch_parser<true>::parse(bytes, size, seed, false);
|
|
519
|
+
return compact_theta_sketch_alloc(data.is_empty, data.is_ordered, data.seed_hash, data.theta, std::vector<uint64_t, A>(data.entries, data.entries + data.num_entries, allocator));
|
|
558
520
|
}
|
|
559
521
|
|
|
560
522
|
// wrapped compact sketch
|