datasketches 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +7 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +11 -7
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +8 -8
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +12 -15
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +24 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +20 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +2 -2
- data/vendor/datasketches-cpp/common/include/serde.hpp +7 -7
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +19 -19
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +91 -89
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +14 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +121 -87
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +10 -10
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +4 -4
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +14 -14
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +10 -10
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +25 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +65 -80
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +10 -10
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +2 -2
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +60 -63
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +19 -19
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +74 -76
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +110 -113
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +2 -4
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +1 -1
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +80 -76
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +26 -26
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +33 -33
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +6 -6
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +205 -209
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +36 -36
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +28 -28
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +22 -22
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +13 -13
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +15 -15
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +61 -61
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +120 -127
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +9 -9
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +5 -5
- data/vendor/datasketches-cpp/hll/include/hll.hpp +21 -21
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +34 -34
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +25 -25
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +2 -2
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +35 -35
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +15 -15
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +10 -14
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +3 -3
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +4 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +5 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +14 -6
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +39 -24
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +34 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +72 -62
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kolmogorov_smirnov_impl.hpp +78 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +68 -45
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +4 -4
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +6 -6
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/tests/hll_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +3 -3
- data/vendor/datasketches-cpp/req/include/req_common.hpp +2 -1
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +4 -4
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +26 -39
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +9 -9
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +52 -52
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +47 -56
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +34 -42
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +13 -13
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser.hpp +67 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +70 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +18 -14
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +42 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +107 -58
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +4 -4
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +2 -0
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -28
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +23 -1
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +21 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +58 -2
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +37 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +22 -2
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +47 -60
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +51 -64
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +17 -17
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +12 -12
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +5 -5
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +20 -20
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +12 -12
- metadata +8 -3
|
@@ -24,19 +24,27 @@
|
|
|
24
24
|
|
|
25
25
|
namespace datasketches {
|
|
26
26
|
|
|
27
|
+
// forward declaration
|
|
28
|
+
template<typename T, typename C, typename S, typename A> class kll_sketch;
|
|
29
|
+
|
|
27
30
|
template <typename T, typename C, typename A>
|
|
28
31
|
class kll_quantile_calculator {
|
|
29
32
|
public:
|
|
30
|
-
|
|
31
|
-
|
|
33
|
+
using Entry = std::pair<T, uint64_t>;
|
|
34
|
+
using AllocEntry = typename std::allocator_traits<A>::template rebind_alloc<Entry>;
|
|
35
|
+
using Container = std::vector<Entry, AllocEntry>;
|
|
36
|
+
using const_iterator = typename Container::const_iterator;
|
|
37
|
+
|
|
38
|
+
template<typename S>
|
|
39
|
+
kll_quantile_calculator(const kll_sketch<T, C, S, A>& sketch);
|
|
40
|
+
|
|
32
41
|
T get_quantile(double fraction) const;
|
|
42
|
+
const_iterator begin() const;
|
|
43
|
+
const_iterator end() const;
|
|
33
44
|
|
|
34
45
|
private:
|
|
35
46
|
using AllocU32 = typename std::allocator_traits<A>::template rebind_alloc<uint32_t>;
|
|
36
47
|
using vector_u32 = std::vector<uint32_t, AllocU32>;
|
|
37
|
-
using Entry = std::pair<T, uint64_t>;
|
|
38
|
-
using AllocEntry = typename std::allocator_traits<A>::template rebind_alloc<Entry>;
|
|
39
|
-
using Container = std::vector<Entry, AllocEntry>;
|
|
40
48
|
uint64_t n_;
|
|
41
49
|
vector_u32 levels_;
|
|
42
50
|
Container entries_;
|
|
@@ -45,7 +53,7 @@ class kll_quantile_calculator {
|
|
|
45
53
|
T approximately_answer_positional_query(uint64_t pos) const;
|
|
46
54
|
void convert_to_preceding_cummulative();
|
|
47
55
|
uint32_t chunk_containing_pos(uint64_t pos) const;
|
|
48
|
-
uint32_t search_for_chunk_containing_pos(uint64_t pos,
|
|
56
|
+
uint32_t search_for_chunk_containing_pos(uint64_t pos, uint64_t l, uint64_t r) const;
|
|
49
57
|
static void merge_sorted_blocks(Container& entries, const uint32_t* levels, uint8_t num_levels, uint32_t num_items);
|
|
50
58
|
static void merge_sorted_blocks_direct(Container& orig, Container& temp, const uint32_t* levels, uint8_t starting_level, uint8_t num_levels);
|
|
51
59
|
static void merge_sorted_blocks_reversed(Container& orig, Container& temp, const uint32_t* levels, uint8_t starting_level, uint8_t num_levels);
|
|
@@ -28,24 +28,38 @@
|
|
|
28
28
|
|
|
29
29
|
namespace datasketches {
|
|
30
30
|
|
|
31
|
-
template
|
|
32
|
-
|
|
33
|
-
|
|
31
|
+
template<typename T, typename C, typename A>
|
|
32
|
+
template<typename S>
|
|
33
|
+
kll_quantile_calculator<T, C, A>::kll_quantile_calculator(const kll_sketch<T, C, S, A>& sketch):
|
|
34
|
+
n_(sketch.n_), levels_(sketch.num_levels_ + 1, 0, sketch.allocator_), entries_(sketch.allocator_)
|
|
34
35
|
{
|
|
35
|
-
const uint32_t num_items =
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
36
|
+
const uint32_t num_items = sketch.levels_[sketch.num_levels_] - sketch.levels_[0];
|
|
37
|
+
if (num_items > 0) {
|
|
38
|
+
entries_.reserve(num_items);
|
|
39
|
+
populate_from_sketch(sketch.items_, sketch.levels_.data(), sketch.num_levels_);
|
|
40
|
+
if (!sketch.is_level_zero_sorted_) std::sort(entries_.begin(), entries_.begin() + levels_[1], compare_pair_by_first<C>());
|
|
41
|
+
merge_sorted_blocks(entries_, levels_.data(), static_cast<uint8_t>(levels_.size()) - 1, num_items);
|
|
42
|
+
if (!is_sorted(entries_.begin(), entries_.end(), compare_pair_by_first<C>())) throw std::logic_error("entries must be sorted");
|
|
43
|
+
convert_to_preceding_cummulative();
|
|
44
|
+
}
|
|
41
45
|
}
|
|
42
46
|
|
|
43
|
-
template
|
|
47
|
+
template<typename T, typename C, typename A>
|
|
44
48
|
T kll_quantile_calculator<T, C, A>::get_quantile(double fraction) const {
|
|
45
49
|
return approximately_answer_positional_query(pos_of_phi(fraction, n_));
|
|
46
50
|
}
|
|
47
51
|
|
|
48
|
-
template
|
|
52
|
+
template<typename T, typename C, typename A>
|
|
53
|
+
auto kll_quantile_calculator<T, C, A>::begin() const -> const_iterator {
|
|
54
|
+
return entries_.begin();
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
template<typename T, typename C, typename A>
|
|
58
|
+
auto kll_quantile_calculator<T, C, A>::end() const -> const_iterator {
|
|
59
|
+
return entries_.end();
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
template<typename T, typename C, typename A>
|
|
49
63
|
void kll_quantile_calculator<T, C, A>::populate_from_sketch(const T* items, const uint32_t* levels, uint8_t num_levels) {
|
|
50
64
|
size_t src_level = 0;
|
|
51
65
|
size_t dst_level = 0;
|
|
@@ -68,7 +82,7 @@ void kll_quantile_calculator<T, C, A>::populate_from_sketch(const T* items, cons
|
|
|
68
82
|
if (levels_.size() > static_cast<size_t>(dst_level + 1)) levels_.resize(dst_level + 1);
|
|
69
83
|
}
|
|
70
84
|
|
|
71
|
-
template
|
|
85
|
+
template<typename T, typename C, typename A>
|
|
72
86
|
T kll_quantile_calculator<T, C, A>::approximately_answer_positional_query(uint64_t pos) const {
|
|
73
87
|
if (pos >= n_) throw std::logic_error("position out of range");
|
|
74
88
|
const uint32_t num_items = levels_[levels_.size() - 1];
|
|
@@ -77,7 +91,7 @@ T kll_quantile_calculator<T, C, A>::approximately_answer_positional_query(uint64
|
|
|
77
91
|
return entries_[index].first;
|
|
78
92
|
}
|
|
79
93
|
|
|
80
|
-
template
|
|
94
|
+
template<typename T, typename C, typename A>
|
|
81
95
|
void kll_quantile_calculator<T, C, A>::convert_to_preceding_cummulative() {
|
|
82
96
|
uint64_t subtotal = 0;
|
|
83
97
|
for (auto& entry: entries_) {
|
|
@@ -87,13 +101,13 @@ void kll_quantile_calculator<T, C, A>::convert_to_preceding_cummulative() {
|
|
|
87
101
|
}
|
|
88
102
|
}
|
|
89
103
|
|
|
90
|
-
template
|
|
104
|
+
template<typename T, typename C, typename A>
|
|
91
105
|
uint64_t kll_quantile_calculator<T, C, A>::pos_of_phi(double phi, uint64_t n) {
|
|
92
|
-
const uint64_t pos = std::floor(phi * n);
|
|
106
|
+
const uint64_t pos = static_cast<uint64_t>(std::floor(phi * n));
|
|
93
107
|
return (pos == n) ? n - 1 : pos;
|
|
94
108
|
}
|
|
95
109
|
|
|
96
|
-
template
|
|
110
|
+
template<typename T, typename C, typename A>
|
|
97
111
|
uint32_t kll_quantile_calculator<T, C, A>::chunk_containing_pos(uint64_t pos) const {
|
|
98
112
|
if (entries_.size() < 1) throw std::logic_error("array too short");
|
|
99
113
|
if (pos < entries_[0].second) throw std::logic_error("position too small");
|
|
@@ -101,19 +115,19 @@ uint32_t kll_quantile_calculator<T, C, A>::chunk_containing_pos(uint64_t pos) co
|
|
|
101
115
|
return search_for_chunk_containing_pos(pos, 0, entries_.size());
|
|
102
116
|
}
|
|
103
117
|
|
|
104
|
-
template
|
|
105
|
-
uint32_t kll_quantile_calculator<T, C, A>::search_for_chunk_containing_pos(uint64_t pos,
|
|
118
|
+
template<typename T, typename C, typename A>
|
|
119
|
+
uint32_t kll_quantile_calculator<T, C, A>::search_for_chunk_containing_pos(uint64_t pos, uint64_t l, uint64_t r) const {
|
|
106
120
|
if (l + 1 == r) {
|
|
107
|
-
return l;
|
|
121
|
+
return static_cast<uint32_t>(l);
|
|
108
122
|
}
|
|
109
|
-
const
|
|
123
|
+
const uint64_t m = l + (r - l) / 2;
|
|
110
124
|
if (entries_[m].second <= pos) {
|
|
111
125
|
return search_for_chunk_containing_pos(pos, m, r);
|
|
112
126
|
}
|
|
113
127
|
return search_for_chunk_containing_pos(pos, l, m);
|
|
114
128
|
}
|
|
115
129
|
|
|
116
|
-
template
|
|
130
|
+
template<typename T, typename C, typename A>
|
|
117
131
|
void kll_quantile_calculator<T, C, A>::merge_sorted_blocks(Container& entries, const uint32_t* levels, uint8_t num_levels, uint32_t num_items) {
|
|
118
132
|
if (num_levels == 1) return;
|
|
119
133
|
Container temporary(entries.get_allocator());
|
|
@@ -121,7 +135,7 @@ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks(Container& entries, c
|
|
|
121
135
|
merge_sorted_blocks_direct(entries, temporary, levels, 0, num_levels);
|
|
122
136
|
}
|
|
123
137
|
|
|
124
|
-
template
|
|
138
|
+
template<typename T, typename C, typename A>
|
|
125
139
|
void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_direct(Container& orig, Container& temp, const uint32_t* levels,
|
|
126
140
|
uint8_t starting_level, uint8_t num_levels) {
|
|
127
141
|
if (num_levels == 1) return;
|
|
@@ -129,10 +143,11 @@ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_direct(Container& ori
|
|
|
129
143
|
const uint8_t num_levels_2 = num_levels - num_levels_1;
|
|
130
144
|
const uint8_t starting_level_1 = starting_level;
|
|
131
145
|
const uint8_t starting_level_2 = starting_level + num_levels_1;
|
|
132
|
-
const auto
|
|
146
|
+
const auto initial_size = temp.size();
|
|
133
147
|
merge_sorted_blocks_reversed(orig, temp, levels, starting_level_1, num_levels_1);
|
|
134
148
|
merge_sorted_blocks_reversed(orig, temp, levels, starting_level_2, num_levels_2);
|
|
135
149
|
const uint32_t num_items_1 = levels[starting_level_1 + num_levels_1] - levels[starting_level_1];
|
|
150
|
+
const auto chunk_begin = temp.begin() + initial_size;
|
|
136
151
|
std::merge(
|
|
137
152
|
std::make_move_iterator(chunk_begin), std::make_move_iterator(chunk_begin + num_items_1),
|
|
138
153
|
std::make_move_iterator(chunk_begin + num_items_1), std::make_move_iterator(temp.end()),
|
|
@@ -141,7 +156,7 @@ void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_direct(Container& ori
|
|
|
141
156
|
temp.erase(chunk_begin, temp.end());
|
|
142
157
|
}
|
|
143
158
|
|
|
144
|
-
template
|
|
159
|
+
template<typename T, typename C, typename A>
|
|
145
160
|
void kll_quantile_calculator<T, C, A>::merge_sorted_blocks_reversed(Container& orig, Container& temp, const uint32_t* levels,
|
|
146
161
|
uint8_t starting_level, uint8_t num_levels) {
|
|
147
162
|
if (num_levels == 1) {
|
|
@@ -156,6 +156,9 @@ template<typename A> using vector_d = std::vector<double, AllocD<A>>;
|
|
|
156
156
|
template <typename T, typename C = std::less<T>, typename S = serde<T>, typename A = std::allocator<T>>
|
|
157
157
|
class kll_sketch {
|
|
158
158
|
public:
|
|
159
|
+
using value_type = T;
|
|
160
|
+
using comparator = C;
|
|
161
|
+
|
|
159
162
|
static const uint8_t DEFAULT_M = 8;
|
|
160
163
|
static const uint16_t DEFAULT_K = 200;
|
|
161
164
|
static const uint16_t MIN_K = DEFAULT_M;
|
|
@@ -296,7 +299,7 @@ class kll_sketch {
|
|
|
296
299
|
*
|
|
297
300
|
* @return array of approximations to the given number of evenly-spaced fractional ranks.
|
|
298
301
|
*/
|
|
299
|
-
std::vector<T, A> get_quantiles(
|
|
302
|
+
std::vector<T, A> get_quantiles(uint32_t num) const;
|
|
300
303
|
|
|
301
304
|
/**
|
|
302
305
|
* Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1,
|
|
@@ -383,6 +386,33 @@ class kll_sketch {
|
|
|
383
386
|
template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
|
|
384
387
|
size_t get_serialized_size_bytes() const;
|
|
385
388
|
|
|
389
|
+
/**
|
|
390
|
+
* Returns upper bound on the serialized size of a sketch given a parameter <em>k</em> and stream
|
|
391
|
+
* length. The resulting size is an overestimate to make sure actual sketches don't exceed it.
|
|
392
|
+
* This method can be used if allocation of storage is necessary beforehand, but it is not
|
|
393
|
+
* optimal.
|
|
394
|
+
* This method is for arithmetic types (integral and floating point)
|
|
395
|
+
* @param k parameter that controls size of the sketch and accuracy of estimates
|
|
396
|
+
* @param n stream length
|
|
397
|
+
* @return upper bound on the serialized size
|
|
398
|
+
*/
|
|
399
|
+
template<typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
|
|
400
|
+
static size_t get_max_serialized_size_bytes(uint16_t k, uint64_t n);
|
|
401
|
+
|
|
402
|
+
/**
|
|
403
|
+
* Returns upper bound on the serialized size of a sketch given a parameter <em>k</em> and stream
|
|
404
|
+
* length. The resulting size is an overestimate to make sure actual sketches don't exceed it.
|
|
405
|
+
* This method can be used if allocation of storage is necessary beforehand, but it is not
|
|
406
|
+
* optimal.
|
|
407
|
+
* This method is for all other non-arithmetic types, and it takes a max size of an item as input.
|
|
408
|
+
* @param k parameter that controls size of the sketch and accuracy of estimates
|
|
409
|
+
* @param n stream length
|
|
410
|
+
* @param max_item_size_bytes maximum size of an item in bytes
|
|
411
|
+
* @return upper bound on the serialized size
|
|
412
|
+
*/
|
|
413
|
+
template<typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
|
|
414
|
+
static size_t get_max_serialized_size_bytes(uint16_t k, uint64_t n, size_t max_item_size_bytes);
|
|
415
|
+
|
|
386
416
|
/**
|
|
387
417
|
* This method serializes the sketch into a given stream in a binary form
|
|
388
418
|
* @param os output stream
|
|
@@ -391,7 +421,7 @@ class kll_sketch {
|
|
|
391
421
|
|
|
392
422
|
// This is a convenience alias for users
|
|
393
423
|
// The type returned by the following serialize method
|
|
394
|
-
|
|
424
|
+
using vector_bytes = vector_u8<A>;
|
|
395
425
|
|
|
396
426
|
/**
|
|
397
427
|
* This method serializes the sketch as a vector of bytes.
|
|
@@ -480,6 +510,8 @@ class kll_sketch {
|
|
|
480
510
|
T* max_value_;
|
|
481
511
|
bool is_level_zero_sorted_;
|
|
482
512
|
|
|
513
|
+
friend class kll_quantile_calculator<T, C, A>;
|
|
514
|
+
|
|
483
515
|
// for deserialization
|
|
484
516
|
class item_deleter;
|
|
485
517
|
class items_deleter;
|
|
@@ -303,7 +303,7 @@ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(const double* fractions,
|
|
|
303
303
|
}
|
|
304
304
|
|
|
305
305
|
template<typename T, typename C, typename S, typename A>
|
|
306
|
-
std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(
|
|
306
|
+
std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(uint32_t num) const {
|
|
307
307
|
if (is_empty()) return std::vector<T, A>(allocator_);
|
|
308
308
|
if (num == 0) {
|
|
309
309
|
throw std::invalid_argument("num must be > 0");
|
|
@@ -380,36 +380,56 @@ size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
|
|
|
380
380
|
size_t size = DATA_START + num_levels_ * sizeof(uint32_t);
|
|
381
381
|
size += S().size_of_item(*min_value_);
|
|
382
382
|
size += S().size_of_item(*max_value_);
|
|
383
|
-
for (auto
|
|
383
|
+
for (auto it: *this) size += S().size_of_item(it.first);
|
|
384
384
|
return size;
|
|
385
385
|
}
|
|
386
386
|
|
|
387
|
+
// implementation for fixed-size arithmetic types (integral and floating point)
|
|
388
|
+
template<typename T, typename C, typename S, typename A>
|
|
389
|
+
template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
|
|
390
|
+
size_t kll_sketch<T, C, S, A>::get_max_serialized_size_bytes(uint16_t k, uint64_t n) {
|
|
391
|
+
const uint8_t num_levels = kll_helper::ub_on_num_levels(n);
|
|
392
|
+
const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, DEFAULT_M, num_levels);
|
|
393
|
+
// the last integer in the levels_ array is not serialized because it can be derived
|
|
394
|
+
return DATA_START + num_levels * sizeof(uint32_t) + (max_num_retained + 2) * sizeof(TT);
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// implementation for all other types
|
|
398
|
+
template<typename T, typename C, typename S, typename A>
|
|
399
|
+
template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
|
|
400
|
+
size_t kll_sketch<T, C, S, A>::get_max_serialized_size_bytes(uint16_t k, uint64_t n, size_t max_item_size_bytes) {
|
|
401
|
+
const uint8_t num_levels = kll_helper::ub_on_num_levels(n);
|
|
402
|
+
const uint32_t max_num_retained = kll_helper::compute_total_capacity(k, DEFAULT_M, num_levels);
|
|
403
|
+
// the last integer in the levels_ array is not serialized because it can be derived
|
|
404
|
+
return DATA_START + num_levels * sizeof(uint32_t) + (max_num_retained + 2) * max_item_size_bytes;
|
|
405
|
+
}
|
|
406
|
+
|
|
387
407
|
template<typename T, typename C, typename S, typename A>
|
|
388
408
|
void kll_sketch<T, C, S, A>::serialize(std::ostream& os) const {
|
|
389
409
|
const bool is_single_item = n_ == 1;
|
|
390
410
|
const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
|
|
391
|
-
|
|
411
|
+
write(os, preamble_ints);
|
|
392
412
|
const uint8_t serial_version(is_single_item ? SERIAL_VERSION_2 : SERIAL_VERSION_1);
|
|
393
|
-
|
|
413
|
+
write(os, serial_version);
|
|
394
414
|
const uint8_t family(FAMILY);
|
|
395
|
-
|
|
415
|
+
write(os, family);
|
|
396
416
|
const uint8_t flags_byte(
|
|
397
417
|
(is_empty() ? 1 << flags::IS_EMPTY : 0)
|
|
398
418
|
| (is_level_zero_sorted_ ? 1 << flags::IS_LEVEL_ZERO_SORTED : 0)
|
|
399
419
|
| (is_single_item ? 1 << flags::IS_SINGLE_ITEM : 0)
|
|
400
420
|
);
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
421
|
+
write(os, flags_byte);
|
|
422
|
+
write(os, k_);
|
|
423
|
+
write(os, m_);
|
|
404
424
|
const uint8_t unused = 0;
|
|
405
|
-
|
|
425
|
+
write(os, unused);
|
|
406
426
|
if (is_empty()) return;
|
|
407
427
|
if (!is_single_item) {
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
428
|
+
write(os, n_);
|
|
429
|
+
write(os, min_k_);
|
|
430
|
+
write(os, num_levels_);
|
|
431
|
+
write(os, unused);
|
|
432
|
+
write(os, levels_.data(), sizeof(levels_[0]) * num_levels_);
|
|
413
433
|
S().serialize(os, min_value_, 1);
|
|
414
434
|
S().serialize(os, max_value_, 1);
|
|
415
435
|
}
|
|
@@ -424,27 +444,26 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
|
|
|
424
444
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
425
445
|
const uint8_t* end_ptr = ptr + size;
|
|
426
446
|
const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
|
|
427
|
-
ptr += copy_to_mem(
|
|
447
|
+
ptr += copy_to_mem(preamble_ints, ptr);
|
|
428
448
|
const uint8_t serial_version(is_single_item ? SERIAL_VERSION_2 : SERIAL_VERSION_1);
|
|
429
|
-
ptr += copy_to_mem(
|
|
449
|
+
ptr += copy_to_mem(serial_version, ptr);
|
|
430
450
|
const uint8_t family(FAMILY);
|
|
431
|
-
ptr += copy_to_mem(
|
|
451
|
+
ptr += copy_to_mem(family, ptr);
|
|
432
452
|
const uint8_t flags_byte(
|
|
433
453
|
(is_empty() ? 1 << flags::IS_EMPTY : 0)
|
|
434
454
|
| (is_level_zero_sorted_ ? 1 << flags::IS_LEVEL_ZERO_SORTED : 0)
|
|
435
455
|
| (is_single_item ? 1 << flags::IS_SINGLE_ITEM : 0)
|
|
436
456
|
);
|
|
437
|
-
ptr += copy_to_mem(
|
|
438
|
-
ptr += copy_to_mem(
|
|
439
|
-
ptr += copy_to_mem(
|
|
440
|
-
|
|
441
|
-
ptr += copy_to_mem(&unused, ptr, sizeof(unused));
|
|
457
|
+
ptr += copy_to_mem(flags_byte, ptr);
|
|
458
|
+
ptr += copy_to_mem(k_, ptr);
|
|
459
|
+
ptr += copy_to_mem(m_, ptr);
|
|
460
|
+
ptr += sizeof(uint8_t); // unused
|
|
442
461
|
if (!is_empty()) {
|
|
443
462
|
if (!is_single_item) {
|
|
444
|
-
ptr += copy_to_mem(
|
|
445
|
-
ptr += copy_to_mem(
|
|
446
|
-
ptr += copy_to_mem(
|
|
447
|
-
ptr +=
|
|
463
|
+
ptr += copy_to_mem(n_, ptr);
|
|
464
|
+
ptr += copy_to_mem(min_k_, ptr);
|
|
465
|
+
ptr += copy_to_mem(num_levels_, ptr);
|
|
466
|
+
ptr += sizeof(uint8_t); // unused
|
|
448
467
|
ptr += copy_to_mem(levels_.data(), ptr, sizeof(levels_[0]) * num_levels_);
|
|
449
468
|
ptr += S().serialize(ptr, end_ptr - ptr, min_value_, 1);
|
|
450
469
|
ptr += S().serialize(ptr, end_ptr - ptr, max_value_, 1);
|
|
@@ -459,20 +478,13 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
|
|
|
459
478
|
|
|
460
479
|
template<typename T, typename C, typename S, typename A>
|
|
461
480
|
kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, const A& allocator) {
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
uint8_t
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
uint8_t
|
|
469
|
-
is.read((char*)&flags_byte, sizeof(flags_byte));
|
|
470
|
-
uint16_t k;
|
|
471
|
-
is.read((char*)&k, sizeof(k));
|
|
472
|
-
uint8_t m;
|
|
473
|
-
is.read((char*)&m, sizeof(m));
|
|
474
|
-
uint8_t unused;
|
|
475
|
-
is.read((char*)&unused, sizeof(unused));
|
|
481
|
+
const auto preamble_ints = read<uint8_t>(is);
|
|
482
|
+
const auto serial_version = read<uint8_t>(is);
|
|
483
|
+
const auto family_id = read<uint8_t>(is);
|
|
484
|
+
const auto flags_byte = read<uint8_t>(is);
|
|
485
|
+
const auto k = read<uint16_t>(is);
|
|
486
|
+
const auto m = read<uint8_t>(is);
|
|
487
|
+
read<uint8_t>(is); // skip unused byte
|
|
476
488
|
|
|
477
489
|
check_m(m);
|
|
478
490
|
check_preamble_ints(preamble_ints, flags_byte);
|
|
@@ -492,10 +504,10 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, con
|
|
|
492
504
|
min_k = k;
|
|
493
505
|
num_levels = 1;
|
|
494
506
|
} else {
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
507
|
+
n = read<uint64_t>(is);
|
|
508
|
+
min_k = read<uint16_t>(is);
|
|
509
|
+
num_levels = read<uint8_t>(is);
|
|
510
|
+
read<uint8_t>(is); // skip unused byte
|
|
499
511
|
}
|
|
500
512
|
vector_u32<A> levels(num_levels + 1, 0, allocator);
|
|
501
513
|
const uint32_t capacity(kll_helper::compute_total_capacity(k, m, num_levels));
|
|
@@ -503,7 +515,7 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, con
|
|
|
503
515
|
levels[0] = capacity - 1;
|
|
504
516
|
} else {
|
|
505
517
|
// the last integer in levels_ is not serialized because it can be derived
|
|
506
|
-
|
|
518
|
+
read(is, levels.data(), sizeof(levels[0]) * num_levels);
|
|
507
519
|
}
|
|
508
520
|
levels[num_levels] = capacity;
|
|
509
521
|
A alloc(allocator);
|
|
@@ -546,24 +558,24 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, si
|
|
|
546
558
|
ensure_minimum_memory(size, 8);
|
|
547
559
|
const char* ptr = static_cast<const char*>(bytes);
|
|
548
560
|
uint8_t preamble_ints;
|
|
549
|
-
ptr += copy_from_mem(ptr,
|
|
561
|
+
ptr += copy_from_mem(ptr, preamble_ints);
|
|
550
562
|
uint8_t serial_version;
|
|
551
|
-
ptr += copy_from_mem(ptr,
|
|
563
|
+
ptr += copy_from_mem(ptr, serial_version);
|
|
552
564
|
uint8_t family_id;
|
|
553
|
-
ptr += copy_from_mem(ptr,
|
|
565
|
+
ptr += copy_from_mem(ptr, family_id);
|
|
554
566
|
uint8_t flags_byte;
|
|
555
|
-
ptr += copy_from_mem(ptr,
|
|
567
|
+
ptr += copy_from_mem(ptr, flags_byte);
|
|
556
568
|
uint16_t k;
|
|
557
|
-
ptr += copy_from_mem(ptr,
|
|
569
|
+
ptr += copy_from_mem(ptr, k);
|
|
558
570
|
uint8_t m;
|
|
559
|
-
ptr += copy_from_mem(ptr,
|
|
560
|
-
ptr
|
|
571
|
+
ptr += copy_from_mem(ptr, m);
|
|
572
|
+
ptr += sizeof(uint8_t); // skip unused byte
|
|
561
573
|
|
|
562
574
|
check_m(m);
|
|
563
575
|
check_preamble_ints(preamble_ints, flags_byte);
|
|
564
576
|
check_serial_version(serial_version);
|
|
565
577
|
check_family_id(family_id);
|
|
566
|
-
ensure_minimum_memory(size,
|
|
578
|
+
ensure_minimum_memory(size, 1ULL << preamble_ints);
|
|
567
579
|
|
|
568
580
|
const bool is_empty(flags_byte & (1 << flags::IS_EMPTY));
|
|
569
581
|
if (is_empty) return kll_sketch<T, C, S, A>(k, allocator);
|
|
@@ -578,10 +590,10 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, si
|
|
|
578
590
|
min_k = k;
|
|
579
591
|
num_levels = 1;
|
|
580
592
|
} else {
|
|
581
|
-
ptr += copy_from_mem(ptr,
|
|
582
|
-
ptr += copy_from_mem(ptr,
|
|
583
|
-
ptr += copy_from_mem(ptr,
|
|
584
|
-
ptr
|
|
593
|
+
ptr += copy_from_mem(ptr, n);
|
|
594
|
+
ptr += copy_from_mem(ptr, min_k);
|
|
595
|
+
ptr += copy_from_mem(ptr, num_levels);
|
|
596
|
+
ptr += sizeof(uint8_t); // skip unused byte
|
|
585
597
|
}
|
|
586
598
|
vector_u32<A> levels(num_levels + 1, 0, allocator);
|
|
587
599
|
const uint32_t capacity(kll_helper::compute_total_capacity(k, m, num_levels));
|
|
@@ -779,7 +791,7 @@ std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantil
|
|
|
779
791
|
using AllocCalc = typename std::allocator_traits<A>::template rebind_alloc<kll_quantile_calculator<T, C, A>>;
|
|
780
792
|
AllocCalc alloc(allocator_);
|
|
781
793
|
std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator(
|
|
782
|
-
new (alloc.allocate(1)) kll_quantile_calculator<T, C, A>(
|
|
794
|
+
new (alloc.allocate(1)) kll_quantile_calculator<T, C, A>(*this),
|
|
783
795
|
[&alloc](kll_quantile_calculator<T, C, A>* ptr){ ptr->~kll_quantile_calculator<T, C, A>(); alloc.deallocate(ptr, 1); }
|
|
784
796
|
);
|
|
785
797
|
return quantile_calculator;
|
|
@@ -1067,14 +1079,14 @@ typename kll_sketch<T, C, S, A>::const_iterator kll_sketch<T, C, S, A>::begin()
|
|
|
1067
1079
|
|
|
1068
1080
|
template <typename T, typename C, typename S, typename A>
|
|
1069
1081
|
typename kll_sketch<T, C, S, A>::const_iterator kll_sketch<T, C, S, A>::end() const {
|
|
1070
|
-
return kll_sketch<T, C, S, A>::const_iterator(nullptr,
|
|
1082
|
+
return kll_sketch<T, C, S, A>::const_iterator(nullptr, levels_.data(), num_levels_);
|
|
1071
1083
|
}
|
|
1072
1084
|
|
|
1073
1085
|
// kll_sketch::const_iterator implementation
|
|
1074
1086
|
|
|
1075
1087
|
template<typename T, typename C, typename S, typename A>
|
|
1076
1088
|
kll_sketch<T, C, S, A>::const_iterator::const_iterator(const T* items, const uint32_t* levels, const uint8_t num_levels):
|
|
1077
|
-
items(items), levels(levels), num_levels(num_levels), index(
|
|
1089
|
+
items(items), levels(levels), num_levels(num_levels), index(items == nullptr ? levels[num_levels] : levels[0]), level(items == nullptr ? num_levels : 0), weight(1)
|
|
1078
1090
|
{}
|
|
1079
1091
|
|
|
1080
1092
|
template<typename T, typename C, typename S, typename A>
|
|
@@ -1098,8 +1110,6 @@ typename kll_sketch<T, C, S, A>::const_iterator& kll_sketch<T, C, S, A>::const_i
|
|
|
1098
1110
|
|
|
1099
1111
|
template<typename T, typename C, typename S, typename A>
|
|
1100
1112
|
bool kll_sketch<T, C, S, A>::const_iterator::operator==(const const_iterator& other) const {
|
|
1101
|
-
if (level != other.level) return false;
|
|
1102
|
-
if (level == num_levels) return true; // end
|
|
1103
1113
|
return index == other.index;
|
|
1104
1114
|
}
|
|
1105
1115
|
|