datasketches 0.2.7 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/datasketches/kll_wrapper.cpp +20 -20
- data/ext/datasketches/theta_wrapper.cpp +2 -2
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
- data/vendor/datasketches-cpp/MANIFEST.in +21 -2
- data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
- data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
- data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
- data/vendor/datasketches-cpp/pyproject.toml +17 -12
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
- data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
- data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/setup.py +14 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
- data/vendor/datasketches-cpp/tox.ini +26 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -0
- metadata +14 -5
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
@@ -20,14 +20,12 @@
|
|
20
20
|
#ifndef KLL_SKETCH_HPP_
|
21
21
|
#define KLL_SKETCH_HPP_
|
22
22
|
|
23
|
-
#include <functional>
|
24
23
|
#include <memory>
|
25
24
|
#include <vector>
|
26
|
-
#include <cmath>
|
27
25
|
|
28
|
-
#include "quantile_sketch_sorted_view.hpp"
|
29
26
|
#include "common_defs.hpp"
|
30
27
|
#include "serde.hpp"
|
28
|
+
#include "quantiles_sorted_view.hpp"
|
31
29
|
|
32
30
|
namespace datasketches {
|
33
31
|
|
@@ -37,9 +35,9 @@ namespace datasketches {
|
|
37
35
|
* See <a href="https://arxiv.org/abs/1603.05346v2">Optimal Quantile Approximation in Streams</a>.
|
38
36
|
*
|
39
37
|
* <p>This is a stochastic streaming sketch that enables near real-time analysis of the
|
40
|
-
* approximate distribution of
|
41
|
-
* that the
|
42
|
-
* The analysis is obtained using <i>get_quantile()</i>
|
38
|
+
* approximate distribution of items from a very large stream in a single pass, requiring only
|
39
|
+
* that the items are comparable.
|
40
|
+
* The analysis is obtained using <i>get_quantile()</i> function or the
|
43
41
|
* inverse functions get_rank(), get_PMF() (Probability Mass Function), and get_CDF()
|
44
42
|
* (Cumulative Distribution Function).
|
45
43
|
*
|
@@ -47,14 +45,15 @@ namespace datasketches {
|
|
47
45
|
* with the equivalent Java implementation only when template parameter T = float
|
48
46
|
* (32-bit single precision values).
|
49
47
|
*
|
50
|
-
* <p>Given an input stream of <i>N</i>
|
51
|
-
*
|
52
|
-
* <i>N</i>
|
48
|
+
* <p>Given an input stream of <i>N</i> items, the <i>natural rank</i> of any specific
|
49
|
+
* item is defined as its index <i>(1 to N)</i> in inclusive mode
|
50
|
+
* or <i>(0 to N-1)</i> in exclusive mode
|
51
|
+
* in the hypothetical sorted stream of all <i>N</i> input items.
|
53
52
|
*
|
54
|
-
* <p>The <i>normalized rank</i> (<i>rank</i>) of any specific
|
55
|
-
* <i>
|
56
|
-
* Thus, the <i>normalized rank</i> is
|
57
|
-
* In the documentation for this sketch <i>
|
53
|
+
* <p>The <i>normalized rank</i> (<i>rank</i>) of any specific item is defined as its
|
54
|
+
* <i>natural rank</i> divided by <i>N</i>.
|
55
|
+
* Thus, the <i>normalized rank</i> is between zero and one.
|
56
|
+
* In the documentation for this sketch <i>natural rank</i> is never used so any
|
58
57
|
* reference to just <i>rank</i> should be interpreted to mean <i>normalized rank</i>.
|
59
58
|
*
|
60
59
|
* <p>This sketch is configured with a parameter <i>k</i>, which affects the size of the sketch
|
@@ -63,18 +62,18 @@ namespace datasketches {
|
|
63
62
|
* <p>The estimation error is commonly called <i>epsilon</i> (or <i>eps</i>) and is a fraction
|
64
63
|
* between zero and one. Larger values of <i>k</i> result in smaller values of epsilon.
|
65
64
|
* Epsilon is always with respect to the rank and cannot be applied to the
|
66
|
-
* corresponding
|
65
|
+
* corresponding items.
|
67
66
|
*
|
68
|
-
* <p>The relationship between the normalized rank and the corresponding
|
67
|
+
* <p>The relationship between the normalized rank and the corresponding items can be viewed
|
69
68
|
* as a two dimensional monotonic plot with the normalized rank on one axis and the
|
70
|
-
* corresponding
|
69
|
+
* corresponding items on the other axis. If the y-axis is specified as the item-axis and
|
71
70
|
* the x-axis as the normalized rank, then <i>y = get_quantile(x)</i> is a monotonically
|
72
71
|
* increasing function.
|
73
72
|
*
|
74
|
-
* <p>The
|
75
|
-
* corresponding
|
73
|
+
* <p>The function <i>get_quantile(rank)</i> translates ranks into
|
74
|
+
* corresponding quantiles. The functions <i>get_rank(item),
|
76
75
|
* get_CDF(...) (Cumulative Distribution Function), and get_PMF(...)
|
77
|
-
* (Probability Mass Function)</i> perform the opposite operation and translate
|
76
|
+
* (Probability Mass Function)</i> perform the opposite operation and translate items into ranks.
|
78
77
|
*
|
79
78
|
* <p>The <i>getPMF(...)</i> function has about 13 to 47% worse rank error (depending
|
80
79
|
* on <i>k</i>) than the other queries because the mass of each "bin" of the PMF has
|
@@ -86,60 +85,60 @@ namespace datasketches {
|
|
86
85
|
*
|
87
86
|
* <p>A <i>get_quantile(rank)</i> query has the following guarantees:
|
88
87
|
* <ul>
|
89
|
-
* <li>Let <i>
|
90
|
-
* <li>The
|
91
|
-
* <li>Let <i>trueRank</i> be the true rank of <i>
|
92
|
-
* stream of all <i>N</i>
|
88
|
+
* <li>Let <i>q = get_quantile(r)</i> where <i>r</i> is the rank between zero and one.</li>
|
89
|
+
* <li>The quantile <i>q</i> will be an item from the input stream.</li>
|
90
|
+
* <li>Let <i>trueRank</i> be the true rank of <i>q</i> derived from the hypothetical sorted
|
91
|
+
* stream of all <i>N</i> items.</li>
|
93
92
|
* <li>Let <i>eps = get_normalized_rank_error(false)</i>.</li>
|
94
93
|
* <li>Then <i>r - eps ≤ trueRank ≤ r + eps</i> with a confidence of 99%. Note that the
|
95
|
-
* error is on the rank, not the
|
94
|
+
* error is on the rank, not the quantile.</li>
|
96
95
|
* </ul>
|
97
96
|
*
|
98
|
-
* <p>A <i>get_rank(
|
97
|
+
* <p>A <i>get_rank(item)</i> query has the following guarantees:
|
99
98
|
* <ul>
|
100
|
-
* <li>Let <i>r = get_rank(
|
99
|
+
* <li>Let <i>r = get_rank(i)</i> where <i>i</i> is an item between the min and max items of
|
101
100
|
* the input stream.</li>
|
102
|
-
* <li>Let <i>true_rank</i> be the true rank of <i>
|
103
|
-
* stream of all <i>N</i>
|
101
|
+
* <li>Let <i>true_rank</i> be the true rank of <i>i</i> derived from the hypothetical sorted
|
102
|
+
* stream of all <i>N</i> items.</li>
|
104
103
|
* <li>Let <i>eps = get_normalized_rank_error(false)</i>.</li>
|
105
104
|
* <li>Then <i>r - eps ≤ trueRank ≤ r + eps</i> with a confidence of 99%.</li>
|
106
105
|
* </ul>
|
107
106
|
*
|
108
107
|
* <p>A <i>get_PMF()</i> query has the following guarantees:
|
109
108
|
* <ul>
|
110
|
-
* <li>Let <i>{r1, r2, ..., r(m+1)} = get_PMF(
|
111
|
-
* between the min and max
|
112
|
-
* <li>Let <i>mass<sub>i</sub> = estimated mass between
|
113
|
-
* <li>Let <i>trueMass</i> be the true mass between the
|
114
|
-
*
|
109
|
+
* <li>Let <i>{r1, r2, ..., r(m+1)} = get_PMF(s1, s2, ..., sm)</i> where <i>s1, s2</i> are
|
110
|
+
* split points (items from the input domain) between the min and max items of the input stream.
|
111
|
+
* <li>Let <i>mass<sub>i</sub> = estimated mass between s<sub>i</sub> and s<sub>i+1</sub></i>.</li>
|
112
|
+
* <li>Let <i>trueMass</i> be the true mass between the items of <i>s<sub>i</sub>,
|
113
|
+
* s<sub>i+1</sub></i> derived from the hypothetical sorted stream of all <i>N</i> items.</li>
|
115
114
|
* <li>Let <i>eps = get_normalized_rank_error(true)</i>.</li>
|
116
115
|
* <li>then <i>mass - eps ≤ trueMass ≤ mass + eps</i> with a confidence of 99%.</li>
|
117
|
-
* <li>r(m+1) includes the mass of all points larger than
|
116
|
+
* <li>r(m+1) includes the mass of all points larger than sm.</li>
|
118
117
|
* </ul>
|
119
118
|
*
|
120
119
|
* <p>A <i>get_CDF(...)</i> query has the following guarantees;
|
121
120
|
* <ul>
|
122
|
-
* <li>Let <i>{r1, r2, ..., r(m+1)} = get_CDF(
|
123
|
-
* between the min and max
|
121
|
+
* <li>Let <i>{r1, r2, ..., r(m+1)} = get_CDF(s1, s2, ..., sm)</i> where <i>s1, s2, ...</i> are
|
122
|
+
* split points (items from the input domain) between the min and max items of the input stream.
|
124
123
|
* <li>Let <i>mass<sub>i</sub> = r<sub>i+1</sub> - r<sub>i</sub></i>.</li>
|
125
|
-
* <li>Let <i>trueMass</i> be the true mass between the true ranks of <i>
|
126
|
-
*
|
124
|
+
* <li>Let <i>trueMass</i> be the true mass between the true ranks of <i>s<sub>i</sub>,
|
125
|
+
* s<sub>i+1</sub></i> derived from the hypothetical sorted stream of all <i>N</i> items.</li>
|
127
126
|
* <li>Let <i>eps = get_normalized_rank_error(true)</i>.</li>
|
128
127
|
* <li>then <i>mass - eps ≤ trueMass ≤ mass + eps</i> with a confidence of 99%.</li>
|
129
|
-
* <li>1 - r(m+1) includes the mass of all points larger than
|
128
|
+
* <li>1 - r(m+1) includes the mass of all points larger than sm.</li>
|
130
129
|
* </ul>
|
131
130
|
*
|
132
131
|
* <p>From the above, it might seem like we could make some estimates to bound the
|
133
|
-
* <em>
|
134
|
-
* let us derive error bounds or confidences around
|
132
|
+
* <em>item</em> returned from a call to <em>get_quantile()</em>. The sketch, however, does not
|
133
|
+
* let us derive error bounds or confidences around items. Because errors are independent, we
|
135
134
|
* can approximately bracket a value as shown below, but there are no error estimates available.
|
136
135
|
* Additionally, the interval may be quite large for certain distributions.
|
137
136
|
* <ul>
|
138
|
-
* <li>Let <i>
|
137
|
+
* <li>Let <i>q = get_quantile(r)</i>, the estimated quantile of rank <i>r</i>.</li>
|
139
138
|
* <li>Let <i>eps = get_normalized_rank_error(false)</i>.</li>
|
140
|
-
* <li>Let <i>
|
141
|
-
* <li>Let <i>
|
142
|
-
* <li>Then <i>
|
139
|
+
* <li>Let <i>q<sub>lo</sub></i> = estimated quantile of rank <i>(r - eps)</i>.</li>
|
140
|
+
* <li>Let <i>q<sub>hi</sub></i> = estimated quantile of rank <i>(r + eps)</i>.</li>
|
141
|
+
* <li>Then <i>q<sub>lo</sub> ≤ q ≤ q<sub>hi</sub></i>, with 99% confidence.</li>
|
143
142
|
* </ul>
|
144
143
|
*
|
145
144
|
* author Kevin Lang
|
@@ -147,13 +146,6 @@ namespace datasketches {
|
|
147
146
|
* author Lee Rhodes
|
148
147
|
*/
|
149
148
|
|
150
|
-
template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
|
151
|
-
template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
|
152
|
-
template<typename A> using AllocU32 = typename std::allocator_traits<A>::template rebind_alloc<uint32_t>;
|
153
|
-
template<typename A> using vector_u32 = std::vector<uint32_t, AllocU32<A>>;
|
154
|
-
template<typename A> using AllocD = typename std::allocator_traits<A>::template rebind_alloc<double>;
|
155
|
-
template<typename A> using vector_d = std::vector<double, AllocD<A>>;
|
156
|
-
|
157
149
|
namespace kll_constants {
|
158
150
|
const uint16_t DEFAULT_K = 200;
|
159
151
|
}
|
@@ -161,21 +153,19 @@ namespace kll_constants {
|
|
161
153
|
template <
|
162
154
|
typename T,
|
163
155
|
typename C = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare)
|
164
|
-
typename S = serde<T>, // deprecated, to be removed in the next major version
|
165
156
|
typename A = std::allocator<T>
|
166
157
|
>
|
167
158
|
class kll_sketch {
|
168
159
|
public:
|
169
160
|
using value_type = T;
|
170
161
|
using comparator = C;
|
162
|
+
using vector_u32 = std::vector<uint32_t, typename std::allocator_traits<A>::template rebind_alloc<uint32_t>>;
|
171
163
|
|
172
164
|
static const uint8_t DEFAULT_M = 8;
|
173
|
-
// TODO: Redundant and deprecated. Will be removed in next major version.
|
174
|
-
static const uint16_t DEFAULT_K = kll_constants::DEFAULT_K;
|
175
165
|
static const uint16_t MIN_K = DEFAULT_M;
|
176
166
|
static const uint16_t MAX_K = (1 << 16) - 1;
|
177
167
|
|
178
|
-
explicit kll_sketch(uint16_t k = kll_constants::DEFAULT_K, const A& allocator = A());
|
168
|
+
explicit kll_sketch(uint16_t k = kll_constants::DEFAULT_K, const C& comparator = C(), const A& allocator = A());
|
179
169
|
kll_sketch(const kll_sketch& other);
|
180
170
|
kll_sketch(kll_sketch&& other) noexcept;
|
181
171
|
~kll_sketch();
|
@@ -185,17 +175,18 @@ class kll_sketch {
|
|
185
175
|
/*
|
186
176
|
* Type converting constructor.
|
187
177
|
* @param other sketch of a different type
|
178
|
+
* @param comparator instance of a Comparator
|
188
179
|
* @param allocator instance of an Allocator
|
189
180
|
*/
|
190
|
-
template<typename TT, typename CC, typename
|
191
|
-
explicit kll_sketch(const kll_sketch<TT, CC,
|
181
|
+
template<typename TT, typename CC, typename AA>
|
182
|
+
explicit kll_sketch(const kll_sketch<TT, CC, AA>& other, const C& comparator = C(), const A& allocator = A());
|
192
183
|
|
193
184
|
/**
|
194
185
|
* Updates this sketch with the given data item.
|
195
|
-
* @param
|
186
|
+
* @param item from a stream of items
|
196
187
|
*/
|
197
188
|
template<typename FwdT>
|
198
|
-
void update(FwdT&&
|
189
|
+
void update(FwdT&& item);
|
199
190
|
|
200
191
|
/**
|
201
192
|
* Merges another sketch into this one.
|
@@ -235,20 +226,18 @@ class kll_sketch {
|
|
235
226
|
bool is_estimation_mode() const;
|
236
227
|
|
237
228
|
/**
|
238
|
-
* Returns the min
|
239
|
-
*
|
240
|
-
*
|
241
|
-
* @return the min value of the stream
|
229
|
+
* Returns the min item of the stream.
|
230
|
+
* If the sketch is empty this throws std::runtime_error.
|
231
|
+
* @return the min item of the stream
|
242
232
|
*/
|
243
|
-
T
|
233
|
+
T get_min_item() const;
|
244
234
|
|
245
235
|
/**
|
246
|
-
* Returns the max
|
247
|
-
*
|
248
|
-
*
|
249
|
-
* @return the max value of the stream
|
236
|
+
* Returns the max item of the stream.
|
237
|
+
* If the sketch is empty this throws std::runtime_error.
|
238
|
+
* @return the max item of the stream
|
250
239
|
*/
|
251
|
-
T
|
240
|
+
T get_max_item() const;
|
252
241
|
|
253
242
|
/**
|
254
243
|
* Returns an instance of the comparator for this sketch.
|
@@ -257,134 +246,128 @@ class kll_sketch {
|
|
257
246
|
C get_comparator() const;
|
258
247
|
|
259
248
|
/**
|
260
|
-
* Returns an
|
261
|
-
*
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
*
|
267
|
-
*
|
268
|
-
*
|
269
|
-
*
|
249
|
+
* Returns an instance of the allocator for this sketch.
|
250
|
+
* @return allocator
|
251
|
+
*/
|
252
|
+
A get_allocator() const;
|
253
|
+
|
254
|
+
/**
|
255
|
+
* Returns an item from the sketch that is the best approximation to an item
|
256
|
+
* from the original stream with the given rank.
|
257
|
+
*
|
258
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
270
259
|
*
|
271
|
-
* @param
|
272
|
-
*
|
273
|
-
* If fraction = 0.0, the true minimum value of the stream is returned.
|
274
|
-
* If fraction = 1.0, the true maximum value of the stream is returned.
|
275
|
-
* If the parameter inclusive=true, the given rank is considered inclusive (includes the weight of an item)
|
260
|
+
* @param rank of an item in the hypothetical sorted stream.
|
261
|
+
* @param inclusive if true, the given rank is considered inclusive (includes weight of an item)
|
276
262
|
*
|
277
|
-
* @return
|
263
|
+
* @return approximate quantile associated with the given rank
|
278
264
|
*/
|
279
|
-
using quantile_return_type = typename
|
280
|
-
|
281
|
-
quantile_return_type get_quantile(double fraction) const;
|
265
|
+
using quantile_return_type = typename quantiles_sorted_view<T, C, A>::quantile_return_type;
|
266
|
+
quantile_return_type get_quantile(double rank, bool inclusive = true) const;
|
282
267
|
|
283
268
|
/**
|
284
|
-
* This is a more efficient multiple-query version of get_quantile().
|
285
|
-
* <p>
|
286
269
|
* This returns an array that could have been generated by using get_quantile() for each
|
287
|
-
*
|
288
|
-
*
|
289
|
-
*
|
290
|
-
* to get_quantile().
|
270
|
+
* rank separately.
|
271
|
+
*
|
272
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
291
273
|
*
|
292
|
-
*
|
274
|
+
* @param ranks given array of ranks in the hypothetical sorted stream.
|
275
|
+
* These ranks must be in the interval [0.0, 1.0].
|
276
|
+
* @param size the number of ranks in the array
|
277
|
+
* @param inclusive if true, the given ranks are considered inclusive (include weights of items)
|
293
278
|
*
|
294
|
-
* @
|
295
|
-
* These are also called normalized ranks or fractional ranks.
|
296
|
-
* These fractions must be in the interval [0.0, 1.0], inclusive.
|
297
|
-
* If the parameter inclusive=true, the given fractions are considered inclusive (include weights of items)
|
279
|
+
* @return array of approximate quantiles corresponding to the given ranks in the same order.
|
298
280
|
*
|
299
|
-
*
|
300
|
-
* in the input array.
|
281
|
+
* Deprecated. Will be removed in the next major version. Use get_quantile() instead.
|
301
282
|
*/
|
302
|
-
|
303
|
-
std::vector<T, A> get_quantiles(const double* fractions, uint32_t size) const;
|
283
|
+
std::vector<T, A> get_quantiles(const double* ranks, uint32_t size, bool inclusive = true) const;
|
304
284
|
|
305
285
|
/**
|
306
286
|
* This is a multiple-query version of get_quantile() that allows the caller to
|
307
|
-
* specify the number of evenly-spaced
|
287
|
+
* specify the number of evenly-spaced ranks.
|
308
288
|
*
|
309
|
-
* <p>If the sketch is empty this
|
289
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
310
290
|
*
|
311
|
-
* @param num an integer that specifies the number of evenly-spaced
|
312
|
-
* This must be an integer greater than 0. A value of 1 will return the
|
313
|
-
* A value of 2 will return
|
314
|
-
*
|
291
|
+
* @param num an integer that specifies the number of evenly-spaced ranks.
|
292
|
+
* This must be an integer greater than 0. A value of 1 will return the quantile of rank 0.
|
293
|
+
* A value of 2 will return quantiles of ranks 0 and 1. A value of 3 will return quantiles of ranks 0,
|
294
|
+
* 0.5 (median) and 1, etc.
|
295
|
+
* @param inclusive if true, the ranks are considered inclusive (include weights of items)
|
315
296
|
*
|
316
|
-
* @return array of
|
297
|
+
* @return array of approximate quantiles corresponding to the given number of evenly-spaced ranks.
|
298
|
+
*
|
299
|
+
* Deprecated. Will be removed in the next major version. Use get_quantile() instead.
|
317
300
|
*/
|
318
|
-
|
319
|
-
std::vector<T, A> get_quantiles(uint32_t num) const;
|
301
|
+
std::vector<T, A> get_quantiles(uint32_t num, bool inclusive = true) const;
|
320
302
|
|
321
303
|
/**
|
322
|
-
* Returns an approximation to the normalized
|
323
|
-
* inclusive.
|
324
|
-
* With the template parameter inclusive=true the weight of the given value is included into the rank.
|
325
|
-
* Otherwise the rank equals the sum of the weights of all values that are less than the given value
|
326
|
-
* according to the comparator C.
|
304
|
+
* Returns an approximation to the normalized rank of the given item from 0 to 1, inclusive.
|
327
305
|
*
|
328
306
|
* <p>The resulting approximation has a probabilistic guarantee that can be obtained from the
|
329
307
|
* get_normalized_rank_error(false) function.
|
330
308
|
*
|
331
|
-
* <p>If the sketch is empty this
|
309
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
310
|
+
*
|
311
|
+
* @param item to be ranked.
|
312
|
+
* @param inclusive if true the weight of the given item is included into the rank.
|
313
|
+
* Otherwise the rank equals the sum of the weights of all items that are less than the given item
|
314
|
+
* according to the comparator C.
|
332
315
|
*
|
333
|
-
* @
|
334
|
-
* @return an approximate rank of the given value
|
316
|
+
* @return an approximate rank of the given item
|
335
317
|
*/
|
336
|
-
|
337
|
-
double get_rank(const T& value) const;
|
318
|
+
double get_rank(const T& item, bool inclusive = true) const;
|
338
319
|
|
339
320
|
/**
|
340
321
|
* Returns an approximation to the Probability Mass Function (PMF) of the input stream
|
341
|
-
* given a set of split points (
|
322
|
+
* given a set of split points (items).
|
342
323
|
*
|
343
324
|
* <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
|
344
325
|
* get_normalized_rank_error(true) function.
|
345
326
|
*
|
346
|
-
* <p>If the sketch is empty this
|
327
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
347
328
|
*
|
348
|
-
* @param split_points an array of <i>m</i> unique, monotonically increasing
|
349
|
-
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
|
350
|
-
*
|
351
|
-
*
|
352
|
-
*
|
353
|
-
*
|
329
|
+
* @param split_points an array of <i>m</i> unique, monotonically increasing items
|
330
|
+
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
|
331
|
+
*
|
332
|
+
* @param size the number of split points in the array
|
333
|
+
*
|
334
|
+
* @param inclusive if true the rank of an item includes its own weight, and therefore
|
335
|
+
* if the sketch contains items equal to a slit point, then in PMF such items are
|
336
|
+
* included into the interval to the left of split point. Otherwise they are included into the interval
|
337
|
+
* to the right of split point.
|
354
338
|
*
|
355
339
|
* @return an array of m+1 doubles each of which is an approximation
|
356
|
-
* to the fraction of the input stream
|
357
|
-
* If the template parameter inclusive=false, the definition of an "interval" is inclusive of the left split point and exclusive of the right
|
358
|
-
* split point, with the exception that the last interval will include the maximum value.
|
359
|
-
* If the template parameter inclusive=true, the definition of an "interval" is exclusive of the left split point and inclusive of the right
|
360
|
-
* split point.
|
340
|
+
* to the fraction of the input stream items (the mass) that fall into one of those intervals.
|
361
341
|
*/
|
362
|
-
|
363
|
-
|
342
|
+
using vector_double = typename quantiles_sorted_view<T, C, A>::vector_double;
|
343
|
+
vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
|
364
344
|
|
365
345
|
/**
|
366
346
|
* Returns an approximation to the Cumulative Distribution Function (CDF), which is the
|
367
|
-
* cumulative analog of the PMF, of the input stream given a set of split points (
|
347
|
+
* cumulative analog of the PMF, of the input stream given a set of split points (items).
|
368
348
|
*
|
369
349
|
* <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
|
370
350
|
* get_normalized_rank_error(false) function.
|
371
351
|
*
|
372
|
-
* <p>If the sketch is empty this
|
352
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
373
353
|
*
|
374
|
-
* @param split_points an array of <i>m</i> unique, monotonically increasing
|
354
|
+
* @param split_points an array of <i>m</i> unique, monotonically increasing items
|
375
355
|
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
|
376
|
-
* The definition of an "interval" is inclusive of the left split point (or minimum value) and
|
377
|
-
* exclusive of the right split point, with the exception that the last interval will include
|
378
|
-
* the maximum value.
|
379
|
-
* It is not necessary to include either the min or max values in these split points.
|
380
356
|
*
|
381
|
-
* @
|
357
|
+
* @param size the number of split points in the array
|
358
|
+
*
|
359
|
+
* @param inclusive if true the rank of an item includes its own weight, and therefore
|
360
|
+
* if the sketch contains items equal to a slit point, then in CDF such items are
|
361
|
+
* included into the interval to the left of split point. Otherwise they are included into
|
362
|
+
* the interval to the right of split point.
|
363
|
+
*
|
364
|
+
* @return an array of m+1 doubles, which are a consecutive approximation to the CDF
|
382
365
|
* of the input stream given the split_points. The value at array position j of the returned
|
383
366
|
* CDF array is the sum of the returned values in positions 0 through j of the returned PMF
|
384
|
-
* array.
|
367
|
+
* array. This can be viewed as array of ranks of the given split points plus one more value
|
368
|
+
* that is always 1.
|
385
369
|
*/
|
386
|
-
|
387
|
-
vector_d<A> get_CDF(const T* split_points, uint32_t size) const;
|
370
|
+
vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
|
388
371
|
|
389
372
|
/**
|
390
373
|
* Gets the approximate rank error of this sketch normalized as a fraction between zero and one.
|
@@ -398,19 +381,19 @@ class kll_sketch {
|
|
398
381
|
/**
|
399
382
|
* Computes size needed to serialize the current state of the sketch.
|
400
383
|
* This version is for fixed-size arithmetic types (integral and floating point).
|
401
|
-
* @param
|
384
|
+
* @param sd instance of a SerDe
|
402
385
|
* @return size in bytes needed to serialize this sketch
|
403
386
|
*/
|
404
|
-
template<typename TT = T, typename SerDe =
|
387
|
+
template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
|
405
388
|
size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
|
406
389
|
|
407
390
|
/**
|
408
391
|
* Computes size needed to serialize the current state of the sketch.
|
409
392
|
* This version is for all other types and can be expensive since every item needs to be looked at.
|
410
|
-
* @param
|
393
|
+
* @param sd instance of a SerDe
|
411
394
|
* @return size in bytes needed to serialize this sketch
|
412
395
|
*/
|
413
|
-
template<typename TT = T, typename SerDe =
|
396
|
+
template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
|
414
397
|
size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
|
415
398
|
|
416
399
|
/**
|
@@ -443,14 +426,14 @@ class kll_sketch {
|
|
443
426
|
/**
|
444
427
|
* This method serializes the sketch into a given stream in a binary form
|
445
428
|
* @param os output stream
|
446
|
-
* @param instance of a SerDe
|
429
|
+
* @param sd instance of a SerDe
|
447
430
|
*/
|
448
|
-
template<typename SerDe =
|
431
|
+
template<typename SerDe = serde<T>>
|
449
432
|
void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
|
450
433
|
|
451
434
|
// This is a convenience alias for users
|
452
435
|
// The type returned by the following serialize method
|
453
|
-
using vector_bytes =
|
436
|
+
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
|
454
437
|
|
455
438
|
/**
|
456
439
|
* This method serializes the sketch as a vector of bytes.
|
@@ -458,53 +441,36 @@ class kll_sketch {
|
|
458
441
|
* It is a blank space of a given size.
|
459
442
|
* This header is used in Datasketches PostgreSQL extension.
|
460
443
|
* @param header_size_bytes space to reserve in front of the sketch
|
461
|
-
* @param instance of a SerDe
|
444
|
+
* @param sd instance of a SerDe
|
462
445
|
* @return serialized sketch as a vector of bytes
|
463
446
|
*/
|
464
|
-
template<typename SerDe =
|
447
|
+
template<typename SerDe = serde<T>>
|
465
448
|
vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
|
466
449
|
|
467
450
|
/**
|
468
451
|
* This method deserializes a sketch from a given stream.
|
469
452
|
* @param is input stream
|
470
|
-
* @param
|
471
|
-
* @
|
472
|
-
*
|
473
|
-
* Deprecated, to be removed in the next major version
|
474
|
-
*/
|
475
|
-
static kll_sketch deserialize(std::istream& is, const A& allocator = A());
|
476
|
-
|
477
|
-
/**
|
478
|
-
* This method deserializes a sketch from a given stream.
|
479
|
-
* @param is input stream
|
480
|
-
* @param serde instance of a SerDe
|
453
|
+
* @param sd instance of a SerDe
|
454
|
+
* @param comparator instance of a Comparator
|
481
455
|
* @param allocator instance of an Allocator
|
482
456
|
* @return an instance of a sketch
|
483
457
|
*/
|
484
|
-
template<typename SerDe =
|
485
|
-
static kll_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(),
|
458
|
+
template<typename SerDe = serde<T>>
|
459
|
+
static kll_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(),
|
460
|
+
const C& comparator = C(), const A& allocator = A());
|
486
461
|
|
487
462
|
/**
|
488
463
|
* This method deserializes a sketch from a given array of bytes.
|
489
464
|
* @param bytes pointer to the array of bytes
|
490
465
|
* @param size the size of the array
|
466
|
+
* @param sd instance of a SerDe
|
467
|
+
* @param comparator instance of a Comparator
|
491
468
|
* @param allocator instance of an Allocator
|
492
469
|
* @return an instance of a sketch
|
493
|
-
*
|
494
|
-
* Deprecated, to be removed in the next major version
|
495
470
|
*/
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
* This method deserializes a sketch from a given array of bytes.
|
500
|
-
* @param bytes pointer to the array of bytes
|
501
|
-
* @param size the size of the array
|
502
|
-
* @param serde instance of a SerDe
|
503
|
-
* @param allocator instance of an Allocator
|
504
|
-
* @return an instance of a sketch
|
505
|
-
*/
|
506
|
-
template<typename SerDe = S>
|
507
|
-
static kll_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
|
471
|
+
template<typename SerDe = serde<T>>
|
472
|
+
static kll_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(),
|
473
|
+
const C& comparator = C(), const A& allocator = A());
|
508
474
|
|
509
475
|
/*
|
510
476
|
* Gets the normalized rank error given k and pmf.
|
@@ -526,14 +492,7 @@ class kll_sketch {
|
|
526
492
|
const_iterator begin() const;
|
527
493
|
const_iterator end() const;
|
528
494
|
|
529
|
-
|
530
|
-
quantile_sketch_sorted_view<T, C, A> get_sorted_view(bool cumulative) const;
|
531
|
-
|
532
|
-
#ifdef KLL_VALIDATION
|
533
|
-
uint8_t get_num_levels() { return num_levels_; }
|
534
|
-
uint32_t* get_levels() { return levels_; }
|
535
|
-
T* get_items() { return items_; }
|
536
|
-
#endif
|
495
|
+
quantiles_sorted_view<T, C, A> get_sorted_view() const;
|
537
496
|
|
538
497
|
private:
|
539
498
|
/* Serialized sketch layout:
|
@@ -559,28 +518,30 @@ class kll_sketch {
|
|
559
518
|
static const uint8_t PREAMBLE_INTS_SHORT = 2; // for empty and single item
|
560
519
|
static const uint8_t PREAMBLE_INTS_FULL = 5;
|
561
520
|
|
521
|
+
C comparator_;
|
562
522
|
A allocator_;
|
563
523
|
uint16_t k_;
|
564
524
|
uint8_t m_; // minimum buffer "width"
|
565
525
|
uint16_t min_k_; // for error estimation after merging with different k
|
566
|
-
uint64_t n_;
|
567
526
|
uint8_t num_levels_;
|
568
|
-
|
527
|
+
bool is_level_zero_sorted_;
|
528
|
+
uint64_t n_;
|
529
|
+
vector_u32 levels_;
|
569
530
|
T* items_;
|
570
531
|
uint32_t items_size_;
|
571
|
-
T*
|
572
|
-
T*
|
573
|
-
|
532
|
+
T* min_item_;
|
533
|
+
T* max_item_;
|
534
|
+
mutable quantiles_sorted_view<T, C, A>* sorted_view_;
|
574
535
|
|
575
536
|
// for deserialization
|
576
537
|
class item_deleter;
|
577
538
|
class items_deleter;
|
578
|
-
kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32
|
579
|
-
std::unique_ptr<T, items_deleter> items, uint32_t items_size, std::unique_ptr<T, item_deleter>
|
580
|
-
std::unique_ptr<T, item_deleter>
|
539
|
+
kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32&& levels,
|
540
|
+
std::unique_ptr<T, items_deleter> items, uint32_t items_size, std::unique_ptr<T, item_deleter> min_item,
|
541
|
+
std::unique_ptr<T, item_deleter> max_item, bool is_level_zero_sorted, const C& comparator);
|
581
542
|
|
582
543
|
// common update code
|
583
|
-
inline void update_min_max(const T&
|
544
|
+
inline void update_min_max(const T& item);
|
584
545
|
inline uint32_t internal_update();
|
585
546
|
|
586
547
|
// The following code is only valid in the special case of exactly reaching capacity while updating.
|
@@ -591,15 +552,6 @@ class kll_sketch {
|
|
591
552
|
void add_empty_top_level_to_completely_full_sketch();
|
592
553
|
void sort_level_zero();
|
593
554
|
|
594
|
-
template<bool inclusive>
|
595
|
-
vector_d<A> get_PMF_or_CDF(const T* split_points, uint32_t size, bool is_CDF) const;
|
596
|
-
template<bool inclusive>
|
597
|
-
void increment_buckets_unsorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
|
598
|
-
const T* split_points, uint32_t size, double* buckets) const;
|
599
|
-
template<bool inclusive>
|
600
|
-
void increment_buckets_sorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
|
601
|
-
const T* split_points, uint32_t size, double* buckets) const;
|
602
|
-
|
603
555
|
template<typename O> void merge_higher_levels(O&& other, uint64_t final_n);
|
604
556
|
|
605
557
|
template<typename FwdSk>
|
@@ -616,43 +568,34 @@ class kll_sketch {
|
|
616
568
|
|
617
569
|
void check_sorting() const;
|
618
570
|
|
619
|
-
// implementations for floating point types
|
620
571
|
template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
|
621
|
-
static
|
622
|
-
|
623
|
-
return value;
|
572
|
+
static inline bool check_update_item(TT item) {
|
573
|
+
return !std::isnan(item);
|
624
574
|
}
|
625
575
|
|
626
|
-
template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
|
627
|
-
static inline bool check_update_value(TT value) {
|
628
|
-
return !std::isnan(value);
|
629
|
-
}
|
630
|
-
|
631
|
-
// implementations for all other types
|
632
576
|
template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
|
633
|
-
static
|
634
|
-
throw std::runtime_error("getting quantiles from empty sketch is not supported for this type of value");
|
635
|
-
}
|
636
|
-
|
637
|
-
template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
|
638
|
-
static inline bool check_update_value(TT) {
|
577
|
+
static inline bool check_update_item(TT) {
|
639
578
|
return true;
|
640
579
|
}
|
641
580
|
|
642
581
|
// for type converting constructor
|
643
|
-
template<typename TT, typename CC, typename
|
644
|
-
|
582
|
+
template<typename TT, typename CC, typename AA> friend class kll_sketch;
|
583
|
+
|
584
|
+
void setup_sorted_view() const; // modifies mutable state
|
585
|
+
void reset_sorted_view();
|
645
586
|
};
|
646
587
|
|
647
|
-
template<typename T, typename C, typename
|
648
|
-
class kll_sketch<T, C,
|
588
|
+
template<typename T, typename C, typename A>
|
589
|
+
class kll_sketch<T, C, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
|
649
590
|
public:
|
650
|
-
|
591
|
+
using value_type = std::pair<const T&, const uint64_t>;
|
592
|
+
friend class kll_sketch<T, C, A>;
|
651
593
|
const_iterator& operator++();
|
652
594
|
const_iterator& operator++(int);
|
653
595
|
bool operator==(const const_iterator& other) const;
|
654
596
|
bool operator!=(const const_iterator& other) const;
|
655
|
-
const
|
597
|
+
const value_type operator*() const;
|
598
|
+
const return_value_holder<value_type> operator->() const;
|
656
599
|
private:
|
657
600
|
const T* items;
|
658
601
|
const uint32_t* levels;
|