datasketches 0.2.7 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/datasketches/kll_wrapper.cpp +20 -20
- data/ext/datasketches/theta_wrapper.cpp +2 -2
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
- data/vendor/datasketches-cpp/MANIFEST.in +21 -2
- data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
- data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
- data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
- data/vendor/datasketches-cpp/pyproject.toml +17 -12
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
- data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
- data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/setup.py +14 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
- data/vendor/datasketches-cpp/tox.ini +26 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -0
- metadata +14 -5
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
|
@@ -24,7 +24,7 @@
|
|
|
24
24
|
#include <memory>
|
|
25
25
|
#include <vector>
|
|
26
26
|
|
|
27
|
-
#include "
|
|
27
|
+
#include "quantiles_sorted_view.hpp"
|
|
28
28
|
#include "common_defs.hpp"
|
|
29
29
|
#include "serde.hpp"
|
|
30
30
|
|
|
@@ -32,22 +32,21 @@ namespace datasketches {
|
|
|
32
32
|
|
|
33
33
|
/**
|
|
34
34
|
* This is a stochastic streaming sketch that enables near-real time analysis of the
|
|
35
|
-
* approximate distribution
|
|
36
|
-
* The analysis is obtained using
|
|
37
|
-
* Probability Mass Function from
|
|
35
|
+
* approximate distribution from a very large stream in a single pass.
|
|
36
|
+
* The analysis is obtained using get_rank() and get_quantile() functions,
|
|
37
|
+
* the Probability Mass Function from get_PMF() and the Cumulative Distribution Function from get_CDF().
|
|
38
38
|
*
|
|
39
39
|
* <p>Consider a large stream of one million values such as packet sizes coming into a network node.
|
|
40
|
-
* The
|
|
40
|
+
* The natural rank of any specific size value is its index in the hypothetical sorted
|
|
41
41
|
* array of values.
|
|
42
|
-
* The normalized rank
|
|
42
|
+
* The normalized rank is the natural rank divided by the stream size,
|
|
43
43
|
* in this case one million.
|
|
44
44
|
* The value corresponding to the normalized rank of 0.5 represents the 50th percentile or median
|
|
45
|
-
* value of the distribution, or
|
|
46
|
-
*
|
|
47
|
-
* the sketch.</p>
|
|
45
|
+
* value of the distribution, or get_quantile(0.5). Similarly, the 95th percentile is obtained from
|
|
46
|
+
* get_quantile(0.95).</p>
|
|
48
47
|
*
|
|
49
48
|
* <p>From the min and max values, for example, 1 and 1000 bytes,
|
|
50
|
-
* you can obtain the PMF from
|
|
49
|
+
* you can obtain the PMF from get_PMF(100, 500, 900) that will result in an array of
|
|
51
50
|
* 4 fractional values such as {.4, .3, .2, .1}, which means that
|
|
52
51
|
* <ul>
|
|
53
52
|
* <li>40% of the values were < 100,</li>
|
|
@@ -55,20 +54,19 @@ namespace datasketches {
|
|
|
55
54
|
* <li>20% of the values were ≥ 500 and < 900, and</li>
|
|
56
55
|
* <li>10% of the values were ≥ 900.</li>
|
|
57
56
|
* </ul>
|
|
58
|
-
* A frequency histogram can be obtained by
|
|
57
|
+
* A frequency histogram can be obtained by multiplying these fractions by get_n(),
|
|
59
58
|
* which is the total count of values received.
|
|
60
|
-
* The
|
|
59
|
+
* The get_CDF() works similarly, but produces the cumulative distribution instead.
|
|
61
60
|
*
|
|
62
61
|
* <p>As of November 2021, this implementation produces serialized sketches which are binary-compatible
|
|
63
62
|
* with the equivalent Java implementation only when template parameter T = double
|
|
64
63
|
* (64-bit double precision values).
|
|
65
|
-
|
|
66
64
|
*
|
|
67
65
|
* <p>The accuracy of this sketch is a function of the configured value <i>k</i>, which also affects
|
|
68
66
|
* the overall size of the sketch. Accuracy of this quantile sketch is always with respect to
|
|
69
|
-
* the normalized rank.
|
|
70
|
-
* For example, the median
|
|
71
|
-
* from the hypothetically sorted array of input
|
|
67
|
+
* the normalized rank. A <i>k</i> of 128 produces a normalized, rank error of about 1.7%.
|
|
68
|
+
* For example, the median item returned from getQuantile(0.5) will be between the actual items
|
|
69
|
+
* from the hypothetically sorted array of input items at normalized ranks of 0.483 and 0.517, with
|
|
72
70
|
* a confidence of about 99%.</p>
|
|
73
71
|
*
|
|
74
72
|
* <pre>
|
|
@@ -121,17 +119,17 @@ Table Guide for DoublesSketch Size in Bytes and Approximate Error:
|
|
|
121
119
|
* by Agarwal, Cormode, Huang, Phillips, Wei, and Yi.
|
|
122
120
|
* <a href="http://dblp.org/rec/html/journals/tods/AgarwalCHPWY13"></a></p>
|
|
123
121
|
*
|
|
124
|
-
* <p>This algorithm is independent of the distribution of
|
|
125
|
-
* requires only that the
|
|
122
|
+
* <p>This algorithm is independent of the distribution of items and
|
|
123
|
+
* requires only that the items be comparable.</p>
|
|
126
124
|
*
|
|
127
|
-
* <p>This algorithm intentionally inserts randomness into the sampling process for
|
|
125
|
+
* <p>This algorithm intentionally inserts randomness into the sampling process for items that
|
|
128
126
|
* ultimately get retained in the sketch. The results produced by this algorithm are not
|
|
129
127
|
* deterministic. For example, if the same stream is inserted into two different instances of this
|
|
130
128
|
* sketch, the answers obtained from the two sketches may not be identical.</p>
|
|
131
129
|
*
|
|
132
|
-
* <p>Similarly, there may be directional inconsistencies. For example, the
|
|
133
|
-
*
|
|
134
|
-
*
|
|
130
|
+
* <p>Similarly, there may be directional inconsistencies. For example, the result
|
|
131
|
+
* obtained from get_quantile(rank) input into the reverse directional query
|
|
132
|
+
* get_rank(item) may not result in the original item.</p>
|
|
135
133
|
*
|
|
136
134
|
* @author Kevin Lang
|
|
137
135
|
* @author Lee Rhodes
|
|
@@ -153,9 +151,9 @@ public:
|
|
|
153
151
|
using value_type = T;
|
|
154
152
|
using allocator_type = Allocator;
|
|
155
153
|
using comparator = Comparator;
|
|
156
|
-
using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
|
|
157
154
|
|
|
158
|
-
explicit quantiles_sketch(uint16_t k = quantiles_constants::DEFAULT_K,
|
|
155
|
+
explicit quantiles_sketch(uint16_t k = quantiles_constants::DEFAULT_K,
|
|
156
|
+
const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
|
|
159
157
|
quantiles_sketch(const quantiles_sketch& other);
|
|
160
158
|
quantiles_sketch(quantiles_sketch&& other) noexcept;
|
|
161
159
|
~quantiles_sketch();
|
|
@@ -165,17 +163,19 @@ public:
|
|
|
165
163
|
/**
|
|
166
164
|
* @brief Type converting constructor
|
|
167
165
|
* @param other quantiles sketch of a different type
|
|
166
|
+
* @param comparator instance of a Comparator
|
|
168
167
|
* @param allocator instance of an Allocator
|
|
169
168
|
*/
|
|
170
169
|
template<typename From, typename FC, typename FA>
|
|
171
|
-
explicit quantiles_sketch(const quantiles_sketch<From, FC, FA>& other,
|
|
170
|
+
explicit quantiles_sketch(const quantiles_sketch<From, FC, FA>& other,
|
|
171
|
+
const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
|
|
172
172
|
|
|
173
173
|
/**
|
|
174
174
|
* Updates this sketch with the given data item.
|
|
175
|
-
* @param
|
|
175
|
+
* @param item from a stream of items
|
|
176
176
|
*/
|
|
177
177
|
template<typename FwdT>
|
|
178
|
-
void update(FwdT&&
|
|
178
|
+
void update(FwdT&& item);
|
|
179
179
|
|
|
180
180
|
/**
|
|
181
181
|
* Merges another sketch into this one.
|
|
@@ -215,20 +215,18 @@ public:
|
|
|
215
215
|
bool is_estimation_mode() const;
|
|
216
216
|
|
|
217
217
|
/**
|
|
218
|
-
* Returns the min
|
|
219
|
-
*
|
|
220
|
-
*
|
|
221
|
-
* @return the min value of the stream
|
|
218
|
+
* Returns the min item of the stream.
|
|
219
|
+
* If the sketch is empty this throws std::runtime_error.
|
|
220
|
+
* @return the min item of the stream
|
|
222
221
|
*/
|
|
223
|
-
const T&
|
|
222
|
+
const T& get_min_item() const;
|
|
224
223
|
|
|
225
224
|
/**
|
|
226
|
-
* Returns the max
|
|
227
|
-
*
|
|
228
|
-
*
|
|
229
|
-
* @return the max value of the stream
|
|
225
|
+
* Returns the max item of the stream.
|
|
226
|
+
* If the sketch is empty this throws std::runtime_error.
|
|
227
|
+
* @return the max item of the stream
|
|
230
228
|
*/
|
|
231
|
-
const T&
|
|
229
|
+
const T& get_max_item() const;
|
|
232
230
|
|
|
233
231
|
/**
|
|
234
232
|
* Returns an instance of the comparator for this sketch.
|
|
@@ -243,166 +241,147 @@ public:
|
|
|
243
241
|
allocator_type get_allocator() const;
|
|
244
242
|
|
|
245
243
|
/**
|
|
246
|
-
* Returns an approximation to the
|
|
247
|
-
*
|
|
248
|
-
* version of the input stream so far.
|
|
249
|
-
* <p>
|
|
250
|
-
* Note that this method has a fairly large overhead (microseconds instead of nanoseconds)
|
|
251
|
-
* so it should not be called multiple times to get different quantiles from the same
|
|
252
|
-
* sketch. Instead use get_quantiles(), which pays the overhead only once.
|
|
244
|
+
* Returns an approximation to the data item associated with the given rank
|
|
245
|
+
* of a hypothetical sorted version of the input stream so far.
|
|
253
246
|
* <p>
|
|
254
|
-
*
|
|
255
|
-
* For other types: if the sketch is empty this throws runtime_error.
|
|
247
|
+
* If the sketch is empty this throws std::runtime_error.
|
|
256
248
|
*
|
|
257
|
-
* @param rank the specified
|
|
258
|
-
* These are also called normalized ranks or fractional ranks.
|
|
259
|
-
* If rank = 0.0, the true minimum value of the stream is returned.
|
|
260
|
-
* If rank = 1.0, the true maximum value of the stream is returned.
|
|
249
|
+
* @param rank the specified normalized rank in the hypothetical sorted stream.
|
|
261
250
|
*
|
|
262
|
-
* @return the approximation to the
|
|
251
|
+
* @return the approximation to the item at the given rank
|
|
263
252
|
*/
|
|
264
|
-
using quantile_return_type = typename
|
|
265
|
-
|
|
266
|
-
quantile_return_type get_quantile(double rank) const;
|
|
253
|
+
using quantile_return_type = typename quantiles_sorted_view<T, Comparator, Allocator>::quantile_return_type;
|
|
254
|
+
quantile_return_type get_quantile(double rank, bool inclusive = true) const;
|
|
267
255
|
|
|
268
256
|
/**
|
|
269
|
-
* This is a
|
|
257
|
+
* This is a multiple-query version of get_quantile().
|
|
270
258
|
* <p>
|
|
271
259
|
* This returns an array that could have been generated by using get_quantile() for each
|
|
272
|
-
*
|
|
273
|
-
* This method incurs the internal set-up overhead once and obtains multiple quantile values in
|
|
274
|
-
* a single query. It is strongly recommend that this method be used instead of multiple calls
|
|
275
|
-
* to get_quantile().
|
|
260
|
+
* normalized rank separately.
|
|
276
261
|
*
|
|
277
|
-
* <p>If the sketch is empty this
|
|
262
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
|
278
263
|
*
|
|
279
|
-
* @param
|
|
280
|
-
* These
|
|
281
|
-
*
|
|
264
|
+
* @param ranks given array of normalized ranks in the hypothetical sorted stream.
|
|
265
|
+
* These ranks must be in the interval [0.0, 1.0], inclusive.
|
|
266
|
+
* @param size the number of ranks in the array
|
|
282
267
|
*
|
|
283
|
-
* @return array of approximations to
|
|
268
|
+
* @return array of approximations to items associated with given ranks in the same order as given ranks
|
|
284
269
|
* in the input array.
|
|
270
|
+
*
|
|
271
|
+
* Deprecated. Will be removed in the next major version. Use get_quantile() instead.
|
|
285
272
|
*/
|
|
286
|
-
|
|
287
|
-
std::vector<T, Allocator> get_quantiles(const double* fractions, uint32_t size) const;
|
|
273
|
+
std::vector<T, Allocator> get_quantiles(const double* ranks, uint32_t size, bool inclusive = true) const;
|
|
288
274
|
|
|
289
275
|
/**
|
|
290
276
|
* This is a multiple-query version of get_quantile() that allows the caller to
|
|
291
|
-
* specify the number of evenly-spaced
|
|
277
|
+
* specify the number of evenly-spaced normalized ranks.
|
|
278
|
+
*
|
|
279
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
|
292
280
|
*
|
|
293
|
-
*
|
|
281
|
+
* @param num an integer that specifies the number of evenly-spaced ranks.
|
|
282
|
+
* This must be an integer greater than 0. A value of 1 is equivalent to get_quantiles([0]).
|
|
283
|
+
* A value of 2 is equivalent to get_quantiles([0, 1]). A value of 3 is equivalent to
|
|
284
|
+
* get_quantiles([0, 0.5, 1]), etc.
|
|
294
285
|
*
|
|
295
|
-
* @
|
|
296
|
-
* This must be an integer greater than 0. A value of 1 will return the min value.
|
|
297
|
-
* A value of 2 will return the min and the max value. A value of 3 will return the min,
|
|
298
|
-
* the median and the max value, etc.
|
|
286
|
+
* @return array of approximations to items associated with the given number of evenly-spaced normalized ranks.
|
|
299
287
|
*
|
|
300
|
-
*
|
|
288
|
+
* Deprecated. Will be removed in the next major version. Use get_quantile() instead.
|
|
301
289
|
*/
|
|
302
|
-
|
|
303
|
-
std::vector<T, Allocator> get_quantiles(uint32_t num) const;
|
|
290
|
+
std::vector<T, Allocator> get_quantiles(uint32_t num, bool inclusive = true) const;
|
|
304
291
|
|
|
305
292
|
/**
|
|
306
|
-
* Returns an approximation to the normalized
|
|
307
|
-
* inclusive. When template parameter <em>inclusive=false</em> (the default), only elements strictly
|
|
308
|
-
* less than the provided value are included in the rank estimate. With <em>inclusive=true</em>,
|
|
309
|
-
* the rank estimate includes elements less than or equal to the provided value.
|
|
293
|
+
* Returns an approximation to the normalized rank of the given item from 0 to 1, inclusive.
|
|
310
294
|
*
|
|
311
295
|
* <p>The resulting approximation has a probabilistic guarantee that can be obtained from the
|
|
312
296
|
* get_normalized_rank_error(false) function.
|
|
313
297
|
*
|
|
314
|
-
* <p>If the sketch is empty this
|
|
298
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
|
315
299
|
*
|
|
316
|
-
* @param
|
|
317
|
-
* @
|
|
300
|
+
* @param item to be ranked
|
|
301
|
+
* @param inclusive if true the weight of the given item is included into the rank.
|
|
302
|
+
* Otherwise the rank equals the sum of the weights of all items that are less than the given item
|
|
303
|
+
* according to the comparator C.
|
|
304
|
+
* @return an approximate normalized rank of the given item
|
|
318
305
|
*/
|
|
319
|
-
|
|
320
|
-
double get_rank(const T& value) const;
|
|
306
|
+
double get_rank(const T& item, bool inclusive = true) const;
|
|
321
307
|
|
|
322
308
|
/**
|
|
323
309
|
* Returns an approximation to the Probability Mass Function (PMF) of the input stream
|
|
324
|
-
* given a set of split points (
|
|
310
|
+
* given a set of split points (items).
|
|
325
311
|
*
|
|
326
312
|
* <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
|
|
327
313
|
* get_normalized_rank_error(true) function.
|
|
328
314
|
*
|
|
329
|
-
* <p>If the sketch is empty this
|
|
315
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
|
330
316
|
*
|
|
331
|
-
* @param split_points an array of <i>m</i> unique, monotonically increasing
|
|
332
|
-
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
|
|
333
|
-
*
|
|
334
|
-
*
|
|
335
|
-
*
|
|
336
|
-
*
|
|
337
|
-
* the
|
|
338
|
-
*
|
|
317
|
+
* @param split_points an array of <i>m</i> unique, monotonically increasing items
|
|
318
|
+
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
|
|
319
|
+
*
|
|
320
|
+
* @param size of the array of split points.
|
|
321
|
+
*
|
|
322
|
+
* @param inclusive if true the rank of an item includes its own weight, and therefore
|
|
323
|
+
* if the sketch contains items equal to a slit point, then in PMF such items are
|
|
324
|
+
* included into the interval to the left of split point. Otherwise they are included into the interval
|
|
325
|
+
* to the right of split point.
|
|
339
326
|
*
|
|
340
327
|
* @return an array of m+1 doubles each of which is an approximation
|
|
341
|
-
* to the fraction of the input stream
|
|
342
|
-
* When <em>inclusive=false</em> (the default), the definition of an "interval" is inclusive
|
|
343
|
-
* of the left split point and exclusive of the right split point, with the exception that the last
|
|
344
|
-
* interval will include the maximum value. When <em>inclusive=true</em>,
|
|
345
|
-
* an "interval" is exclusive of the left split point and inclusive of the right.
|
|
328
|
+
* to the fraction of the input stream items (the mass) that fall into one of those intervals.
|
|
346
329
|
*/
|
|
347
|
-
|
|
348
|
-
vector_double get_PMF(const T* split_points, uint32_t size) const;
|
|
330
|
+
using vector_double = typename quantiles_sorted_view<T, Comparator, Allocator>::vector_double;
|
|
331
|
+
vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
|
|
349
332
|
|
|
350
333
|
/**
|
|
351
334
|
* Returns an approximation to the Cumulative Distribution Function (CDF), which is the
|
|
352
|
-
* cumulative analog of the PMF, of the input stream given a set of split points (
|
|
335
|
+
* cumulative analog of the PMF, of the input stream given a set of split points (items).
|
|
353
336
|
*
|
|
354
337
|
* <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
|
|
355
338
|
* get_normalized_rank_error(false) function.
|
|
356
339
|
*
|
|
357
|
-
* <p>If the sketch is empty this
|
|
340
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
|
358
341
|
*
|
|
359
|
-
* @param split_points an array of <i>m</i> unique, monotonically increasing
|
|
342
|
+
* @param split_points an array of <i>m</i> unique, monotonically increasing items
|
|
360
343
|
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
|
|
361
|
-
*
|
|
362
|
-
*
|
|
363
|
-
*
|
|
364
|
-
*
|
|
365
|
-
* the
|
|
366
|
-
*
|
|
344
|
+
*
|
|
345
|
+
* @param size of the array of split points.
|
|
346
|
+
*
|
|
347
|
+
* @param inclusive if true the rank of an item includes its own weight, and therefore
|
|
348
|
+
* if the sketch contains items equal to a slit point, then in CDF such items are
|
|
349
|
+
* included into the interval to the left of split point. Otherwise they are included into
|
|
350
|
+
* the interval to the right of split point.
|
|
367
351
|
*
|
|
368
352
|
* @return an array of m+1 double values, which are a consecutive approximation to the CDF
|
|
369
353
|
* of the input stream given the split_points. The value at array position j of the returned
|
|
370
354
|
* CDF array is the sum of the returned values in positions 0 through j of the returned PMF
|
|
371
|
-
* array.
|
|
372
|
-
*
|
|
373
|
-
* of the left split point and exclusive of the right split point, with the exception that the last
|
|
374
|
-
* interval will include the maximum value. When <em>inclusive=true</em>,
|
|
375
|
-
* an "interval" is exclusive of the left split point and inclusive of the right.
|
|
376
|
-
|
|
355
|
+
* array. This can be viewed as array of ranks of the given split points plus one more value
|
|
356
|
+
* that is always 1.
|
|
377
357
|
*/
|
|
378
|
-
|
|
379
|
-
vector_double get_CDF(const T* split_points, uint32_t size) const;
|
|
358
|
+
vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
|
|
380
359
|
|
|
381
360
|
/**
|
|
382
361
|
* Computes size needed to serialize the current state of the sketch.
|
|
383
362
|
* This version is for fixed-size arithmetic types (integral and floating point).
|
|
384
|
-
* @param instance of a SerDe
|
|
363
|
+
* @param sd instance of a SerDe
|
|
385
364
|
* @return size in bytes needed to serialize this sketch
|
|
386
365
|
*/
|
|
387
366
|
template<typename SerDe = serde<T>, typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
|
|
388
|
-
size_t get_serialized_size_bytes(const SerDe&
|
|
367
|
+
size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
|
|
389
368
|
|
|
390
369
|
/**
|
|
391
370
|
* Computes size needed to serialize the current state of the sketch.
|
|
392
371
|
* This version is for all other types and can be expensive since every item needs to be looked at.
|
|
393
|
-
* @param instance of a SerDe
|
|
372
|
+
* @param sd instance of a SerDe
|
|
394
373
|
* @return size in bytes needed to serialize this sketch
|
|
395
374
|
*/
|
|
396
375
|
template<typename SerDe = serde<T>, typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
|
|
397
|
-
size_t get_serialized_size_bytes(const SerDe&
|
|
376
|
+
size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
|
|
398
377
|
|
|
399
378
|
/**
|
|
400
379
|
* This method serializes the sketch into a given stream in a binary form
|
|
401
380
|
* @param os output stream
|
|
402
|
-
* @param instance of a SerDe
|
|
381
|
+
* @param sd instance of a SerDe
|
|
403
382
|
*/
|
|
404
383
|
template<typename SerDe = serde<T>>
|
|
405
|
-
void serialize(std::ostream& os, const SerDe&
|
|
384
|
+
void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
|
|
406
385
|
|
|
407
386
|
// This is a convenience alias for users
|
|
408
387
|
// The type returned by the following serialize method
|
|
@@ -414,32 +393,36 @@ public:
|
|
|
414
393
|
* It is a blank space of a given size.
|
|
415
394
|
* This header is used in Datasketches PostgreSQL extension.
|
|
416
395
|
* @param header_size_bytes space to reserve in front of the sketch
|
|
417
|
-
* @param instance of a SerDe
|
|
396
|
+
* @param sd instance of a SerDe
|
|
418
397
|
* @return serialized sketch as a vector of bytes
|
|
419
398
|
*/
|
|
420
399
|
template<typename SerDe = serde<T>>
|
|
421
|
-
vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe&
|
|
400
|
+
vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
|
|
422
401
|
|
|
423
402
|
/**
|
|
424
403
|
* This method deserializes a sketch from a given stream.
|
|
425
404
|
* @param is input stream
|
|
426
|
-
* @param instance of a SerDe
|
|
427
|
-
* @param instance of
|
|
405
|
+
* @param sd instance of a SerDe
|
|
406
|
+
* @param comparator instance of a Comparator
|
|
407
|
+
* @param allocator instance of an Allocator
|
|
428
408
|
* @return an instance of a sketch
|
|
429
409
|
*/
|
|
430
410
|
template<typename SerDe = serde<T>>
|
|
431
|
-
static quantiles_sketch deserialize(std::istream& is, const SerDe&
|
|
411
|
+
static quantiles_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(),
|
|
412
|
+
const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
|
|
432
413
|
|
|
433
414
|
/**
|
|
434
415
|
* This method deserializes a sketch from a given array of bytes.
|
|
435
416
|
* @param bytes pointer to the array of bytes
|
|
436
417
|
* @param size the size of the array
|
|
437
|
-
* @param instance of a SerDe
|
|
438
|
-
* @param instance of
|
|
418
|
+
* @param sd instance of a SerDe
|
|
419
|
+
* @param comparator instance of a Comparator
|
|
420
|
+
* @param allocator instance of an Allocator
|
|
439
421
|
* @return an instance of a sketch
|
|
440
422
|
*/
|
|
441
423
|
template<typename SerDe = serde<T>>
|
|
442
|
-
static quantiles_sketch deserialize(const void* bytes, size_t size, const SerDe&
|
|
424
|
+
static quantiles_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(),
|
|
425
|
+
const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
|
|
443
426
|
|
|
444
427
|
/**
|
|
445
428
|
* Gets the normalized rank error for this sketch. Constants were derived as the best fit to 99 percentile
|
|
@@ -471,8 +454,7 @@ public:
|
|
|
471
454
|
const_iterator begin() const;
|
|
472
455
|
const_iterator end() const;
|
|
473
456
|
|
|
474
|
-
|
|
475
|
-
quantile_sketch_sorted_view<T, Comparator, Allocator> get_sorted_view(bool cumulative) const;
|
|
457
|
+
quantiles_sorted_view<T, Comparator, Allocator> get_sorted_view() const;
|
|
476
458
|
|
|
477
459
|
private:
|
|
478
460
|
using Level = std::vector<T, Allocator>;
|
|
@@ -487,7 +469,7 @@ private:
|
|
|
487
469
|
* || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
|
|
488
470
|
* 1 ||---------------------------Items Seen Count (N)--------------------------------|
|
|
489
471
|
*
|
|
490
|
-
* Long 3 is the start of data, beginning with serialized min and max
|
|
472
|
+
* Long 3 is the start of data, beginning with serialized min and max item, followed by
|
|
491
473
|
* the sketch data buffers.
|
|
492
474
|
*/
|
|
493
475
|
|
|
@@ -503,23 +485,28 @@ private:
|
|
|
503
485
|
static const uint8_t PREAMBLE_LONGS_FULL = 2;
|
|
504
486
|
static const size_t DATA_START = 16;
|
|
505
487
|
|
|
488
|
+
Comparator comparator_;
|
|
506
489
|
Allocator allocator_;
|
|
490
|
+
bool is_base_buffer_sorted_;
|
|
507
491
|
uint16_t k_;
|
|
508
492
|
uint64_t n_;
|
|
509
493
|
uint64_t bit_pattern_;
|
|
510
494
|
Level base_buffer_;
|
|
511
495
|
VectorLevels levels_;
|
|
512
|
-
T*
|
|
513
|
-
T*
|
|
514
|
-
|
|
496
|
+
T* min_item_;
|
|
497
|
+
T* max_item_;
|
|
498
|
+
mutable quantiles_sorted_view<T, Comparator, Allocator>* sorted_view_;
|
|
499
|
+
|
|
500
|
+
void setup_sorted_view() const; // modifies mutable state
|
|
501
|
+
void reset_sorted_view();
|
|
515
502
|
|
|
516
503
|
// for deserialization
|
|
517
504
|
class item_deleter;
|
|
518
505
|
class items_deleter;
|
|
519
506
|
quantiles_sketch(uint16_t k, uint64_t n, uint64_t bit_pattern,
|
|
520
507
|
Level&& base_buffer, VectorLevels&& levels,
|
|
521
|
-
std::unique_ptr<T, item_deleter>
|
|
522
|
-
bool is_sorted, const Allocator& allocator = Allocator());
|
|
508
|
+
std::unique_ptr<T, item_deleter> min_item, std::unique_ptr<T, item_deleter> max_item,
|
|
509
|
+
bool is_sorted, const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
|
|
523
510
|
|
|
524
511
|
void grow_base_buffer();
|
|
525
512
|
void process_full_base_buffer();
|
|
@@ -533,7 +520,7 @@ private:
|
|
|
533
520
|
Level& buf_size_2k, bool apply_as_update,
|
|
534
521
|
quantiles_sketch& sketch);
|
|
535
522
|
static void zip_buffer(Level& buf_in, Level& buf_out);
|
|
536
|
-
static void merge_two_size_k_buffers(Level& arr_in_1, Level& arr_in_2, Level& arr_out);
|
|
523
|
+
static void merge_two_size_k_buffers(Level& arr_in_1, Level& arr_in_2, Level& arr_out, const Comparator& comparator);
|
|
537
524
|
|
|
538
525
|
template<typename SerDe>
|
|
539
526
|
static Level deserialize_array(std::istream& is, uint32_t num_items, uint32_t capcacity, const SerDe& serde, const Allocator& allocator);
|
|
@@ -549,7 +536,7 @@ private:
|
|
|
549
536
|
static uint32_t compute_retained_items(uint16_t k, uint64_t n);
|
|
550
537
|
static uint32_t compute_base_buffer_items(uint16_t k, uint64_t n);
|
|
551
538
|
static uint64_t compute_bit_pattern(uint16_t k, uint64_t n);
|
|
552
|
-
static uint32_t
|
|
539
|
+
static uint32_t count_valid_levels(uint64_t bit_pattern);
|
|
553
540
|
static uint8_t compute_levels_needed(uint16_t k, uint64_t n);
|
|
554
541
|
|
|
555
542
|
/**
|
|
@@ -580,60 +567,28 @@ private:
|
|
|
580
567
|
*/
|
|
581
568
|
static uint8_t lowest_zero_bit_starting_at(uint64_t bits, uint8_t starting_bit);
|
|
582
569
|
|
|
583
|
-
// implementations for floating point types
|
|
584
570
|
template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
|
|
585
|
-
static
|
|
586
|
-
|
|
587
|
-
return value;
|
|
571
|
+
static inline bool check_update_item(TT item) {
|
|
572
|
+
return !std::isnan(item);
|
|
588
573
|
}
|
|
589
574
|
|
|
590
|
-
template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
|
|
591
|
-
static inline bool check_update_value(TT value) {
|
|
592
|
-
return !std::isnan(value);
|
|
593
|
-
}
|
|
594
|
-
|
|
595
|
-
template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
|
|
596
|
-
static inline void check_split_points(const T* values, uint32_t size) {
|
|
597
|
-
for (uint32_t i = 0; i < size ; i++) {
|
|
598
|
-
if (std::isnan(values[i])) {
|
|
599
|
-
throw std::invalid_argument("Values must not be NaN");
|
|
600
|
-
}
|
|
601
|
-
if ((i < (size - 1)) && !(Comparator()(values[i], values[i + 1]))) {
|
|
602
|
-
throw std::invalid_argument("Values must be unique and monotonically increasing");
|
|
603
|
-
}
|
|
604
|
-
}
|
|
605
|
-
}
|
|
606
|
-
|
|
607
|
-
// implementations for all other types
|
|
608
575
|
template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
|
|
609
|
-
static
|
|
610
|
-
throw std::runtime_error("getting quantiles from empty sketch is not supported for this type of values");
|
|
611
|
-
}
|
|
612
|
-
|
|
613
|
-
template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
|
|
614
|
-
static inline bool check_update_value(TT) {
|
|
576
|
+
static inline bool check_update_item(TT) {
|
|
615
577
|
return true;
|
|
616
578
|
}
|
|
617
|
-
|
|
618
|
-
template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
|
|
619
|
-
static inline void check_split_points(const T* values, uint32_t size) {
|
|
620
|
-
for (uint32_t i = 0; i < size ; i++) {
|
|
621
|
-
if ((i < (size - 1)) && !(Comparator()(values[i], values[i + 1]))) {
|
|
622
|
-
throw std::invalid_argument("Values must be unique and monotonically increasing");
|
|
623
|
-
}
|
|
624
|
-
}
|
|
625
|
-
}
|
|
626
579
|
};
|
|
627
580
|
|
|
628
581
|
|
|
629
582
|
template<typename T, typename C, typename A>
|
|
630
583
|
class quantiles_sketch<T, C, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
|
|
631
584
|
public:
|
|
585
|
+
using value_type = std::pair<const T&, const uint64_t>;
|
|
632
586
|
const_iterator& operator++();
|
|
633
587
|
const_iterator& operator++(int);
|
|
634
588
|
bool operator==(const const_iterator& other) const;
|
|
635
589
|
bool operator!=(const const_iterator& other) const;
|
|
636
|
-
|
|
590
|
+
const value_type operator*() const;
|
|
591
|
+
const return_value_holder<value_type> operator->() const;
|
|
637
592
|
private:
|
|
638
593
|
friend class quantiles_sketch<T, C, A>;
|
|
639
594
|
using Level = std::vector<T, A>;
|