datasketches 0.2.6 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/LICENSE +4 -6
- data/NOTICE +6 -5
- data/ext/datasketches/kll_wrapper.cpp +20 -20
- data/ext/datasketches/theta_wrapper.cpp +2 -2
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
- data/vendor/datasketches-cpp/LICENSE +4 -6
- data/vendor/datasketches-cpp/MANIFEST.in +21 -4
- data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
- data/vendor/datasketches-cpp/common/{test/test_runner.cpp → include/version.hpp.in} +15 -8
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +37 -7
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +22 -1
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +1 -1
- data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +1 -1
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +17 -10
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +55 -42
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +4 -4
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +27 -27
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +197 -233
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +42 -32
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
- data/vendor/datasketches-cpp/pyproject.toml +17 -13
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/README.md +1 -1
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +19 -1
- data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
- data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
- data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +1 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +20 -19
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +241 -233
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +27 -27
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +117 -104
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +7 -7
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +3 -3
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +5 -5
- data/vendor/datasketches-cpp/setup.py +14 -3
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +3 -2
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/tox.ini +26 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +41 -35
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +27 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -0
- metadata +14 -7
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
- data/vendor/datasketches-cpp/common/test/catch.hpp +0 -17618
@@ -24,7 +24,7 @@
|
|
24
24
|
#include <memory>
|
25
25
|
#include <vector>
|
26
26
|
|
27
|
-
#include "
|
27
|
+
#include "quantiles_sorted_view.hpp"
|
28
28
|
#include "common_defs.hpp"
|
29
29
|
#include "serde.hpp"
|
30
30
|
|
@@ -32,22 +32,21 @@ namespace datasketches {
|
|
32
32
|
|
33
33
|
/**
|
34
34
|
* This is a stochastic streaming sketch that enables near-real time analysis of the
|
35
|
-
* approximate distribution
|
36
|
-
* The analysis is obtained using
|
37
|
-
* Probability Mass Function from
|
35
|
+
* approximate distribution from a very large stream in a single pass.
|
36
|
+
* The analysis is obtained using get_rank() and get_quantile() functions,
|
37
|
+
* the Probability Mass Function from get_PMF() and the Cumulative Distribution Function from get_CDF().
|
38
38
|
*
|
39
39
|
* <p>Consider a large stream of one million values such as packet sizes coming into a network node.
|
40
|
-
* The
|
40
|
+
* The natural rank of any specific size value is its index in the hypothetical sorted
|
41
41
|
* array of values.
|
42
|
-
* The normalized rank
|
42
|
+
* The normalized rank is the natural rank divided by the stream size,
|
43
43
|
* in this case one million.
|
44
44
|
* The value corresponding to the normalized rank of 0.5 represents the 50th percentile or median
|
45
|
-
* value of the distribution, or
|
46
|
-
*
|
47
|
-
* the sketch.</p>
|
45
|
+
* value of the distribution, or get_quantile(0.5). Similarly, the 95th percentile is obtained from
|
46
|
+
* get_quantile(0.95).</p>
|
48
47
|
*
|
49
48
|
* <p>From the min and max values, for example, 1 and 1000 bytes,
|
50
|
-
* you can obtain the PMF from
|
49
|
+
* you can obtain the PMF from get_PMF(100, 500, 900) that will result in an array of
|
51
50
|
* 4 fractional values such as {.4, .3, .2, .1}, which means that
|
52
51
|
* <ul>
|
53
52
|
* <li>40% of the values were < 100,</li>
|
@@ -55,20 +54,19 @@ namespace datasketches {
|
|
55
54
|
* <li>20% of the values were ≥ 500 and < 900, and</li>
|
56
55
|
* <li>10% of the values were ≥ 900.</li>
|
57
56
|
* </ul>
|
58
|
-
* A frequency histogram can be obtained by
|
57
|
+
* A frequency histogram can be obtained by multiplying these fractions by get_n(),
|
59
58
|
* which is the total count of values received.
|
60
|
-
* The
|
59
|
+
* The get_CDF() works similarly, but produces the cumulative distribution instead.
|
61
60
|
*
|
62
61
|
* <p>As of November 2021, this implementation produces serialized sketches which are binary-compatible
|
63
62
|
* with the equivalent Java implementation only when template parameter T = double
|
64
63
|
* (64-bit double precision values).
|
65
|
-
|
66
64
|
*
|
67
65
|
* <p>The accuracy of this sketch is a function of the configured value <i>k</i>, which also affects
|
68
66
|
* the overall size of the sketch. Accuracy of this quantile sketch is always with respect to
|
69
|
-
* the normalized rank.
|
70
|
-
* For example, the median
|
71
|
-
* from the hypothetically sorted array of input
|
67
|
+
* the normalized rank. A <i>k</i> of 128 produces a normalized, rank error of about 1.7%.
|
68
|
+
* For example, the median item returned from getQuantile(0.5) will be between the actual items
|
69
|
+
* from the hypothetically sorted array of input items at normalized ranks of 0.483 and 0.517, with
|
72
70
|
* a confidence of about 99%.</p>
|
73
71
|
*
|
74
72
|
* <pre>
|
@@ -121,17 +119,17 @@ Table Guide for DoublesSketch Size in Bytes and Approximate Error:
|
|
121
119
|
* by Agarwal, Cormode, Huang, Phillips, Wei, and Yi.
|
122
120
|
* <a href="http://dblp.org/rec/html/journals/tods/AgarwalCHPWY13"></a></p>
|
123
121
|
*
|
124
|
-
* <p>This algorithm is independent of the distribution of
|
125
|
-
* requires only that the
|
122
|
+
* <p>This algorithm is independent of the distribution of items and
|
123
|
+
* requires only that the items be comparable.</p>
|
126
124
|
*
|
127
|
-
* <p>This algorithm intentionally inserts randomness into the sampling process for
|
125
|
+
* <p>This algorithm intentionally inserts randomness into the sampling process for items that
|
128
126
|
* ultimately get retained in the sketch. The results produced by this algorithm are not
|
129
127
|
* deterministic. For example, if the same stream is inserted into two different instances of this
|
130
128
|
* sketch, the answers obtained from the two sketches may not be identical.</p>
|
131
129
|
*
|
132
|
-
* <p>Similarly, there may be directional inconsistencies. For example, the
|
133
|
-
*
|
134
|
-
*
|
130
|
+
* <p>Similarly, there may be directional inconsistencies. For example, the result
|
131
|
+
* obtained from get_quantile(rank) input into the reverse directional query
|
132
|
+
* get_rank(item) may not result in the original item.</p>
|
135
133
|
*
|
136
134
|
* @author Kevin Lang
|
137
135
|
* @author Lee Rhodes
|
@@ -153,9 +151,9 @@ public:
|
|
153
151
|
using value_type = T;
|
154
152
|
using allocator_type = Allocator;
|
155
153
|
using comparator = Comparator;
|
156
|
-
using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
|
157
154
|
|
158
|
-
explicit quantiles_sketch(uint16_t k = quantiles_constants::DEFAULT_K,
|
155
|
+
explicit quantiles_sketch(uint16_t k = quantiles_constants::DEFAULT_K,
|
156
|
+
const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
|
159
157
|
quantiles_sketch(const quantiles_sketch& other);
|
160
158
|
quantiles_sketch(quantiles_sketch&& other) noexcept;
|
161
159
|
~quantiles_sketch();
|
@@ -165,17 +163,19 @@ public:
|
|
165
163
|
/**
|
166
164
|
* @brief Type converting constructor
|
167
165
|
* @param other quantiles sketch of a different type
|
166
|
+
* @param comparator instance of a Comparator
|
168
167
|
* @param allocator instance of an Allocator
|
169
168
|
*/
|
170
169
|
template<typename From, typename FC, typename FA>
|
171
|
-
explicit quantiles_sketch(const quantiles_sketch<From, FC, FA>& other,
|
170
|
+
explicit quantiles_sketch(const quantiles_sketch<From, FC, FA>& other,
|
171
|
+
const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
|
172
172
|
|
173
173
|
/**
|
174
174
|
* Updates this sketch with the given data item.
|
175
|
-
* @param
|
175
|
+
* @param item from a stream of items
|
176
176
|
*/
|
177
177
|
template<typename FwdT>
|
178
|
-
void update(FwdT&&
|
178
|
+
void update(FwdT&& item);
|
179
179
|
|
180
180
|
/**
|
181
181
|
* Merges another sketch into this one.
|
@@ -215,20 +215,18 @@ public:
|
|
215
215
|
bool is_estimation_mode() const;
|
216
216
|
|
217
217
|
/**
|
218
|
-
* Returns the min
|
219
|
-
*
|
220
|
-
*
|
221
|
-
* @return the min value of the stream
|
218
|
+
* Returns the min item of the stream.
|
219
|
+
* If the sketch is empty this throws std::runtime_error.
|
220
|
+
* @return the min item of the stream
|
222
221
|
*/
|
223
|
-
const T&
|
222
|
+
const T& get_min_item() const;
|
224
223
|
|
225
224
|
/**
|
226
|
-
* Returns the max
|
227
|
-
*
|
228
|
-
*
|
229
|
-
* @return the max value of the stream
|
225
|
+
* Returns the max item of the stream.
|
226
|
+
* If the sketch is empty this throws std::runtime_error.
|
227
|
+
* @return the max item of the stream
|
230
228
|
*/
|
231
|
-
const T&
|
229
|
+
const T& get_max_item() const;
|
232
230
|
|
233
231
|
/**
|
234
232
|
* Returns an instance of the comparator for this sketch.
|
@@ -243,166 +241,147 @@ public:
|
|
243
241
|
allocator_type get_allocator() const;
|
244
242
|
|
245
243
|
/**
|
246
|
-
* Returns an approximation to the
|
247
|
-
*
|
248
|
-
* version of the input stream so far.
|
249
|
-
* <p>
|
250
|
-
* Note that this method has a fairly large overhead (microseconds instead of nanoseconds)
|
251
|
-
* so it should not be called multiple times to get different quantiles from the same
|
252
|
-
* sketch. Instead use get_quantiles(), which pays the overhead only once.
|
244
|
+
* Returns an approximation to the data item associated with the given rank
|
245
|
+
* of a hypothetical sorted version of the input stream so far.
|
253
246
|
* <p>
|
254
|
-
*
|
255
|
-
* For other types: if the sketch is empty this throws runtime_error.
|
247
|
+
* If the sketch is empty this throws std::runtime_error.
|
256
248
|
*
|
257
|
-
* @param rank the specified
|
258
|
-
* These are also called normalized ranks or fractional ranks.
|
259
|
-
* If rank = 0.0, the true minimum value of the stream is returned.
|
260
|
-
* If rank = 1.0, the true maximum value of the stream is returned.
|
249
|
+
* @param rank the specified normalized rank in the hypothetical sorted stream.
|
261
250
|
*
|
262
|
-
* @return the approximation to the
|
251
|
+
* @return the approximation to the item at the given rank
|
263
252
|
*/
|
264
|
-
using quantile_return_type = typename
|
265
|
-
|
266
|
-
quantile_return_type get_quantile(double rank) const;
|
253
|
+
using quantile_return_type = typename quantiles_sorted_view<T, Comparator, Allocator>::quantile_return_type;
|
254
|
+
quantile_return_type get_quantile(double rank, bool inclusive = true) const;
|
267
255
|
|
268
256
|
/**
|
269
|
-
* This is a
|
257
|
+
* This is a multiple-query version of get_quantile().
|
270
258
|
* <p>
|
271
259
|
* This returns an array that could have been generated by using get_quantile() for each
|
272
|
-
*
|
273
|
-
* This method incurs the internal set-up overhead once and obtains multiple quantile values in
|
274
|
-
* a single query. It is strongly recommend that this method be used instead of multiple calls
|
275
|
-
* to get_quantile().
|
260
|
+
* normalized rank separately.
|
276
261
|
*
|
277
|
-
* <p>If the sketch is empty this
|
262
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
278
263
|
*
|
279
|
-
* @param
|
280
|
-
* These
|
281
|
-
*
|
264
|
+
* @param ranks given array of normalized ranks in the hypothetical sorted stream.
|
265
|
+
* These ranks must be in the interval [0.0, 1.0], inclusive.
|
266
|
+
* @param size the number of ranks in the array
|
282
267
|
*
|
283
|
-
* @return array of approximations to
|
268
|
+
* @return array of approximations to items associated with given ranks in the same order as given ranks
|
284
269
|
* in the input array.
|
270
|
+
*
|
271
|
+
* Deprecated. Will be removed in the next major version. Use get_quantile() instead.
|
285
272
|
*/
|
286
|
-
|
287
|
-
std::vector<T, Allocator> get_quantiles(const double* fractions, uint32_t size) const;
|
273
|
+
std::vector<T, Allocator> get_quantiles(const double* ranks, uint32_t size, bool inclusive = true) const;
|
288
274
|
|
289
275
|
/**
|
290
276
|
* This is a multiple-query version of get_quantile() that allows the caller to
|
291
|
-
* specify the number of evenly-spaced
|
277
|
+
* specify the number of evenly-spaced normalized ranks.
|
278
|
+
*
|
279
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
292
280
|
*
|
293
|
-
*
|
281
|
+
* @param num an integer that specifies the number of evenly-spaced ranks.
|
282
|
+
* This must be an integer greater than 0. A value of 1 is equivalent to get_quantiles([0]).
|
283
|
+
* A value of 2 is equivalent to get_quantiles([0, 1]). A value of 3 is equivalent to
|
284
|
+
* get_quantiles([0, 0.5, 1]), etc.
|
294
285
|
*
|
295
|
-
* @
|
296
|
-
* This must be an integer greater than 0. A value of 1 will return the min value.
|
297
|
-
* A value of 2 will return the min and the max value. A value of 3 will return the min,
|
298
|
-
* the median and the max value, etc.
|
286
|
+
* @return array of approximations to items associated with the given number of evenly-spaced normalized ranks.
|
299
287
|
*
|
300
|
-
*
|
288
|
+
* Deprecated. Will be removed in the next major version. Use get_quantile() instead.
|
301
289
|
*/
|
302
|
-
|
303
|
-
std::vector<T, Allocator> get_quantiles(uint32_t num) const;
|
290
|
+
std::vector<T, Allocator> get_quantiles(uint32_t num, bool inclusive = true) const;
|
304
291
|
|
305
292
|
/**
|
306
|
-
* Returns an approximation to the normalized
|
307
|
-
* inclusive. When template parameter <em>inclusive=false</em> (the default), only elements strictly
|
308
|
-
* less than the provided value are included in the rank estimate. With <em>inclusive=true</em>,
|
309
|
-
* the rank estimate includes elements less than or equal to the provided value.
|
293
|
+
* Returns an approximation to the normalized rank of the given item from 0 to 1, inclusive.
|
310
294
|
*
|
311
295
|
* <p>The resulting approximation has a probabilistic guarantee that can be obtained from the
|
312
296
|
* get_normalized_rank_error(false) function.
|
313
297
|
*
|
314
|
-
* <p>If the sketch is empty this
|
298
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
315
299
|
*
|
316
|
-
* @param
|
317
|
-
* @
|
300
|
+
* @param item to be ranked
|
301
|
+
* @param inclusive if true the weight of the given item is included into the rank.
|
302
|
+
* Otherwise the rank equals the sum of the weights of all items that are less than the given item
|
303
|
+
* according to the comparator C.
|
304
|
+
* @return an approximate normalized rank of the given item
|
318
305
|
*/
|
319
|
-
|
320
|
-
double get_rank(const T& value) const;
|
306
|
+
double get_rank(const T& item, bool inclusive = true) const;
|
321
307
|
|
322
308
|
/**
|
323
309
|
* Returns an approximation to the Probability Mass Function (PMF) of the input stream
|
324
|
-
* given a set of split points (
|
310
|
+
* given a set of split points (items).
|
325
311
|
*
|
326
312
|
* <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
|
327
313
|
* get_normalized_rank_error(true) function.
|
328
314
|
*
|
329
|
-
* <p>If the sketch is empty this
|
315
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
330
316
|
*
|
331
|
-
* @param split_points an array of <i>m</i> unique, monotonically increasing
|
332
|
-
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
|
333
|
-
*
|
334
|
-
*
|
335
|
-
*
|
336
|
-
*
|
337
|
-
* the
|
338
|
-
*
|
317
|
+
* @param split_points an array of <i>m</i> unique, monotonically increasing items
|
318
|
+
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
|
319
|
+
*
|
320
|
+
* @param size of the array of split points.
|
321
|
+
*
|
322
|
+
* @param inclusive if true the rank of an item includes its own weight, and therefore
|
323
|
+
* if the sketch contains items equal to a slit point, then in PMF such items are
|
324
|
+
* included into the interval to the left of split point. Otherwise they are included into the interval
|
325
|
+
* to the right of split point.
|
339
326
|
*
|
340
327
|
* @return an array of m+1 doubles each of which is an approximation
|
341
|
-
* to the fraction of the input stream
|
342
|
-
* When <em>inclusive=false</em> (the default), the definition of an "interval" is inclusive
|
343
|
-
* of the left split point and exclusive of the right split point, with the exception that the last
|
344
|
-
* interval will include the maximum value. When <em>inclusive=true</em>,
|
345
|
-
* an "interval" is exclusive of the left split point and inclusive of the right.
|
328
|
+
* to the fraction of the input stream items (the mass) that fall into one of those intervals.
|
346
329
|
*/
|
347
|
-
|
348
|
-
vector_double get_PMF(const T* split_points, uint32_t size) const;
|
330
|
+
using vector_double = typename quantiles_sorted_view<T, Comparator, Allocator>::vector_double;
|
331
|
+
vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
|
349
332
|
|
350
333
|
/**
|
351
334
|
* Returns an approximation to the Cumulative Distribution Function (CDF), which is the
|
352
|
-
* cumulative analog of the PMF, of the input stream given a set of split points (
|
335
|
+
* cumulative analog of the PMF, of the input stream given a set of split points (items).
|
353
336
|
*
|
354
337
|
* <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
|
355
338
|
* get_normalized_rank_error(false) function.
|
356
339
|
*
|
357
|
-
* <p>If the sketch is empty this
|
340
|
+
* <p>If the sketch is empty this throws std::runtime_error.
|
358
341
|
*
|
359
|
-
* @param split_points an array of <i>m</i> unique, monotonically increasing
|
342
|
+
* @param split_points an array of <i>m</i> unique, monotonically increasing items
|
360
343
|
* that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
|
361
|
-
*
|
362
|
-
*
|
363
|
-
*
|
364
|
-
*
|
365
|
-
* the
|
366
|
-
*
|
344
|
+
*
|
345
|
+
* @param size of the array of split points.
|
346
|
+
*
|
347
|
+
* @param inclusive if true the rank of an item includes its own weight, and therefore
|
348
|
+
* if the sketch contains items equal to a slit point, then in CDF such items are
|
349
|
+
* included into the interval to the left of split point. Otherwise they are included into
|
350
|
+
* the interval to the right of split point.
|
367
351
|
*
|
368
352
|
* @return an array of m+1 double values, which are a consecutive approximation to the CDF
|
369
353
|
* of the input stream given the split_points. The value at array position j of the returned
|
370
354
|
* CDF array is the sum of the returned values in positions 0 through j of the returned PMF
|
371
|
-
* array.
|
372
|
-
*
|
373
|
-
* of the left split point and exclusive of the right split point, with the exception that the last
|
374
|
-
* interval will include the maximum value. When <em>inclusive=true</em>,
|
375
|
-
* an "interval" is exclusive of the left split point and inclusive of the right.
|
376
|
-
|
355
|
+
* array. This can be viewed as array of ranks of the given split points plus one more value
|
356
|
+
* that is always 1.
|
377
357
|
*/
|
378
|
-
|
379
|
-
vector_double get_CDF(const T* split_points, uint32_t size) const;
|
358
|
+
vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
|
380
359
|
|
381
360
|
/**
|
382
361
|
* Computes size needed to serialize the current state of the sketch.
|
383
362
|
* This version is for fixed-size arithmetic types (integral and floating point).
|
384
|
-
* @param instance of a SerDe
|
363
|
+
* @param sd instance of a SerDe
|
385
364
|
* @return size in bytes needed to serialize this sketch
|
386
365
|
*/
|
387
366
|
template<typename SerDe = serde<T>, typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
|
388
|
-
size_t get_serialized_size_bytes(const SerDe&
|
367
|
+
size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
|
389
368
|
|
390
369
|
/**
|
391
370
|
* Computes size needed to serialize the current state of the sketch.
|
392
371
|
* This version is for all other types and can be expensive since every item needs to be looked at.
|
393
|
-
* @param instance of a SerDe
|
372
|
+
* @param sd instance of a SerDe
|
394
373
|
* @return size in bytes needed to serialize this sketch
|
395
374
|
*/
|
396
375
|
template<typename SerDe = serde<T>, typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
|
397
|
-
size_t get_serialized_size_bytes(const SerDe&
|
376
|
+
size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
|
398
377
|
|
399
378
|
/**
|
400
379
|
* This method serializes the sketch into a given stream in a binary form
|
401
380
|
* @param os output stream
|
402
|
-
* @param instance of a SerDe
|
381
|
+
* @param sd instance of a SerDe
|
403
382
|
*/
|
404
383
|
template<typename SerDe = serde<T>>
|
405
|
-
void serialize(std::ostream& os, const SerDe&
|
384
|
+
void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
|
406
385
|
|
407
386
|
// This is a convenience alias for users
|
408
387
|
// The type returned by the following serialize method
|
@@ -414,32 +393,36 @@ public:
|
|
414
393
|
* It is a blank space of a given size.
|
415
394
|
* This header is used in Datasketches PostgreSQL extension.
|
416
395
|
* @param header_size_bytes space to reserve in front of the sketch
|
417
|
-
* @param instance of a SerDe
|
396
|
+
* @param sd instance of a SerDe
|
418
397
|
* @return serialized sketch as a vector of bytes
|
419
398
|
*/
|
420
399
|
template<typename SerDe = serde<T>>
|
421
|
-
vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe&
|
400
|
+
vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
|
422
401
|
|
423
402
|
/**
|
424
403
|
* This method deserializes a sketch from a given stream.
|
425
404
|
* @param is input stream
|
426
|
-
* @param instance of a SerDe
|
427
|
-
* @param instance of
|
405
|
+
* @param sd instance of a SerDe
|
406
|
+
* @param comparator instance of a Comparator
|
407
|
+
* @param allocator instance of an Allocator
|
428
408
|
* @return an instance of a sketch
|
429
409
|
*/
|
430
410
|
template<typename SerDe = serde<T>>
|
431
|
-
static quantiles_sketch deserialize(std::istream& is, const SerDe&
|
411
|
+
static quantiles_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(),
|
412
|
+
const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
|
432
413
|
|
433
414
|
/**
|
434
415
|
* This method deserializes a sketch from a given array of bytes.
|
435
416
|
* @param bytes pointer to the array of bytes
|
436
417
|
* @param size the size of the array
|
437
|
-
* @param instance of a SerDe
|
438
|
-
* @param instance of
|
418
|
+
* @param sd instance of a SerDe
|
419
|
+
* @param comparator instance of a Comparator
|
420
|
+
* @param allocator instance of an Allocator
|
439
421
|
* @return an instance of a sketch
|
440
422
|
*/
|
441
423
|
template<typename SerDe = serde<T>>
|
442
|
-
static quantiles_sketch deserialize(const void* bytes, size_t size, const SerDe&
|
424
|
+
static quantiles_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(),
|
425
|
+
const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
|
443
426
|
|
444
427
|
/**
|
445
428
|
* Gets the normalized rank error for this sketch. Constants were derived as the best fit to 99 percentile
|
@@ -471,8 +454,7 @@ public:
|
|
471
454
|
const_iterator begin() const;
|
472
455
|
const_iterator end() const;
|
473
456
|
|
474
|
-
|
475
|
-
quantile_sketch_sorted_view<T, Comparator, Allocator> get_sorted_view(bool cumulative) const;
|
457
|
+
quantiles_sorted_view<T, Comparator, Allocator> get_sorted_view() const;
|
476
458
|
|
477
459
|
private:
|
478
460
|
using Level = std::vector<T, Allocator>;
|
@@ -487,7 +469,7 @@ private:
|
|
487
469
|
* || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
|
488
470
|
* 1 ||---------------------------Items Seen Count (N)--------------------------------|
|
489
471
|
*
|
490
|
-
* Long 3 is the start of data, beginning with serialized min and max
|
472
|
+
* Long 3 is the start of data, beginning with serialized min and max item, followed by
|
491
473
|
* the sketch data buffers.
|
492
474
|
*/
|
493
475
|
|
@@ -503,23 +485,28 @@ private:
|
|
503
485
|
static const uint8_t PREAMBLE_LONGS_FULL = 2;
|
504
486
|
static const size_t DATA_START = 16;
|
505
487
|
|
488
|
+
Comparator comparator_;
|
506
489
|
Allocator allocator_;
|
490
|
+
bool is_base_buffer_sorted_;
|
507
491
|
uint16_t k_;
|
508
492
|
uint64_t n_;
|
509
493
|
uint64_t bit_pattern_;
|
510
494
|
Level base_buffer_;
|
511
495
|
VectorLevels levels_;
|
512
|
-
T*
|
513
|
-
T*
|
514
|
-
|
496
|
+
T* min_item_;
|
497
|
+
T* max_item_;
|
498
|
+
mutable quantiles_sorted_view<T, Comparator, Allocator>* sorted_view_;
|
499
|
+
|
500
|
+
void setup_sorted_view() const; // modifies mutable state
|
501
|
+
void reset_sorted_view();
|
515
502
|
|
516
503
|
// for deserialization
|
517
504
|
class item_deleter;
|
518
505
|
class items_deleter;
|
519
506
|
quantiles_sketch(uint16_t k, uint64_t n, uint64_t bit_pattern,
|
520
507
|
Level&& base_buffer, VectorLevels&& levels,
|
521
|
-
std::unique_ptr<T, item_deleter>
|
522
|
-
bool is_sorted, const Allocator& allocator = Allocator());
|
508
|
+
std::unique_ptr<T, item_deleter> min_item, std::unique_ptr<T, item_deleter> max_item,
|
509
|
+
bool is_sorted, const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
|
523
510
|
|
524
511
|
void grow_base_buffer();
|
525
512
|
void process_full_base_buffer();
|
@@ -533,7 +520,7 @@ private:
|
|
533
520
|
Level& buf_size_2k, bool apply_as_update,
|
534
521
|
quantiles_sketch& sketch);
|
535
522
|
static void zip_buffer(Level& buf_in, Level& buf_out);
|
536
|
-
static void merge_two_size_k_buffers(Level& arr_in_1, Level& arr_in_2, Level& arr_out);
|
523
|
+
static void merge_two_size_k_buffers(Level& arr_in_1, Level& arr_in_2, Level& arr_out, const Comparator& comparator);
|
537
524
|
|
538
525
|
template<typename SerDe>
|
539
526
|
static Level deserialize_array(std::istream& is, uint32_t num_items, uint32_t capcacity, const SerDe& serde, const Allocator& allocator);
|
@@ -549,7 +536,7 @@ private:
|
|
549
536
|
static uint32_t compute_retained_items(uint16_t k, uint64_t n);
|
550
537
|
static uint32_t compute_base_buffer_items(uint16_t k, uint64_t n);
|
551
538
|
static uint64_t compute_bit_pattern(uint16_t k, uint64_t n);
|
552
|
-
static uint32_t
|
539
|
+
static uint32_t count_valid_levels(uint64_t bit_pattern);
|
553
540
|
static uint8_t compute_levels_needed(uint16_t k, uint64_t n);
|
554
541
|
|
555
542
|
/**
|
@@ -580,60 +567,28 @@ private:
|
|
580
567
|
*/
|
581
568
|
static uint8_t lowest_zero_bit_starting_at(uint64_t bits, uint8_t starting_bit);
|
582
569
|
|
583
|
-
// implementations for floating point types
|
584
570
|
template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
|
585
|
-
static
|
586
|
-
|
587
|
-
return value;
|
571
|
+
static inline bool check_update_item(TT item) {
|
572
|
+
return !std::isnan(item);
|
588
573
|
}
|
589
574
|
|
590
|
-
template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
|
591
|
-
static inline bool check_update_value(TT value) {
|
592
|
-
return !std::isnan(value);
|
593
|
-
}
|
594
|
-
|
595
|
-
template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
|
596
|
-
static inline void check_split_points(const T* values, uint32_t size) {
|
597
|
-
for (uint32_t i = 0; i < size ; i++) {
|
598
|
-
if (std::isnan(values[i])) {
|
599
|
-
throw std::invalid_argument("Values must not be NaN");
|
600
|
-
}
|
601
|
-
if ((i < (size - 1)) && !(Comparator()(values[i], values[i + 1]))) {
|
602
|
-
throw std::invalid_argument("Values must be unique and monotonically increasing");
|
603
|
-
}
|
604
|
-
}
|
605
|
-
}
|
606
|
-
|
607
|
-
// implementations for all other types
|
608
575
|
template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
|
609
|
-
static
|
610
|
-
throw std::runtime_error("getting quantiles from empty sketch is not supported for this type of values");
|
611
|
-
}
|
612
|
-
|
613
|
-
template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
|
614
|
-
static inline bool check_update_value(TT) {
|
576
|
+
static inline bool check_update_item(TT) {
|
615
577
|
return true;
|
616
578
|
}
|
617
|
-
|
618
|
-
template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
|
619
|
-
static inline void check_split_points(const T* values, uint32_t size) {
|
620
|
-
for (uint32_t i = 0; i < size ; i++) {
|
621
|
-
if ((i < (size - 1)) && !(Comparator()(values[i], values[i + 1]))) {
|
622
|
-
throw std::invalid_argument("Values must be unique and monotonically increasing");
|
623
|
-
}
|
624
|
-
}
|
625
|
-
}
|
626
579
|
};
|
627
580
|
|
628
581
|
|
629
582
|
template<typename T, typename C, typename A>
|
630
583
|
class quantiles_sketch<T, C, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
|
631
584
|
public:
|
585
|
+
using value_type = std::pair<const T&, const uint64_t>;
|
632
586
|
const_iterator& operator++();
|
633
587
|
const_iterator& operator++(int);
|
634
588
|
bool operator==(const const_iterator& other) const;
|
635
589
|
bool operator!=(const const_iterator& other) const;
|
636
|
-
|
590
|
+
const value_type operator*() const;
|
591
|
+
const return_value_holder<value_type> operator->() const;
|
637
592
|
private:
|
638
593
|
friend class quantiles_sketch<T, C, A>;
|
639
594
|
using Level = std::vector<T, A>;
|