datasketches 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +1 -1
  4. data/ext/datasketches/kll_wrapper.cpp +5 -1
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +4 -3
  7. data/vendor/datasketches-cpp/common/CMakeLists.txt +4 -0
  8. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
  10. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
  11. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
  12. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
  13. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
  14. data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
  15. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
  16. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
  17. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +2 -0
  18. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
  19. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
  20. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
  21. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +25 -9
  22. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
  23. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
  24. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
  25. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
  26. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
  28. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
  29. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
  30. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
  31. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
  32. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
  33. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
  34. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
  35. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
  36. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
  37. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -4
  38. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
  39. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
  40. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +96 -42
  41. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +105 -127
  42. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +94 -25
  43. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  44. data/vendor/datasketches-cpp/pyproject.toml +1 -1
  45. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  46. data/vendor/datasketches-cpp/python/README.md +7 -0
  47. data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
  48. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
  49. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
  50. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
  51. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
  52. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
  53. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
  54. data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
  55. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
  56. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
  57. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
  58. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
  59. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
  60. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
  61. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
  62. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
  63. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
  64. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
  65. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
  66. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
  67. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
  68. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
  69. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
  70. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
  71. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -2
  72. data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
  73. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
  74. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
  75. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +62 -59
  76. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
  77. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
  78. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +31 -26
  79. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
  80. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +25 -9
  81. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
  82. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
  83. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
  84. data/vendor/datasketches-cpp/setup.py +1 -1
  85. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  86. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +8 -6
  87. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +1 -0
  88. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -0
  89. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +7 -45
  90. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -0
  91. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +1 -0
  92. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
  93. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
  94. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -0
  95. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +29 -1
  96. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +2 -0
  97. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +16 -0
  98. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +1 -0
  99. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
  100. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
  101. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -0
  102. metadata +25 -9
  103. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
  104. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
  105. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
  106. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
@@ -0,0 +1,641 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #ifndef _QUANTILES_SKETCH_HPP_
21
+ #define _QUANTILES_SKETCH_HPP_
22
+
23
+ #include <functional>
24
+ #include <memory>
25
+ #include <vector>
26
+
27
+ #include "quantile_sketch_sorted_view.hpp"
28
+ #include "common_defs.hpp"
29
+ #include "serde.hpp"
30
+
31
+ namespace datasketches {
32
+
33
+ /**
34
+ * This is a stochastic streaming sketch that enables near-real time analysis of the
35
+ * approximate distribution of real values from a very large stream in a single pass.
36
+ * The analysis is obtained using a getQuantiles(*) function or its inverse functions the
37
+ * Probability Mass Function from getPMF(*) and the Cumulative Distribution Function from getCDF(*).
38
+ *
39
+ * <p>Consider a large stream of one million values such as packet sizes coming into a network node.
40
+ * The absolute rank of any specific size value is simply its index in the hypothetical sorted
41
+ * array of values.
42
+ * The normalized rank (or fractional rank) is the absolute rank divided by the stream size,
43
+ * in this case one million.
44
+ * The value corresponding to the normalized rank of 0.5 represents the 50th percentile or median
45
+ * value of the distribution, or getQuantile(0.5). Similarly, the 95th percentile is obtained from
46
+ * getQuantile(0.95). Using the getQuantiles(0.0, 1.0) will return the min and max values seen by
47
+ * the sketch.</p>
48
+ *
49
+ * <p>From the min and max values, for example, 1 and 1000 bytes,
50
+ * you can obtain the PMF from getPMF(100, 500, 900) that will result in an array of
51
+ * 4 fractional values such as {.4, .3, .2, .1}, which means that
52
+ * <ul>
53
+ * <li>40% of the values were &lt; 100,</li>
54
+ * <li>30% of the values were &ge; 100 and &lt; 500,</li>
55
+ * <li>20% of the values were &ge; 500 and &lt; 900, and</li>
56
+ * <li>10% of the values were &ge; 900.</li>
57
+ * </ul>
58
+ * A frequency histogram can be obtained by simply multiplying these fractions by getN(),
59
+ * which is the total count of values received.
60
+ * The getCDF(*) works similarly, but produces the cumulative distribution instead.
61
+ *
62
+ * <p>As of November 2021, this implementation produces serialized sketches which are binary-compatible
63
+ * with the equivalent Java implementation only when template parameter T = double
64
+ * (64-bit double precision values).
65
+
66
+ *
67
+ * <p>The accuracy of this sketch is a function of the configured value <i>k</i>, which also affects
68
+ * the overall size of the sketch. Accuracy of this quantile sketch is always with respect to
69
+ * the normalized rank. A <i>k</i> of 128 produces a normalized, rank error of about 1.7%.
70
+ * For example, the median value returned from getQuantile(0.5) will be between the actual values
71
+ * from the hypothetically sorted array of input values at normalized ranks of 0.483 and 0.517, with
72
+ * a confidence of about 99%.</p>
73
+ *
74
+ * <pre>
75
+ Table Guide for DoublesSketch Size in Bytes and Approximate Error:
76
+ K =&gt; | 16 32 64 128 256 512 1,024
77
+ ~ Error =&gt; | 12.145% 6.359% 3.317% 1.725% 0.894% 0.463% 0.239%
78
+ N | Size in Bytes -&gt;
79
+ ------------------------------------------------------------------------
80
+ 0 | 8 8 8 8 8 8 8
81
+ 1 | 72 72 72 72 72 72 72
82
+ 3 | 72 72 72 72 72 72 72
83
+ 7 | 104 104 104 104 104 104 104
84
+ 15 | 168 168 168 168 168 168 168
85
+ 31 | 296 296 296 296 296 296 296
86
+ 63 | 424 552 552 552 552 552 552
87
+ 127 | 552 808 1,064 1,064 1,064 1,064 1,064
88
+ 255 | 680 1,064 1,576 2,088 2,088 2,088 2,088
89
+ 511 | 808 1,320 2,088 3,112 4,136 4,136 4,136
90
+ 1,023 | 936 1,576 2,600 4,136 6,184 8,232 8,232
91
+ 2,047 | 1,064 1,832 3,112 5,160 8,232 12,328 16,424
92
+ 4,095 | 1,192 2,088 3,624 6,184 10,280 16,424 24,616
93
+ 8,191 | 1,320 2,344 4,136 7,208 12,328 20,520 32,808
94
+ 16,383 | 1,448 2,600 4,648 8,232 14,376 24,616 41,000
95
+ 32,767 | 1,576 2,856 5,160 9,256 16,424 28,712 49,192
96
+ 65,535 | 1,704 3,112 5,672 10,280 18,472 32,808 57,384
97
+ 131,071 | 1,832 3,368 6,184 11,304 20,520 36,904 65,576
98
+ 262,143 | 1,960 3,624 6,696 12,328 22,568 41,000 73,768
99
+ 524,287 | 2,088 3,880 7,208 13,352 24,616 45,096 81,960
100
+ 1,048,575 | 2,216 4,136 7,720 14,376 26,664 49,192 90,152
101
+ 2,097,151 | 2,344 4,392 8,232 15,400 28,712 53,288 98,344
102
+ 4,194,303 | 2,472 4,648 8,744 16,424 30,760 57,384 106,536
103
+ 8,388,607 | 2,600 4,904 9,256 17,448 32,808 61,480 114,728
104
+ 16,777,215 | 2,728 5,160 9,768 18,472 34,856 65,576 122,920
105
+ 33,554,431 | 2,856 5,416 10,280 19,496 36,904 69,672 131,112
106
+ 67,108,863 | 2,984 5,672 10,792 20,520 38,952 73,768 139,304
107
+ 134,217,727 | 3,112 5,928 11,304 21,544 41,000 77,864 147,496
108
+ 268,435,455 | 3,240 6,184 11,816 22,568 43,048 81,960 155,688
109
+ 536,870,911 | 3,368 6,440 12,328 23,592 45,096 86,056 163,880
110
+ 1,073,741,823 | 3,496 6,696 12,840 24,616 47,144 90,152 172,072
111
+ 2,147,483,647 | 3,624 6,952 13,352 25,640 49,192 94,248 180,264
112
+ 4,294,967,295 | 3,752 7,208 13,864 26,664 51,240 98,344 188,456
113
+
114
+ * </pre>
115
+
116
+ * <p>There is more documentation available on
117
+ * <a href="https://datasketches.apache.org">datasketches.apache.org</a>.</p>
118
+ *
119
+ * <p>This is an implementation of the Low Discrepancy Mergeable Quantiles Sketch
120
+ * described in section 3.2 of the journal version of the paper "Mergeable Summaries"
121
+ * by Agarwal, Cormode, Huang, Phillips, Wei, and Yi.
122
+ * <a href="http://dblp.org/rec/html/journals/tods/AgarwalCHPWY13"></a></p>
123
+ *
124
+ * <p>This algorithm is independent of the distribution of values and
125
+ * requires only that the values be comparable.</p
126
+ *
127
+ * <p>This algorithm intentionally inserts randomness into the sampling process for values that
128
+ * ultimately get retained in the sketch. The results produced by this algorithm are not
129
+ * deterministic. For example, if the same stream is inserted into two different instances of this
130
+ * sketch, the answers obtained from the two sketches may not be identical.</p>
131
+ *
132
+ * <p>Similarly, there may be directional inconsistencies. For example, the resulting array of
133
+ * values obtained from getQuantiles(fractions[]) input into the reverse directional query
134
+ * getPMF(splitPoints[]) may not result in the original fractional values.</p>
135
+ *
136
+ * @author Kevin Lang
137
+ * @author Lee Rhodes
138
+ * @author Alexander Saydakov
139
+ * @author Jon Malkin
140
+ */
141
+
142
+ namespace quantiles_constants {
143
+ const uint16_t DEFAULT_K = 128;
144
+ const uint16_t MIN_K = 2;
145
+ const uint16_t MAX_K = 1 << 15;
146
+ }
147
+
148
+ template <typename T,
149
+ typename Comparator = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare)
150
+ typename Allocator = std::allocator<T>>
151
+ class quantiles_sketch {
152
+ public:
153
+ using value_type = T;
154
+ using comparator = Comparator;
155
+ using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
156
+
157
+ explicit quantiles_sketch(uint16_t k = quantiles_constants::DEFAULT_K, const Allocator& allocator = Allocator());
158
+ quantiles_sketch(const quantiles_sketch& other);
159
+ quantiles_sketch(quantiles_sketch&& other) noexcept;
160
+ ~quantiles_sketch();
161
+ quantiles_sketch& operator=(const quantiles_sketch& other);
162
+ quantiles_sketch& operator=(quantiles_sketch&& other) noexcept;
163
+
164
+ /**
165
+ * Updates this sketch with the given data item.
166
+ * @param value an item from a stream of items
167
+ */
168
+ template<typename FwdT>
169
+ void update(FwdT&& value);
170
+
171
+ /**
172
+ * Merges another sketch into this one.
173
+ * @param other sketch to merge into this one
174
+ */
175
+ template<typename FwdSk>
176
+ void merge(FwdSk&& other);
177
+
178
+ /**
179
+ * Returns true if this sketch is empty.
180
+ * @return empty flag
181
+ */
182
+ bool is_empty() const;
183
+
184
+ /**
185
+ * Returns configured parameter k
186
+ * @return parameter k
187
+ */
188
+ uint16_t get_k() const;
189
+
190
+ /**
191
+ * Returns the length of the input stream.
192
+ * @return stream length
193
+ */
194
+ uint64_t get_n() const;
195
+
196
+ /**
197
+ * Returns the number of retained items (samples) in the sketch.
198
+ * @return the number of retained items
199
+ */
200
+ uint32_t get_num_retained() const;
201
+
202
+ /**
203
+ * Returns true if this sketch is in estimation mode.
204
+ * @return estimation mode flag
205
+ */
206
+ bool is_estimation_mode() const;
207
+
208
+ /**
209
+ * Returns the min value of the stream.
210
+ * For floating point types: if the sketch is empty this returns NaN.
211
+ * For other types: if the sketch is empty this throws runtime_error.
212
+ * @return the min value of the stream
213
+ */
214
+ const T& get_min_value() const;
215
+
216
+ /**
217
+ * Returns the max value of the stream.
218
+ * For floating point types: if the sketch is empty this returns NaN.
219
+ * For other types: if the sketch is empty this throws runtime_error.
220
+ * @return the max value of the stream
221
+ */
222
+ const T& get_max_value() const;
223
+
224
+ /**
225
+ * Returns an instance of the comparator for this sketch.
226
+ * @return comparator
227
+ */
228
+ Comparator get_comparator() const;
229
+
230
+ /**
231
+ * Returns an approximation to the value of the data item
232
+ * that would be preceded by the given fraction of a hypothetical sorted
233
+ * version of the input stream so far.
234
+ * <p>
235
+ * Note that this method has a fairly large overhead (microseconds instead of nanoseconds)
236
+ * so it should not be called multiple times to get different quantiles from the same
237
+ * sketch. Instead use get_quantiles(), which pays the overhead only once.
238
+ * <p>
239
+ * For floating point types: if the sketch is empty this returns NaN.
240
+ * For other types: if the sketch is empty this throws runtime_error.
241
+ *
242
+ * @param rank the specified fractional position in the hypothetical sorted stream.
243
+ * These are also called normalized ranks or fractional ranks.
244
+ * If rank = 0.0, the true minimum value of the stream is returned.
245
+ * If rank = 1.0, the true maximum value of the stream is returned.
246
+ *
247
+ * @return the approximation to the value at the given rank
248
+ */
249
+ using quantile_return_type = typename quantile_sketch_sorted_view<T, Comparator, Allocator>::quantile_return_type;
250
+ template<bool inclusive = false>
251
+ quantile_return_type get_quantile(double rank) const;
252
+
253
+ /**
254
+ * This is a more efficient multiple-query version of get_quantile().
255
+ * <p>
256
+ * This returns an array that could have been generated by using get_quantile() for each
257
+ * fractional rank separately, but would be very inefficient.
258
+ * This method incurs the internal set-up overhead once and obtains multiple quantile values in
259
+ * a single query. It is strongly recommend that this method be used instead of multiple calls
260
+ * to get_quantile().
261
+ *
262
+ * <p>If the sketch is empty this returns an empty vector.
263
+ *
264
+ * @param fractions given array of fractional positions in the hypothetical sorted stream.
265
+ * These are also called normalized ranks or fractional ranks.
266
+ * These fractions must be in the interval [0.0, 1.0], inclusive.
267
+ *
268
+ * @return array of approximations to the given fractions in the same order as given fractions
269
+ * in the input array.
270
+ */
271
+ template<bool inclusive = false>
272
+ std::vector<T, Allocator> get_quantiles(const double* fractions, uint32_t size) const;
273
+
274
+ /**
275
+ * This is a multiple-query version of get_quantile() that allows the caller to
276
+ * specify the number of evenly-spaced fractional ranks.
277
+ *
278
+ * <p>If the sketch is empty this returns an empty vector.
279
+ *
280
+ * @param num an integer that specifies the number of evenly-spaced fractional ranks.
281
+ * This must be an integer greater than 0. A value of 1 will return the min value.
282
+ * A value of 2 will return the min and the max value. A value of 3 will return the min,
283
+ * the median and the max value, etc.
284
+ *
285
+ * @return array of approximations to the given number of evenly-spaced fractional ranks.
286
+ */
287
+ template<bool inclusive = false>
288
+ std::vector<T, Allocator> get_quantiles(uint32_t num) const;
289
+
290
+ /**
291
+ * Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1,
292
+ * inclusive. When template parameter <em>inclusive=false</em> (the default), only elements strictly
293
+ * less than the provided value are included in the rank estimate. With <em>inclusive=true</em>,
294
+ * the rank estimate includes elements less than or equal to the provided value.
295
+ *
296
+ * <p>The resulting approximation has a probabilistic guarantee that can be obtained from the
297
+ * get_normalized_rank_error(false) function.
298
+ *
299
+ * <p>If the sketch is empty this returns NaN.
300
+ *
301
+ * @param value to be ranked
302
+ * @return an approximate rank of the given value
303
+ */
304
+ template<bool inclusive = false>
305
+ double get_rank(const T& value) const;
306
+
307
+ /**
308
+ * Returns an approximation to the Probability Mass Function (PMF) of the input stream
309
+ * given a set of split points (values).
310
+ *
311
+ * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
312
+ * get_normalized_rank_error(true) function.
313
+ *
314
+ * <p>If the sketch is empty this returns an empty vector.
315
+ *
316
+ * @param split_points an array of <i>m</i> unique, monotonically increasing values
317
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
318
+ * If the template parameter <em>inclusive=false</em> (the default), the definition of an "interval"
319
+ * is inclusive of the left split point and exclusive of the right
320
+ * split point, with the exception that the last interval will include the maximum value.
321
+ * If the template parameter <em>inclusive=true</em>, the definition of an "interval" is exclusive of
322
+ * the left split point and inclusive of the right split point.
323
+ * It is not necessary to include either the min or max values in these split points.
324
+ *
325
+ * @return an array of m+1 doubles each of which is an approximation
326
+ * to the fraction of the input stream values (the mass) that fall into one of those intervals.
327
+ * When <em>inclusive=false</em> (the default), the definition of an "interval" is inclusive
328
+ * of the left split point and exclusive of the right split point, with the exception that the last
329
+ * interval will include the maximum value. When <em>inclusive=true</em>,
330
+ * an "interval" is exclusive of the left split point and inclusive of the right.
331
+ */
332
+ template<bool inclusive = false>
333
+ vector_double get_PMF(const T* split_points, uint32_t size) const;
334
+
335
+ /**
336
+ * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
337
+ * cumulative analog of the PMF, of the input stream given a set of split points (values).
338
+ *
339
+ * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
340
+ * get_normalized_rank_error(false) function.
341
+ *
342
+ * <p>If the sketch is empty this returns an empty vector.
343
+ *
344
+ * @param split_points an array of <i>m</i> unique, monotonically increasing values
345
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
346
+ * If the template parameter <em>inclusive=false</em> (the default), the definition of an "interval" is
347
+ * inclusive of the left split point and exclusive of the right
348
+ * split point, with the exception that the last interval will include the maximum value.
349
+ * If the template parameter <em>inclusive=true</em>, the definition of an "interval" is exclusive of
350
+ * the left split point and inclusive of the right split point.
351
+ * It is not necessary to include either the min or max values in these split points.
352
+ *
353
+ * @return an array of m+1 double values, which are a consecutive approximation to the CDF
354
+ * of the input stream given the split_points. The value at array position j of the returned
355
+ * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
356
+ * array.
357
+ * When <em>inclusive=false</em> (the default), the definition of an "interval" is inclusive
358
+ * of the left split point and exclusive of the right split point, with the exception that the last
359
+ * interval will include the maximum value. When <em>inclusive=true</em>,
360
+ * an "interval" is exclusive of the left split point and inclusive of the right.
361
+
362
+ */
363
+ template<bool inclusive = false>
364
+ vector_double get_CDF(const T* split_points, uint32_t size) const;
365
+
366
+ /**
367
+ * Computes size needed to serialize the current state of the sketch.
368
+ * This version is for fixed-size arithmetic types (integral and floating point).
369
+ * @param instance of a SerDe
370
+ * @return size in bytes needed to serialize this sketch
371
+ */
372
+ template<typename SerDe = serde<T>, typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
373
+ size_t get_serialized_size_bytes(const SerDe& serde = SerDe()) const;
374
+
375
+ /**
376
+ * Computes size needed to serialize the current state of the sketch.
377
+ * This version is for all other types and can be expensive since every item needs to be looked at.
378
+ * @param instance of a SerDe
379
+ * @return size in bytes needed to serialize this sketch
380
+ */
381
+ template<typename SerDe = serde<T>, typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
382
+ size_t get_serialized_size_bytes(const SerDe& serde = SerDe()) const;
383
+
384
+ /**
385
+ * This method serializes the sketch into a given stream in a binary form
386
+ * @param os output stream
387
+ * @param instance of a SerDe
388
+ */
389
+ template<typename SerDe = serde<T>>
390
+ void serialize(std::ostream& os, const SerDe& serde = SerDe()) const;
391
+
392
+ // This is a convenience alias for users
393
+ // The type returned by the following serialize method
394
+ using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>>;
395
+
396
+ /**
397
+ * This method serializes the sketch as a vector of bytes.
398
+ * An optional header can be reserved in front of the sketch.
399
+ * It is a blank space of a given size.
400
+ * This header is used in Datasketches PostgreSQL extension.
401
+ * @param header_size_bytes space to reserve in front of the sketch
402
+ * @param instance of a SerDe
403
+ * @return serialized sketch as a vector of bytes
404
+ */
405
+ template<typename SerDe = serde<T>>
406
+ vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& serde = SerDe()) const;
407
+
408
+ /**
409
+ * This method deserializes a sketch from a given stream.
410
+ * @param is input stream
411
+ * @param instance of a SerDe
412
+ * @param instance of an Allocator
413
+ * @return an instance of a sketch
414
+ */
415
+ template<typename SerDe = serde<T>>
416
+ static quantiles_sketch deserialize(std::istream& is, const SerDe& serde = SerDe(), const Allocator& allocator = Allocator());
417
+
418
+ /**
419
+ * This method deserializes a sketch from a given array of bytes.
420
+ * @param bytes pointer to the array of bytes
421
+ * @param size the size of the array
422
+ * @param instance of a SerDe
423
+ * @param instance of an Allocator
424
+ * @return an instance of a sketch
425
+ */
426
+ template<typename SerDe = serde<T>>
427
+ static quantiles_sketch deserialize(const void* bytes, size_t size, const SerDe& serde = SerDe(), const Allocator& allocator = Allocator());
428
+
429
+ /**
430
+ * Gets the normalized rank error for this sketch. Constants were derived as the best fit to 99 percentile
431
+ * empirically measured max error in thousands of trials.
432
+ * @param is_pmf if true, returns the "double-sided" normalized rank error for the get_PMF() function.
433
+ * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
434
+ * @return the normalized rank error for the sketch
435
+ */
436
+ double get_normalized_rank_error(bool is_pmf) const;
437
+
438
+ /**
439
+ * Gets the normalized rank error given k and pmf. Constants were derived as the best fit to 99 percentile
440
+ * empirically measured max error in thousands of trials.
441
+ * @param k the configuration parameter
442
+ * @param is_pmf if true, returns the "double-sided" normalized rank error for the get_PMF() function.
443
+ * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
444
+ * @return the normalized rank error for the given parameters
445
+ */
446
+ static double get_normalized_rank_error(uint16_t k, bool is_pmf);
447
+
448
+ /**
449
+ * Prints a summary of the sketch.
450
+ * @param print_levels if true include information about levels
451
+ * @param print_items if true include sketch data
452
+ */
453
+ string<Allocator> to_string(bool print_levels = false, bool print_items = false) const;
454
+
455
+ class const_iterator;
456
+ const_iterator begin() const;
457
+ const_iterator end() const;
458
+
459
+ template<bool inclusive = false>
460
+ quantile_sketch_sorted_view<T, Comparator, Allocator> get_sorted_view(bool cumulative) const;
461
+
462
+ private:
463
+ using Level = std::vector<T, Allocator>;
464
+ using VectorLevels = std::vector<Level, typename std::allocator_traits<Allocator>::template rebind_alloc<Level>>;
465
+
466
+ /* Serialized sketch layout:
467
+ * Long || Start Byte Addr:
468
+ * Addr:
469
+ * || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
470
+ * 0 || Preamble_Longs | SerVer | FamID | Flags |----- K ---------|---- unused -----|
471
+ *
472
+ * || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
473
+ * 1 ||---------------------------Items Seen Count (N)--------------------------------|
474
+ *
475
+ * Long 3 is the start of data, beginning with serialized min and max values, followed by
476
+ * the sketch data buffers.
477
+ */
478
+
479
+ static const size_t EMPTY_SIZE_BYTES = 8;
480
+ static const uint8_t SERIAL_VERSION_1 = 1;
481
+ static const uint8_t SERIAL_VERSION_2 = 2;
482
+ static const uint8_t SERIAL_VERSION = 3;
483
+ static const uint8_t FAMILY = 8;
484
+
485
+ enum flags { RESERVED0, RESERVED1, IS_EMPTY, IS_COMPACT, IS_SORTED };
486
+
487
+ static const uint8_t PREAMBLE_LONGS_SHORT = 1; // for empty
488
+ static const uint8_t PREAMBLE_LONGS_FULL = 2;
489
+ static const size_t DATA_START = 16;
490
+
491
+ Allocator allocator_;
492
+ uint16_t k_;
493
+ uint64_t n_;
494
+ uint64_t bit_pattern_;
495
+ Level base_buffer_;
496
+ VectorLevels levels_;
497
+ T* min_value_;
498
+ T* max_value_;
499
+ bool is_sorted_;
500
+
501
+ // for deserialization
502
+ class item_deleter;
503
+ class items_deleter;
504
+ quantiles_sketch(uint16_t k, uint64_t n, uint64_t bit_pattern,
505
+ Level&& base_buffer, VectorLevels&& levels,
506
+ std::unique_ptr<T, item_deleter> min_value, std::unique_ptr<T, item_deleter> max_value,
507
+ bool is_sorted, const Allocator& allocator = Allocator());
508
+
509
+ void grow_base_buffer();
510
+ void process_full_base_buffer();
511
+
512
+ // returns true if size adjusted, else false
513
+ bool grow_levels_if_needed();
514
+
515
+ // buffers should be pre-sized to target capacity as appropriate
516
+ template<typename FwdV>
517
+ static void in_place_propagate_carry(uint8_t starting_level, FwdV&& buf_size_k,
518
+ Level& buf_size_2k, bool apply_as_update,
519
+ quantiles_sketch& sketch);
520
+ static void zip_buffer(Level& buf_in, Level& buf_out);
521
+ static void merge_two_size_k_buffers(Level& arr_in_1, Level& arr_in_2, Level& arr_out);
522
+
523
+ template<typename SerDe>
524
+ static Level deserialize_array(std::istream& is, uint32_t num_items, uint32_t capcacity, const SerDe& serde, const Allocator& allocator);
525
+
526
+ template<typename SerDe>
527
+ static std::pair<Level, size_t> deserialize_array(const void* bytes, size_t size, uint32_t num_items, uint32_t capcacity, const SerDe& serde, const Allocator& allocator);
528
+
529
+ static void check_k(uint16_t k);
530
+ static void check_serial_version(uint8_t serial_version);
531
+ static void check_header_validity(uint8_t preamble_longs, uint8_t flags_byte, uint8_t serial_version);
532
+ static void check_family_id(uint8_t family_id);
533
+
534
+ static uint32_t compute_retained_items(uint16_t k, uint64_t n);
535
+ static uint32_t compute_base_buffer_items(uint16_t k, uint64_t n);
536
+ static uint64_t compute_bit_pattern(uint16_t k, uint64_t n);
537
+ static uint32_t compute_valid_levels(uint64_t bit_pattern);
538
+ static uint8_t compute_levels_needed(uint16_t k, uint64_t n);
539
+
540
+ /**
541
+ * Merges the src sketch into the tgt sketch with equal values of K.
542
+ * src is modified only if elements can be moved out of it.
543
+ */
544
+ template<typename FwdSk>
545
+ static void standard_merge(quantiles_sketch& tgt, FwdSk&& src);
546
+
547
+ /**
548
+ * Merges the src sketch into the tgt sketch with a smaller value of K.
549
+ * However, it is required that the ratio of the two K values be a power of 2.
550
+ * I.e., other.get_k() = this.get_k() * 2^(nonnegative integer).
551
+ * src is modified only if elements can be moved out of it.
552
+ */
553
+ template<typename FwdSk>
554
+ static void downsampling_merge(quantiles_sketch& tgt, FwdSk&& src);
555
+
556
+ template<typename FwdV>
557
+ static void zip_buffer_with_stride(FwdV&& buf_in, Level& buf_out, uint16_t stride);
558
+
559
+ /**
560
+ * Returns the zero-based bit position of the lowest zero bit of <i>bits</i> starting at
561
+ * <i>startingBit</i>. If input is all ones, this returns 64.
562
+ * @param bits the input bits as a long
563
+ * @param starting_bit the zero-based starting bit position. Only the low 6 bits are used.
564
+ * @return the zero-based bit position of the lowest zero bit starting at <i>startingBit</i>.
565
+ */
566
+ static uint8_t lowest_zero_bit_starting_at(uint64_t bits, uint8_t starting_bit);
567
+
568
+ // implementations for floating point types
569
+ template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
570
+ static const TT& get_invalid_value() {
571
+ static TT value = std::numeric_limits<TT>::quiet_NaN();
572
+ return value;
573
+ }
574
+
575
+ template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
576
+ static inline bool check_update_value(TT value) {
577
+ return !std::isnan(value);
578
+ }
579
+
580
+ template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
581
+ static inline void check_split_points(const T* values, uint32_t size) {
582
+ for (uint32_t i = 0; i < size ; i++) {
583
+ if (std::isnan(values[i])) {
584
+ throw std::invalid_argument("Values must not be NaN");
585
+ }
586
+ if ((i < (size - 1)) && !(Comparator()(values[i], values[i + 1]))) {
587
+ throw std::invalid_argument("Values must be unique and monotonically increasing");
588
+ }
589
+ }
590
+ }
591
+
592
+ // implementations for all other types
593
+ template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
594
+ static const TT& get_invalid_value() {
595
+ throw std::runtime_error("getting quantiles from empty sketch is not supported for this type of values");
596
+ }
597
+
598
+ template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
599
+ static inline bool check_update_value(TT) {
600
+ return true;
601
+ }
602
+
603
+ template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
604
+ static inline void check_split_points(const T* values, uint32_t size) {
605
+ for (uint32_t i = 0; i < size ; i++) {
606
+ if ((i < (size - 1)) && !(Comparator()(values[i], values[i + 1]))) {
607
+ throw std::invalid_argument("Values must be unique and monotonically increasing");
608
+ }
609
+ }
610
+ }
611
+ };
612
+
613
+
614
+ template<typename T, typename C, typename A>
615
+ class quantiles_sketch<T, C, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
616
+ public:
617
+ const_iterator& operator++();
618
+ const_iterator& operator++(int);
619
+ bool operator==(const const_iterator& other) const;
620
+ bool operator!=(const const_iterator& other) const;
621
+ std::pair<const T&, const uint64_t> operator*() const;
622
+ private:
623
+ friend class quantiles_sketch<T, C, A>;
624
+ using Level = std::vector<T, A>;
625
+ using AllocLevel = typename std::allocator_traits<A>::template rebind_alloc<Level>;
626
+ Level base_buffer_;
627
+ std::vector<Level, AllocLevel> levels_;
628
+ int level_;
629
+ uint32_t index_;
630
+ uint32_t bb_count_;
631
+ uint64_t bit_pattern_;
632
+ uint64_t weight_;
633
+ uint16_t k_;
634
+ const_iterator(const Level& base_buffer, const std::vector<Level, AllocLevel>& levels, uint16_t k, uint64_t n, bool is_end);
635
+ };
636
+
637
+ } /* namespace datasketches */
638
+
639
+ #include "quantiles_sketch_impl.hpp"
640
+
641
+ #endif // _QUANTILES_SKETCH_HPP_