datasketches 0.2.7 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/ext/datasketches/kll_wrapper.cpp +20 -20
  4. data/ext/datasketches/theta_wrapper.cpp +2 -2
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
  7. data/vendor/datasketches-cpp/MANIFEST.in +21 -2
  8. data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
  10. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
  11. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
  12. data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
  13. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
  14. data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
  15. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
  16. data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
  17. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
  18. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
  19. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
  20. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
  21. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
  22. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
  23. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
  24. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  25. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
  26. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
  27. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
  28. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
  29. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
  30. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
  31. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
  32. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
  33. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
  34. data/vendor/datasketches-cpp/pyproject.toml +17 -12
  35. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  36. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
  37. data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
  38. data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
  39. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
  40. data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
  41. data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
  42. data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
  43. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
  44. data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
  45. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
  46. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
  47. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
  48. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
  49. data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
  50. data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
  51. data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
  52. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
  53. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
  54. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
  55. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
  56. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
  57. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
  58. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
  59. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
  60. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
  61. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
  62. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
  63. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
  64. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
  65. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
  66. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
  67. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
  68. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
  69. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
  70. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
  71. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
  72. data/vendor/datasketches-cpp/setup.py +14 -2
  73. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
  74. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
  75. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
  76. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
  77. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
  78. data/vendor/datasketches-cpp/tox.ini +26 -0
  79. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
  80. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
  81. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
  82. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
  83. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
  84. data/vendor/datasketches-cpp/version.cfg.in +1 -0
  85. metadata +14 -5
  86. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
@@ -20,14 +20,12 @@
20
20
  #ifndef KLL_SKETCH_HPP_
21
21
  #define KLL_SKETCH_HPP_
22
22
 
23
- #include <functional>
24
23
  #include <memory>
25
24
  #include <vector>
26
- #include <cmath>
27
25
 
28
- #include "quantile_sketch_sorted_view.hpp"
29
26
  #include "common_defs.hpp"
30
27
  #include "serde.hpp"
28
+ #include "quantiles_sorted_view.hpp"
31
29
 
32
30
  namespace datasketches {
33
31
 
@@ -37,9 +35,9 @@ namespace datasketches {
37
35
  * See <a href="https://arxiv.org/abs/1603.05346v2">Optimal Quantile Approximation in Streams</a>.
38
36
  *
39
37
  * <p>This is a stochastic streaming sketch that enables near real-time analysis of the
40
- * approximate distribution of values from a very large stream in a single pass, requiring only
41
- * that the values are comparable.
42
- * The analysis is obtained using <i>get_quantile()</i> or <i>get_quantiles()</i> functions or the
38
+ * approximate distribution of items from a very large stream in a single pass, requiring only
39
+ * that the items are comparable.
40
+ * The analysis is obtained using <i>get_quantile()</i> function or the
43
41
  * inverse functions get_rank(), get_PMF() (Probability Mass Function), and get_CDF()
44
42
  * (Cumulative Distribution Function).
45
43
  *
@@ -47,14 +45,15 @@ namespace datasketches {
47
45
  * with the equivalent Java implementation only when template parameter T = float
48
46
  * (32-bit single precision values).
49
47
  *
50
- * <p>Given an input stream of <i>N</i> numeric values, the <i>absolute rank</i> of any specific
51
- * value is defined as its index <i>(0 to N-1)</i> in the hypothetical sorted stream of all
52
- * <i>N</i> input values.
48
+ * <p>Given an input stream of <i>N</i> items, the <i>natural rank</i> of any specific
49
+ * item is defined as its index <i>(1 to N)</i> in inclusive mode
50
+ * or <i>(0 to N-1)</i> in exclusive mode
51
+ * in the hypothetical sorted stream of all <i>N</i> input items.
53
52
  *
54
- * <p>The <i>normalized rank</i> (<i>rank</i>) of any specific value is defined as its
55
- * <i>absolute rank</i> divided by <i>N</i>.
56
- * Thus, the <i>normalized rank</i> is a value between zero and one.
57
- * In the documentation for this sketch <i>absolute rank</i> is never used so any
53
+ * <p>The <i>normalized rank</i> (<i>rank</i>) of any specific item is defined as its
54
+ * <i>natural rank</i> divided by <i>N</i>.
55
+ * Thus, the <i>normalized rank</i> is between zero and one.
56
+ * In the documentation for this sketch <i>natural rank</i> is never used so any
58
57
  * reference to just <i>rank</i> should be interpreted to mean <i>normalized rank</i>.
59
58
  *
60
59
  * <p>This sketch is configured with a parameter <i>k</i>, which affects the size of the sketch
@@ -63,18 +62,18 @@ namespace datasketches {
63
62
  * <p>The estimation error is commonly called <i>epsilon</i> (or <i>eps</i>) and is a fraction
64
63
  * between zero and one. Larger values of <i>k</i> result in smaller values of epsilon.
65
64
  * Epsilon is always with respect to the rank and cannot be applied to the
66
- * corresponding values.
65
+ * corresponding items.
67
66
  *
68
- * <p>The relationship between the normalized rank and the corresponding values can be viewed
67
+ * <p>The relationship between the normalized rank and the corresponding items can be viewed
69
68
  * as a two dimensional monotonic plot with the normalized rank on one axis and the
70
- * corresponding values on the other axis. If the y-axis is specified as the value-axis and
69
+ * corresponding items on the other axis. If the y-axis is specified as the item-axis and
71
70
  * the x-axis as the normalized rank, then <i>y = get_quantile(x)</i> is a monotonically
72
71
  * increasing function.
73
72
  *
74
- * <p>The functions <i>get_quantile(rank)</i> and get_quantiles(...) translate ranks into
75
- * corresponding values. The functions <i>get_rank(value),
73
+ * <p>The function <i>get_quantile(rank)</i> translates ranks into
74
+ * corresponding quantiles. The functions <i>get_rank(item),
76
75
  * get_CDF(...) (Cumulative Distribution Function), and get_PMF(...)
77
- * (Probability Mass Function)</i> perform the opposite operation and translate values into ranks.
76
+ * (Probability Mass Function)</i> perform the opposite operation and translate items into ranks.
78
77
  *
79
78
  * <p>The <i>getPMF(...)</i> function has about 13 to 47% worse rank error (depending
80
79
  * on <i>k</i>) than the other queries because the mass of each "bin" of the PMF has
@@ -86,60 +85,60 @@ namespace datasketches {
86
85
  *
87
86
  * <p>A <i>get_quantile(rank)</i> query has the following guarantees:
88
87
  * <ul>
89
- * <li>Let <i>v = get_quantile(r)</i> where <i>r</i> is the rank between zero and one.</li>
90
- * <li>The value <i>v</i> will be a value from the input stream.</li>
91
- * <li>Let <i>trueRank</i> be the true rank of <i>v</i> derived from the hypothetical sorted
92
- * stream of all <i>N</i> values.</li>
88
+ * <li>Let <i>q = get_quantile(r)</i> where <i>r</i> is the rank between zero and one.</li>
89
+ * <li>The quantile <i>q</i> will be an item from the input stream.</li>
90
+ * <li>Let <i>trueRank</i> be the true rank of <i>q</i> derived from the hypothetical sorted
91
+ * stream of all <i>N</i> items.</li>
93
92
  * <li>Let <i>eps = get_normalized_rank_error(false)</i>.</li>
94
93
  * <li>Then <i>r - eps &le; trueRank &le; r + eps</i> with a confidence of 99%. Note that the
95
- * error is on the rank, not the value.</li>
94
+ * error is on the rank, not the quantile.</li>
96
95
  * </ul>
97
96
  *
98
- * <p>A <i>get_rank(value)</i> query has the following guarantees:
97
+ * <p>A <i>get_rank(item)</i> query has the following guarantees:
99
98
  * <ul>
100
- * <li>Let <i>r = get_rank(v)</i> where <i>v</i> is a value between the min and max values of
99
+ * <li>Let <i>r = get_rank(i)</i> where <i>i</i> is an item between the min and max items of
101
100
  * the input stream.</li>
102
- * <li>Let <i>true_rank</i> be the true rank of <i>v</i> derived from the hypothetical sorted
103
- * stream of all <i>N</i> values.</li>
101
+ * <li>Let <i>true_rank</i> be the true rank of <i>i</i> derived from the hypothetical sorted
102
+ * stream of all <i>N</i> items.</li>
104
103
  * <li>Let <i>eps = get_normalized_rank_error(false)</i>.</li>
105
104
  * <li>Then <i>r - eps &le; trueRank &le; r + eps</i> with a confidence of 99%.</li>
106
105
  * </ul>
107
106
  *
108
107
  * <p>A <i>get_PMF()</i> query has the following guarantees:
109
108
  * <ul>
110
- * <li>Let <i>{r1, r2, ..., r(m+1)} = get_PMF(v1, v2, ..., vm)</i> where <i>v1, v2</i> are values
111
- * between the min and max values of the input stream.
112
- * <li>Let <i>mass<sub>i</sub> = estimated mass between v<sub>i</sub> and v<sub>i+1</sub></i>.</li>
113
- * <li>Let <i>trueMass</i> be the true mass between the values of <i>v<sub>i</sub>,
114
- * v<sub>i+1</sub></i> derived from the hypothetical sorted stream of all <i>N</i> values.</li>
109
+ * <li>Let <i>{r1, r2, ..., r(m+1)} = get_PMF(s1, s2, ..., sm)</i> where <i>s1, s2</i> are
110
+ * split points (items from the input domain) between the min and max items of the input stream.
111
+ * <li>Let <i>mass<sub>i</sub> = estimated mass between s<sub>i</sub> and s<sub>i+1</sub></i>.</li>
112
+ * <li>Let <i>trueMass</i> be the true mass between the items of <i>s<sub>i</sub>,
113
+ * s<sub>i+1</sub></i> derived from the hypothetical sorted stream of all <i>N</i> items.</li>
115
114
  * <li>Let <i>eps = get_normalized_rank_error(true)</i>.</li>
116
115
  * <li>then <i>mass - eps &le; trueMass &le; mass + eps</i> with a confidence of 99%.</li>
117
- * <li>r(m+1) includes the mass of all points larger than vm.</li>
116
+ * <li>r(m+1) includes the mass of all points larger than sm.</li>
118
117
  * </ul>
119
118
  *
120
119
  * <p>A <i>get_CDF(...)</i> query has the following guarantees;
121
120
  * <ul>
122
- * <li>Let <i>{r1, r2, ..., r(m+1)} = get_CDF(v1, v2, ..., vm)</i> where <i>v1, v2</i> are values
123
- * between the min and max values of the input stream.
121
+ * <li>Let <i>{r1, r2, ..., r(m+1)} = get_CDF(s1, s2, ..., sm)</i> where <i>s1, s2, ...</i> are
122
+ * split points (items from the input domain) between the min and max items of the input stream.
124
123
  * <li>Let <i>mass<sub>i</sub> = r<sub>i+1</sub> - r<sub>i</sub></i>.</li>
125
- * <li>Let <i>trueMass</i> be the true mass between the true ranks of <i>v<sub>i</sub>,
126
- * v<sub>i+1</sub></i> derived from the hypothetical sorted stream of all <i>N</i> values.</li>
124
+ * <li>Let <i>trueMass</i> be the true mass between the true ranks of <i>s<sub>i</sub>,
125
+ * s<sub>i+1</sub></i> derived from the hypothetical sorted stream of all <i>N</i> items.</li>
127
126
  * <li>Let <i>eps = get_normalized_rank_error(true)</i>.</li>
128
127
  * <li>then <i>mass - eps &le; trueMass &le; mass + eps</i> with a confidence of 99%.</li>
129
- * <li>1 - r(m+1) includes the mass of all points larger than vm.</li>
128
+ * <li>1 - r(m+1) includes the mass of all points larger than sm.</li>
130
129
  * </ul>
131
130
  *
132
131
  * <p>From the above, it might seem like we could make some estimates to bound the
133
- * <em>value</em> returned from a call to <em>get_quantile()</em>. The sketch, however, does not
134
- * let us derive error bounds or confidences around values. Because errors are independent, we
132
+ * <em>item</em> returned from a call to <em>get_quantile()</em>. The sketch, however, does not
133
+ * let us derive error bounds or confidences around items. Because errors are independent, we
135
134
  * can approximately bracket a value as shown below, but there are no error estimates available.
136
135
  * Additionally, the interval may be quite large for certain distributions.
137
136
  * <ul>
138
- * <li>Let <i>v = get_quantile(r)</i>, the estimated quantile value of rank <i>r</i>.</li>
137
+ * <li>Let <i>q = get_quantile(r)</i>, the estimated quantile of rank <i>r</i>.</li>
139
138
  * <li>Let <i>eps = get_normalized_rank_error(false)</i>.</li>
140
- * <li>Let <i>v<sub>lo</sub></i> = estimated quantile value of rank <i>(r - eps)</i>.</li>
141
- * <li>Let <i>v<sub>hi</sub></i> = estimated quantile value of rank <i>(r + eps)</i>.</li>
142
- * <li>Then <i>v<sub>lo</sub> &le; v &le; v<sub>hi</sub></i>, with 99% confidence.</li>
139
+ * <li>Let <i>q<sub>lo</sub></i> = estimated quantile of rank <i>(r - eps)</i>.</li>
140
+ * <li>Let <i>q<sub>hi</sub></i> = estimated quantile of rank <i>(r + eps)</i>.</li>
141
+ * <li>Then <i>q<sub>lo</sub> &le; q &le; q<sub>hi</sub></i>, with 99% confidence.</li>
143
142
  * </ul>
144
143
  *
145
144
  * author Kevin Lang
@@ -147,13 +146,6 @@ namespace datasketches {
147
146
  * author Lee Rhodes
148
147
  */
149
148
 
150
- template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
151
- template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
152
- template<typename A> using AllocU32 = typename std::allocator_traits<A>::template rebind_alloc<uint32_t>;
153
- template<typename A> using vector_u32 = std::vector<uint32_t, AllocU32<A>>;
154
- template<typename A> using AllocD = typename std::allocator_traits<A>::template rebind_alloc<double>;
155
- template<typename A> using vector_d = std::vector<double, AllocD<A>>;
156
-
157
149
  namespace kll_constants {
158
150
  const uint16_t DEFAULT_K = 200;
159
151
  }
@@ -161,21 +153,19 @@ namespace kll_constants {
161
153
  template <
162
154
  typename T,
163
155
  typename C = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare)
164
- typename S = serde<T>, // deprecated, to be removed in the next major version
165
156
  typename A = std::allocator<T>
166
157
  >
167
158
  class kll_sketch {
168
159
  public:
169
160
  using value_type = T;
170
161
  using comparator = C;
162
+ using vector_u32 = std::vector<uint32_t, typename std::allocator_traits<A>::template rebind_alloc<uint32_t>>;
171
163
 
172
164
  static const uint8_t DEFAULT_M = 8;
173
- // TODO: Redundant and deprecated. Will be removed in next major version.
174
- static const uint16_t DEFAULT_K = kll_constants::DEFAULT_K;
175
165
  static const uint16_t MIN_K = DEFAULT_M;
176
166
  static const uint16_t MAX_K = (1 << 16) - 1;
177
167
 
178
- explicit kll_sketch(uint16_t k = kll_constants::DEFAULT_K, const A& allocator = A());
168
+ explicit kll_sketch(uint16_t k = kll_constants::DEFAULT_K, const C& comparator = C(), const A& allocator = A());
179
169
  kll_sketch(const kll_sketch& other);
180
170
  kll_sketch(kll_sketch&& other) noexcept;
181
171
  ~kll_sketch();
@@ -185,17 +175,18 @@ class kll_sketch {
185
175
  /*
186
176
  * Type converting constructor.
187
177
  * @param other sketch of a different type
178
+ * @param comparator instance of a Comparator
188
179
  * @param allocator instance of an Allocator
189
180
  */
190
- template<typename TT, typename CC, typename SS, typename AA>
191
- explicit kll_sketch(const kll_sketch<TT, CC, SS, AA>& other, const A& allocator = A());
181
+ template<typename TT, typename CC, typename AA>
182
+ explicit kll_sketch(const kll_sketch<TT, CC, AA>& other, const C& comparator = C(), const A& allocator = A());
192
183
 
193
184
  /**
194
185
  * Updates this sketch with the given data item.
195
- * @param value an item from a stream of items
186
+ * @param item from a stream of items
196
187
  */
197
188
  template<typename FwdT>
198
- void update(FwdT&& value);
189
+ void update(FwdT&& item);
199
190
 
200
191
  /**
201
192
  * Merges another sketch into this one.
@@ -235,20 +226,18 @@ class kll_sketch {
235
226
  bool is_estimation_mode() const;
236
227
 
237
228
  /**
238
- * Returns the min value of the stream.
239
- * For floating point types: if the sketch is empty this returns NaN.
240
- * For other types: if the sketch is empty this throws runtime_error.
241
- * @return the min value of the stream
229
+ * Returns the min item of the stream.
230
+ * If the sketch is empty this throws std::runtime_error.
231
+ * @return the min item of the stream
242
232
  */
243
- T get_min_value() const;
233
+ T get_min_item() const;
244
234
 
245
235
  /**
246
- * Returns the max value of the stream.
247
- * For floating point types: if the sketch is empty this returns NaN.
248
- * For other types: if the sketch is empty this throws runtime_error.
249
- * @return the max value of the stream
236
+ * Returns the max item of the stream.
237
+ * If the sketch is empty this throws std::runtime_error.
238
+ * @return the max item of the stream
250
239
  */
251
- T get_max_value() const;
240
+ T get_max_item() const;
252
241
 
253
242
  /**
254
243
  * Returns an instance of the comparator for this sketch.
@@ -257,134 +246,128 @@ class kll_sketch {
257
246
  C get_comparator() const;
258
247
 
259
248
  /**
260
- * Returns an approximation to the value of the data item
261
- * that would be preceded by the given fraction of a hypothetical sorted
262
- * version of the input stream so far.
263
- * <p>
264
- * Note that this method has a fairly large overhead (microseconds instead of nanoseconds)
265
- * so it should not be called multiple times to get different quantiles from the same
266
- * sketch. Instead use get_quantiles(), which pays the overhead only once.
267
- * <p>
268
- * For floating point types: if the sketch is empty this returns NaN.
269
- * For other types: if the sketch is empty this throws runtime_error.
249
+ * Returns an instance of the allocator for this sketch.
250
+ * @return allocator
251
+ */
252
+ A get_allocator() const;
253
+
254
+ /**
255
+ * Returns an item from the sketch that is the best approximation to an item
256
+ * from the original stream with the given rank.
257
+ *
258
+ * <p>If the sketch is empty this throws std::runtime_error.
270
259
  *
271
- * @param fraction the specified fractional position in the hypothetical sorted stream.
272
- * These are also called normalized ranks or fractional ranks.
273
- * If fraction = 0.0, the true minimum value of the stream is returned.
274
- * If fraction = 1.0, the true maximum value of the stream is returned.
275
- * If the parameter inclusive=true, the given rank is considered inclusive (includes the weight of an item)
260
+ * @param rank of an item in the hypothetical sorted stream.
261
+ * @param inclusive if true, the given rank is considered inclusive (includes weight of an item)
276
262
  *
277
- * @return the approximation to the value at the given fraction
263
+ * @return approximate quantile associated with the given rank
278
264
  */
279
- using quantile_return_type = typename quantile_sketch_sorted_view<T, C, A>::quantile_return_type;
280
- template<bool inclusive = false>
281
- quantile_return_type get_quantile(double fraction) const;
265
+ using quantile_return_type = typename quantiles_sorted_view<T, C, A>::quantile_return_type;
266
+ quantile_return_type get_quantile(double rank, bool inclusive = true) const;
282
267
 
283
268
  /**
284
- * This is a more efficient multiple-query version of get_quantile().
285
- * <p>
286
269
  * This returns an array that could have been generated by using get_quantile() for each
287
- * fractional rank separately, but would be very inefficient.
288
- * This method incurs the internal set-up overhead once and obtains multiple quantile values in
289
- * a single query. It is strongly recommend that this method be used instead of multiple calls
290
- * to get_quantile().
270
+ * rank separately.
271
+ *
272
+ * <p>If the sketch is empty this throws std::runtime_error.
291
273
  *
292
- * <p>If the sketch is empty this returns an empty vector.
274
+ * @param ranks given array of ranks in the hypothetical sorted stream.
275
+ * These ranks must be in the interval [0.0, 1.0].
276
+ * @param size the number of ranks in the array
277
+ * @param inclusive if true, the given ranks are considered inclusive (include weights of items)
293
278
  *
294
- * @param fractions given array of fractional positions in the hypothetical sorted stream.
295
- * These are also called normalized ranks or fractional ranks.
296
- * These fractions must be in the interval [0.0, 1.0], inclusive.
297
- * If the parameter inclusive=true, the given fractions are considered inclusive (include weights of items)
279
+ * @return array of approximate quantiles corresponding to the given ranks in the same order.
298
280
  *
299
- * @return array of approximations to the given fractions in the same order as given fractions
300
- * in the input array.
281
+ * Deprecated. Will be removed in the next major version. Use get_quantile() instead.
301
282
  */
302
- template<bool inclusive = false>
303
- std::vector<T, A> get_quantiles(const double* fractions, uint32_t size) const;
283
+ std::vector<T, A> get_quantiles(const double* ranks, uint32_t size, bool inclusive = true) const;
304
284
 
305
285
  /**
306
286
  * This is a multiple-query version of get_quantile() that allows the caller to
307
- * specify the number of evenly-spaced fractional ranks.
287
+ * specify the number of evenly-spaced ranks.
308
288
  *
309
- * <p>If the sketch is empty this returns an empty vector.
289
+ * <p>If the sketch is empty this throws std::runtime_error.
310
290
  *
311
- * @param num an integer that specifies the number of evenly-spaced fractional ranks.
312
- * This must be an integer greater than 0. A value of 1 will return the min value.
313
- * A value of 2 will return the min and the max value. A value of 3 will return the min,
314
- * the median and the max value, etc.
291
+ * @param num an integer that specifies the number of evenly-spaced ranks.
292
+ * This must be an integer greater than 0. A value of 1 will return the quantile of rank 0.
293
+ * A value of 2 will return quantiles of ranks 0 and 1. A value of 3 will return quantiles of ranks 0,
294
+ * 0.5 (median) and 1, etc.
295
+ * @param inclusive if true, the ranks are considered inclusive (include weights of items)
315
296
  *
316
- * @return array of approximations to the given number of evenly-spaced fractional ranks.
297
+ * @return array of approximate quantiles corresponding to the given number of evenly-spaced ranks.
298
+ *
299
+ * Deprecated. Will be removed in the next major version. Use get_quantile() instead.
317
300
  */
318
- template<bool inclusive = false>
319
- std::vector<T, A> get_quantiles(uint32_t num) const;
301
+ std::vector<T, A> get_quantiles(uint32_t num, bool inclusive = true) const;
320
302
 
321
303
  /**
322
- * Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1,
323
- * inclusive.
324
- * With the template parameter inclusive=true the weight of the given value is included into the rank.
325
- * Otherwise the rank equals the sum of the weights of all values that are less than the given value
326
- * according to the comparator C.
304
+ * Returns an approximation to the normalized rank of the given item from 0 to 1, inclusive.
327
305
  *
328
306
  * <p>The resulting approximation has a probabilistic guarantee that can be obtained from the
329
307
  * get_normalized_rank_error(false) function.
330
308
  *
331
- * <p>If the sketch is empty this returns NaN.
309
+ * <p>If the sketch is empty this throws std::runtime_error.
310
+ *
311
+ * @param item to be ranked.
312
+ * @param inclusive if true the weight of the given item is included into the rank.
313
+ * Otherwise the rank equals the sum of the weights of all items that are less than the given item
314
+ * according to the comparator C.
332
315
  *
333
- * @param value to be ranked
334
- * @return an approximate rank of the given value
316
+ * @return an approximate rank of the given item
335
317
  */
336
- template<bool inclusive = false>
337
- double get_rank(const T& value) const;
318
+ double get_rank(const T& item, bool inclusive = true) const;
338
319
 
339
320
  /**
340
321
  * Returns an approximation to the Probability Mass Function (PMF) of the input stream
341
- * given a set of split points (values).
322
+ * given a set of split points (items).
342
323
  *
343
324
  * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
344
325
  * get_normalized_rank_error(true) function.
345
326
  *
346
- * <p>If the sketch is empty this returns an empty vector.
327
+ * <p>If the sketch is empty this throws std::runtime_error.
347
328
  *
348
- * @param split_points an array of <i>m</i> unique, monotonically increasing values
349
- * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
350
- * The definition of an "interval" is inclusive of the left split point (or minimum value) and
351
- * exclusive of the right split point, with the exception that the last interval will include
352
- * the maximum value.
353
- * It is not necessary to include either the min or max values in these split points.
329
+ * @param split_points an array of <i>m</i> unique, monotonically increasing items
330
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
331
+ *
332
+ * @param size the number of split points in the array
333
+ *
334
+ * @param inclusive if true the rank of an item includes its own weight, and therefore
335
+ * if the sketch contains items equal to a slit point, then in PMF such items are
336
+ * included into the interval to the left of split point. Otherwise they are included into the interval
337
+ * to the right of split point.
354
338
  *
355
339
  * @return an array of m+1 doubles each of which is an approximation
356
- * to the fraction of the input stream values (the mass) that fall into one of those intervals.
357
- * If the template parameter inclusive=false, the definition of an "interval" is inclusive of the left split point and exclusive of the right
358
- * split point, with the exception that the last interval will include the maximum value.
359
- * If the template parameter inclusive=true, the definition of an "interval" is exclusive of the left split point and inclusive of the right
360
- * split point.
340
+ * to the fraction of the input stream items (the mass) that fall into one of those intervals.
361
341
  */
362
- template<bool inclusive = false>
363
- vector_d<A> get_PMF(const T* split_points, uint32_t size) const;
342
+ using vector_double = typename quantiles_sorted_view<T, C, A>::vector_double;
343
+ vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
364
344
 
365
345
  /**
366
346
  * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
367
- * cumulative analog of the PMF, of the input stream given a set of split points (values).
347
+ * cumulative analog of the PMF, of the input stream given a set of split points (items).
368
348
  *
369
349
  * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
370
350
  * get_normalized_rank_error(false) function.
371
351
  *
372
- * <p>If the sketch is empty this returns an empty vector.
352
+ * <p>If the sketch is empty this throws std::runtime_error.
373
353
  *
374
- * @param split_points an array of <i>m</i> unique, monotonically increasing values
354
+ * @param split_points an array of <i>m</i> unique, monotonically increasing items
375
355
  * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
376
- * The definition of an "interval" is inclusive of the left split point (or minimum value) and
377
- * exclusive of the right split point, with the exception that the last interval will include
378
- * the maximum value.
379
- * It is not necessary to include either the min or max values in these split points.
380
356
  *
381
- * @return an array of m+1 double values, which are a consecutive approximation to the CDF
357
+ * @param size the number of split points in the array
358
+ *
359
+ * @param inclusive if true the rank of an item includes its own weight, and therefore
360
+ * if the sketch contains items equal to a slit point, then in CDF such items are
361
+ * included into the interval to the left of split point. Otherwise they are included into
362
+ * the interval to the right of split point.
363
+ *
364
+ * @return an array of m+1 doubles, which are a consecutive approximation to the CDF
382
365
  * of the input stream given the split_points. The value at array position j of the returned
383
366
  * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
384
- * array.
367
+ * array. This can be viewed as array of ranks of the given split points plus one more value
368
+ * that is always 1.
385
369
  */
386
- template<bool inclusive = false>
387
- vector_d<A> get_CDF(const T* split_points, uint32_t size) const;
370
+ vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
388
371
 
389
372
  /**
390
373
  * Gets the approximate rank error of this sketch normalized as a fraction between zero and one.
@@ -398,19 +381,19 @@ class kll_sketch {
398
381
  /**
399
382
  * Computes size needed to serialize the current state of the sketch.
400
383
  * This version is for fixed-size arithmetic types (integral and floating point).
401
- * @param serde instance of a SerDe
384
+ * @param sd instance of a SerDe
402
385
  * @return size in bytes needed to serialize this sketch
403
386
  */
404
- template<typename TT = T, typename SerDe = S, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
387
+ template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
405
388
  size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
406
389
 
407
390
  /**
408
391
  * Computes size needed to serialize the current state of the sketch.
409
392
  * This version is for all other types and can be expensive since every item needs to be looked at.
410
- * @param serde instance of a SerDe
393
+ * @param sd instance of a SerDe
411
394
  * @return size in bytes needed to serialize this sketch
412
395
  */
413
- template<typename TT = T, typename SerDe = S, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
396
+ template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
414
397
  size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
415
398
 
416
399
  /**
@@ -443,14 +426,14 @@ class kll_sketch {
443
426
  /**
444
427
  * This method serializes the sketch into a given stream in a binary form
445
428
  * @param os output stream
446
- * @param instance of a SerDe
429
+ * @param sd instance of a SerDe
447
430
  */
448
- template<typename SerDe = S>
431
+ template<typename SerDe = serde<T>>
449
432
  void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
450
433
 
451
434
  // This is a convenience alias for users
452
435
  // The type returned by the following serialize method
453
- using vector_bytes = vector_u8<A>;
436
+ using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
454
437
 
455
438
  /**
456
439
  * This method serializes the sketch as a vector of bytes.
@@ -458,53 +441,36 @@ class kll_sketch {
458
441
  * It is a blank space of a given size.
459
442
  * This header is used in Datasketches PostgreSQL extension.
460
443
  * @param header_size_bytes space to reserve in front of the sketch
461
- * @param instance of a SerDe
444
+ * @param sd instance of a SerDe
462
445
  * @return serialized sketch as a vector of bytes
463
446
  */
464
- template<typename SerDe = S>
447
+ template<typename SerDe = serde<T>>
465
448
  vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
466
449
 
467
450
  /**
468
451
  * This method deserializes a sketch from a given stream.
469
452
  * @param is input stream
470
- * @param allocator instance of an Allocator
471
- * @return an instance of a sketch
472
- *
473
- * Deprecated, to be removed in the next major version
474
- */
475
- static kll_sketch deserialize(std::istream& is, const A& allocator = A());
476
-
477
- /**
478
- * This method deserializes a sketch from a given stream.
479
- * @param is input stream
480
- * @param serde instance of a SerDe
453
+ * @param sd instance of a SerDe
454
+ * @param comparator instance of a Comparator
481
455
  * @param allocator instance of an Allocator
482
456
  * @return an instance of a sketch
483
457
  */
484
- template<typename SerDe = S>
485
- static kll_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
458
+ template<typename SerDe = serde<T>>
459
+ static kll_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(),
460
+ const C& comparator = C(), const A& allocator = A());
486
461
 
487
462
  /**
488
463
  * This method deserializes a sketch from a given array of bytes.
489
464
  * @param bytes pointer to the array of bytes
490
465
  * @param size the size of the array
466
+ * @param sd instance of a SerDe
467
+ * @param comparator instance of a Comparator
491
468
  * @param allocator instance of an Allocator
492
469
  * @return an instance of a sketch
493
- *
494
- * Deprecated, to be removed in the next major version
495
470
  */
496
- static kll_sketch deserialize(const void* bytes, size_t size, const A& allocator = A());
497
-
498
- /**
499
- * This method deserializes a sketch from a given array of bytes.
500
- * @param bytes pointer to the array of bytes
501
- * @param size the size of the array
502
- * @param serde instance of a SerDe
503
- * @param allocator instance of an Allocator
504
- * @return an instance of a sketch
505
- */
506
- template<typename SerDe = S>
507
- static kll_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
471
+ template<typename SerDe = serde<T>>
472
+ static kll_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(),
473
+ const C& comparator = C(), const A& allocator = A());
508
474
 
509
475
  /*
510
476
  * Gets the normalized rank error given k and pmf.
@@ -526,14 +492,7 @@ class kll_sketch {
526
492
  const_iterator begin() const;
527
493
  const_iterator end() const;
528
494
 
529
- template<bool inclusive = false>
530
- quantile_sketch_sorted_view<T, C, A> get_sorted_view(bool cumulative) const;
531
-
532
- #ifdef KLL_VALIDATION
533
- uint8_t get_num_levels() { return num_levels_; }
534
- uint32_t* get_levels() { return levels_; }
535
- T* get_items() { return items_; }
536
- #endif
495
+ quantiles_sorted_view<T, C, A> get_sorted_view() const;
537
496
 
538
497
  private:
539
498
  /* Serialized sketch layout:
@@ -559,28 +518,30 @@ class kll_sketch {
559
518
  static const uint8_t PREAMBLE_INTS_SHORT = 2; // for empty and single item
560
519
  static const uint8_t PREAMBLE_INTS_FULL = 5;
561
520
 
521
+ C comparator_;
562
522
  A allocator_;
563
523
  uint16_t k_;
564
524
  uint8_t m_; // minimum buffer "width"
565
525
  uint16_t min_k_; // for error estimation after merging with different k
566
- uint64_t n_;
567
526
  uint8_t num_levels_;
568
- vector_u32<A> levels_;
527
+ bool is_level_zero_sorted_;
528
+ uint64_t n_;
529
+ vector_u32 levels_;
569
530
  T* items_;
570
531
  uint32_t items_size_;
571
- T* min_value_;
572
- T* max_value_;
573
- bool is_level_zero_sorted_;
532
+ T* min_item_;
533
+ T* max_item_;
534
+ mutable quantiles_sorted_view<T, C, A>* sorted_view_;
574
535
 
575
536
  // for deserialization
576
537
  class item_deleter;
577
538
  class items_deleter;
578
- kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32<A>&& levels,
579
- std::unique_ptr<T, items_deleter> items, uint32_t items_size, std::unique_ptr<T, item_deleter> min_value,
580
- std::unique_ptr<T, item_deleter> max_value, bool is_level_zero_sorted);
539
+ kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32&& levels,
540
+ std::unique_ptr<T, items_deleter> items, uint32_t items_size, std::unique_ptr<T, item_deleter> min_item,
541
+ std::unique_ptr<T, item_deleter> max_item, bool is_level_zero_sorted, const C& comparator);
581
542
 
582
543
  // common update code
583
- inline void update_min_max(const T& value);
544
+ inline void update_min_max(const T& item);
584
545
  inline uint32_t internal_update();
585
546
 
586
547
  // The following code is only valid in the special case of exactly reaching capacity while updating.
@@ -591,15 +552,6 @@ class kll_sketch {
591
552
  void add_empty_top_level_to_completely_full_sketch();
592
553
  void sort_level_zero();
593
554
 
594
- template<bool inclusive>
595
- vector_d<A> get_PMF_or_CDF(const T* split_points, uint32_t size, bool is_CDF) const;
596
- template<bool inclusive>
597
- void increment_buckets_unsorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
598
- const T* split_points, uint32_t size, double* buckets) const;
599
- template<bool inclusive>
600
- void increment_buckets_sorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
601
- const T* split_points, uint32_t size, double* buckets) const;
602
-
603
555
  template<typename O> void merge_higher_levels(O&& other, uint64_t final_n);
604
556
 
605
557
  template<typename FwdSk>
@@ -616,43 +568,34 @@ class kll_sketch {
616
568
 
617
569
  void check_sorting() const;
618
570
 
619
- // implementations for floating point types
620
571
  template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
621
- static const TT& get_invalid_value() {
622
- static TT value = std::numeric_limits<TT>::quiet_NaN();
623
- return value;
572
+ static inline bool check_update_item(TT item) {
573
+ return !std::isnan(item);
624
574
  }
625
575
 
626
- template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
627
- static inline bool check_update_value(TT value) {
628
- return !std::isnan(value);
629
- }
630
-
631
- // implementations for all other types
632
576
  template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
633
- static const TT& get_invalid_value() {
634
- throw std::runtime_error("getting quantiles from empty sketch is not supported for this type of value");
635
- }
636
-
637
- template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
638
- static inline bool check_update_value(TT) {
577
+ static inline bool check_update_item(TT) {
639
578
  return true;
640
579
  }
641
580
 
642
581
  // for type converting constructor
643
- template<typename TT, typename CC, typename SS, typename AA>
644
- friend class kll_sketch;
582
+ template<typename TT, typename CC, typename AA> friend class kll_sketch;
583
+
584
+ void setup_sorted_view() const; // modifies mutable state
585
+ void reset_sorted_view();
645
586
  };
646
587
 
647
- template<typename T, typename C, typename S, typename A>
648
- class kll_sketch<T, C, S, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
588
+ template<typename T, typename C, typename A>
589
+ class kll_sketch<T, C, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
649
590
  public:
650
- friend class kll_sketch<T, C, S, A>;
591
+ using value_type = std::pair<const T&, const uint64_t>;
592
+ friend class kll_sketch<T, C, A>;
651
593
  const_iterator& operator++();
652
594
  const_iterator& operator++(int);
653
595
  bool operator==(const const_iterator& other) const;
654
596
  bool operator!=(const const_iterator& other) const;
655
- const std::pair<const T&, const uint64_t> operator*() const;
597
+ const value_type operator*() const;
598
+ const return_value_holder<value_type> operator->() const;
656
599
  private:
657
600
  const T* items;
658
601
  const uint32_t* levels;