datasketches 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/ext/datasketches/kll_wrapper.cpp +20 -20
  4. data/ext/datasketches/theta_wrapper.cpp +2 -2
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
  7. data/vendor/datasketches-cpp/MANIFEST.in +21 -2
  8. data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
  10. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
  11. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
  12. data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
  13. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
  14. data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
  15. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
  16. data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
  17. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
  18. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
  19. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
  20. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
  21. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
  22. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
  23. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
  24. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  25. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
  26. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
  27. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
  28. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
  29. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
  30. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
  31. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
  32. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
  33. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
  34. data/vendor/datasketches-cpp/pyproject.toml +17 -12
  35. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  36. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
  37. data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
  38. data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
  39. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
  40. data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
  41. data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
  42. data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
  43. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
  44. data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
  45. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
  46. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
  47. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
  48. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
  49. data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
  50. data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
  51. data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
  52. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
  53. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
  54. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
  55. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
  56. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
  57. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
  58. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
  59. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
  60. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
  61. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
  62. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
  63. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
  64. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
  65. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
  66. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
  67. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
  68. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
  69. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
  70. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
  71. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
  72. data/vendor/datasketches-cpp/setup.py +14 -2
  73. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
  74. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
  75. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
  76. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
  77. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
  78. data/vendor/datasketches-cpp/tox.ini +26 -0
  79. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
  80. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
  81. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
  82. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
  83. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
  84. data/vendor/datasketches-cpp/version.cfg.in +1 -0
  85. metadata +14 -5
  86. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
@@ -20,14 +20,12 @@
20
20
  #ifndef KLL_SKETCH_HPP_
21
21
  #define KLL_SKETCH_HPP_
22
22
 
23
- #include <functional>
24
23
  #include <memory>
25
24
  #include <vector>
26
- #include <cmath>
27
25
 
28
- #include "quantile_sketch_sorted_view.hpp"
29
26
  #include "common_defs.hpp"
30
27
  #include "serde.hpp"
28
+ #include "quantiles_sorted_view.hpp"
31
29
 
32
30
  namespace datasketches {
33
31
 
@@ -37,9 +35,9 @@ namespace datasketches {
37
35
  * See <a href="https://arxiv.org/abs/1603.05346v2">Optimal Quantile Approximation in Streams</a>.
38
36
  *
39
37
  * <p>This is a stochastic streaming sketch that enables near real-time analysis of the
40
- * approximate distribution of values from a very large stream in a single pass, requiring only
41
- * that the values are comparable.
42
- * The analysis is obtained using <i>get_quantile()</i> or <i>get_quantiles()</i> functions or the
38
+ * approximate distribution of items from a very large stream in a single pass, requiring only
39
+ * that the items are comparable.
40
+ * The analysis is obtained using <i>get_quantile()</i> function or the
43
41
  * inverse functions get_rank(), get_PMF() (Probability Mass Function), and get_CDF()
44
42
  * (Cumulative Distribution Function).
45
43
  *
@@ -47,14 +45,15 @@ namespace datasketches {
47
45
  * with the equivalent Java implementation only when template parameter T = float
48
46
  * (32-bit single precision values).
49
47
  *
50
- * <p>Given an input stream of <i>N</i> numeric values, the <i>absolute rank</i> of any specific
51
- * value is defined as its index <i>(0 to N-1)</i> in the hypothetical sorted stream of all
52
- * <i>N</i> input values.
48
+ * <p>Given an input stream of <i>N</i> items, the <i>natural rank</i> of any specific
49
+ * item is defined as its index <i>(1 to N)</i> in inclusive mode
50
+ * or <i>(0 to N-1)</i> in exclusive mode
51
+ * in the hypothetical sorted stream of all <i>N</i> input items.
53
52
  *
54
- * <p>The <i>normalized rank</i> (<i>rank</i>) of any specific value is defined as its
55
- * <i>absolute rank</i> divided by <i>N</i>.
56
- * Thus, the <i>normalized rank</i> is a value between zero and one.
57
- * In the documentation for this sketch <i>absolute rank</i> is never used so any
53
+ * <p>The <i>normalized rank</i> (<i>rank</i>) of any specific item is defined as its
54
+ * <i>natural rank</i> divided by <i>N</i>.
55
+ * Thus, the <i>normalized rank</i> is between zero and one.
56
+ * In the documentation for this sketch <i>natural rank</i> is never used so any
58
57
  * reference to just <i>rank</i> should be interpreted to mean <i>normalized rank</i>.
59
58
  *
60
59
  * <p>This sketch is configured with a parameter <i>k</i>, which affects the size of the sketch
@@ -63,18 +62,18 @@ namespace datasketches {
63
62
  * <p>The estimation error is commonly called <i>epsilon</i> (or <i>eps</i>) and is a fraction
64
63
  * between zero and one. Larger values of <i>k</i> result in smaller values of epsilon.
65
64
  * Epsilon is always with respect to the rank and cannot be applied to the
66
- * corresponding values.
65
+ * corresponding items.
67
66
  *
68
- * <p>The relationship between the normalized rank and the corresponding values can be viewed
67
+ * <p>The relationship between the normalized rank and the corresponding items can be viewed
69
68
  * as a two dimensional monotonic plot with the normalized rank on one axis and the
70
- * corresponding values on the other axis. If the y-axis is specified as the value-axis and
69
+ * corresponding items on the other axis. If the y-axis is specified as the item-axis and
71
70
  * the x-axis as the normalized rank, then <i>y = get_quantile(x)</i> is a monotonically
72
71
  * increasing function.
73
72
  *
74
- * <p>The functions <i>get_quantile(rank)</i> and get_quantiles(...) translate ranks into
75
- * corresponding values. The functions <i>get_rank(value),
73
+ * <p>The function <i>get_quantile(rank)</i> translates ranks into
74
+ * corresponding quantiles. The functions <i>get_rank(item),
76
75
  * get_CDF(...) (Cumulative Distribution Function), and get_PMF(...)
77
- * (Probability Mass Function)</i> perform the opposite operation and translate values into ranks.
76
+ * (Probability Mass Function)</i> perform the opposite operation and translate items into ranks.
78
77
  *
79
78
  * <p>The <i>getPMF(...)</i> function has about 13 to 47% worse rank error (depending
80
79
  * on <i>k</i>) than the other queries because the mass of each "bin" of the PMF has
@@ -86,60 +85,60 @@ namespace datasketches {
86
85
  *
87
86
  * <p>A <i>get_quantile(rank)</i> query has the following guarantees:
88
87
  * <ul>
89
- * <li>Let <i>v = get_quantile(r)</i> where <i>r</i> is the rank between zero and one.</li>
90
- * <li>The value <i>v</i> will be a value from the input stream.</li>
91
- * <li>Let <i>trueRank</i> be the true rank of <i>v</i> derived from the hypothetical sorted
92
- * stream of all <i>N</i> values.</li>
88
+ * <li>Let <i>q = get_quantile(r)</i> where <i>r</i> is the rank between zero and one.</li>
89
+ * <li>The quantile <i>q</i> will be an item from the input stream.</li>
90
+ * <li>Let <i>trueRank</i> be the true rank of <i>q</i> derived from the hypothetical sorted
91
+ * stream of all <i>N</i> items.</li>
93
92
  * <li>Let <i>eps = get_normalized_rank_error(false)</i>.</li>
94
93
  * <li>Then <i>r - eps &le; trueRank &le; r + eps</i> with a confidence of 99%. Note that the
95
- * error is on the rank, not the value.</li>
94
+ * error is on the rank, not the quantile.</li>
96
95
  * </ul>
97
96
  *
98
- * <p>A <i>get_rank(value)</i> query has the following guarantees:
97
+ * <p>A <i>get_rank(item)</i> query has the following guarantees:
99
98
  * <ul>
100
- * <li>Let <i>r = get_rank(v)</i> where <i>v</i> is a value between the min and max values of
99
+ * <li>Let <i>r = get_rank(i)</i> where <i>i</i> is an item between the min and max items of
101
100
  * the input stream.</li>
102
- * <li>Let <i>true_rank</i> be the true rank of <i>v</i> derived from the hypothetical sorted
103
- * stream of all <i>N</i> values.</li>
101
+ * <li>Let <i>true_rank</i> be the true rank of <i>i</i> derived from the hypothetical sorted
102
+ * stream of all <i>N</i> items.</li>
104
103
  * <li>Let <i>eps = get_normalized_rank_error(false)</i>.</li>
105
104
  * <li>Then <i>r - eps &le; trueRank &le; r + eps</i> with a confidence of 99%.</li>
106
105
  * </ul>
107
106
  *
108
107
  * <p>A <i>get_PMF()</i> query has the following guarantees:
109
108
  * <ul>
110
- * <li>Let <i>{r1, r2, ..., r(m+1)} = get_PMF(v1, v2, ..., vm)</i> where <i>v1, v2</i> are values
111
- * between the min and max values of the input stream.
112
- * <li>Let <i>mass<sub>i</sub> = estimated mass between v<sub>i</sub> and v<sub>i+1</sub></i>.</li>
113
- * <li>Let <i>trueMass</i> be the true mass between the values of <i>v<sub>i</sub>,
114
- * v<sub>i+1</sub></i> derived from the hypothetical sorted stream of all <i>N</i> values.</li>
109
+ * <li>Let <i>{r1, r2, ..., r(m+1)} = get_PMF(s1, s2, ..., sm)</i> where <i>s1, s2</i> are
110
+ * split points (items from the input domain) between the min and max items of the input stream.
111
+ * <li>Let <i>mass<sub>i</sub> = estimated mass between s<sub>i</sub> and s<sub>i+1</sub></i>.</li>
112
+ * <li>Let <i>trueMass</i> be the true mass between the items of <i>s<sub>i</sub>,
113
+ * s<sub>i+1</sub></i> derived from the hypothetical sorted stream of all <i>N</i> items.</li>
115
114
  * <li>Let <i>eps = get_normalized_rank_error(true)</i>.</li>
116
115
  * <li>then <i>mass - eps &le; trueMass &le; mass + eps</i> with a confidence of 99%.</li>
117
- * <li>r(m+1) includes the mass of all points larger than vm.</li>
116
+ * <li>r(m+1) includes the mass of all points larger than sm.</li>
118
117
  * </ul>
119
118
  *
120
119
  * <p>A <i>get_CDF(...)</i> query has the following guarantees;
121
120
  * <ul>
122
- * <li>Let <i>{r1, r2, ..., r(m+1)} = get_CDF(v1, v2, ..., vm)</i> where <i>v1, v2</i> are values
123
- * between the min and max values of the input stream.
121
+ * <li>Let <i>{r1, r2, ..., r(m+1)} = get_CDF(s1, s2, ..., sm)</i> where <i>s1, s2, ...</i> are
122
+ * split points (items from the input domain) between the min and max items of the input stream.
124
123
  * <li>Let <i>mass<sub>i</sub> = r<sub>i+1</sub> - r<sub>i</sub></i>.</li>
125
- * <li>Let <i>trueMass</i> be the true mass between the true ranks of <i>v<sub>i</sub>,
126
- * v<sub>i+1</sub></i> derived from the hypothetical sorted stream of all <i>N</i> values.</li>
124
+ * <li>Let <i>trueMass</i> be the true mass between the true ranks of <i>s<sub>i</sub>,
125
+ * s<sub>i+1</sub></i> derived from the hypothetical sorted stream of all <i>N</i> items.</li>
127
126
  * <li>Let <i>eps = get_normalized_rank_error(true)</i>.</li>
128
127
  * <li>then <i>mass - eps &le; trueMass &le; mass + eps</i> with a confidence of 99%.</li>
129
- * <li>1 - r(m+1) includes the mass of all points larger than vm.</li>
128
+ * <li>1 - r(m+1) includes the mass of all points larger than sm.</li>
130
129
  * </ul>
131
130
  *
132
131
  * <p>From the above, it might seem like we could make some estimates to bound the
133
- * <em>value</em> returned from a call to <em>get_quantile()</em>. The sketch, however, does not
134
- * let us derive error bounds or confidences around values. Because errors are independent, we
132
+ * <em>item</em> returned from a call to <em>get_quantile()</em>. The sketch, however, does not
133
+ * let us derive error bounds or confidences around items. Because errors are independent, we
135
134
  * can approximately bracket a value as shown below, but there are no error estimates available.
136
135
  * Additionally, the interval may be quite large for certain distributions.
137
136
  * <ul>
138
- * <li>Let <i>v = get_quantile(r)</i>, the estimated quantile value of rank <i>r</i>.</li>
137
+ * <li>Let <i>q = get_quantile(r)</i>, the estimated quantile of rank <i>r</i>.</li>
139
138
  * <li>Let <i>eps = get_normalized_rank_error(false)</i>.</li>
140
- * <li>Let <i>v<sub>lo</sub></i> = estimated quantile value of rank <i>(r - eps)</i>.</li>
141
- * <li>Let <i>v<sub>hi</sub></i> = estimated quantile value of rank <i>(r + eps)</i>.</li>
142
- * <li>Then <i>v<sub>lo</sub> &le; v &le; v<sub>hi</sub></i>, with 99% confidence.</li>
139
+ * <li>Let <i>q<sub>lo</sub></i> = estimated quantile of rank <i>(r - eps)</i>.</li>
140
+ * <li>Let <i>q<sub>hi</sub></i> = estimated quantile of rank <i>(r + eps)</i>.</li>
141
+ * <li>Then <i>q<sub>lo</sub> &le; q &le; q<sub>hi</sub></i>, with 99% confidence.</li>
143
142
  * </ul>
144
143
  *
145
144
  * author Kevin Lang
@@ -147,13 +146,6 @@ namespace datasketches {
147
146
  * author Lee Rhodes
148
147
  */
149
148
 
150
- template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
151
- template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
152
- template<typename A> using AllocU32 = typename std::allocator_traits<A>::template rebind_alloc<uint32_t>;
153
- template<typename A> using vector_u32 = std::vector<uint32_t, AllocU32<A>>;
154
- template<typename A> using AllocD = typename std::allocator_traits<A>::template rebind_alloc<double>;
155
- template<typename A> using vector_d = std::vector<double, AllocD<A>>;
156
-
157
149
  namespace kll_constants {
158
150
  const uint16_t DEFAULT_K = 200;
159
151
  }
@@ -161,21 +153,19 @@ namespace kll_constants {
161
153
  template <
162
154
  typename T,
163
155
  typename C = std::less<T>, // strict weak ordering function (see C++ named requirements: Compare)
164
- typename S = serde<T>, // deprecated, to be removed in the next major version
165
156
  typename A = std::allocator<T>
166
157
  >
167
158
  class kll_sketch {
168
159
  public:
169
160
  using value_type = T;
170
161
  using comparator = C;
162
+ using vector_u32 = std::vector<uint32_t, typename std::allocator_traits<A>::template rebind_alloc<uint32_t>>;
171
163
 
172
164
  static const uint8_t DEFAULT_M = 8;
173
- // TODO: Redundant and deprecated. Will be removed in next major version.
174
- static const uint16_t DEFAULT_K = kll_constants::DEFAULT_K;
175
165
  static const uint16_t MIN_K = DEFAULT_M;
176
166
  static const uint16_t MAX_K = (1 << 16) - 1;
177
167
 
178
- explicit kll_sketch(uint16_t k = kll_constants::DEFAULT_K, const A& allocator = A());
168
+ explicit kll_sketch(uint16_t k = kll_constants::DEFAULT_K, const C& comparator = C(), const A& allocator = A());
179
169
  kll_sketch(const kll_sketch& other);
180
170
  kll_sketch(kll_sketch&& other) noexcept;
181
171
  ~kll_sketch();
@@ -185,17 +175,18 @@ class kll_sketch {
185
175
  /*
186
176
  * Type converting constructor.
187
177
  * @param other sketch of a different type
178
+ * @param comparator instance of a Comparator
188
179
  * @param allocator instance of an Allocator
189
180
  */
190
- template<typename TT, typename CC, typename SS, typename AA>
191
- explicit kll_sketch(const kll_sketch<TT, CC, SS, AA>& other, const A& allocator = A());
181
+ template<typename TT, typename CC, typename AA>
182
+ explicit kll_sketch(const kll_sketch<TT, CC, AA>& other, const C& comparator = C(), const A& allocator = A());
192
183
 
193
184
  /**
194
185
  * Updates this sketch with the given data item.
195
- * @param value an item from a stream of items
186
+ * @param item from a stream of items
196
187
  */
197
188
  template<typename FwdT>
198
- void update(FwdT&& value);
189
+ void update(FwdT&& item);
199
190
 
200
191
  /**
201
192
  * Merges another sketch into this one.
@@ -235,20 +226,18 @@ class kll_sketch {
235
226
  bool is_estimation_mode() const;
236
227
 
237
228
  /**
238
- * Returns the min value of the stream.
239
- * For floating point types: if the sketch is empty this returns NaN.
240
- * For other types: if the sketch is empty this throws runtime_error.
241
- * @return the min value of the stream
229
+ * Returns the min item of the stream.
230
+ * If the sketch is empty this throws std::runtime_error.
231
+ * @return the min item of the stream
242
232
  */
243
- T get_min_value() const;
233
+ T get_min_item() const;
244
234
 
245
235
  /**
246
- * Returns the max value of the stream.
247
- * For floating point types: if the sketch is empty this returns NaN.
248
- * For other types: if the sketch is empty this throws runtime_error.
249
- * @return the max value of the stream
236
+ * Returns the max item of the stream.
237
+ * If the sketch is empty this throws std::runtime_error.
238
+ * @return the max item of the stream
250
239
  */
251
- T get_max_value() const;
240
+ T get_max_item() const;
252
241
 
253
242
  /**
254
243
  * Returns an instance of the comparator for this sketch.
@@ -257,134 +246,128 @@ class kll_sketch {
257
246
  C get_comparator() const;
258
247
 
259
248
  /**
260
- * Returns an approximation to the value of the data item
261
- * that would be preceded by the given fraction of a hypothetical sorted
262
- * version of the input stream so far.
263
- * <p>
264
- * Note that this method has a fairly large overhead (microseconds instead of nanoseconds)
265
- * so it should not be called multiple times to get different quantiles from the same
266
- * sketch. Instead use get_quantiles(), which pays the overhead only once.
267
- * <p>
268
- * For floating point types: if the sketch is empty this returns NaN.
269
- * For other types: if the sketch is empty this throws runtime_error.
249
+ * Returns an instance of the allocator for this sketch.
250
+ * @return allocator
251
+ */
252
+ A get_allocator() const;
253
+
254
+ /**
255
+ * Returns an item from the sketch that is the best approximation to an item
256
+ * from the original stream with the given rank.
257
+ *
258
+ * <p>If the sketch is empty this throws std::runtime_error.
270
259
  *
271
- * @param fraction the specified fractional position in the hypothetical sorted stream.
272
- * These are also called normalized ranks or fractional ranks.
273
- * If fraction = 0.0, the true minimum value of the stream is returned.
274
- * If fraction = 1.0, the true maximum value of the stream is returned.
275
- * If the parameter inclusive=true, the given rank is considered inclusive (includes the weight of an item)
260
+ * @param rank of an item in the hypothetical sorted stream.
261
+ * @param inclusive if true, the given rank is considered inclusive (includes weight of an item)
276
262
  *
277
- * @return the approximation to the value at the given fraction
263
+ * @return approximate quantile associated with the given rank
278
264
  */
279
- using quantile_return_type = typename quantile_sketch_sorted_view<T, C, A>::quantile_return_type;
280
- template<bool inclusive = false>
281
- quantile_return_type get_quantile(double fraction) const;
265
+ using quantile_return_type = typename quantiles_sorted_view<T, C, A>::quantile_return_type;
266
+ quantile_return_type get_quantile(double rank, bool inclusive = true) const;
282
267
 
283
268
  /**
284
- * This is a more efficient multiple-query version of get_quantile().
285
- * <p>
286
269
  * This returns an array that could have been generated by using get_quantile() for each
287
- * fractional rank separately, but would be very inefficient.
288
- * This method incurs the internal set-up overhead once and obtains multiple quantile values in
289
- * a single query. It is strongly recommend that this method be used instead of multiple calls
290
- * to get_quantile().
270
+ * rank separately.
271
+ *
272
+ * <p>If the sketch is empty this throws std::runtime_error.
291
273
  *
292
- * <p>If the sketch is empty this returns an empty vector.
274
+ * @param ranks given array of ranks in the hypothetical sorted stream.
275
+ * These ranks must be in the interval [0.0, 1.0].
276
+ * @param size the number of ranks in the array
277
+ * @param inclusive if true, the given ranks are considered inclusive (include weights of items)
293
278
  *
294
- * @param fractions given array of fractional positions in the hypothetical sorted stream.
295
- * These are also called normalized ranks or fractional ranks.
296
- * These fractions must be in the interval [0.0, 1.0], inclusive.
297
- * If the parameter inclusive=true, the given fractions are considered inclusive (include weights of items)
279
+ * @return array of approximate quantiles corresponding to the given ranks in the same order.
298
280
  *
299
- * @return array of approximations to the given fractions in the same order as given fractions
300
- * in the input array.
281
+ * Deprecated. Will be removed in the next major version. Use get_quantile() instead.
301
282
  */
302
- template<bool inclusive = false>
303
- std::vector<T, A> get_quantiles(const double* fractions, uint32_t size) const;
283
+ std::vector<T, A> get_quantiles(const double* ranks, uint32_t size, bool inclusive = true) const;
304
284
 
305
285
  /**
306
286
  * This is a multiple-query version of get_quantile() that allows the caller to
307
- * specify the number of evenly-spaced fractional ranks.
287
+ * specify the number of evenly-spaced ranks.
308
288
  *
309
- * <p>If the sketch is empty this returns an empty vector.
289
+ * <p>If the sketch is empty this throws std::runtime_error.
310
290
  *
311
- * @param num an integer that specifies the number of evenly-spaced fractional ranks.
312
- * This must be an integer greater than 0. A value of 1 will return the min value.
313
- * A value of 2 will return the min and the max value. A value of 3 will return the min,
314
- * the median and the max value, etc.
291
+ * @param num an integer that specifies the number of evenly-spaced ranks.
292
+ * This must be an integer greater than 0. A value of 1 will return the quantile of rank 0.
293
+ * A value of 2 will return quantiles of ranks 0 and 1. A value of 3 will return quantiles of ranks 0,
294
+ * 0.5 (median) and 1, etc.
295
+ * @param inclusive if true, the ranks are considered inclusive (include weights of items)
315
296
  *
316
- * @return array of approximations to the given number of evenly-spaced fractional ranks.
297
+ * @return array of approximate quantiles corresponding to the given number of evenly-spaced ranks.
298
+ *
299
+ * Deprecated. Will be removed in the next major version. Use get_quantile() instead.
317
300
  */
318
- template<bool inclusive = false>
319
- std::vector<T, A> get_quantiles(uint32_t num) const;
301
+ std::vector<T, A> get_quantiles(uint32_t num, bool inclusive = true) const;
320
302
 
321
303
  /**
322
- * Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1,
323
- * inclusive.
324
- * With the template parameter inclusive=true the weight of the given value is included into the rank.
325
- * Otherwise the rank equals the sum of the weights of all values that are less than the given value
326
- * according to the comparator C.
304
+ * Returns an approximation to the normalized rank of the given item from 0 to 1, inclusive.
327
305
  *
328
306
  * <p>The resulting approximation has a probabilistic guarantee that can be obtained from the
329
307
  * get_normalized_rank_error(false) function.
330
308
  *
331
- * <p>If the sketch is empty this returns NaN.
309
+ * <p>If the sketch is empty this throws std::runtime_error.
310
+ *
311
+ * @param item to be ranked.
312
+ * @param inclusive if true the weight of the given item is included into the rank.
313
+ * Otherwise the rank equals the sum of the weights of all items that are less than the given item
314
+ * according to the comparator C.
332
315
  *
333
- * @param value to be ranked
334
- * @return an approximate rank of the given value
316
+ * @return an approximate rank of the given item
335
317
  */
336
- template<bool inclusive = false>
337
- double get_rank(const T& value) const;
318
+ double get_rank(const T& item, bool inclusive = true) const;
338
319
 
339
320
  /**
340
321
  * Returns an approximation to the Probability Mass Function (PMF) of the input stream
341
- * given a set of split points (values).
322
+ * given a set of split points (items).
342
323
  *
343
324
  * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
344
325
  * get_normalized_rank_error(true) function.
345
326
  *
346
- * <p>If the sketch is empty this returns an empty vector.
327
+ * <p>If the sketch is empty this throws std::runtime_error.
347
328
  *
348
- * @param split_points an array of <i>m</i> unique, monotonically increasing values
349
- * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
350
- * The definition of an "interval" is inclusive of the left split point (or minimum value) and
351
- * exclusive of the right split point, with the exception that the last interval will include
352
- * the maximum value.
353
- * It is not necessary to include either the min or max values in these split points.
329
+ * @param split_points an array of <i>m</i> unique, monotonically increasing items
330
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
331
+ *
332
+ * @param size the number of split points in the array
333
+ *
334
+ * @param inclusive if true the rank of an item includes its own weight, and therefore
335
+ * if the sketch contains items equal to a slit point, then in PMF such items are
336
+ * included into the interval to the left of split point. Otherwise they are included into the interval
337
+ * to the right of split point.
354
338
  *
355
339
  * @return an array of m+1 doubles each of which is an approximation
356
- * to the fraction of the input stream values (the mass) that fall into one of those intervals.
357
- * If the template parameter inclusive=false, the definition of an "interval" is inclusive of the left split point and exclusive of the right
358
- * split point, with the exception that the last interval will include the maximum value.
359
- * If the template parameter inclusive=true, the definition of an "interval" is exclusive of the left split point and inclusive of the right
360
- * split point.
340
+ * to the fraction of the input stream items (the mass) that fall into one of those intervals.
361
341
  */
362
- template<bool inclusive = false>
363
- vector_d<A> get_PMF(const T* split_points, uint32_t size) const;
342
+ using vector_double = typename quantiles_sorted_view<T, C, A>::vector_double;
343
+ vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
364
344
 
365
345
  /**
366
346
  * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
367
- * cumulative analog of the PMF, of the input stream given a set of split points (values).
347
+ * cumulative analog of the PMF, of the input stream given a set of split points (items).
368
348
  *
369
349
  * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
370
350
  * get_normalized_rank_error(false) function.
371
351
  *
372
- * <p>If the sketch is empty this returns an empty vector.
352
+ * <p>If the sketch is empty this throws std::runtime_error.
373
353
  *
374
- * @param split_points an array of <i>m</i> unique, monotonically increasing values
354
+ * @param split_points an array of <i>m</i> unique, monotonically increasing items
375
355
  * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
376
- * The definition of an "interval" is inclusive of the left split point (or minimum value) and
377
- * exclusive of the right split point, with the exception that the last interval will include
378
- * the maximum value.
379
- * It is not necessary to include either the min or max values in these split points.
380
356
  *
381
- * @return an array of m+1 double values, which are a consecutive approximation to the CDF
357
+ * @param size the number of split points in the array
358
+ *
359
+ * @param inclusive if true the rank of an item includes its own weight, and therefore
360
+ * if the sketch contains items equal to a slit point, then in CDF such items are
361
+ * included into the interval to the left of split point. Otherwise they are included into
362
+ * the interval to the right of split point.
363
+ *
364
+ * @return an array of m+1 doubles, which are a consecutive approximation to the CDF
382
365
  * of the input stream given the split_points. The value at array position j of the returned
383
366
  * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
384
- * array.
367
+ * array. This can be viewed as array of ranks of the given split points plus one more value
368
+ * that is always 1.
385
369
  */
386
- template<bool inclusive = false>
387
- vector_d<A> get_CDF(const T* split_points, uint32_t size) const;
370
+ vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
388
371
 
389
372
  /**
390
373
  * Gets the approximate rank error of this sketch normalized as a fraction between zero and one.
@@ -398,19 +381,19 @@ class kll_sketch {
398
381
  /**
399
382
  * Computes size needed to serialize the current state of the sketch.
400
383
  * This version is for fixed-size arithmetic types (integral and floating point).
401
- * @param serde instance of a SerDe
384
+ * @param sd instance of a SerDe
402
385
  * @return size in bytes needed to serialize this sketch
403
386
  */
404
- template<typename TT = T, typename SerDe = S, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
387
+ template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
405
388
  size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
406
389
 
407
390
  /**
408
391
  * Computes size needed to serialize the current state of the sketch.
409
392
  * This version is for all other types and can be expensive since every item needs to be looked at.
410
- * @param serde instance of a SerDe
393
+ * @param sd instance of a SerDe
411
394
  * @return size in bytes needed to serialize this sketch
412
395
  */
413
- template<typename TT = T, typename SerDe = S, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
396
+ template<typename TT = T, typename SerDe = serde<T>, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
414
397
  size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
415
398
 
416
399
  /**
@@ -443,14 +426,14 @@ class kll_sketch {
443
426
  /**
444
427
  * This method serializes the sketch into a given stream in a binary form
445
428
  * @param os output stream
446
- * @param instance of a SerDe
429
+ * @param sd instance of a SerDe
447
430
  */
448
- template<typename SerDe = S>
431
+ template<typename SerDe = serde<T>>
449
432
  void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
450
433
 
451
434
  // This is a convenience alias for users
452
435
  // The type returned by the following serialize method
453
- using vector_bytes = vector_u8<A>;
436
+ using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
454
437
 
455
438
  /**
456
439
  * This method serializes the sketch as a vector of bytes.
@@ -458,53 +441,36 @@ class kll_sketch {
458
441
  * It is a blank space of a given size.
459
442
  * This header is used in Datasketches PostgreSQL extension.
460
443
  * @param header_size_bytes space to reserve in front of the sketch
461
- * @param instance of a SerDe
444
+ * @param sd instance of a SerDe
462
445
  * @return serialized sketch as a vector of bytes
463
446
  */
464
- template<typename SerDe = S>
447
+ template<typename SerDe = serde<T>>
465
448
  vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
466
449
 
467
450
  /**
468
451
  * This method deserializes a sketch from a given stream.
469
452
  * @param is input stream
470
- * @param allocator instance of an Allocator
471
- * @return an instance of a sketch
472
- *
473
- * Deprecated, to be removed in the next major version
474
- */
475
- static kll_sketch deserialize(std::istream& is, const A& allocator = A());
476
-
477
- /**
478
- * This method deserializes a sketch from a given stream.
479
- * @param is input stream
480
- * @param serde instance of a SerDe
453
+ * @param sd instance of a SerDe
454
+ * @param comparator instance of a Comparator
481
455
  * @param allocator instance of an Allocator
482
456
  * @return an instance of a sketch
483
457
  */
484
- template<typename SerDe = S>
485
- static kll_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
458
+ template<typename SerDe = serde<T>>
459
+ static kll_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(),
460
+ const C& comparator = C(), const A& allocator = A());
486
461
 
487
462
  /**
488
463
  * This method deserializes a sketch from a given array of bytes.
489
464
  * @param bytes pointer to the array of bytes
490
465
  * @param size the size of the array
466
+ * @param sd instance of a SerDe
467
+ * @param comparator instance of a Comparator
491
468
  * @param allocator instance of an Allocator
492
469
  * @return an instance of a sketch
493
- *
494
- * Deprecated, to be removed in the next major version
495
470
  */
496
- static kll_sketch deserialize(const void* bytes, size_t size, const A& allocator = A());
497
-
498
- /**
499
- * This method deserializes a sketch from a given array of bytes.
500
- * @param bytes pointer to the array of bytes
501
- * @param size the size of the array
502
- * @param serde instance of a SerDe
503
- * @param allocator instance of an Allocator
504
- * @return an instance of a sketch
505
- */
506
- template<typename SerDe = S>
507
- static kll_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
471
+ template<typename SerDe = serde<T>>
472
+ static kll_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(),
473
+ const C& comparator = C(), const A& allocator = A());
508
474
 
509
475
  /*
510
476
  * Gets the normalized rank error given k and pmf.
@@ -526,14 +492,7 @@ class kll_sketch {
526
492
  const_iterator begin() const;
527
493
  const_iterator end() const;
528
494
 
529
- template<bool inclusive = false>
530
- quantile_sketch_sorted_view<T, C, A> get_sorted_view(bool cumulative) const;
531
-
532
- #ifdef KLL_VALIDATION
533
- uint8_t get_num_levels() { return num_levels_; }
534
- uint32_t* get_levels() { return levels_; }
535
- T* get_items() { return items_; }
536
- #endif
495
+ quantiles_sorted_view<T, C, A> get_sorted_view() const;
537
496
 
538
497
  private:
539
498
  /* Serialized sketch layout:
@@ -559,28 +518,30 @@ class kll_sketch {
559
518
  static const uint8_t PREAMBLE_INTS_SHORT = 2; // for empty and single item
560
519
  static const uint8_t PREAMBLE_INTS_FULL = 5;
561
520
 
521
+ C comparator_;
562
522
  A allocator_;
563
523
  uint16_t k_;
564
524
  uint8_t m_; // minimum buffer "width"
565
525
  uint16_t min_k_; // for error estimation after merging with different k
566
- uint64_t n_;
567
526
  uint8_t num_levels_;
568
- vector_u32<A> levels_;
527
+ bool is_level_zero_sorted_;
528
+ uint64_t n_;
529
+ vector_u32 levels_;
569
530
  T* items_;
570
531
  uint32_t items_size_;
571
- T* min_value_;
572
- T* max_value_;
573
- bool is_level_zero_sorted_;
532
+ T* min_item_;
533
+ T* max_item_;
534
+ mutable quantiles_sorted_view<T, C, A>* sorted_view_;
574
535
 
575
536
  // for deserialization
576
537
  class item_deleter;
577
538
  class items_deleter;
578
- kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32<A>&& levels,
579
- std::unique_ptr<T, items_deleter> items, uint32_t items_size, std::unique_ptr<T, item_deleter> min_value,
580
- std::unique_ptr<T, item_deleter> max_value, bool is_level_zero_sorted);
539
+ kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32&& levels,
540
+ std::unique_ptr<T, items_deleter> items, uint32_t items_size, std::unique_ptr<T, item_deleter> min_item,
541
+ std::unique_ptr<T, item_deleter> max_item, bool is_level_zero_sorted, const C& comparator);
581
542
 
582
543
  // common update code
583
- inline void update_min_max(const T& value);
544
+ inline void update_min_max(const T& item);
584
545
  inline uint32_t internal_update();
585
546
 
586
547
  // The following code is only valid in the special case of exactly reaching capacity while updating.
@@ -591,15 +552,6 @@ class kll_sketch {
591
552
  void add_empty_top_level_to_completely_full_sketch();
592
553
  void sort_level_zero();
593
554
 
594
- template<bool inclusive>
595
- vector_d<A> get_PMF_or_CDF(const T* split_points, uint32_t size, bool is_CDF) const;
596
- template<bool inclusive>
597
- void increment_buckets_unsorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
598
- const T* split_points, uint32_t size, double* buckets) const;
599
- template<bool inclusive>
600
- void increment_buckets_sorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
601
- const T* split_points, uint32_t size, double* buckets) const;
602
-
603
555
  template<typename O> void merge_higher_levels(O&& other, uint64_t final_n);
604
556
 
605
557
  template<typename FwdSk>
@@ -616,43 +568,34 @@ class kll_sketch {
616
568
 
617
569
  void check_sorting() const;
618
570
 
619
- // implementations for floating point types
620
571
  template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
621
- static const TT& get_invalid_value() {
622
- static TT value = std::numeric_limits<TT>::quiet_NaN();
623
- return value;
572
+ static inline bool check_update_item(TT item) {
573
+ return !std::isnan(item);
624
574
  }
625
575
 
626
- template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
627
- static inline bool check_update_value(TT value) {
628
- return !std::isnan(value);
629
- }
630
-
631
- // implementations for all other types
632
576
  template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
633
- static const TT& get_invalid_value() {
634
- throw std::runtime_error("getting quantiles from empty sketch is not supported for this type of value");
635
- }
636
-
637
- template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
638
- static inline bool check_update_value(TT) {
577
+ static inline bool check_update_item(TT) {
639
578
  return true;
640
579
  }
641
580
 
642
581
  // for type converting constructor
643
- template<typename TT, typename CC, typename SS, typename AA>
644
- friend class kll_sketch;
582
+ template<typename TT, typename CC, typename AA> friend class kll_sketch;
583
+
584
+ void setup_sorted_view() const; // modifies mutable state
585
+ void reset_sorted_view();
645
586
  };
646
587
 
647
- template<typename T, typename C, typename S, typename A>
648
- class kll_sketch<T, C, S, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
588
+ template<typename T, typename C, typename A>
589
+ class kll_sketch<T, C, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
649
590
  public:
650
- friend class kll_sketch<T, C, S, A>;
591
+ using value_type = std::pair<const T&, const uint64_t>;
592
+ friend class kll_sketch<T, C, A>;
651
593
  const_iterator& operator++();
652
594
  const_iterator& operator++(int);
653
595
  bool operator==(const const_iterator& other) const;
654
596
  bool operator!=(const const_iterator& other) const;
655
- const std::pair<const T&, const uint64_t> operator*() const;
597
+ const value_type operator*() const;
598
+ const return_value_holder<value_type> operator->() const;
656
599
  private:
657
600
  const T* items;
658
601
  const uint32_t* levels;