datasketches 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/ext/datasketches/kll_wrapper.cpp +20 -20
  4. data/ext/datasketches/theta_wrapper.cpp +2 -2
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
  7. data/vendor/datasketches-cpp/MANIFEST.in +21 -2
  8. data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
  10. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
  11. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
  12. data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
  13. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
  14. data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
  15. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
  16. data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
  17. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
  18. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
  19. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
  20. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
  21. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
  22. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
  23. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
  24. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  25. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
  26. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
  27. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
  28. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
  29. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
  30. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
  31. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
  32. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
  33. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
  34. data/vendor/datasketches-cpp/pyproject.toml +17 -12
  35. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  36. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
  37. data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
  38. data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
  39. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
  40. data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
  41. data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
  42. data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
  43. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
  44. data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
  45. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
  46. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
  47. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
  48. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
  49. data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
  50. data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
  51. data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
  52. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
  53. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
  54. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
  55. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
  56. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
  57. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
  58. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
  59. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
  60. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
  61. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
  62. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
  63. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
  64. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
  65. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
  66. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
  67. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
  68. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
  69. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
  70. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
  71. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
  72. data/vendor/datasketches-cpp/setup.py +14 -2
  73. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
  74. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
  75. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
  76. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
  77. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
  78. data/vendor/datasketches-cpp/tox.ini +26 -0
  79. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
  80. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
  81. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
  82. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
  83. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
  84. data/vendor/datasketches-cpp/version.cfg.in +1 -0
  85. metadata +14 -5
  86. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
@@ -24,7 +24,7 @@
24
24
  #include <memory>
25
25
  #include <vector>
26
26
 
27
- #include "quantile_sketch_sorted_view.hpp"
27
+ #include "quantiles_sorted_view.hpp"
28
28
  #include "common_defs.hpp"
29
29
  #include "serde.hpp"
30
30
 
@@ -32,22 +32,21 @@ namespace datasketches {
32
32
 
33
33
  /**
34
34
  * This is a stochastic streaming sketch that enables near-real time analysis of the
35
- * approximate distribution of real values from a very large stream in a single pass.
36
- * The analysis is obtained using a getQuantiles(*) function or its inverse functions the
37
- * Probability Mass Function from getPMF(*) and the Cumulative Distribution Function from getCDF(*).
35
+ * approximate distribution from a very large stream in a single pass.
36
+ * The analysis is obtained using get_rank() and get_quantile() functions,
37
+ * the Probability Mass Function from get_PMF() and the Cumulative Distribution Function from get_CDF().
38
38
  *
39
39
  * <p>Consider a large stream of one million values such as packet sizes coming into a network node.
40
- * The absolute rank of any specific size value is simply its index in the hypothetical sorted
40
+ * The natural rank of any specific size value is its index in the hypothetical sorted
41
41
  * array of values.
42
- * The normalized rank (or fractional rank) is the absolute rank divided by the stream size,
42
+ * The normalized rank is the natural rank divided by the stream size,
43
43
  * in this case one million.
44
44
  * The value corresponding to the normalized rank of 0.5 represents the 50th percentile or median
45
- * value of the distribution, or getQuantile(0.5). Similarly, the 95th percentile is obtained from
46
- * getQuantile(0.95). Using the getQuantiles(0.0, 1.0) will return the min and max values seen by
47
- * the sketch.</p>
45
+ * value of the distribution, or get_quantile(0.5). Similarly, the 95th percentile is obtained from
46
+ * get_quantile(0.95).</p>
48
47
  *
49
48
  * <p>From the min and max values, for example, 1 and 1000 bytes,
50
- * you can obtain the PMF from getPMF(100, 500, 900) that will result in an array of
49
+ * you can obtain the PMF from get_PMF(100, 500, 900) that will result in an array of
51
50
  * 4 fractional values such as {.4, .3, .2, .1}, which means that
52
51
  * <ul>
53
52
  * <li>40% of the values were &lt; 100,</li>
@@ -55,20 +54,19 @@ namespace datasketches {
55
54
  * <li>20% of the values were &ge; 500 and &lt; 900, and</li>
56
55
  * <li>10% of the values were &ge; 900.</li>
57
56
  * </ul>
58
- * A frequency histogram can be obtained by simply multiplying these fractions by getN(),
57
+ * A frequency histogram can be obtained by multiplying these fractions by get_n(),
59
58
  * which is the total count of values received.
60
- * The getCDF(*) works similarly, but produces the cumulative distribution instead.
59
+ * The get_CDF() works similarly, but produces the cumulative distribution instead.
61
60
  *
62
61
  * <p>As of November 2021, this implementation produces serialized sketches which are binary-compatible
63
62
  * with the equivalent Java implementation only when template parameter T = double
64
63
  * (64-bit double precision values).
65
-
66
64
  *
67
65
  * <p>The accuracy of this sketch is a function of the configured value <i>k</i>, which also affects
68
66
  * the overall size of the sketch. Accuracy of this quantile sketch is always with respect to
69
- * the normalized rank. A <i>k</i> of 128 produces a normalized, rank error of about 1.7%.
70
- * For example, the median value returned from getQuantile(0.5) will be between the actual values
71
- * from the hypothetically sorted array of input values at normalized ranks of 0.483 and 0.517, with
67
+ * the normalized rank. A <i>k</i> of 128 produces a normalized, rank error of about 1.7%.
68
+ * For example, the median item returned from getQuantile(0.5) will be between the actual items
69
+ * from the hypothetically sorted array of input items at normalized ranks of 0.483 and 0.517, with
72
70
  * a confidence of about 99%.</p>
73
71
  *
74
72
  * <pre>
@@ -121,17 +119,17 @@ Table Guide for DoublesSketch Size in Bytes and Approximate Error:
121
119
  * by Agarwal, Cormode, Huang, Phillips, Wei, and Yi.
122
120
  * <a href="http://dblp.org/rec/html/journals/tods/AgarwalCHPWY13"></a></p>
123
121
  *
124
- * <p>This algorithm is independent of the distribution of values and
125
- * requires only that the values be comparable.</p
122
+ * <p>This algorithm is independent of the distribution of items and
123
+ * requires only that the items be comparable.</p>
126
124
  *
127
- * <p>This algorithm intentionally inserts randomness into the sampling process for values that
125
+ * <p>This algorithm intentionally inserts randomness into the sampling process for items that
128
126
  * ultimately get retained in the sketch. The results produced by this algorithm are not
129
127
  * deterministic. For example, if the same stream is inserted into two different instances of this
130
128
  * sketch, the answers obtained from the two sketches may not be identical.</p>
131
129
  *
132
- * <p>Similarly, there may be directional inconsistencies. For example, the resulting array of
133
- * values obtained from getQuantiles(fractions[]) input into the reverse directional query
134
- * getPMF(splitPoints[]) may not result in the original fractional values.</p>
130
+ * <p>Similarly, there may be directional inconsistencies. For example, the result
131
+ * obtained from get_quantile(rank) input into the reverse directional query
132
+ * get_rank(item) may not result in the original item.</p>
135
133
  *
136
134
  * @author Kevin Lang
137
135
  * @author Lee Rhodes
@@ -153,9 +151,9 @@ public:
153
151
  using value_type = T;
154
152
  using allocator_type = Allocator;
155
153
  using comparator = Comparator;
156
- using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
157
154
 
158
- explicit quantiles_sketch(uint16_t k = quantiles_constants::DEFAULT_K, const Allocator& allocator = Allocator());
155
+ explicit quantiles_sketch(uint16_t k = quantiles_constants::DEFAULT_K,
156
+ const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
159
157
  quantiles_sketch(const quantiles_sketch& other);
160
158
  quantiles_sketch(quantiles_sketch&& other) noexcept;
161
159
  ~quantiles_sketch();
@@ -165,17 +163,19 @@ public:
165
163
  /**
166
164
  * @brief Type converting constructor
167
165
  * @param other quantiles sketch of a different type
166
+ * @param comparator instance of a Comparator
168
167
  * @param allocator instance of an Allocator
169
168
  */
170
169
  template<typename From, typename FC, typename FA>
171
- explicit quantiles_sketch(const quantiles_sketch<From, FC, FA>& other, const Allocator& allocator = Allocator());
170
+ explicit quantiles_sketch(const quantiles_sketch<From, FC, FA>& other,
171
+ const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
172
172
 
173
173
  /**
174
174
  * Updates this sketch with the given data item.
175
- * @param value an item from a stream of items
175
+ * @param item from a stream of items
176
176
  */
177
177
  template<typename FwdT>
178
- void update(FwdT&& value);
178
+ void update(FwdT&& item);
179
179
 
180
180
  /**
181
181
  * Merges another sketch into this one.
@@ -215,20 +215,18 @@ public:
215
215
  bool is_estimation_mode() const;
216
216
 
217
217
  /**
218
- * Returns the min value of the stream.
219
- * For floating point types: if the sketch is empty this returns NaN.
220
- * For other types: if the sketch is empty this throws runtime_error.
221
- * @return the min value of the stream
218
+ * Returns the min item of the stream.
219
+ * If the sketch is empty this throws std::runtime_error.
220
+ * @return the min item of the stream
222
221
  */
223
- const T& get_min_value() const;
222
+ const T& get_min_item() const;
224
223
 
225
224
  /**
226
- * Returns the max value of the stream.
227
- * For floating point types: if the sketch is empty this returns NaN.
228
- * For other types: if the sketch is empty this throws runtime_error.
229
- * @return the max value of the stream
225
+ * Returns the max item of the stream.
226
+ * If the sketch is empty this throws std::runtime_error.
227
+ * @return the max item of the stream
230
228
  */
231
- const T& get_max_value() const;
229
+ const T& get_max_item() const;
232
230
 
233
231
  /**
234
232
  * Returns an instance of the comparator for this sketch.
@@ -243,166 +241,147 @@ public:
243
241
  allocator_type get_allocator() const;
244
242
 
245
243
  /**
246
- * Returns an approximation to the value of the data item
247
- * that would be preceded by the given fraction of a hypothetical sorted
248
- * version of the input stream so far.
249
- * <p>
250
- * Note that this method has a fairly large overhead (microseconds instead of nanoseconds)
251
- * so it should not be called multiple times to get different quantiles from the same
252
- * sketch. Instead use get_quantiles(), which pays the overhead only once.
244
+ * Returns an approximation to the data item associated with the given rank
245
+ * of a hypothetical sorted version of the input stream so far.
253
246
  * <p>
254
- * For floating point types: if the sketch is empty this returns NaN.
255
- * For other types: if the sketch is empty this throws runtime_error.
247
+ * If the sketch is empty this throws std::runtime_error.
256
248
  *
257
- * @param rank the specified fractional position in the hypothetical sorted stream.
258
- * These are also called normalized ranks or fractional ranks.
259
- * If rank = 0.0, the true minimum value of the stream is returned.
260
- * If rank = 1.0, the true maximum value of the stream is returned.
249
+ * @param rank the specified normalized rank in the hypothetical sorted stream.
261
250
  *
262
- * @return the approximation to the value at the given rank
251
+ * @return the approximation to the item at the given rank
263
252
  */
264
- using quantile_return_type = typename quantile_sketch_sorted_view<T, Comparator, Allocator>::quantile_return_type;
265
- template<bool inclusive = false>
266
- quantile_return_type get_quantile(double rank) const;
253
+ using quantile_return_type = typename quantiles_sorted_view<T, Comparator, Allocator>::quantile_return_type;
254
+ quantile_return_type get_quantile(double rank, bool inclusive = true) const;
267
255
 
268
256
  /**
269
- * This is a more efficient multiple-query version of get_quantile().
257
+ * This is a multiple-query version of get_quantile().
270
258
  * <p>
271
259
  * This returns an array that could have been generated by using get_quantile() for each
272
- * fractional rank separately, but would be very inefficient.
273
- * This method incurs the internal set-up overhead once and obtains multiple quantile values in
274
- * a single query. It is strongly recommend that this method be used instead of multiple calls
275
- * to get_quantile().
260
+ * normalized rank separately.
276
261
  *
277
- * <p>If the sketch is empty this returns an empty vector.
262
+ * <p>If the sketch is empty this throws std::runtime_error.
278
263
  *
279
- * @param fractions given array of fractional positions in the hypothetical sorted stream.
280
- * These are also called normalized ranks or fractional ranks.
281
- * These fractions must be in the interval [0.0, 1.0], inclusive.
264
+ * @param ranks given array of normalized ranks in the hypothetical sorted stream.
265
+ * These ranks must be in the interval [0.0, 1.0], inclusive.
266
+ * @param size the number of ranks in the array
282
267
  *
283
- * @return array of approximations to the given fractions in the same order as given fractions
268
+ * @return array of approximations to items associated with given ranks in the same order as given ranks
284
269
  * in the input array.
270
+ *
271
+ * Deprecated. Will be removed in the next major version. Use get_quantile() instead.
285
272
  */
286
- template<bool inclusive = false>
287
- std::vector<T, Allocator> get_quantiles(const double* fractions, uint32_t size) const;
273
+ std::vector<T, Allocator> get_quantiles(const double* ranks, uint32_t size, bool inclusive = true) const;
288
274
 
289
275
  /**
290
276
  * This is a multiple-query version of get_quantile() that allows the caller to
291
- * specify the number of evenly-spaced fractional ranks.
277
+ * specify the number of evenly-spaced normalized ranks.
278
+ *
279
+ * <p>If the sketch is empty this throws std::runtime_error.
292
280
  *
293
- * <p>If the sketch is empty this returns an empty vector.
281
+ * @param num an integer that specifies the number of evenly-spaced ranks.
282
+ * This must be an integer greater than 0. A value of 1 is equivalent to get_quantiles([0]).
283
+ * A value of 2 is equivalent to get_quantiles([0, 1]). A value of 3 is equivalent to
284
+ * get_quantiles([0, 0.5, 1]), etc.
294
285
  *
295
- * @param num an integer that specifies the number of evenly-spaced fractional ranks.
296
- * This must be an integer greater than 0. A value of 1 will return the min value.
297
- * A value of 2 will return the min and the max value. A value of 3 will return the min,
298
- * the median and the max value, etc.
286
+ * @return array of approximations to items associated with the given number of evenly-spaced normalized ranks.
299
287
  *
300
- * @return array of approximations to the given number of evenly-spaced fractional ranks.
288
+ * Deprecated. Will be removed in the next major version. Use get_quantile() instead.
301
289
  */
302
- template<bool inclusive = false>
303
- std::vector<T, Allocator> get_quantiles(uint32_t num) const;
290
+ std::vector<T, Allocator> get_quantiles(uint32_t num, bool inclusive = true) const;
304
291
 
305
292
  /**
306
- * Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1,
307
- * inclusive. When template parameter <em>inclusive=false</em> (the default), only elements strictly
308
- * less than the provided value are included in the rank estimate. With <em>inclusive=true</em>,
309
- * the rank estimate includes elements less than or equal to the provided value.
293
+ * Returns an approximation to the normalized rank of the given item from 0 to 1, inclusive.
310
294
  *
311
295
  * <p>The resulting approximation has a probabilistic guarantee that can be obtained from the
312
296
  * get_normalized_rank_error(false) function.
313
297
  *
314
- * <p>If the sketch is empty this returns NaN.
298
+ * <p>If the sketch is empty this throws std::runtime_error.
315
299
  *
316
- * @param value to be ranked
317
- * @return an approximate rank of the given value
300
+ * @param item to be ranked
301
+ * @param inclusive if true the weight of the given item is included into the rank.
302
+ * Otherwise the rank equals the sum of the weights of all items that are less than the given item
303
+ * according to the comparator C.
304
+ * @return an approximate normalized rank of the given item
318
305
  */
319
- template<bool inclusive = false>
320
- double get_rank(const T& value) const;
306
+ double get_rank(const T& item, bool inclusive = true) const;
321
307
 
322
308
  /**
323
309
  * Returns an approximation to the Probability Mass Function (PMF) of the input stream
324
- * given a set of split points (values).
310
+ * given a set of split points (items).
325
311
  *
326
312
  * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
327
313
  * get_normalized_rank_error(true) function.
328
314
  *
329
- * <p>If the sketch is empty this returns an empty vector.
315
+ * <p>If the sketch is empty this throws std::runtime_error.
330
316
  *
331
- * @param split_points an array of <i>m</i> unique, monotonically increasing values
332
- * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
333
- * If the template parameter <em>inclusive=false</em> (the default), the definition of an "interval"
334
- * is inclusive of the left split point and exclusive of the right
335
- * split point, with the exception that the last interval will include the maximum value.
336
- * If the template parameter <em>inclusive=true</em>, the definition of an "interval" is exclusive of
337
- * the left split point and inclusive of the right split point.
338
- * It is not necessary to include either the min or max values in these split points.
317
+ * @param split_points an array of <i>m</i> unique, monotonically increasing items
318
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
319
+ *
320
+ * @param size of the array of split points.
321
+ *
322
+ * @param inclusive if true the rank of an item includes its own weight, and therefore
323
+ * if the sketch contains items equal to a slit point, then in PMF such items are
324
+ * included into the interval to the left of split point. Otherwise they are included into the interval
325
+ * to the right of split point.
339
326
  *
340
327
  * @return an array of m+1 doubles each of which is an approximation
341
- * to the fraction of the input stream values (the mass) that fall into one of those intervals.
342
- * When <em>inclusive=false</em> (the default), the definition of an "interval" is inclusive
343
- * of the left split point and exclusive of the right split point, with the exception that the last
344
- * interval will include the maximum value. When <em>inclusive=true</em>,
345
- * an "interval" is exclusive of the left split point and inclusive of the right.
328
+ * to the fraction of the input stream items (the mass) that fall into one of those intervals.
346
329
  */
347
- template<bool inclusive = false>
348
- vector_double get_PMF(const T* split_points, uint32_t size) const;
330
+ using vector_double = typename quantiles_sorted_view<T, Comparator, Allocator>::vector_double;
331
+ vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
349
332
 
350
333
  /**
351
334
  * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
352
- * cumulative analog of the PMF, of the input stream given a set of split points (values).
335
+ * cumulative analog of the PMF, of the input stream given a set of split points (items).
353
336
  *
354
337
  * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
355
338
  * get_normalized_rank_error(false) function.
356
339
  *
357
- * <p>If the sketch is empty this returns an empty vector.
340
+ * <p>If the sketch is empty this throws std::runtime_error.
358
341
  *
359
- * @param split_points an array of <i>m</i> unique, monotonically increasing values
342
+ * @param split_points an array of <i>m</i> unique, monotonically increasing items
360
343
  * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
361
- * If the template parameter <em>inclusive=false</em> (the default), the definition of an "interval" is
362
- * inclusive of the left split point and exclusive of the right
363
- * split point, with the exception that the last interval will include the maximum value.
364
- * If the template parameter <em>inclusive=true</em>, the definition of an "interval" is exclusive of
365
- * the left split point and inclusive of the right split point.
366
- * It is not necessary to include either the min or max values in these split points.
344
+ *
345
+ * @param size of the array of split points.
346
+ *
347
+ * @param inclusive if true the rank of an item includes its own weight, and therefore
348
+ * if the sketch contains items equal to a slit point, then in CDF such items are
349
+ * included into the interval to the left of split point. Otherwise they are included into
350
+ * the interval to the right of split point.
367
351
  *
368
352
  * @return an array of m+1 double values, which are a consecutive approximation to the CDF
369
353
  * of the input stream given the split_points. The value at array position j of the returned
370
354
  * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
371
- * array.
372
- * When <em>inclusive=false</em> (the default), the definition of an "interval" is inclusive
373
- * of the left split point and exclusive of the right split point, with the exception that the last
374
- * interval will include the maximum value. When <em>inclusive=true</em>,
375
- * an "interval" is exclusive of the left split point and inclusive of the right.
376
-
355
+ * array. This can be viewed as array of ranks of the given split points plus one more value
356
+ * that is always 1.
377
357
  */
378
- template<bool inclusive = false>
379
- vector_double get_CDF(const T* split_points, uint32_t size) const;
358
+ vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
380
359
 
381
360
  /**
382
361
  * Computes size needed to serialize the current state of the sketch.
383
362
  * This version is for fixed-size arithmetic types (integral and floating point).
384
- * @param instance of a SerDe
363
+ * @param sd instance of a SerDe
385
364
  * @return size in bytes needed to serialize this sketch
386
365
  */
387
366
  template<typename SerDe = serde<T>, typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
388
- size_t get_serialized_size_bytes(const SerDe& serde = SerDe()) const;
367
+ size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
389
368
 
390
369
  /**
391
370
  * Computes size needed to serialize the current state of the sketch.
392
371
  * This version is for all other types and can be expensive since every item needs to be looked at.
393
- * @param instance of a SerDe
372
+ * @param sd instance of a SerDe
394
373
  * @return size in bytes needed to serialize this sketch
395
374
  */
396
375
  template<typename SerDe = serde<T>, typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
397
- size_t get_serialized_size_bytes(const SerDe& serde = SerDe()) const;
376
+ size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
398
377
 
399
378
  /**
400
379
  * This method serializes the sketch into a given stream in a binary form
401
380
  * @param os output stream
402
- * @param instance of a SerDe
381
+ * @param sd instance of a SerDe
403
382
  */
404
383
  template<typename SerDe = serde<T>>
405
- void serialize(std::ostream& os, const SerDe& serde = SerDe()) const;
384
+ void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
406
385
 
407
386
  // This is a convenience alias for users
408
387
  // The type returned by the following serialize method
@@ -414,32 +393,36 @@ public:
414
393
  * It is a blank space of a given size.
415
394
  * This header is used in Datasketches PostgreSQL extension.
416
395
  * @param header_size_bytes space to reserve in front of the sketch
417
- * @param instance of a SerDe
396
+ * @param sd instance of a SerDe
418
397
  * @return serialized sketch as a vector of bytes
419
398
  */
420
399
  template<typename SerDe = serde<T>>
421
- vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& serde = SerDe()) const;
400
+ vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
422
401
 
423
402
  /**
424
403
  * This method deserializes a sketch from a given stream.
425
404
  * @param is input stream
426
- * @param instance of a SerDe
427
- * @param instance of an Allocator
405
+ * @param sd instance of a SerDe
406
+ * @param comparator instance of a Comparator
407
+ * @param allocator instance of an Allocator
428
408
  * @return an instance of a sketch
429
409
  */
430
410
  template<typename SerDe = serde<T>>
431
- static quantiles_sketch deserialize(std::istream& is, const SerDe& serde = SerDe(), const Allocator& allocator = Allocator());
411
+ static quantiles_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(),
412
+ const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
432
413
 
433
414
  /**
434
415
  * This method deserializes a sketch from a given array of bytes.
435
416
  * @param bytes pointer to the array of bytes
436
417
  * @param size the size of the array
437
- * @param instance of a SerDe
438
- * @param instance of an Allocator
418
+ * @param sd instance of a SerDe
419
+ * @param comparator instance of a Comparator
420
+ * @param allocator instance of an Allocator
439
421
  * @return an instance of a sketch
440
422
  */
441
423
  template<typename SerDe = serde<T>>
442
- static quantiles_sketch deserialize(const void* bytes, size_t size, const SerDe& serde = SerDe(), const Allocator& allocator = Allocator());
424
+ static quantiles_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(),
425
+ const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
443
426
 
444
427
  /**
445
428
  * Gets the normalized rank error for this sketch. Constants were derived as the best fit to 99 percentile
@@ -471,8 +454,7 @@ public:
471
454
  const_iterator begin() const;
472
455
  const_iterator end() const;
473
456
 
474
- template<bool inclusive = false>
475
- quantile_sketch_sorted_view<T, Comparator, Allocator> get_sorted_view(bool cumulative) const;
457
+ quantiles_sorted_view<T, Comparator, Allocator> get_sorted_view() const;
476
458
 
477
459
  private:
478
460
  using Level = std::vector<T, Allocator>;
@@ -487,7 +469,7 @@ private:
487
469
  * || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
488
470
  * 1 ||---------------------------Items Seen Count (N)--------------------------------|
489
471
  *
490
- * Long 3 is the start of data, beginning with serialized min and max values, followed by
472
+ * Long 3 is the start of data, beginning with serialized min and max item, followed by
491
473
  * the sketch data buffers.
492
474
  */
493
475
 
@@ -503,23 +485,28 @@ private:
503
485
  static const uint8_t PREAMBLE_LONGS_FULL = 2;
504
486
  static const size_t DATA_START = 16;
505
487
 
488
+ Comparator comparator_;
506
489
  Allocator allocator_;
490
+ bool is_base_buffer_sorted_;
507
491
  uint16_t k_;
508
492
  uint64_t n_;
509
493
  uint64_t bit_pattern_;
510
494
  Level base_buffer_;
511
495
  VectorLevels levels_;
512
- T* min_value_;
513
- T* max_value_;
514
- bool is_sorted_;
496
+ T* min_item_;
497
+ T* max_item_;
498
+ mutable quantiles_sorted_view<T, Comparator, Allocator>* sorted_view_;
499
+
500
+ void setup_sorted_view() const; // modifies mutable state
501
+ void reset_sorted_view();
515
502
 
516
503
  // for deserialization
517
504
  class item_deleter;
518
505
  class items_deleter;
519
506
  quantiles_sketch(uint16_t k, uint64_t n, uint64_t bit_pattern,
520
507
  Level&& base_buffer, VectorLevels&& levels,
521
- std::unique_ptr<T, item_deleter> min_value, std::unique_ptr<T, item_deleter> max_value,
522
- bool is_sorted, const Allocator& allocator = Allocator());
508
+ std::unique_ptr<T, item_deleter> min_item, std::unique_ptr<T, item_deleter> max_item,
509
+ bool is_sorted, const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
523
510
 
524
511
  void grow_base_buffer();
525
512
  void process_full_base_buffer();
@@ -533,7 +520,7 @@ private:
533
520
  Level& buf_size_2k, bool apply_as_update,
534
521
  quantiles_sketch& sketch);
535
522
  static void zip_buffer(Level& buf_in, Level& buf_out);
536
- static void merge_two_size_k_buffers(Level& arr_in_1, Level& arr_in_2, Level& arr_out);
523
+ static void merge_two_size_k_buffers(Level& arr_in_1, Level& arr_in_2, Level& arr_out, const Comparator& comparator);
537
524
 
538
525
  template<typename SerDe>
539
526
  static Level deserialize_array(std::istream& is, uint32_t num_items, uint32_t capcacity, const SerDe& serde, const Allocator& allocator);
@@ -549,7 +536,7 @@ private:
549
536
  static uint32_t compute_retained_items(uint16_t k, uint64_t n);
550
537
  static uint32_t compute_base_buffer_items(uint16_t k, uint64_t n);
551
538
  static uint64_t compute_bit_pattern(uint16_t k, uint64_t n);
552
- static uint32_t compute_valid_levels(uint64_t bit_pattern);
539
+ static uint32_t count_valid_levels(uint64_t bit_pattern);
553
540
  static uint8_t compute_levels_needed(uint16_t k, uint64_t n);
554
541
 
555
542
  /**
@@ -580,60 +567,28 @@ private:
580
567
  */
581
568
  static uint8_t lowest_zero_bit_starting_at(uint64_t bits, uint8_t starting_bit);
582
569
 
583
- // implementations for floating point types
584
570
  template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
585
- static const TT& get_invalid_value() {
586
- static TT value = std::numeric_limits<TT>::quiet_NaN();
587
- return value;
571
+ static inline bool check_update_item(TT item) {
572
+ return !std::isnan(item);
588
573
  }
589
574
 
590
- template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
591
- static inline bool check_update_value(TT value) {
592
- return !std::isnan(value);
593
- }
594
-
595
- template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
596
- static inline void check_split_points(const T* values, uint32_t size) {
597
- for (uint32_t i = 0; i < size ; i++) {
598
- if (std::isnan(values[i])) {
599
- throw std::invalid_argument("Values must not be NaN");
600
- }
601
- if ((i < (size - 1)) && !(Comparator()(values[i], values[i + 1]))) {
602
- throw std::invalid_argument("Values must be unique and monotonically increasing");
603
- }
604
- }
605
- }
606
-
607
- // implementations for all other types
608
575
  template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
609
- static const TT& get_invalid_value() {
610
- throw std::runtime_error("getting quantiles from empty sketch is not supported for this type of values");
611
- }
612
-
613
- template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
614
- static inline bool check_update_value(TT) {
576
+ static inline bool check_update_item(TT) {
615
577
  return true;
616
578
  }
617
-
618
- template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
619
- static inline void check_split_points(const T* values, uint32_t size) {
620
- for (uint32_t i = 0; i < size ; i++) {
621
- if ((i < (size - 1)) && !(Comparator()(values[i], values[i + 1]))) {
622
- throw std::invalid_argument("Values must be unique and monotonically increasing");
623
- }
624
- }
625
- }
626
579
  };
627
580
 
628
581
 
629
582
  template<typename T, typename C, typename A>
630
583
  class quantiles_sketch<T, C, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
631
584
  public:
585
+ using value_type = std::pair<const T&, const uint64_t>;
632
586
  const_iterator& operator++();
633
587
  const_iterator& operator++(int);
634
588
  bool operator==(const const_iterator& other) const;
635
589
  bool operator!=(const const_iterator& other) const;
636
- std::pair<const T&, const uint64_t> operator*() const;
590
+ const value_type operator*() const;
591
+ const return_value_holder<value_type> operator->() const;
637
592
  private:
638
593
  friend class quantiles_sketch<T, C, A>;
639
594
  using Level = std::vector<T, A>;