datasketches 0.2.6 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (121) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +9 -0
  3. data/LICENSE +4 -6
  4. data/NOTICE +6 -5
  5. data/ext/datasketches/kll_wrapper.cpp +20 -20
  6. data/ext/datasketches/theta_wrapper.cpp +2 -2
  7. data/lib/datasketches/version.rb +1 -1
  8. data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
  9. data/vendor/datasketches-cpp/LICENSE +4 -6
  10. data/vendor/datasketches-cpp/MANIFEST.in +21 -4
  11. data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
  12. data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
  13. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
  14. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
  15. data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
  16. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
  17. data/vendor/datasketches-cpp/common/{test/test_runner.cpp → include/version.hpp.in} +15 -8
  18. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +37 -7
  19. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +22 -1
  20. data/vendor/datasketches-cpp/common/test/integration_test.cpp +1 -1
  21. data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
  22. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
  23. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +1 -1
  24. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +1 -1
  25. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -1
  26. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  27. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
  28. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
  29. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
  30. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +17 -10
  31. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
  32. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +55 -42
  33. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -1
  34. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +4 -4
  35. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
  36. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  37. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
  38. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -1
  39. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +1 -1
  40. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +1 -1
  41. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -1
  42. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +1 -1
  43. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +1 -1
  44. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +1 -1
  45. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -1
  46. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +1 -1
  47. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
  48. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
  49. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
  50. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
  51. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +27 -27
  52. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +197 -233
  53. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +42 -32
  54. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  55. data/vendor/datasketches-cpp/pyproject.toml +17 -13
  56. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  57. data/vendor/datasketches-cpp/python/README.md +1 -1
  58. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
  59. data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
  60. data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
  61. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
  62. data/vendor/datasketches-cpp/python/pybind11Path.cmd +19 -1
  63. data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
  64. data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
  65. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
  66. data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
  67. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
  68. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
  69. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
  70. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
  71. data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
  72. data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
  73. data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
  74. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
  75. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
  76. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
  77. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +1 -1
  78. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +20 -19
  79. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +241 -233
  80. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
  84. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
  85. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +27 -27
  86. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +117 -104
  87. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
  88. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
  89. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
  91. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
  92. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +7 -7
  93. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +3 -3
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +5 -5
  95. data/vendor/datasketches-cpp/setup.py +14 -3
  96. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
  97. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
  98. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
  99. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
  100. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +1 -1
  101. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +1 -1
  102. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +1 -1
  103. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -1
  104. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +3 -2
  105. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +1 -1
  106. data/vendor/datasketches-cpp/tox.ini +26 -0
  107. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
  108. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +41 -35
  109. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
  110. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  111. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
  112. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -1
  113. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -1
  114. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +1 -1
  115. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  116. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +27 -1
  117. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -1
  118. data/vendor/datasketches-cpp/version.cfg.in +1 -0
  119. metadata +14 -7
  120. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
  121. data/vendor/datasketches-cpp/common/test/catch.hpp +0 -17618
@@ -24,7 +24,7 @@
24
24
  #include <memory>
25
25
  #include <vector>
26
26
 
27
- #include "quantile_sketch_sorted_view.hpp"
27
+ #include "quantiles_sorted_view.hpp"
28
28
  #include "common_defs.hpp"
29
29
  #include "serde.hpp"
30
30
 
@@ -32,22 +32,21 @@ namespace datasketches {
32
32
 
33
33
  /**
34
34
  * This is a stochastic streaming sketch that enables near-real time analysis of the
35
- * approximate distribution of real values from a very large stream in a single pass.
36
- * The analysis is obtained using a getQuantiles(*) function or its inverse functions the
37
- * Probability Mass Function from getPMF(*) and the Cumulative Distribution Function from getCDF(*).
35
+ * approximate distribution from a very large stream in a single pass.
36
+ * The analysis is obtained using get_rank() and get_quantile() functions,
37
+ * the Probability Mass Function from get_PMF() and the Cumulative Distribution Function from get_CDF().
38
38
  *
39
39
  * <p>Consider a large stream of one million values such as packet sizes coming into a network node.
40
- * The absolute rank of any specific size value is simply its index in the hypothetical sorted
40
+ * The natural rank of any specific size value is its index in the hypothetical sorted
41
41
  * array of values.
42
- * The normalized rank (or fractional rank) is the absolute rank divided by the stream size,
42
+ * The normalized rank is the natural rank divided by the stream size,
43
43
  * in this case one million.
44
44
  * The value corresponding to the normalized rank of 0.5 represents the 50th percentile or median
45
- * value of the distribution, or getQuantile(0.5). Similarly, the 95th percentile is obtained from
46
- * getQuantile(0.95). Using the getQuantiles(0.0, 1.0) will return the min and max values seen by
47
- * the sketch.</p>
45
+ * value of the distribution, or get_quantile(0.5). Similarly, the 95th percentile is obtained from
46
+ * get_quantile(0.95).</p>
48
47
  *
49
48
  * <p>From the min and max values, for example, 1 and 1000 bytes,
50
- * you can obtain the PMF from getPMF(100, 500, 900) that will result in an array of
49
+ * you can obtain the PMF from get_PMF(100, 500, 900) that will result in an array of
51
50
  * 4 fractional values such as {.4, .3, .2, .1}, which means that
52
51
  * <ul>
53
52
  * <li>40% of the values were &lt; 100,</li>
@@ -55,20 +54,19 @@ namespace datasketches {
55
54
  * <li>20% of the values were &ge; 500 and &lt; 900, and</li>
56
55
  * <li>10% of the values were &ge; 900.</li>
57
56
  * </ul>
58
- * A frequency histogram can be obtained by simply multiplying these fractions by getN(),
57
+ * A frequency histogram can be obtained by multiplying these fractions by get_n(),
59
58
  * which is the total count of values received.
60
- * The getCDF(*) works similarly, but produces the cumulative distribution instead.
59
+ * The get_CDF() works similarly, but produces the cumulative distribution instead.
61
60
  *
62
61
  * <p>As of November 2021, this implementation produces serialized sketches which are binary-compatible
63
62
  * with the equivalent Java implementation only when template parameter T = double
64
63
  * (64-bit double precision values).
65
-
66
64
  *
67
65
  * <p>The accuracy of this sketch is a function of the configured value <i>k</i>, which also affects
68
66
  * the overall size of the sketch. Accuracy of this quantile sketch is always with respect to
69
- * the normalized rank. A <i>k</i> of 128 produces a normalized, rank error of about 1.7%.
70
- * For example, the median value returned from getQuantile(0.5) will be between the actual values
71
- * from the hypothetically sorted array of input values at normalized ranks of 0.483 and 0.517, with
67
+ * the normalized rank. A <i>k</i> of 128 produces a normalized, rank error of about 1.7%.
68
+ * For example, the median item returned from getQuantile(0.5) will be between the actual items
69
+ * from the hypothetically sorted array of input items at normalized ranks of 0.483 and 0.517, with
72
70
  * a confidence of about 99%.</p>
73
71
  *
74
72
  * <pre>
@@ -121,17 +119,17 @@ Table Guide for DoublesSketch Size in Bytes and Approximate Error:
121
119
  * by Agarwal, Cormode, Huang, Phillips, Wei, and Yi.
122
120
  * <a href="http://dblp.org/rec/html/journals/tods/AgarwalCHPWY13"></a></p>
123
121
  *
124
- * <p>This algorithm is independent of the distribution of values and
125
- * requires only that the values be comparable.</p
122
+ * <p>This algorithm is independent of the distribution of items and
123
+ * requires only that the items be comparable.</p>
126
124
  *
127
- * <p>This algorithm intentionally inserts randomness into the sampling process for values that
125
+ * <p>This algorithm intentionally inserts randomness into the sampling process for items that
128
126
  * ultimately get retained in the sketch. The results produced by this algorithm are not
129
127
  * deterministic. For example, if the same stream is inserted into two different instances of this
130
128
  * sketch, the answers obtained from the two sketches may not be identical.</p>
131
129
  *
132
- * <p>Similarly, there may be directional inconsistencies. For example, the resulting array of
133
- * values obtained from getQuantiles(fractions[]) input into the reverse directional query
134
- * getPMF(splitPoints[]) may not result in the original fractional values.</p>
130
+ * <p>Similarly, there may be directional inconsistencies. For example, the result
131
+ * obtained from get_quantile(rank) input into the reverse directional query
132
+ * get_rank(item) may not result in the original item.</p>
135
133
  *
136
134
  * @author Kevin Lang
137
135
  * @author Lee Rhodes
@@ -153,9 +151,9 @@ public:
153
151
  using value_type = T;
154
152
  using allocator_type = Allocator;
155
153
  using comparator = Comparator;
156
- using vector_double = std::vector<double, typename std::allocator_traits<Allocator>::template rebind_alloc<double>>;
157
154
 
158
- explicit quantiles_sketch(uint16_t k = quantiles_constants::DEFAULT_K, const Allocator& allocator = Allocator());
155
+ explicit quantiles_sketch(uint16_t k = quantiles_constants::DEFAULT_K,
156
+ const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
159
157
  quantiles_sketch(const quantiles_sketch& other);
160
158
  quantiles_sketch(quantiles_sketch&& other) noexcept;
161
159
  ~quantiles_sketch();
@@ -165,17 +163,19 @@ public:
165
163
  /**
166
164
  * @brief Type converting constructor
167
165
  * @param other quantiles sketch of a different type
166
+ * @param comparator instance of a Comparator
168
167
  * @param allocator instance of an Allocator
169
168
  */
170
169
  template<typename From, typename FC, typename FA>
171
- explicit quantiles_sketch(const quantiles_sketch<From, FC, FA>& other, const Allocator& allocator = Allocator());
170
+ explicit quantiles_sketch(const quantiles_sketch<From, FC, FA>& other,
171
+ const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
172
172
 
173
173
  /**
174
174
  * Updates this sketch with the given data item.
175
- * @param value an item from a stream of items
175
+ * @param item from a stream of items
176
176
  */
177
177
  template<typename FwdT>
178
- void update(FwdT&& value);
178
+ void update(FwdT&& item);
179
179
 
180
180
  /**
181
181
  * Merges another sketch into this one.
@@ -215,20 +215,18 @@ public:
215
215
  bool is_estimation_mode() const;
216
216
 
217
217
  /**
218
- * Returns the min value of the stream.
219
- * For floating point types: if the sketch is empty this returns NaN.
220
- * For other types: if the sketch is empty this throws runtime_error.
221
- * @return the min value of the stream
218
+ * Returns the min item of the stream.
219
+ * If the sketch is empty this throws std::runtime_error.
220
+ * @return the min item of the stream
222
221
  */
223
- const T& get_min_value() const;
222
+ const T& get_min_item() const;
224
223
 
225
224
  /**
226
- * Returns the max value of the stream.
227
- * For floating point types: if the sketch is empty this returns NaN.
228
- * For other types: if the sketch is empty this throws runtime_error.
229
- * @return the max value of the stream
225
+ * Returns the max item of the stream.
226
+ * If the sketch is empty this throws std::runtime_error.
227
+ * @return the max item of the stream
230
228
  */
231
- const T& get_max_value() const;
229
+ const T& get_max_item() const;
232
230
 
233
231
  /**
234
232
  * Returns an instance of the comparator for this sketch.
@@ -243,166 +241,147 @@ public:
243
241
  allocator_type get_allocator() const;
244
242
 
245
243
  /**
246
- * Returns an approximation to the value of the data item
247
- * that would be preceded by the given fraction of a hypothetical sorted
248
- * version of the input stream so far.
249
- * <p>
250
- * Note that this method has a fairly large overhead (microseconds instead of nanoseconds)
251
- * so it should not be called multiple times to get different quantiles from the same
252
- * sketch. Instead use get_quantiles(), which pays the overhead only once.
244
+ * Returns an approximation to the data item associated with the given rank
245
+ * of a hypothetical sorted version of the input stream so far.
253
246
  * <p>
254
- * For floating point types: if the sketch is empty this returns NaN.
255
- * For other types: if the sketch is empty this throws runtime_error.
247
+ * If the sketch is empty this throws std::runtime_error.
256
248
  *
257
- * @param rank the specified fractional position in the hypothetical sorted stream.
258
- * These are also called normalized ranks or fractional ranks.
259
- * If rank = 0.0, the true minimum value of the stream is returned.
260
- * If rank = 1.0, the true maximum value of the stream is returned.
249
+ * @param rank the specified normalized rank in the hypothetical sorted stream.
261
250
  *
262
- * @return the approximation to the value at the given rank
251
+ * @return the approximation to the item at the given rank
263
252
  */
264
- using quantile_return_type = typename quantile_sketch_sorted_view<T, Comparator, Allocator>::quantile_return_type;
265
- template<bool inclusive = false>
266
- quantile_return_type get_quantile(double rank) const;
253
+ using quantile_return_type = typename quantiles_sorted_view<T, Comparator, Allocator>::quantile_return_type;
254
+ quantile_return_type get_quantile(double rank, bool inclusive = true) const;
267
255
 
268
256
  /**
269
- * This is a more efficient multiple-query version of get_quantile().
257
+ * This is a multiple-query version of get_quantile().
270
258
  * <p>
271
259
  * This returns an array that could have been generated by using get_quantile() for each
272
- * fractional rank separately, but would be very inefficient.
273
- * This method incurs the internal set-up overhead once and obtains multiple quantile values in
274
- * a single query. It is strongly recommend that this method be used instead of multiple calls
275
- * to get_quantile().
260
+ * normalized rank separately.
276
261
  *
277
- * <p>If the sketch is empty this returns an empty vector.
262
+ * <p>If the sketch is empty this throws std::runtime_error.
278
263
  *
279
- * @param fractions given array of fractional positions in the hypothetical sorted stream.
280
- * These are also called normalized ranks or fractional ranks.
281
- * These fractions must be in the interval [0.0, 1.0], inclusive.
264
+ * @param ranks given array of normalized ranks in the hypothetical sorted stream.
265
+ * These ranks must be in the interval [0.0, 1.0], inclusive.
266
+ * @param size the number of ranks in the array
282
267
  *
283
- * @return array of approximations to the given fractions in the same order as given fractions
268
+ * @return array of approximations to items associated with given ranks in the same order as given ranks
284
269
  * in the input array.
270
+ *
271
+ * Deprecated. Will be removed in the next major version. Use get_quantile() instead.
285
272
  */
286
- template<bool inclusive = false>
287
- std::vector<T, Allocator> get_quantiles(const double* fractions, uint32_t size) const;
273
+ std::vector<T, Allocator> get_quantiles(const double* ranks, uint32_t size, bool inclusive = true) const;
288
274
 
289
275
  /**
290
276
  * This is a multiple-query version of get_quantile() that allows the caller to
291
- * specify the number of evenly-spaced fractional ranks.
277
+ * specify the number of evenly-spaced normalized ranks.
278
+ *
279
+ * <p>If the sketch is empty this throws std::runtime_error.
292
280
  *
293
- * <p>If the sketch is empty this returns an empty vector.
281
+ * @param num an integer that specifies the number of evenly-spaced ranks.
282
+ * This must be an integer greater than 0. A value of 1 is equivalent to get_quantiles([0]).
283
+ * A value of 2 is equivalent to get_quantiles([0, 1]). A value of 3 is equivalent to
284
+ * get_quantiles([0, 0.5, 1]), etc.
294
285
  *
295
- * @param num an integer that specifies the number of evenly-spaced fractional ranks.
296
- * This must be an integer greater than 0. A value of 1 will return the min value.
297
- * A value of 2 will return the min and the max value. A value of 3 will return the min,
298
- * the median and the max value, etc.
286
+ * @return array of approximations to items associated with the given number of evenly-spaced normalized ranks.
299
287
  *
300
- * @return array of approximations to the given number of evenly-spaced fractional ranks.
288
+ * Deprecated. Will be removed in the next major version. Use get_quantile() instead.
301
289
  */
302
- template<bool inclusive = false>
303
- std::vector<T, Allocator> get_quantiles(uint32_t num) const;
290
+ std::vector<T, Allocator> get_quantiles(uint32_t num, bool inclusive = true) const;
304
291
 
305
292
  /**
306
- * Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1,
307
- * inclusive. When template parameter <em>inclusive=false</em> (the default), only elements strictly
308
- * less than the provided value are included in the rank estimate. With <em>inclusive=true</em>,
309
- * the rank estimate includes elements less than or equal to the provided value.
293
+ * Returns an approximation to the normalized rank of the given item from 0 to 1, inclusive.
310
294
  *
311
295
  * <p>The resulting approximation has a probabilistic guarantee that can be obtained from the
312
296
  * get_normalized_rank_error(false) function.
313
297
  *
314
- * <p>If the sketch is empty this returns NaN.
298
+ * <p>If the sketch is empty this throws std::runtime_error.
315
299
  *
316
- * @param value to be ranked
317
- * @return an approximate rank of the given value
300
+ * @param item to be ranked
301
+ * @param inclusive if true the weight of the given item is included into the rank.
302
+ * Otherwise the rank equals the sum of the weights of all items that are less than the given item
303
+ * according to the comparator C.
304
+ * @return an approximate normalized rank of the given item
318
305
  */
319
- template<bool inclusive = false>
320
- double get_rank(const T& value) const;
306
+ double get_rank(const T& item, bool inclusive = true) const;
321
307
 
322
308
  /**
323
309
  * Returns an approximation to the Probability Mass Function (PMF) of the input stream
324
- * given a set of split points (values).
310
+ * given a set of split points (items).
325
311
  *
326
312
  * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
327
313
  * get_normalized_rank_error(true) function.
328
314
  *
329
- * <p>If the sketch is empty this returns an empty vector.
315
+ * <p>If the sketch is empty this throws std::runtime_error.
330
316
  *
331
- * @param split_points an array of <i>m</i> unique, monotonically increasing values
332
- * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
333
- * If the template parameter <em>inclusive=false</em> (the default), the definition of an "interval"
334
- * is inclusive of the left split point and exclusive of the right
335
- * split point, with the exception that the last interval will include the maximum value.
336
- * If the template parameter <em>inclusive=true</em>, the definition of an "interval" is exclusive of
337
- * the left split point and inclusive of the right split point.
338
- * It is not necessary to include either the min or max values in these split points.
317
+ * @param split_points an array of <i>m</i> unique, monotonically increasing items
318
+ * that divide the input domain into <i>m+1</i> consecutive disjoint intervals (bins).
319
+ *
320
+ * @param size of the array of split points.
321
+ *
322
+ * @param inclusive if true the rank of an item includes its own weight, and therefore
323
+ * if the sketch contains items equal to a slit point, then in PMF such items are
324
+ * included into the interval to the left of split point. Otherwise they are included into the interval
325
+ * to the right of split point.
339
326
  *
340
327
  * @return an array of m+1 doubles each of which is an approximation
341
- * to the fraction of the input stream values (the mass) that fall into one of those intervals.
342
- * When <em>inclusive=false</em> (the default), the definition of an "interval" is inclusive
343
- * of the left split point and exclusive of the right split point, with the exception that the last
344
- * interval will include the maximum value. When <em>inclusive=true</em>,
345
- * an "interval" is exclusive of the left split point and inclusive of the right.
328
+ * to the fraction of the input stream items (the mass) that fall into one of those intervals.
346
329
  */
347
- template<bool inclusive = false>
348
- vector_double get_PMF(const T* split_points, uint32_t size) const;
330
+ using vector_double = typename quantiles_sorted_view<T, Comparator, Allocator>::vector_double;
331
+ vector_double get_PMF(const T* split_points, uint32_t size, bool inclusive = true) const;
349
332
 
350
333
  /**
351
334
  * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
352
- * cumulative analog of the PMF, of the input stream given a set of split points (values).
335
+ * cumulative analog of the PMF, of the input stream given a set of split points (items).
353
336
  *
354
337
  * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
355
338
  * get_normalized_rank_error(false) function.
356
339
  *
357
- * <p>If the sketch is empty this returns an empty vector.
340
+ * <p>If the sketch is empty this throws std::runtime_error.
358
341
  *
359
- * @param split_points an array of <i>m</i> unique, monotonically increasing values
342
+ * @param split_points an array of <i>m</i> unique, monotonically increasing items
360
343
  * that divide the input domain into <i>m+1</i> consecutive disjoint intervals.
361
- * If the template parameter <em>inclusive=false</em> (the default), the definition of an "interval" is
362
- * inclusive of the left split point and exclusive of the right
363
- * split point, with the exception that the last interval will include the maximum value.
364
- * If the template parameter <em>inclusive=true</em>, the definition of an "interval" is exclusive of
365
- * the left split point and inclusive of the right split point.
366
- * It is not necessary to include either the min or max values in these split points.
344
+ *
345
+ * @param size of the array of split points.
346
+ *
347
+ * @param inclusive if true the rank of an item includes its own weight, and therefore
348
+ * if the sketch contains items equal to a slit point, then in CDF such items are
349
+ * included into the interval to the left of split point. Otherwise they are included into
350
+ * the interval to the right of split point.
367
351
  *
368
352
  * @return an array of m+1 double values, which are a consecutive approximation to the CDF
369
353
  * of the input stream given the split_points. The value at array position j of the returned
370
354
  * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
371
- * array.
372
- * When <em>inclusive=false</em> (the default), the definition of an "interval" is inclusive
373
- * of the left split point and exclusive of the right split point, with the exception that the last
374
- * interval will include the maximum value. When <em>inclusive=true</em>,
375
- * an "interval" is exclusive of the left split point and inclusive of the right.
376
-
355
+ * array. This can be viewed as array of ranks of the given split points plus one more value
356
+ * that is always 1.
377
357
  */
378
- template<bool inclusive = false>
379
- vector_double get_CDF(const T* split_points, uint32_t size) const;
358
+ vector_double get_CDF(const T* split_points, uint32_t size, bool inclusive = true) const;
380
359
 
381
360
  /**
382
361
  * Computes size needed to serialize the current state of the sketch.
383
362
  * This version is for fixed-size arithmetic types (integral and floating point).
384
- * @param instance of a SerDe
363
+ * @param sd instance of a SerDe
385
364
  * @return size in bytes needed to serialize this sketch
386
365
  */
387
366
  template<typename SerDe = serde<T>, typename TT = T, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type = 0>
388
- size_t get_serialized_size_bytes(const SerDe& serde = SerDe()) const;
367
+ size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
389
368
 
390
369
  /**
391
370
  * Computes size needed to serialize the current state of the sketch.
392
371
  * This version is for all other types and can be expensive since every item needs to be looked at.
393
- * @param instance of a SerDe
372
+ * @param sd instance of a SerDe
394
373
  * @return size in bytes needed to serialize this sketch
395
374
  */
396
375
  template<typename SerDe = serde<T>, typename TT = T, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type = 0>
397
- size_t get_serialized_size_bytes(const SerDe& serde = SerDe()) const;
376
+ size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
398
377
 
399
378
  /**
400
379
  * This method serializes the sketch into a given stream in a binary form
401
380
  * @param os output stream
402
- * @param instance of a SerDe
381
+ * @param sd instance of a SerDe
403
382
  */
404
383
  template<typename SerDe = serde<T>>
405
- void serialize(std::ostream& os, const SerDe& serde = SerDe()) const;
384
+ void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
406
385
 
407
386
  // This is a convenience alias for users
408
387
  // The type returned by the following serialize method
@@ -414,32 +393,36 @@ public:
414
393
  * It is a blank space of a given size.
415
394
  * This header is used in Datasketches PostgreSQL extension.
416
395
  * @param header_size_bytes space to reserve in front of the sketch
417
- * @param instance of a SerDe
396
+ * @param sd instance of a SerDe
418
397
  * @return serialized sketch as a vector of bytes
419
398
  */
420
399
  template<typename SerDe = serde<T>>
421
- vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& serde = SerDe()) const;
400
+ vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
422
401
 
423
402
  /**
424
403
  * This method deserializes a sketch from a given stream.
425
404
  * @param is input stream
426
- * @param instance of a SerDe
427
- * @param instance of an Allocator
405
+ * @param sd instance of a SerDe
406
+ * @param comparator instance of a Comparator
407
+ * @param allocator instance of an Allocator
428
408
  * @return an instance of a sketch
429
409
  */
430
410
  template<typename SerDe = serde<T>>
431
- static quantiles_sketch deserialize(std::istream& is, const SerDe& serde = SerDe(), const Allocator& allocator = Allocator());
411
+ static quantiles_sketch deserialize(std::istream& is, const SerDe& sd = SerDe(),
412
+ const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
432
413
 
433
414
  /**
434
415
  * This method deserializes a sketch from a given array of bytes.
435
416
  * @param bytes pointer to the array of bytes
436
417
  * @param size the size of the array
437
- * @param instance of a SerDe
438
- * @param instance of an Allocator
418
+ * @param sd instance of a SerDe
419
+ * @param comparator instance of a Comparator
420
+ * @param allocator instance of an Allocator
439
421
  * @return an instance of a sketch
440
422
  */
441
423
  template<typename SerDe = serde<T>>
442
- static quantiles_sketch deserialize(const void* bytes, size_t size, const SerDe& serde = SerDe(), const Allocator& allocator = Allocator());
424
+ static quantiles_sketch deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(),
425
+ const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
443
426
 
444
427
  /**
445
428
  * Gets the normalized rank error for this sketch. Constants were derived as the best fit to 99 percentile
@@ -471,8 +454,7 @@ public:
471
454
  const_iterator begin() const;
472
455
  const_iterator end() const;
473
456
 
474
- template<bool inclusive = false>
475
- quantile_sketch_sorted_view<T, Comparator, Allocator> get_sorted_view(bool cumulative) const;
457
+ quantiles_sorted_view<T, Comparator, Allocator> get_sorted_view() const;
476
458
 
477
459
  private:
478
460
  using Level = std::vector<T, Allocator>;
@@ -487,7 +469,7 @@ private:
487
469
  * || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
488
470
  * 1 ||---------------------------Items Seen Count (N)--------------------------------|
489
471
  *
490
- * Long 3 is the start of data, beginning with serialized min and max values, followed by
472
+ * Long 3 is the start of data, beginning with serialized min and max item, followed by
491
473
  * the sketch data buffers.
492
474
  */
493
475
 
@@ -503,23 +485,28 @@ private:
503
485
  static const uint8_t PREAMBLE_LONGS_FULL = 2;
504
486
  static const size_t DATA_START = 16;
505
487
 
488
+ Comparator comparator_;
506
489
  Allocator allocator_;
490
+ bool is_base_buffer_sorted_;
507
491
  uint16_t k_;
508
492
  uint64_t n_;
509
493
  uint64_t bit_pattern_;
510
494
  Level base_buffer_;
511
495
  VectorLevels levels_;
512
- T* min_value_;
513
- T* max_value_;
514
- bool is_sorted_;
496
+ T* min_item_;
497
+ T* max_item_;
498
+ mutable quantiles_sorted_view<T, Comparator, Allocator>* sorted_view_;
499
+
500
+ void setup_sorted_view() const; // modifies mutable state
501
+ void reset_sorted_view();
515
502
 
516
503
  // for deserialization
517
504
  class item_deleter;
518
505
  class items_deleter;
519
506
  quantiles_sketch(uint16_t k, uint64_t n, uint64_t bit_pattern,
520
507
  Level&& base_buffer, VectorLevels&& levels,
521
- std::unique_ptr<T, item_deleter> min_value, std::unique_ptr<T, item_deleter> max_value,
522
- bool is_sorted, const Allocator& allocator = Allocator());
508
+ std::unique_ptr<T, item_deleter> min_item, std::unique_ptr<T, item_deleter> max_item,
509
+ bool is_sorted, const Comparator& comparator = Comparator(), const Allocator& allocator = Allocator());
523
510
 
524
511
  void grow_base_buffer();
525
512
  void process_full_base_buffer();
@@ -533,7 +520,7 @@ private:
533
520
  Level& buf_size_2k, bool apply_as_update,
534
521
  quantiles_sketch& sketch);
535
522
  static void zip_buffer(Level& buf_in, Level& buf_out);
536
- static void merge_two_size_k_buffers(Level& arr_in_1, Level& arr_in_2, Level& arr_out);
523
+ static void merge_two_size_k_buffers(Level& arr_in_1, Level& arr_in_2, Level& arr_out, const Comparator& comparator);
537
524
 
538
525
  template<typename SerDe>
539
526
  static Level deserialize_array(std::istream& is, uint32_t num_items, uint32_t capcacity, const SerDe& serde, const Allocator& allocator);
@@ -549,7 +536,7 @@ private:
549
536
  static uint32_t compute_retained_items(uint16_t k, uint64_t n);
550
537
  static uint32_t compute_base_buffer_items(uint16_t k, uint64_t n);
551
538
  static uint64_t compute_bit_pattern(uint16_t k, uint64_t n);
552
- static uint32_t compute_valid_levels(uint64_t bit_pattern);
539
+ static uint32_t count_valid_levels(uint64_t bit_pattern);
553
540
  static uint8_t compute_levels_needed(uint16_t k, uint64_t n);
554
541
 
555
542
  /**
@@ -580,60 +567,28 @@ private:
580
567
  */
581
568
  static uint8_t lowest_zero_bit_starting_at(uint64_t bits, uint8_t starting_bit);
582
569
 
583
- // implementations for floating point types
584
570
  template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
585
- static const TT& get_invalid_value() {
586
- static TT value = std::numeric_limits<TT>::quiet_NaN();
587
- return value;
571
+ static inline bool check_update_item(TT item) {
572
+ return !std::isnan(item);
588
573
  }
589
574
 
590
- template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
591
- static inline bool check_update_value(TT value) {
592
- return !std::isnan(value);
593
- }
594
-
595
- template<typename TT = T, typename std::enable_if<std::is_floating_point<TT>::value, int>::type = 0>
596
- static inline void check_split_points(const T* values, uint32_t size) {
597
- for (uint32_t i = 0; i < size ; i++) {
598
- if (std::isnan(values[i])) {
599
- throw std::invalid_argument("Values must not be NaN");
600
- }
601
- if ((i < (size - 1)) && !(Comparator()(values[i], values[i + 1]))) {
602
- throw std::invalid_argument("Values must be unique and monotonically increasing");
603
- }
604
- }
605
- }
606
-
607
- // implementations for all other types
608
575
  template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
609
- static const TT& get_invalid_value() {
610
- throw std::runtime_error("getting quantiles from empty sketch is not supported for this type of values");
611
- }
612
-
613
- template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
614
- static inline bool check_update_value(TT) {
576
+ static inline bool check_update_item(TT) {
615
577
  return true;
616
578
  }
617
-
618
- template<typename TT = T, typename std::enable_if<!std::is_floating_point<TT>::value, int>::type = 0>
619
- static inline void check_split_points(const T* values, uint32_t size) {
620
- for (uint32_t i = 0; i < size ; i++) {
621
- if ((i < (size - 1)) && !(Comparator()(values[i], values[i + 1]))) {
622
- throw std::invalid_argument("Values must be unique and monotonically increasing");
623
- }
624
- }
625
- }
626
579
  };
627
580
 
628
581
 
629
582
  template<typename T, typename C, typename A>
630
583
  class quantiles_sketch<T, C, A>::const_iterator: public std::iterator<std::input_iterator_tag, T> {
631
584
  public:
585
+ using value_type = std::pair<const T&, const uint64_t>;
632
586
  const_iterator& operator++();
633
587
  const_iterator& operator++(int);
634
588
  bool operator==(const const_iterator& other) const;
635
589
  bool operator!=(const const_iterator& other) const;
636
- std::pair<const T&, const uint64_t> operator*() const;
590
+ const value_type operator*() const;
591
+ const return_value_holder<value_type> operator->() const;
637
592
  private:
638
593
  friend class quantiles_sketch<T, C, A>;
639
594
  using Level = std::vector<T, A>;