datasketches 0.2.7 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/ext/datasketches/kll_wrapper.cpp +20 -20
  4. data/ext/datasketches/theta_wrapper.cpp +2 -2
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
  7. data/vendor/datasketches-cpp/MANIFEST.in +21 -2
  8. data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
  10. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
  11. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
  12. data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
  13. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
  14. data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
  15. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
  16. data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
  17. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
  18. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
  19. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
  20. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
  21. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
  22. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
  23. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
  24. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
  25. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
  26. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
  27. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
  28. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
  29. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
  30. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
  31. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
  32. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
  33. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
  34. data/vendor/datasketches-cpp/pyproject.toml +17 -12
  35. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  36. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
  37. data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
  38. data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
  39. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
  40. data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
  41. data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
  42. data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
  43. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
  44. data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
  45. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
  46. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
  47. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
  48. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
  49. data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
  50. data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
  51. data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
  52. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
  53. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
  54. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
  55. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
  56. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
  57. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
  58. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
  59. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
  60. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
  61. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
  62. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
  63. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
  64. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
  65. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
  66. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
  67. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
  68. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
  69. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
  70. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
  71. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
  72. data/vendor/datasketches-cpp/setup.py +14 -2
  73. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
  74. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
  75. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
  76. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
  77. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
  78. data/vendor/datasketches-cpp/tox.ini +26 -0
  79. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
  80. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
  81. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
  82. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
  83. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
  84. data/vendor/datasketches-cpp/version.cfg.in +1 -0
  85. metadata +14 -5
  86. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
@@ -51,39 +51,17 @@ double kll_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
51
51
  return kll_sketch<T>::get_normalized_rank_error(k, pmf);
52
52
  }
53
53
 
54
- template<typename T>
55
- double kll_sketch_get_rank(const kll_sketch<T>& sk, const T& item, bool inclusive) {
56
- if (inclusive)
57
- return sk.template get_rank<true>(item);
58
- else
59
- return sk.template get_rank<false>(item);
60
- }
61
-
62
- template<typename T>
63
- T kll_sketch_get_quantile(const kll_sketch<T>& sk,
64
- double rank,
65
- bool inclusive) {
66
- if (inclusive)
67
- return T(sk.template get_quantile<true>(rank));
68
- else
69
- return T(sk.template get_quantile<false>(rank));
70
- }
71
-
72
54
  template<typename T>
73
55
  py::list kll_sketch_get_quantiles(const kll_sketch<T>& sk,
74
- std::vector<double>& fractions,
56
+ std::vector<double>& ranks,
75
57
  bool inclusive) {
76
- size_t nQuantiles = fractions.size();
77
- auto result = inclusive ?
78
- sk.template get_quantiles<true>(fractions.data(), nQuantiles)
79
- : sk.template get_quantiles<false>(fractions.data(), nQuantiles);
80
-
58
+ size_t nQuantiles = ranks.size();
59
+ auto result = sk.get_quantiles(ranks.data(), nQuantiles, inclusive);
81
60
  // returning as std::vector<> would copy values to a list anyway
82
61
  py::list list(nQuantiles);
83
62
  for (size_t i = 0; i < nQuantiles; ++i) {
84
63
  list[i] = result[i];
85
64
  }
86
-
87
65
  return list;
88
66
  }
89
67
 
@@ -92,15 +70,11 @@ py::list kll_sketch_get_pmf(const kll_sketch<T>& sk,
92
70
  std::vector<T>& split_points,
93
71
  bool inclusive) {
94
72
  size_t nPoints = split_points.size();
95
- auto result = inclusive ?
96
- sk.template get_PMF<true>(split_points.data(), nPoints)
97
- : sk.template get_PMF<false>(split_points.data(), nPoints);
98
-
73
+ auto result = sk.get_PMF(split_points.data(), nPoints, inclusive);
99
74
  py::list list(nPoints + 1);
100
75
  for (size_t i = 0; i <= nPoints; ++i) {
101
76
  list[i] = result[i];
102
77
  }
103
-
104
78
  return list;
105
79
  }
106
80
 
@@ -109,15 +83,11 @@ py::list kll_sketch_get_cdf(const kll_sketch<T>& sk,
109
83
  std::vector<T>& split_points,
110
84
  bool inclusive) {
111
85
  size_t nPoints = split_points.size();
112
- auto result = inclusive ?
113
- sk.template get_CDF<true>(split_points.data(), nPoints)
114
- : sk.template get_CDF<false>(split_points.data(), nPoints);
115
-
86
+ auto result = sk.get_CDF(split_points.data(), nPoints, inclusive);
116
87
  py::list list(nPoints + 1);
117
88
  for (size_t i = 0; i <= nPoints; ++i) {
118
89
  list[i] = result[i];
119
90
  }
120
-
121
91
  return list;
122
92
  }
123
93
 
@@ -166,29 +136,23 @@ void bind_kll_sketch(py::module &m, const char* name) {
166
136
  "Returns the number of retained items (samples) in the sketch")
167
137
  .def("is_estimation_mode", &kll_sketch<T>::is_estimation_mode,
168
138
  "Returns True if the sketch is in estimation mode, otherwise False")
169
- .def("get_min_value", &kll_sketch<T>::get_min_value,
170
- "Returns the minimum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
171
- .def("get_max_value", &kll_sketch<T>::get_max_value,
172
- "Returns the maximum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
173
- .def("get_quantile", &dspy::kll_sketch_get_quantile<T>, py::arg("fraction"), py::arg("inclusive")=false,
174
- "Returns an approximation to the value of the data item "
175
- "that would be preceded by the given fraction of a hypothetical sorted "
139
+ .def("get_min_value", &kll_sketch<T>::get_min_item,
140
+ "Returns the minimum value from the stream. If empty, kll_floats_sketch returns nan; kll_ints_sketch throws a RuntimeError")
141
+ .def("get_max_value", &kll_sketch<T>::get_max_item,
142
+ "Returns the maximum value from the stream. If empty, kll_floats_sketch returns nan; kll_ints_sketch throws a RuntimeError")
143
+ .def("get_quantile", &kll_sketch<T>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
144
+ "Returns an approximation to the data value "
145
+ "associated with the given normalized rank in a hypothetical sorted "
176
146
  "version of the input stream so far.\n"
177
- "Note that this method has a fairly large overhead (microseconds instead of nanoseconds) "
178
- "so it should not be called multiple times to get different quantiles from the same "
179
- "sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
180
147
  "For kll_floats_sketch: if the sketch is empty this returns nan. "
181
148
  "For kll_ints_sketch: if the sketch is empty this throws a RuntimeError.")
182
- .def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("fractions"), py::arg("inclusive")=false,
183
- "This is a more efficient multiple-query version of get_quantile().\n"
149
+ .def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
184
150
  "This returns an array that could have been generated by using get_quantile() for each "
185
- "fractional rank separately, but would be very inefficient. "
186
- "This method incurs the internal set-up overhead once and obtains multiple quantile values in "
187
- "a single query. It is strongly recommend that this method be used instead of multiple calls "
188
- "to get_quantile().\n"
189
- "If the sketch is empty this returns an empty vector.")
190
- .def("get_rank", &dspy::kll_sketch_get_rank<T>, py::arg("value"), py::arg("inclusive")=false,
191
- "Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
151
+ "normalized rank separately.\n"
152
+ "If the sketch is empty this returns an empty vector.\n"
153
+ "Deprecated. Will be removed in the next major version. Use get_quantile() instead.")
154
+ .def("get_rank", &kll_sketch<T>::get_rank, py::arg("value"), py::arg("inclusive")=false,
155
+ "Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
192
156
  "The resulting approximation has a probabilistic guarantee that can be obtained from the "
193
157
  "get_normalized_rank_error(False) function.\n"
194
158
  "With the parameter inclusive=true the weight of the given value is included into the rank."
@@ -0,0 +1,111 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <cstring>
21
+ #include "memory_operations.hpp"
22
+
23
+ #include "py_serde.hpp"
24
+
25
+ #include <pybind11/pybind11.h>
26
+
27
+ namespace py = pybind11;
28
+
29
+ void init_serde(py::module& m) {
30
+ py::class_<datasketches::py_object_serde, datasketches::PyObjectSerDe /* <--- trampoline*/>(m, "PyObjectSerDe")
31
+ .def(py::init<>())
32
+ .def("get_size", &datasketches::py_object_serde::get_size, py::arg("item"),
33
+ "Returns the size in bytes of an item")
34
+ .def("to_bytes", &datasketches::py_object_serde::to_bytes, py::arg("item"),
35
+ "Retuns a bytes object with a serialized version of an item")
36
+ .def("from_bytes", &datasketches::py_object_serde::from_bytes, py::arg("data"), py::arg("offset"),
37
+ "Reads a bytes object starting from the given offest and returns a tuple of the reconstructed "
38
+ "object and the number of additional bytes read")
39
+ ;
40
+ }
41
+
42
+ namespace datasketches {
43
+ size_t py_object_serde::size_of_item(const py::object& item) const {
44
+ return get_size(item);
45
+ }
46
+
47
+ size_t py_object_serde::serialize(void* ptr, size_t capacity, const py::object* items, unsigned num) const {
48
+ size_t bytes_written = 0;
49
+ py::gil_scoped_acquire acquire;
50
+ for (unsigned i = 0; i < num; ++i) {
51
+ std::string bytes = to_bytes(items[i]); // implicit cast from py::bytes
52
+ check_memory_size(bytes_written + bytes.size(), capacity);
53
+ memcpy(ptr, bytes.c_str(), bytes.size());
54
+ ptr = static_cast<char*>(ptr) + bytes.size();
55
+ bytes_written += bytes.size();
56
+ }
57
+ py::gil_scoped_release release;
58
+ return bytes_written;
59
+ }
60
+
61
+ size_t py_object_serde::deserialize(const void* ptr, size_t capacity, py::object* items, unsigned num) const {
62
+ size_t bytes_read = 0;
63
+ unsigned i = 0;
64
+ bool failure = false;
65
+ bool error_from_python = false;
66
+ py::gil_scoped_acquire acquire;
67
+
68
+ // copy data into bytes only once
69
+ py::bytes bytes(static_cast<const char*>(ptr), capacity);
70
+ for (; i < num && !failure; ++i) {
71
+ py::tuple bytes_and_len;
72
+ try {
73
+ bytes_and_len = from_bytes(bytes, bytes_read);
74
+ } catch (py::error_already_set &e) {
75
+ failure = true;
76
+ error_from_python = true;
77
+ break;
78
+ }
79
+
80
+ size_t length = py::cast<size_t>(bytes_and_len[1]);
81
+ if (bytes_read + length > capacity) {
82
+ bytes_read += length; // use this value to report the error
83
+ failure = true;
84
+ break;
85
+ }
86
+
87
+ new (&items[i]) py::object(py::cast<py::object>(bytes_and_len[0]));
88
+ ptr = static_cast<const char*>(ptr) + length;
89
+ bytes_read += length;
90
+ }
91
+
92
+ if (failure) {
93
+ // clean up what we've allocated
94
+ for (unsigned j = 0; j < i; ++j) {
95
+ items[j].dec_ref();
96
+ }
97
+
98
+ if (error_from_python) {
99
+ throw py::value_error("Error reading value in from_bytes");
100
+ } else {
101
+ // this next call will throw
102
+ check_memory_size(bytes_read, capacity);
103
+ }
104
+ }
105
+
106
+ py::gil_scoped_release release;
107
+ return bytes_read;
108
+ }
109
+
110
+
111
+ } // namespace datasketches
@@ -49,41 +49,17 @@ double quantiles_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
49
49
  return quantiles_sketch<T>::get_normalized_rank_error(k, pmf);
50
50
  }
51
51
 
52
- template<typename T>
53
- double quantiles_sketch_get_rank(const quantiles_sketch<T>& sk,
54
- const T& item,
55
- bool inclusive) {
56
- if (inclusive)
57
- return sk.template get_rank<true>(item);
58
- else
59
- return sk.template get_rank<false>(item);
60
- }
61
-
62
- template<typename T>
63
- T quantiles_sketch_get_quantile(const quantiles_sketch<T>& sk,
64
- double rank,
65
- bool inclusive) {
66
- if (inclusive)
67
- return T(sk.template get_quantile<true>(rank));
68
- else
69
- return T(sk.template get_quantile<false>(rank));
70
- }
71
-
72
52
  template<typename T>
73
53
  py::list quantiles_sketch_get_quantiles(const quantiles_sketch<T>& sk,
74
- std::vector<double>& fractions,
54
+ std::vector<double>& ranks,
75
55
  bool inclusive) {
76
- size_t n_quantiles = fractions.size();
77
- auto result = inclusive
78
- ? sk.template get_quantiles<true>(&fractions[0], static_cast<uint32_t>(n_quantiles))
79
- : sk.template get_quantiles<false>(&fractions[0], static_cast<uint32_t>(n_quantiles));
80
-
56
+ size_t n_quantiles = ranks.size();
57
+ auto result = sk.get_quantiles(ranks.data(), static_cast<uint32_t>(n_quantiles), inclusive);
81
58
  // returning as std::vector<> would copy values to a list anyway
82
59
  py::list list(n_quantiles);
83
60
  for (size_t i = 0; i < n_quantiles; ++i) {
84
61
  list[i] = result[i];
85
62
  }
86
-
87
63
  return list;
88
64
  }
89
65
 
@@ -92,15 +68,11 @@ py::list quantiles_sketch_get_pmf(const quantiles_sketch<T>& sk,
92
68
  std::vector<T>& split_points,
93
69
  bool inclusive) {
94
70
  size_t n_points = split_points.size();
95
- auto result = inclusive
96
- ? sk.template get_PMF<true>(&split_points[0], n_points)
97
- : sk.template get_PMF<false>(&split_points[0], n_points);
98
-
71
+ auto result = sk.get_PMF(split_points.data(), n_points, inclusive);
99
72
  py::list list(n_points + 1);
100
73
  for (size_t i = 0; i <= n_points; ++i) {
101
74
  list[i] = result[i];
102
75
  }
103
-
104
76
  return list;
105
77
  }
106
78
 
@@ -109,15 +81,11 @@ py::list quantiles_sketch_get_cdf(const quantiles_sketch<T>& sk,
109
81
  std::vector<T>& split_points,
110
82
  bool inclusive) {
111
83
  size_t n_points = split_points.size();
112
- auto result = inclusive
113
- ? sk.template get_CDF<true>(&split_points[0], n_points)
114
- : sk.template get_CDF<false>(&split_points[0], n_points);
115
-
84
+ auto result = sk.get_CDF(split_points.data(), n_points, inclusive);
116
85
  py::list list(n_points + 1);
117
86
  for (size_t i = 0; i <= n_points; ++i) {
118
87
  list[i] = result[i];
119
88
  }
120
-
121
89
  return list;
122
90
  }
123
91
 
@@ -166,31 +134,27 @@ void bind_quantiles_sketch(py::module &m, const char* name) {
166
134
  "Returns the number of retained items (samples) in the sketch")
167
135
  .def("is_estimation_mode", &quantiles_sketch<T>::is_estimation_mode,
168
136
  "Returns True if the sketch is in estimation mode, otherwise False")
169
- .def("get_min_value", &quantiles_sketch<T>::get_min_value,
137
+ .def("get_min_value", &quantiles_sketch<T>::get_min_item,
170
138
  "Returns the minimum value from the stream. If empty, quantiles_floats_sketch returns nan; quantiles_ints_sketch throws a RuntimeError")
171
- .def("get_max_value", &quantiles_sketch<T>::get_max_value,
139
+ .def("get_max_value", &quantiles_sketch<T>::get_max_item,
172
140
  "Returns the maximum value from the stream. If empty, quantiles_floats_sketch returns nan; quantiles_ints_sketch throws a RuntimeError")
173
- .def("get_quantile", &dspy::quantiles_sketch_get_quantile<T>, py::arg("rank"), py::arg("inclusive")=false,
174
- "Returns an approximation to the value of the data item "
175
- "that would be preceded by the given fraction of a hypothetical sorted "
141
+ .def("get_quantile", &quantiles_sketch<T>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
142
+ "Returns an approximation to the data value "
143
+ "associated with the given rank in a hypothetical sorted "
176
144
  "version of the input stream so far.\n"
177
- "Note that this method has a fairly large overhead (microseconds instead of nanoseconds) "
178
- "so it should not be called multiple times to get different quantiles from the same "
179
- "sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
180
145
  "For quantiles_floats_sketch: if the sketch is empty this returns nan. "
181
146
  "For quantiles_ints_sketch: if the sketch is empty this throws a RuntimeError.")
182
147
  .def("get_quantiles", &dspy::quantiles_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
183
- "This is a more efficient multiple-query version of get_quantile().\n"
184
148
  "This returns an array that could have been generated by using get_quantile() for each "
185
- "fractional rank separately, but would be very inefficient. "
186
- "This method incurs the internal set-up overhead once and obtains multiple quantile values in "
187
- "a single query. It is strongly recommend that this method be used instead of multiple calls "
188
- "to get_quantile().\n"
189
- "If the sketch is empty this returns an empty vector.")
190
- .def("get_rank", &dspy::quantiles_sketch_get_rank<T>, py::arg("item"), py::arg("inclusive")=false,
191
- "Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
149
+ "normalized rank separately.\n"
150
+ "If the sketch is empty this returns an empty vector.\n"
151
+ "Deprecated. Will be removed in the next major version. Use get_quantile() instead.")
152
+ .def("get_rank", &quantiles_sketch<T>::get_rank, py::arg("value"), py::arg("inclusive")=false,
153
+ "Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
192
154
  "The resulting approximation has a probabilistic guarantee that can be obtained from the "
193
155
  "get_normalized_rank_error(False) function.\n"
156
+ "With the parameter inclusive=true the weight of the given value is included into the rank."
157
+ "Otherwise the rank equals the sum of the weights of values less than the given value.\n"
194
158
  "If the sketch is empty this returns nan.")
195
159
  .def("get_pmf", &dspy::quantiles_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
196
160
  "Returns an approximation to the Probability Mass Function (PMF) of the input stream "
@@ -51,41 +51,17 @@ double req_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
51
51
  return req_sketch<T>::get_normalized_rank_error(k, pmf);
52
52
  }
53
53
 
54
- template<typename T>
55
- double req_sketch_get_rank(const req_sketch<T>& sk,
56
- const T& item,
57
- bool inclusive) {
58
- if (inclusive)
59
- return sk.template get_rank<true>(item);
60
- else
61
- return sk.template get_rank<false>(item);
62
- }
63
-
64
- template<typename T>
65
- T req_sketch_get_quantile(const req_sketch<T>& sk,
66
- double rank,
67
- bool inclusive) {
68
- if (inclusive)
69
- return T(sk.template get_quantile<true>(rank));
70
- else
71
- return T(sk.template get_quantile<false>(rank));
72
- }
73
-
74
54
  template<typename T>
75
55
  py::list req_sketch_get_quantiles(const req_sketch<T>& sk,
76
- std::vector<double>& fractions,
56
+ std::vector<double>& ranks,
77
57
  bool inclusive) {
78
- size_t n_quantiles = fractions.size();
79
- auto result = inclusive
80
- ? sk.template get_quantiles<true>(&fractions[0], n_quantiles)
81
- : sk.template get_quantiles<false>(&fractions[0], n_quantiles);
82
-
58
+ size_t n_quantiles = ranks.size();
59
+ auto result = sk.get_quantiles(ranks.data(), n_quantiles, inclusive);
83
60
  // returning as std::vector<> would copy values to a list anyway
84
61
  py::list list(n_quantiles);
85
62
  for (size_t i = 0; i < n_quantiles; ++i) {
86
63
  list[i] = result[i];
87
64
  }
88
-
89
65
  return list;
90
66
  }
91
67
 
@@ -94,15 +70,11 @@ py::list req_sketch_get_pmf(const req_sketch<T>& sk,
94
70
  std::vector<T>& split_points,
95
71
  bool inclusive) {
96
72
  size_t n_points = split_points.size();
97
- auto result = inclusive
98
- ? sk.template get_PMF<true>(&split_points[0], n_points)
99
- : sk.template get_PMF<false>(&split_points[0], n_points);
100
-
73
+ auto result = sk.get_PMF(split_points.data(), n_points, inclusive);
101
74
  py::list list(n_points + 1);
102
75
  for (size_t i = 0; i <= n_points; ++i) {
103
76
  list[i] = result[i];
104
77
  }
105
-
106
78
  return list;
107
79
  }
108
80
 
@@ -111,15 +83,11 @@ py::list req_sketch_get_cdf(const req_sketch<T>& sk,
111
83
  std::vector<T>& split_points,
112
84
  bool inclusive) {
113
85
  size_t n_points = split_points.size();
114
- auto result = inclusive
115
- ? sk.template get_CDF<true>(&split_points[0], n_points)
116
- : sk.template get_CDF<false>(&split_points[0], n_points);
117
-
86
+ auto result = sk.get_CDF(split_points.data(), n_points, inclusive);
118
87
  py::list list(n_points + 1);
119
88
  for (size_t i = 0; i <= n_points; ++i) {
120
89
  list[i] = result[i];
121
90
  }
122
-
123
91
  return list;
124
92
  }
125
93
 
@@ -170,33 +138,27 @@ void bind_req_sketch(py::module &m, const char* name) {
170
138
  "Returns the number of retained items (samples) in the sketch")
171
139
  .def("is_estimation_mode", &req_sketch<T>::is_estimation_mode,
172
140
  "Returns True if the sketch is in estimation mode, otherwise False")
173
- .def("get_min_value", &req_sketch<T>::get_min_value,
141
+ .def("get_min_value", &req_sketch<T>::get_min_item,
174
142
  "Returns the minimum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
175
- .def("get_max_value", &req_sketch<T>::get_max_value,
143
+ .def("get_max_value", &req_sketch<T>::get_max_item,
176
144
  "Returns the maximum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
177
- .def("get_quantile", &dspy::req_sketch_get_quantile<T>, py::arg("rank"), py::arg("inclusive")=false,
178
- "Returns an approximation to the value of the data item "
179
- "that would be preceded by the given fraction of a hypothetical sorted "
145
+ .def("get_quantile", &req_sketch<T>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
146
+ "Returns an approximation to the data value "
147
+ "associated with the given normalized rank in a hypothetical sorted "
180
148
  "version of the input stream so far.\n"
181
- "Note that this method has a fairly large overhead (microseconds instead of nanoseconds) "
182
- "so it should not be called multiple times to get different quantiles from the same "
183
- "sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
184
149
  "For req_floats_sketch: if the sketch is empty this returns nan. "
185
150
  "For req_ints_sketch: if the sketch is empty this throws a RuntimeError.")
186
151
  .def("get_quantiles", &dspy::req_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
187
- "This is a more efficient multiple-query version of get_quantile().\n"
188
152
  "This returns an array that could have been generated by using get_quantile() for each "
189
- "fractional rank separately, but would be very inefficient. "
190
- "This method incurs the internal set-up overhead once and obtains multiple quantile values in "
191
- "a single query. It is strongly recommend that this method be used instead of multiple calls "
192
- "to get_quantile().\n"
193
- "If the sketch is empty this returns an empty vector.")
194
- .def("get_rank", &dspy::req_sketch_get_rank<T>, py::arg("item"), py::arg("inclusive")=false,
195
- "Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
153
+ "normalized rank separately.\n"
154
+ "If the sketch is empty this returns an empty vector.\n"
155
+ "Deprecated. Will be removed in the next major version. Use get_quantile() instead.")
156
+ .def("get_rank", &req_sketch<T>::get_rank, py::arg("value"), py::arg("inclusive")=false,
157
+ "Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
196
158
  "The resulting approximation has a probabilistic guarantee that can be obtained from the "
197
159
  "get_normalized_rank_error(False) function.\n"
198
- "With the parameter inclusive=true the weight of the given item is included into the rank."
199
- "Otherwise the rank equals the sum of the weights of items less than the given item.\n"
160
+ "With the parameter inclusive=true the weight of the given value is included into the rank."
161
+ "Otherwise the rank equals the sum of the weights of values less than the given value.\n"
200
162
  "If the sketch is empty this returns nan.")
201
163
  .def("get_pmf", &dspy::req_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
202
164
  "Returns an approximation to the Probability Mass Function (PMF) of the input stream "