datasketches 0.2.6 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (121) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +9 -0
  3. data/LICENSE +4 -6
  4. data/NOTICE +6 -5
  5. data/ext/datasketches/kll_wrapper.cpp +20 -20
  6. data/ext/datasketches/theta_wrapper.cpp +2 -2
  7. data/lib/datasketches/version.rb +1 -1
  8. data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
  9. data/vendor/datasketches-cpp/LICENSE +4 -6
  10. data/vendor/datasketches-cpp/MANIFEST.in +21 -4
  11. data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
  12. data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
  13. data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
  14. data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
  15. data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
  16. data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
  17. data/vendor/datasketches-cpp/common/{test/test_runner.cpp → include/version.hpp.in} +15 -8
  18. data/vendor/datasketches-cpp/common/test/CMakeLists.txt +37 -7
  19. data/vendor/datasketches-cpp/common/test/catch_runner.cpp +22 -1
  20. data/vendor/datasketches-cpp/common/test/integration_test.cpp +1 -1
  21. data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
  22. data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
  23. data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +1 -1
  24. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +1 -1
  25. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -1
  26. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
  27. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
  28. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
  29. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
  30. data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +17 -10
  31. data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
  32. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +55 -42
  33. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -1
  34. data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +4 -4
  35. data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
  36. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
  37. data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
  38. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -1
  39. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +1 -1
  40. data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +1 -1
  41. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -1
  42. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +1 -1
  43. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +1 -1
  44. data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +1 -1
  45. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -1
  46. data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +1 -1
  47. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
  48. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
  49. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
  50. data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
  51. data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +27 -27
  52. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +197 -233
  53. data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +42 -32
  54. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  55. data/vendor/datasketches-cpp/pyproject.toml +17 -13
  56. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  57. data/vendor/datasketches-cpp/python/README.md +1 -1
  58. data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
  59. data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
  60. data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
  61. data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
  62. data/vendor/datasketches-cpp/python/pybind11Path.cmd +19 -1
  63. data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
  64. data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
  65. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
  66. data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
  67. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
  68. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
  69. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
  70. data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
  71. data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
  72. data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
  73. data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
  74. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
  75. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
  76. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
  77. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +1 -1
  78. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +20 -19
  79. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +241 -233
  80. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
  81. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
  82. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
  83. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
  84. data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
  85. data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +27 -27
  86. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +117 -104
  87. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
  88. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
  89. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
  91. data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
  92. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +7 -7
  93. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +3 -3
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +5 -5
  95. data/vendor/datasketches-cpp/setup.py +14 -3
  96. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
  97. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
  98. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
  99. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
  100. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +1 -1
  101. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +1 -1
  102. data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +1 -1
  103. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -1
  104. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +3 -2
  105. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +1 -1
  106. data/vendor/datasketches-cpp/tox.ini +26 -0
  107. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
  108. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +41 -35
  109. data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
  110. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
  111. data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
  112. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -1
  113. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -1
  114. data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +1 -1
  115. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
  116. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +27 -1
  117. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -1
  118. data/vendor/datasketches-cpp/version.cfg.in +1 -0
  119. metadata +14 -7
  120. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
  121. data/vendor/datasketches-cpp/common/test/catch.hpp +0 -17618
@@ -51,39 +51,17 @@ double kll_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
51
51
  return kll_sketch<T>::get_normalized_rank_error(k, pmf);
52
52
  }
53
53
 
54
- template<typename T>
55
- double kll_sketch_get_rank(const kll_sketch<T>& sk, const T& item, bool inclusive) {
56
- if (inclusive)
57
- return sk.template get_rank<true>(item);
58
- else
59
- return sk.template get_rank<false>(item);
60
- }
61
-
62
- template<typename T>
63
- T kll_sketch_get_quantile(const kll_sketch<T>& sk,
64
- double rank,
65
- bool inclusive) {
66
- if (inclusive)
67
- return T(sk.template get_quantile<true>(rank));
68
- else
69
- return T(sk.template get_quantile<false>(rank));
70
- }
71
-
72
54
  template<typename T>
73
55
  py::list kll_sketch_get_quantiles(const kll_sketch<T>& sk,
74
- std::vector<double>& fractions,
56
+ std::vector<double>& ranks,
75
57
  bool inclusive) {
76
- size_t nQuantiles = fractions.size();
77
- auto result = inclusive ?
78
- sk.template get_quantiles<true>(fractions.data(), nQuantiles)
79
- : sk.template get_quantiles<false>(fractions.data(), nQuantiles);
80
-
58
+ size_t nQuantiles = ranks.size();
59
+ auto result = sk.get_quantiles(ranks.data(), nQuantiles, inclusive);
81
60
  // returning as std::vector<> would copy values to a list anyway
82
61
  py::list list(nQuantiles);
83
62
  for (size_t i = 0; i < nQuantiles; ++i) {
84
63
  list[i] = result[i];
85
64
  }
86
-
87
65
  return list;
88
66
  }
89
67
 
@@ -92,15 +70,11 @@ py::list kll_sketch_get_pmf(const kll_sketch<T>& sk,
92
70
  std::vector<T>& split_points,
93
71
  bool inclusive) {
94
72
  size_t nPoints = split_points.size();
95
- auto result = inclusive ?
96
- sk.template get_PMF<true>(split_points.data(), nPoints)
97
- : sk.template get_PMF<false>(split_points.data(), nPoints);
98
-
73
+ auto result = sk.get_PMF(split_points.data(), nPoints, inclusive);
99
74
  py::list list(nPoints + 1);
100
75
  for (size_t i = 0; i <= nPoints; ++i) {
101
76
  list[i] = result[i];
102
77
  }
103
-
104
78
  return list;
105
79
  }
106
80
 
@@ -109,15 +83,11 @@ py::list kll_sketch_get_cdf(const kll_sketch<T>& sk,
109
83
  std::vector<T>& split_points,
110
84
  bool inclusive) {
111
85
  size_t nPoints = split_points.size();
112
- auto result = inclusive ?
113
- sk.template get_CDF<true>(split_points.data(), nPoints)
114
- : sk.template get_CDF<false>(split_points.data(), nPoints);
115
-
86
+ auto result = sk.get_CDF(split_points.data(), nPoints, inclusive);
116
87
  py::list list(nPoints + 1);
117
88
  for (size_t i = 0; i <= nPoints; ++i) {
118
89
  list[i] = result[i];
119
90
  }
120
-
121
91
  return list;
122
92
  }
123
93
 
@@ -166,29 +136,23 @@ void bind_kll_sketch(py::module &m, const char* name) {
166
136
  "Returns the number of retained items (samples) in the sketch")
167
137
  .def("is_estimation_mode", &kll_sketch<T>::is_estimation_mode,
168
138
  "Returns True if the sketch is in estimation mode, otherwise False")
169
- .def("get_min_value", &kll_sketch<T>::get_min_value,
170
- "Returns the minimum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
171
- .def("get_max_value", &kll_sketch<T>::get_max_value,
172
- "Returns the maximum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
173
- .def("get_quantile", &dspy::kll_sketch_get_quantile<T>, py::arg("fraction"), py::arg("inclusive")=false,
174
- "Returns an approximation to the value of the data item "
175
- "that would be preceded by the given fraction of a hypothetical sorted "
139
+ .def("get_min_value", &kll_sketch<T>::get_min_item,
140
+ "Returns the minimum value from the stream. If empty, kll_floats_sketch returns nan; kll_ints_sketch throws a RuntimeError")
141
+ .def("get_max_value", &kll_sketch<T>::get_max_item,
142
+ "Returns the maximum value from the stream. If empty, kll_floats_sketch returns nan; kll_ints_sketch throws a RuntimeError")
143
+ .def("get_quantile", &kll_sketch<T>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
144
+ "Returns an approximation to the data value "
145
+ "associated with the given normalized rank in a hypothetical sorted "
176
146
  "version of the input stream so far.\n"
177
- "Note that this method has a fairly large overhead (microseconds instead of nanoseconds) "
178
- "so it should not be called multiple times to get different quantiles from the same "
179
- "sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
180
147
  "For kll_floats_sketch: if the sketch is empty this returns nan. "
181
148
  "For kll_ints_sketch: if the sketch is empty this throws a RuntimeError.")
182
- .def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("fractions"), py::arg("inclusive")=false,
183
- "This is a more efficient multiple-query version of get_quantile().\n"
149
+ .def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
184
150
  "This returns an array that could have been generated by using get_quantile() for each "
185
- "fractional rank separately, but would be very inefficient. "
186
- "This method incurs the internal set-up overhead once and obtains multiple quantile values in "
187
- "a single query. It is strongly recommend that this method be used instead of multiple calls "
188
- "to get_quantile().\n"
189
- "If the sketch is empty this returns an empty vector.")
190
- .def("get_rank", &dspy::kll_sketch_get_rank<T>, py::arg("value"), py::arg("inclusive")=false,
191
- "Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
151
+ "normalized rank separately.\n"
152
+ "If the sketch is empty this returns an empty vector.\n"
153
+ "Deprecated. Will be removed in the next major version. Use get_quantile() instead.")
154
+ .def("get_rank", &kll_sketch<T>::get_rank, py::arg("value"), py::arg("inclusive")=false,
155
+ "Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
192
156
  "The resulting approximation has a probabilistic guarantee that can be obtained from the "
193
157
  "get_normalized_rank_error(False) function.\n"
194
158
  "With the parameter inclusive=true the weight of the given value is included into the rank."
@@ -0,0 +1,111 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include <cstring>
21
+ #include "memory_operations.hpp"
22
+
23
+ #include "py_serde.hpp"
24
+
25
+ #include <pybind11/pybind11.h>
26
+
27
+ namespace py = pybind11;
28
+
29
+ void init_serde(py::module& m) {
30
+ py::class_<datasketches::py_object_serde, datasketches::PyObjectSerDe /* <--- trampoline*/>(m, "PyObjectSerDe")
31
+ .def(py::init<>())
32
+ .def("get_size", &datasketches::py_object_serde::get_size, py::arg("item"),
33
+ "Returns the size in bytes of an item")
34
+ .def("to_bytes", &datasketches::py_object_serde::to_bytes, py::arg("item"),
35
+ "Retuns a bytes object with a serialized version of an item")
36
+ .def("from_bytes", &datasketches::py_object_serde::from_bytes, py::arg("data"), py::arg("offset"),
37
+ "Reads a bytes object starting from the given offest and returns a tuple of the reconstructed "
38
+ "object and the number of additional bytes read")
39
+ ;
40
+ }
41
+
42
+ namespace datasketches {
43
+ size_t py_object_serde::size_of_item(const py::object& item) const {
44
+ return get_size(item);
45
+ }
46
+
47
+ size_t py_object_serde::serialize(void* ptr, size_t capacity, const py::object* items, unsigned num) const {
48
+ size_t bytes_written = 0;
49
+ py::gil_scoped_acquire acquire;
50
+ for (unsigned i = 0; i < num; ++i) {
51
+ std::string bytes = to_bytes(items[i]); // implicit cast from py::bytes
52
+ check_memory_size(bytes_written + bytes.size(), capacity);
53
+ memcpy(ptr, bytes.c_str(), bytes.size());
54
+ ptr = static_cast<char*>(ptr) + bytes.size();
55
+ bytes_written += bytes.size();
56
+ }
57
+ py::gil_scoped_release release;
58
+ return bytes_written;
59
+ }
60
+
61
+ size_t py_object_serde::deserialize(const void* ptr, size_t capacity, py::object* items, unsigned num) const {
62
+ size_t bytes_read = 0;
63
+ unsigned i = 0;
64
+ bool failure = false;
65
+ bool error_from_python = false;
66
+ py::gil_scoped_acquire acquire;
67
+
68
+ // copy data into bytes only once
69
+ py::bytes bytes(static_cast<const char*>(ptr), capacity);
70
+ for (; i < num && !failure; ++i) {
71
+ py::tuple bytes_and_len;
72
+ try {
73
+ bytes_and_len = from_bytes(bytes, bytes_read);
74
+ } catch (py::error_already_set &e) {
75
+ failure = true;
76
+ error_from_python = true;
77
+ break;
78
+ }
79
+
80
+ size_t length = py::cast<size_t>(bytes_and_len[1]);
81
+ if (bytes_read + length > capacity) {
82
+ bytes_read += length; // use this value to report the error
83
+ failure = true;
84
+ break;
85
+ }
86
+
87
+ new (&items[i]) py::object(py::cast<py::object>(bytes_and_len[0]));
88
+ ptr = static_cast<const char*>(ptr) + length;
89
+ bytes_read += length;
90
+ }
91
+
92
+ if (failure) {
93
+ // clean up what we've allocated
94
+ for (unsigned j = 0; j < i; ++j) {
95
+ items[j].dec_ref();
96
+ }
97
+
98
+ if (error_from_python) {
99
+ throw py::value_error("Error reading value in from_bytes");
100
+ } else {
101
+ // this next call will throw
102
+ check_memory_size(bytes_read, capacity);
103
+ }
104
+ }
105
+
106
+ py::gil_scoped_release release;
107
+ return bytes_read;
108
+ }
109
+
110
+
111
+ } // namespace datasketches
@@ -49,41 +49,17 @@ double quantiles_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
49
49
  return quantiles_sketch<T>::get_normalized_rank_error(k, pmf);
50
50
  }
51
51
 
52
- template<typename T>
53
- double quantiles_sketch_get_rank(const quantiles_sketch<T>& sk,
54
- const T& item,
55
- bool inclusive) {
56
- if (inclusive)
57
- return sk.template get_rank<true>(item);
58
- else
59
- return sk.template get_rank<false>(item);
60
- }
61
-
62
- template<typename T>
63
- T quantiles_sketch_get_quantile(const quantiles_sketch<T>& sk,
64
- double rank,
65
- bool inclusive) {
66
- if (inclusive)
67
- return T(sk.template get_quantile<true>(rank));
68
- else
69
- return T(sk.template get_quantile<false>(rank));
70
- }
71
-
72
52
  template<typename T>
73
53
  py::list quantiles_sketch_get_quantiles(const quantiles_sketch<T>& sk,
74
- std::vector<double>& fractions,
54
+ std::vector<double>& ranks,
75
55
  bool inclusive) {
76
- size_t n_quantiles = fractions.size();
77
- auto result = inclusive
78
- ? sk.template get_quantiles<true>(&fractions[0], static_cast<uint32_t>(n_quantiles))
79
- : sk.template get_quantiles<false>(&fractions[0], static_cast<uint32_t>(n_quantiles));
80
-
56
+ size_t n_quantiles = ranks.size();
57
+ auto result = sk.get_quantiles(ranks.data(), static_cast<uint32_t>(n_quantiles), inclusive);
81
58
  // returning as std::vector<> would copy values to a list anyway
82
59
  py::list list(n_quantiles);
83
60
  for (size_t i = 0; i < n_quantiles; ++i) {
84
61
  list[i] = result[i];
85
62
  }
86
-
87
63
  return list;
88
64
  }
89
65
 
@@ -92,15 +68,11 @@ py::list quantiles_sketch_get_pmf(const quantiles_sketch<T>& sk,
92
68
  std::vector<T>& split_points,
93
69
  bool inclusive) {
94
70
  size_t n_points = split_points.size();
95
- auto result = inclusive
96
- ? sk.template get_PMF<true>(&split_points[0], n_points)
97
- : sk.template get_PMF<false>(&split_points[0], n_points);
98
-
71
+ auto result = sk.get_PMF(split_points.data(), n_points, inclusive);
99
72
  py::list list(n_points + 1);
100
73
  for (size_t i = 0; i <= n_points; ++i) {
101
74
  list[i] = result[i];
102
75
  }
103
-
104
76
  return list;
105
77
  }
106
78
 
@@ -109,15 +81,11 @@ py::list quantiles_sketch_get_cdf(const quantiles_sketch<T>& sk,
109
81
  std::vector<T>& split_points,
110
82
  bool inclusive) {
111
83
  size_t n_points = split_points.size();
112
- auto result = inclusive
113
- ? sk.template get_CDF<true>(&split_points[0], n_points)
114
- : sk.template get_CDF<false>(&split_points[0], n_points);
115
-
84
+ auto result = sk.get_CDF(split_points.data(), n_points, inclusive);
116
85
  py::list list(n_points + 1);
117
86
  for (size_t i = 0; i <= n_points; ++i) {
118
87
  list[i] = result[i];
119
88
  }
120
-
121
89
  return list;
122
90
  }
123
91
 
@@ -166,31 +134,27 @@ void bind_quantiles_sketch(py::module &m, const char* name) {
166
134
  "Returns the number of retained items (samples) in the sketch")
167
135
  .def("is_estimation_mode", &quantiles_sketch<T>::is_estimation_mode,
168
136
  "Returns True if the sketch is in estimation mode, otherwise False")
169
- .def("get_min_value", &quantiles_sketch<T>::get_min_value,
137
+ .def("get_min_value", &quantiles_sketch<T>::get_min_item,
170
138
  "Returns the minimum value from the stream. If empty, quantiles_floats_sketch returns nan; quantiles_ints_sketch throws a RuntimeError")
171
- .def("get_max_value", &quantiles_sketch<T>::get_max_value,
139
+ .def("get_max_value", &quantiles_sketch<T>::get_max_item,
172
140
  "Returns the maximum value from the stream. If empty, quantiles_floats_sketch returns nan; quantiles_ints_sketch throws a RuntimeError")
173
- .def("get_quantile", &dspy::quantiles_sketch_get_quantile<T>, py::arg("rank"), py::arg("inclusive")=false,
174
- "Returns an approximation to the value of the data item "
175
- "that would be preceded by the given fraction of a hypothetical sorted "
141
+ .def("get_quantile", &quantiles_sketch<T>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
142
+ "Returns an approximation to the data value "
143
+ "associated with the given rank in a hypothetical sorted "
176
144
  "version of the input stream so far.\n"
177
- "Note that this method has a fairly large overhead (microseconds instead of nanoseconds) "
178
- "so it should not be called multiple times to get different quantiles from the same "
179
- "sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
180
145
  "For quantiles_floats_sketch: if the sketch is empty this returns nan. "
181
146
  "For quantiles_ints_sketch: if the sketch is empty this throws a RuntimeError.")
182
147
  .def("get_quantiles", &dspy::quantiles_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
183
- "This is a more efficient multiple-query version of get_quantile().\n"
184
148
  "This returns an array that could have been generated by using get_quantile() for each "
185
- "fractional rank separately, but would be very inefficient. "
186
- "This method incurs the internal set-up overhead once and obtains multiple quantile values in "
187
- "a single query. It is strongly recommend that this method be used instead of multiple calls "
188
- "to get_quantile().\n"
189
- "If the sketch is empty this returns an empty vector.")
190
- .def("get_rank", &dspy::quantiles_sketch_get_rank<T>, py::arg("item"), py::arg("inclusive")=false,
191
- "Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
149
+ "normalized rank separately.\n"
150
+ "If the sketch is empty this returns an empty vector.\n"
151
+ "Deprecated. Will be removed in the next major version. Use get_quantile() instead.")
152
+ .def("get_rank", &quantiles_sketch<T>::get_rank, py::arg("value"), py::arg("inclusive")=false,
153
+ "Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
192
154
  "The resulting approximation has a probabilistic guarantee that can be obtained from the "
193
155
  "get_normalized_rank_error(False) function.\n"
156
+ "With the parameter inclusive=true the weight of the given value is included into the rank."
157
+ "Otherwise the rank equals the sum of the weights of values less than the given value.\n"
194
158
  "If the sketch is empty this returns nan.")
195
159
  .def("get_pmf", &dspy::quantiles_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
196
160
  "Returns an approximation to the Probability Mass Function (PMF) of the input stream "
@@ -51,41 +51,17 @@ double req_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
51
51
  return req_sketch<T>::get_normalized_rank_error(k, pmf);
52
52
  }
53
53
 
54
- template<typename T>
55
- double req_sketch_get_rank(const req_sketch<T>& sk,
56
- const T& item,
57
- bool inclusive) {
58
- if (inclusive)
59
- return sk.template get_rank<true>(item);
60
- else
61
- return sk.template get_rank<false>(item);
62
- }
63
-
64
- template<typename T>
65
- T req_sketch_get_quantile(const req_sketch<T>& sk,
66
- double rank,
67
- bool inclusive) {
68
- if (inclusive)
69
- return T(sk.template get_quantile<true>(rank));
70
- else
71
- return T(sk.template get_quantile<false>(rank));
72
- }
73
-
74
54
  template<typename T>
75
55
  py::list req_sketch_get_quantiles(const req_sketch<T>& sk,
76
- std::vector<double>& fractions,
56
+ std::vector<double>& ranks,
77
57
  bool inclusive) {
78
- size_t n_quantiles = fractions.size();
79
- auto result = inclusive
80
- ? sk.template get_quantiles<true>(&fractions[0], n_quantiles)
81
- : sk.template get_quantiles<false>(&fractions[0], n_quantiles);
82
-
58
+ size_t n_quantiles = ranks.size();
59
+ auto result = sk.get_quantiles(ranks.data(), n_quantiles, inclusive);
83
60
  // returning as std::vector<> would copy values to a list anyway
84
61
  py::list list(n_quantiles);
85
62
  for (size_t i = 0; i < n_quantiles; ++i) {
86
63
  list[i] = result[i];
87
64
  }
88
-
89
65
  return list;
90
66
  }
91
67
 
@@ -94,15 +70,11 @@ py::list req_sketch_get_pmf(const req_sketch<T>& sk,
94
70
  std::vector<T>& split_points,
95
71
  bool inclusive) {
96
72
  size_t n_points = split_points.size();
97
- auto result = inclusive
98
- ? sk.template get_PMF<true>(&split_points[0], n_points)
99
- : sk.template get_PMF<false>(&split_points[0], n_points);
100
-
73
+ auto result = sk.get_PMF(split_points.data(), n_points, inclusive);
101
74
  py::list list(n_points + 1);
102
75
  for (size_t i = 0; i <= n_points; ++i) {
103
76
  list[i] = result[i];
104
77
  }
105
-
106
78
  return list;
107
79
  }
108
80
 
@@ -111,15 +83,11 @@ py::list req_sketch_get_cdf(const req_sketch<T>& sk,
111
83
  std::vector<T>& split_points,
112
84
  bool inclusive) {
113
85
  size_t n_points = split_points.size();
114
- auto result = inclusive
115
- ? sk.template get_CDF<true>(&split_points[0], n_points)
116
- : sk.template get_CDF<false>(&split_points[0], n_points);
117
-
86
+ auto result = sk.get_CDF(split_points.data(), n_points, inclusive);
118
87
  py::list list(n_points + 1);
119
88
  for (size_t i = 0; i <= n_points; ++i) {
120
89
  list[i] = result[i];
121
90
  }
122
-
123
91
  return list;
124
92
  }
125
93
 
@@ -170,33 +138,27 @@ void bind_req_sketch(py::module &m, const char* name) {
170
138
  "Returns the number of retained items (samples) in the sketch")
171
139
  .def("is_estimation_mode", &req_sketch<T>::is_estimation_mode,
172
140
  "Returns True if the sketch is in estimation mode, otherwise False")
173
- .def("get_min_value", &req_sketch<T>::get_min_value,
141
+ .def("get_min_value", &req_sketch<T>::get_min_item,
174
142
  "Returns the minimum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
175
- .def("get_max_value", &req_sketch<T>::get_max_value,
143
+ .def("get_max_value", &req_sketch<T>::get_max_item,
176
144
  "Returns the maximum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
177
- .def("get_quantile", &dspy::req_sketch_get_quantile<T>, py::arg("rank"), py::arg("inclusive")=false,
178
- "Returns an approximation to the value of the data item "
179
- "that would be preceded by the given fraction of a hypothetical sorted "
145
+ .def("get_quantile", &req_sketch<T>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
146
+ "Returns an approximation to the data value "
147
+ "associated with the given normalized rank in a hypothetical sorted "
180
148
  "version of the input stream so far.\n"
181
- "Note that this method has a fairly large overhead (microseconds instead of nanoseconds) "
182
- "so it should not be called multiple times to get different quantiles from the same "
183
- "sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
184
149
  "For req_floats_sketch: if the sketch is empty this returns nan. "
185
150
  "For req_ints_sketch: if the sketch is empty this throws a RuntimeError.")
186
151
  .def("get_quantiles", &dspy::req_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
187
- "This is a more efficient multiple-query version of get_quantile().\n"
188
152
  "This returns an array that could have been generated by using get_quantile() for each "
189
- "fractional rank separately, but would be very inefficient. "
190
- "This method incurs the internal set-up overhead once and obtains multiple quantile values in "
191
- "a single query. It is strongly recommend that this method be used instead of multiple calls "
192
- "to get_quantile().\n"
193
- "If the sketch is empty this returns an empty vector.")
194
- .def("get_rank", &dspy::req_sketch_get_rank<T>, py::arg("item"), py::arg("inclusive")=false,
195
- "Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
153
+ "normalized rank separately.\n"
154
+ "If the sketch is empty this returns an empty vector.\n"
155
+ "Deprecated. Will be removed in the next major version. Use get_quantile() instead.")
156
+ .def("get_rank", &req_sketch<T>::get_rank, py::arg("value"), py::arg("inclusive")=false,
157
+ "Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
196
158
  "The resulting approximation has a probabilistic guarantee that can be obtained from the "
197
159
  "get_normalized_rank_error(False) function.\n"
198
- "With the parameter inclusive=true the weight of the given item is included into the rank."
199
- "Otherwise the rank equals the sum of the weights of items less than the given item.\n"
160
+ "With the parameter inclusive=true the weight of the given value is included into the rank."
161
+ "Otherwise the rank equals the sum of the weights of values less than the given value.\n"
200
162
  "If the sketch is empty this returns nan.")
201
163
  .def("get_pmf", &dspy::req_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
202
164
  "Returns an approximation to the Probability Mass Function (PMF) of the input stream "