datasketches 0.2.7 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/datasketches/kll_wrapper.cpp +20 -20
- data/ext/datasketches/theta_wrapper.cpp +2 -2
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
- data/vendor/datasketches-cpp/MANIFEST.in +21 -2
- data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
- data/vendor/datasketches-cpp/common/include/version.hpp.in +36 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +25 -6
- data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +16 -9
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +54 -41
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +3 -3
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +26 -26
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +196 -232
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +41 -31
- data/vendor/datasketches-cpp/pyproject.toml +17 -12
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +18 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
- data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
- data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +19 -18
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +240 -232
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +26 -26
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +116 -103
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +6 -6
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/setup.py +14 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +2 -1
- data/vendor/datasketches-cpp/tox.ini +26 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +16 -4
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +26 -0
- data/vendor/datasketches-cpp/version.cfg.in +1 -0
- metadata +14 -5
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
|
@@ -51,39 +51,17 @@ double kll_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
|
|
|
51
51
|
return kll_sketch<T>::get_normalized_rank_error(k, pmf);
|
|
52
52
|
}
|
|
53
53
|
|
|
54
|
-
template<typename T>
|
|
55
|
-
double kll_sketch_get_rank(const kll_sketch<T>& sk, const T& item, bool inclusive) {
|
|
56
|
-
if (inclusive)
|
|
57
|
-
return sk.template get_rank<true>(item);
|
|
58
|
-
else
|
|
59
|
-
return sk.template get_rank<false>(item);
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
template<typename T>
|
|
63
|
-
T kll_sketch_get_quantile(const kll_sketch<T>& sk,
|
|
64
|
-
double rank,
|
|
65
|
-
bool inclusive) {
|
|
66
|
-
if (inclusive)
|
|
67
|
-
return T(sk.template get_quantile<true>(rank));
|
|
68
|
-
else
|
|
69
|
-
return T(sk.template get_quantile<false>(rank));
|
|
70
|
-
}
|
|
71
|
-
|
|
72
54
|
template<typename T>
|
|
73
55
|
py::list kll_sketch_get_quantiles(const kll_sketch<T>& sk,
|
|
74
|
-
std::vector<double>&
|
|
56
|
+
std::vector<double>& ranks,
|
|
75
57
|
bool inclusive) {
|
|
76
|
-
size_t nQuantiles =
|
|
77
|
-
auto result = inclusive
|
|
78
|
-
sk.template get_quantiles<true>(fractions.data(), nQuantiles)
|
|
79
|
-
: sk.template get_quantiles<false>(fractions.data(), nQuantiles);
|
|
80
|
-
|
|
58
|
+
size_t nQuantiles = ranks.size();
|
|
59
|
+
auto result = sk.get_quantiles(ranks.data(), nQuantiles, inclusive);
|
|
81
60
|
// returning as std::vector<> would copy values to a list anyway
|
|
82
61
|
py::list list(nQuantiles);
|
|
83
62
|
for (size_t i = 0; i < nQuantiles; ++i) {
|
|
84
63
|
list[i] = result[i];
|
|
85
64
|
}
|
|
86
|
-
|
|
87
65
|
return list;
|
|
88
66
|
}
|
|
89
67
|
|
|
@@ -92,15 +70,11 @@ py::list kll_sketch_get_pmf(const kll_sketch<T>& sk,
|
|
|
92
70
|
std::vector<T>& split_points,
|
|
93
71
|
bool inclusive) {
|
|
94
72
|
size_t nPoints = split_points.size();
|
|
95
|
-
auto result = inclusive
|
|
96
|
-
sk.template get_PMF<true>(split_points.data(), nPoints)
|
|
97
|
-
: sk.template get_PMF<false>(split_points.data(), nPoints);
|
|
98
|
-
|
|
73
|
+
auto result = sk.get_PMF(split_points.data(), nPoints, inclusive);
|
|
99
74
|
py::list list(nPoints + 1);
|
|
100
75
|
for (size_t i = 0; i <= nPoints; ++i) {
|
|
101
76
|
list[i] = result[i];
|
|
102
77
|
}
|
|
103
|
-
|
|
104
78
|
return list;
|
|
105
79
|
}
|
|
106
80
|
|
|
@@ -109,15 +83,11 @@ py::list kll_sketch_get_cdf(const kll_sketch<T>& sk,
|
|
|
109
83
|
std::vector<T>& split_points,
|
|
110
84
|
bool inclusive) {
|
|
111
85
|
size_t nPoints = split_points.size();
|
|
112
|
-
auto result = inclusive
|
|
113
|
-
sk.template get_CDF<true>(split_points.data(), nPoints)
|
|
114
|
-
: sk.template get_CDF<false>(split_points.data(), nPoints);
|
|
115
|
-
|
|
86
|
+
auto result = sk.get_CDF(split_points.data(), nPoints, inclusive);
|
|
116
87
|
py::list list(nPoints + 1);
|
|
117
88
|
for (size_t i = 0; i <= nPoints; ++i) {
|
|
118
89
|
list[i] = result[i];
|
|
119
90
|
}
|
|
120
|
-
|
|
121
91
|
return list;
|
|
122
92
|
}
|
|
123
93
|
|
|
@@ -166,29 +136,23 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
|
166
136
|
"Returns the number of retained items (samples) in the sketch")
|
|
167
137
|
.def("is_estimation_mode", &kll_sketch<T>::is_estimation_mode,
|
|
168
138
|
"Returns True if the sketch is in estimation mode, otherwise False")
|
|
169
|
-
.def("get_min_value", &kll_sketch<T>::
|
|
170
|
-
"Returns the minimum value from the stream. If empty, kll_floats_sketch
|
|
171
|
-
.def("get_max_value", &kll_sketch<T>::
|
|
172
|
-
"Returns the maximum value from the stream. If empty, kll_floats_sketch
|
|
173
|
-
.def("get_quantile", &
|
|
174
|
-
"Returns an approximation to the
|
|
175
|
-
"
|
|
139
|
+
.def("get_min_value", &kll_sketch<T>::get_min_item,
|
|
140
|
+
"Returns the minimum value from the stream. If empty, kll_floats_sketch returns nan; kll_ints_sketch throws a RuntimeError")
|
|
141
|
+
.def("get_max_value", &kll_sketch<T>::get_max_item,
|
|
142
|
+
"Returns the maximum value from the stream. If empty, kll_floats_sketch returns nan; kll_ints_sketch throws a RuntimeError")
|
|
143
|
+
.def("get_quantile", &kll_sketch<T>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
|
|
144
|
+
"Returns an approximation to the data value "
|
|
145
|
+
"associated with the given normalized rank in a hypothetical sorted "
|
|
176
146
|
"version of the input stream so far.\n"
|
|
177
|
-
"Note that this method has a fairly large overhead (microseconds instead of nanoseconds) "
|
|
178
|
-
"so it should not be called multiple times to get different quantiles from the same "
|
|
179
|
-
"sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
|
|
180
147
|
"For kll_floats_sketch: if the sketch is empty this returns nan. "
|
|
181
148
|
"For kll_ints_sketch: if the sketch is empty this throws a RuntimeError.")
|
|
182
|
-
.def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("
|
|
183
|
-
"This is a more efficient multiple-query version of get_quantile().\n"
|
|
149
|
+
.def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
|
|
184
150
|
"This returns an array that could have been generated by using get_quantile() for each "
|
|
185
|
-
"
|
|
186
|
-
"
|
|
187
|
-
"
|
|
188
|
-
|
|
189
|
-
"
|
|
190
|
-
.def("get_rank", &dspy::kll_sketch_get_rank<T>, py::arg("value"), py::arg("inclusive")=false,
|
|
191
|
-
"Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
|
|
151
|
+
"normalized rank separately.\n"
|
|
152
|
+
"If the sketch is empty this returns an empty vector.\n"
|
|
153
|
+
"Deprecated. Will be removed in the next major version. Use get_quantile() instead.")
|
|
154
|
+
.def("get_rank", &kll_sketch<T>::get_rank, py::arg("value"), py::arg("inclusive")=false,
|
|
155
|
+
"Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
|
|
192
156
|
"The resulting approximation has a probabilistic guarantee that can be obtained from the "
|
|
193
157
|
"get_normalized_rank_error(False) function.\n"
|
|
194
158
|
"With the parameter inclusive=true the weight of the given value is included into the rank."
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <cstring>
|
|
21
|
+
#include "memory_operations.hpp"
|
|
22
|
+
|
|
23
|
+
#include "py_serde.hpp"
|
|
24
|
+
|
|
25
|
+
#include <pybind11/pybind11.h>
|
|
26
|
+
|
|
27
|
+
namespace py = pybind11;
|
|
28
|
+
|
|
29
|
+
void init_serde(py::module& m) {
|
|
30
|
+
py::class_<datasketches::py_object_serde, datasketches::PyObjectSerDe /* <--- trampoline*/>(m, "PyObjectSerDe")
|
|
31
|
+
.def(py::init<>())
|
|
32
|
+
.def("get_size", &datasketches::py_object_serde::get_size, py::arg("item"),
|
|
33
|
+
"Returns the size in bytes of an item")
|
|
34
|
+
.def("to_bytes", &datasketches::py_object_serde::to_bytes, py::arg("item"),
|
|
35
|
+
"Retuns a bytes object with a serialized version of an item")
|
|
36
|
+
.def("from_bytes", &datasketches::py_object_serde::from_bytes, py::arg("data"), py::arg("offset"),
|
|
37
|
+
"Reads a bytes object starting from the given offest and returns a tuple of the reconstructed "
|
|
38
|
+
"object and the number of additional bytes read")
|
|
39
|
+
;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
namespace datasketches {
|
|
43
|
+
size_t py_object_serde::size_of_item(const py::object& item) const {
|
|
44
|
+
return get_size(item);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
size_t py_object_serde::serialize(void* ptr, size_t capacity, const py::object* items, unsigned num) const {
|
|
48
|
+
size_t bytes_written = 0;
|
|
49
|
+
py::gil_scoped_acquire acquire;
|
|
50
|
+
for (unsigned i = 0; i < num; ++i) {
|
|
51
|
+
std::string bytes = to_bytes(items[i]); // implicit cast from py::bytes
|
|
52
|
+
check_memory_size(bytes_written + bytes.size(), capacity);
|
|
53
|
+
memcpy(ptr, bytes.c_str(), bytes.size());
|
|
54
|
+
ptr = static_cast<char*>(ptr) + bytes.size();
|
|
55
|
+
bytes_written += bytes.size();
|
|
56
|
+
}
|
|
57
|
+
py::gil_scoped_release release;
|
|
58
|
+
return bytes_written;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
size_t py_object_serde::deserialize(const void* ptr, size_t capacity, py::object* items, unsigned num) const {
|
|
62
|
+
size_t bytes_read = 0;
|
|
63
|
+
unsigned i = 0;
|
|
64
|
+
bool failure = false;
|
|
65
|
+
bool error_from_python = false;
|
|
66
|
+
py::gil_scoped_acquire acquire;
|
|
67
|
+
|
|
68
|
+
// copy data into bytes only once
|
|
69
|
+
py::bytes bytes(static_cast<const char*>(ptr), capacity);
|
|
70
|
+
for (; i < num && !failure; ++i) {
|
|
71
|
+
py::tuple bytes_and_len;
|
|
72
|
+
try {
|
|
73
|
+
bytes_and_len = from_bytes(bytes, bytes_read);
|
|
74
|
+
} catch (py::error_already_set &e) {
|
|
75
|
+
failure = true;
|
|
76
|
+
error_from_python = true;
|
|
77
|
+
break;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
size_t length = py::cast<size_t>(bytes_and_len[1]);
|
|
81
|
+
if (bytes_read + length > capacity) {
|
|
82
|
+
bytes_read += length; // use this value to report the error
|
|
83
|
+
failure = true;
|
|
84
|
+
break;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
new (&items[i]) py::object(py::cast<py::object>(bytes_and_len[0]));
|
|
88
|
+
ptr = static_cast<const char*>(ptr) + length;
|
|
89
|
+
bytes_read += length;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
if (failure) {
|
|
93
|
+
// clean up what we've allocated
|
|
94
|
+
for (unsigned j = 0; j < i; ++j) {
|
|
95
|
+
items[j].dec_ref();
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
if (error_from_python) {
|
|
99
|
+
throw py::value_error("Error reading value in from_bytes");
|
|
100
|
+
} else {
|
|
101
|
+
// this next call will throw
|
|
102
|
+
check_memory_size(bytes_read, capacity);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
py::gil_scoped_release release;
|
|
107
|
+
return bytes_read;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
} // namespace datasketches
|
|
@@ -49,41 +49,17 @@ double quantiles_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
|
|
|
49
49
|
return quantiles_sketch<T>::get_normalized_rank_error(k, pmf);
|
|
50
50
|
}
|
|
51
51
|
|
|
52
|
-
template<typename T>
|
|
53
|
-
double quantiles_sketch_get_rank(const quantiles_sketch<T>& sk,
|
|
54
|
-
const T& item,
|
|
55
|
-
bool inclusive) {
|
|
56
|
-
if (inclusive)
|
|
57
|
-
return sk.template get_rank<true>(item);
|
|
58
|
-
else
|
|
59
|
-
return sk.template get_rank<false>(item);
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
template<typename T>
|
|
63
|
-
T quantiles_sketch_get_quantile(const quantiles_sketch<T>& sk,
|
|
64
|
-
double rank,
|
|
65
|
-
bool inclusive) {
|
|
66
|
-
if (inclusive)
|
|
67
|
-
return T(sk.template get_quantile<true>(rank));
|
|
68
|
-
else
|
|
69
|
-
return T(sk.template get_quantile<false>(rank));
|
|
70
|
-
}
|
|
71
|
-
|
|
72
52
|
template<typename T>
|
|
73
53
|
py::list quantiles_sketch_get_quantiles(const quantiles_sketch<T>& sk,
|
|
74
|
-
std::vector<double>&
|
|
54
|
+
std::vector<double>& ranks,
|
|
75
55
|
bool inclusive) {
|
|
76
|
-
size_t n_quantiles =
|
|
77
|
-
auto result = inclusive
|
|
78
|
-
? sk.template get_quantiles<true>(&fractions[0], static_cast<uint32_t>(n_quantiles))
|
|
79
|
-
: sk.template get_quantiles<false>(&fractions[0], static_cast<uint32_t>(n_quantiles));
|
|
80
|
-
|
|
56
|
+
size_t n_quantiles = ranks.size();
|
|
57
|
+
auto result = sk.get_quantiles(ranks.data(), static_cast<uint32_t>(n_quantiles), inclusive);
|
|
81
58
|
// returning as std::vector<> would copy values to a list anyway
|
|
82
59
|
py::list list(n_quantiles);
|
|
83
60
|
for (size_t i = 0; i < n_quantiles; ++i) {
|
|
84
61
|
list[i] = result[i];
|
|
85
62
|
}
|
|
86
|
-
|
|
87
63
|
return list;
|
|
88
64
|
}
|
|
89
65
|
|
|
@@ -92,15 +68,11 @@ py::list quantiles_sketch_get_pmf(const quantiles_sketch<T>& sk,
|
|
|
92
68
|
std::vector<T>& split_points,
|
|
93
69
|
bool inclusive) {
|
|
94
70
|
size_t n_points = split_points.size();
|
|
95
|
-
auto result = inclusive
|
|
96
|
-
? sk.template get_PMF<true>(&split_points[0], n_points)
|
|
97
|
-
: sk.template get_PMF<false>(&split_points[0], n_points);
|
|
98
|
-
|
|
71
|
+
auto result = sk.get_PMF(split_points.data(), n_points, inclusive);
|
|
99
72
|
py::list list(n_points + 1);
|
|
100
73
|
for (size_t i = 0; i <= n_points; ++i) {
|
|
101
74
|
list[i] = result[i];
|
|
102
75
|
}
|
|
103
|
-
|
|
104
76
|
return list;
|
|
105
77
|
}
|
|
106
78
|
|
|
@@ -109,15 +81,11 @@ py::list quantiles_sketch_get_cdf(const quantiles_sketch<T>& sk,
|
|
|
109
81
|
std::vector<T>& split_points,
|
|
110
82
|
bool inclusive) {
|
|
111
83
|
size_t n_points = split_points.size();
|
|
112
|
-
auto result = inclusive
|
|
113
|
-
? sk.template get_CDF<true>(&split_points[0], n_points)
|
|
114
|
-
: sk.template get_CDF<false>(&split_points[0], n_points);
|
|
115
|
-
|
|
84
|
+
auto result = sk.get_CDF(split_points.data(), n_points, inclusive);
|
|
116
85
|
py::list list(n_points + 1);
|
|
117
86
|
for (size_t i = 0; i <= n_points; ++i) {
|
|
118
87
|
list[i] = result[i];
|
|
119
88
|
}
|
|
120
|
-
|
|
121
89
|
return list;
|
|
122
90
|
}
|
|
123
91
|
|
|
@@ -166,31 +134,27 @@ void bind_quantiles_sketch(py::module &m, const char* name) {
|
|
|
166
134
|
"Returns the number of retained items (samples) in the sketch")
|
|
167
135
|
.def("is_estimation_mode", &quantiles_sketch<T>::is_estimation_mode,
|
|
168
136
|
"Returns True if the sketch is in estimation mode, otherwise False")
|
|
169
|
-
.def("get_min_value", &quantiles_sketch<T>::
|
|
137
|
+
.def("get_min_value", &quantiles_sketch<T>::get_min_item,
|
|
170
138
|
"Returns the minimum value from the stream. If empty, quantiles_floats_sketch returns nan; quantiles_ints_sketch throws a RuntimeError")
|
|
171
|
-
.def("get_max_value", &quantiles_sketch<T>::
|
|
139
|
+
.def("get_max_value", &quantiles_sketch<T>::get_max_item,
|
|
172
140
|
"Returns the maximum value from the stream. If empty, quantiles_floats_sketch returns nan; quantiles_ints_sketch throws a RuntimeError")
|
|
173
|
-
.def("get_quantile", &
|
|
174
|
-
"Returns an approximation to the
|
|
175
|
-
"
|
|
141
|
+
.def("get_quantile", &quantiles_sketch<T>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
|
|
142
|
+
"Returns an approximation to the data value "
|
|
143
|
+
"associated with the given rank in a hypothetical sorted "
|
|
176
144
|
"version of the input stream so far.\n"
|
|
177
|
-
"Note that this method has a fairly large overhead (microseconds instead of nanoseconds) "
|
|
178
|
-
"so it should not be called multiple times to get different quantiles from the same "
|
|
179
|
-
"sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
|
|
180
145
|
"For quantiles_floats_sketch: if the sketch is empty this returns nan. "
|
|
181
146
|
"For quantiles_ints_sketch: if the sketch is empty this throws a RuntimeError.")
|
|
182
147
|
.def("get_quantiles", &dspy::quantiles_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
|
|
183
|
-
"This is a more efficient multiple-query version of get_quantile().\n"
|
|
184
148
|
"This returns an array that could have been generated by using get_quantile() for each "
|
|
185
|
-
"
|
|
186
|
-
"
|
|
187
|
-
"
|
|
188
|
-
|
|
189
|
-
"
|
|
190
|
-
.def("get_rank", &dspy::quantiles_sketch_get_rank<T>, py::arg("item"), py::arg("inclusive")=false,
|
|
191
|
-
"Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
|
|
149
|
+
"normalized rank separately.\n"
|
|
150
|
+
"If the sketch is empty this returns an empty vector.\n"
|
|
151
|
+
"Deprecated. Will be removed in the next major version. Use get_quantile() instead.")
|
|
152
|
+
.def("get_rank", &quantiles_sketch<T>::get_rank, py::arg("value"), py::arg("inclusive")=false,
|
|
153
|
+
"Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
|
|
192
154
|
"The resulting approximation has a probabilistic guarantee that can be obtained from the "
|
|
193
155
|
"get_normalized_rank_error(False) function.\n"
|
|
156
|
+
"With the parameter inclusive=true the weight of the given value is included into the rank."
|
|
157
|
+
"Otherwise the rank equals the sum of the weights of values less than the given value.\n"
|
|
194
158
|
"If the sketch is empty this returns nan.")
|
|
195
159
|
.def("get_pmf", &dspy::quantiles_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
|
196
160
|
"Returns an approximation to the Probability Mass Function (PMF) of the input stream "
|
|
@@ -51,41 +51,17 @@ double req_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
|
|
|
51
51
|
return req_sketch<T>::get_normalized_rank_error(k, pmf);
|
|
52
52
|
}
|
|
53
53
|
|
|
54
|
-
template<typename T>
|
|
55
|
-
double req_sketch_get_rank(const req_sketch<T>& sk,
|
|
56
|
-
const T& item,
|
|
57
|
-
bool inclusive) {
|
|
58
|
-
if (inclusive)
|
|
59
|
-
return sk.template get_rank<true>(item);
|
|
60
|
-
else
|
|
61
|
-
return sk.template get_rank<false>(item);
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
template<typename T>
|
|
65
|
-
T req_sketch_get_quantile(const req_sketch<T>& sk,
|
|
66
|
-
double rank,
|
|
67
|
-
bool inclusive) {
|
|
68
|
-
if (inclusive)
|
|
69
|
-
return T(sk.template get_quantile<true>(rank));
|
|
70
|
-
else
|
|
71
|
-
return T(sk.template get_quantile<false>(rank));
|
|
72
|
-
}
|
|
73
|
-
|
|
74
54
|
template<typename T>
|
|
75
55
|
py::list req_sketch_get_quantiles(const req_sketch<T>& sk,
|
|
76
|
-
std::vector<double>&
|
|
56
|
+
std::vector<double>& ranks,
|
|
77
57
|
bool inclusive) {
|
|
78
|
-
size_t n_quantiles =
|
|
79
|
-
auto result = inclusive
|
|
80
|
-
? sk.template get_quantiles<true>(&fractions[0], n_quantiles)
|
|
81
|
-
: sk.template get_quantiles<false>(&fractions[0], n_quantiles);
|
|
82
|
-
|
|
58
|
+
size_t n_quantiles = ranks.size();
|
|
59
|
+
auto result = sk.get_quantiles(ranks.data(), n_quantiles, inclusive);
|
|
83
60
|
// returning as std::vector<> would copy values to a list anyway
|
|
84
61
|
py::list list(n_quantiles);
|
|
85
62
|
for (size_t i = 0; i < n_quantiles; ++i) {
|
|
86
63
|
list[i] = result[i];
|
|
87
64
|
}
|
|
88
|
-
|
|
89
65
|
return list;
|
|
90
66
|
}
|
|
91
67
|
|
|
@@ -94,15 +70,11 @@ py::list req_sketch_get_pmf(const req_sketch<T>& sk,
|
|
|
94
70
|
std::vector<T>& split_points,
|
|
95
71
|
bool inclusive) {
|
|
96
72
|
size_t n_points = split_points.size();
|
|
97
|
-
auto result = inclusive
|
|
98
|
-
? sk.template get_PMF<true>(&split_points[0], n_points)
|
|
99
|
-
: sk.template get_PMF<false>(&split_points[0], n_points);
|
|
100
|
-
|
|
73
|
+
auto result = sk.get_PMF(split_points.data(), n_points, inclusive);
|
|
101
74
|
py::list list(n_points + 1);
|
|
102
75
|
for (size_t i = 0; i <= n_points; ++i) {
|
|
103
76
|
list[i] = result[i];
|
|
104
77
|
}
|
|
105
|
-
|
|
106
78
|
return list;
|
|
107
79
|
}
|
|
108
80
|
|
|
@@ -111,15 +83,11 @@ py::list req_sketch_get_cdf(const req_sketch<T>& sk,
|
|
|
111
83
|
std::vector<T>& split_points,
|
|
112
84
|
bool inclusive) {
|
|
113
85
|
size_t n_points = split_points.size();
|
|
114
|
-
auto result = inclusive
|
|
115
|
-
? sk.template get_CDF<true>(&split_points[0], n_points)
|
|
116
|
-
: sk.template get_CDF<false>(&split_points[0], n_points);
|
|
117
|
-
|
|
86
|
+
auto result = sk.get_CDF(split_points.data(), n_points, inclusive);
|
|
118
87
|
py::list list(n_points + 1);
|
|
119
88
|
for (size_t i = 0; i <= n_points; ++i) {
|
|
120
89
|
list[i] = result[i];
|
|
121
90
|
}
|
|
122
|
-
|
|
123
91
|
return list;
|
|
124
92
|
}
|
|
125
93
|
|
|
@@ -170,33 +138,27 @@ void bind_req_sketch(py::module &m, const char* name) {
|
|
|
170
138
|
"Returns the number of retained items (samples) in the sketch")
|
|
171
139
|
.def("is_estimation_mode", &req_sketch<T>::is_estimation_mode,
|
|
172
140
|
"Returns True if the sketch is in estimation mode, otherwise False")
|
|
173
|
-
.def("get_min_value", &req_sketch<T>::
|
|
141
|
+
.def("get_min_value", &req_sketch<T>::get_min_item,
|
|
174
142
|
"Returns the minimum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
|
|
175
|
-
.def("get_max_value", &req_sketch<T>::
|
|
143
|
+
.def("get_max_value", &req_sketch<T>::get_max_item,
|
|
176
144
|
"Returns the maximum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
|
|
177
|
-
.def("get_quantile", &
|
|
178
|
-
"Returns an approximation to the
|
|
179
|
-
"
|
|
145
|
+
.def("get_quantile", &req_sketch<T>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
|
|
146
|
+
"Returns an approximation to the data value "
|
|
147
|
+
"associated with the given normalized rank in a hypothetical sorted "
|
|
180
148
|
"version of the input stream so far.\n"
|
|
181
|
-
"Note that this method has a fairly large overhead (microseconds instead of nanoseconds) "
|
|
182
|
-
"so it should not be called multiple times to get different quantiles from the same "
|
|
183
|
-
"sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
|
|
184
149
|
"For req_floats_sketch: if the sketch is empty this returns nan. "
|
|
185
150
|
"For req_ints_sketch: if the sketch is empty this throws a RuntimeError.")
|
|
186
151
|
.def("get_quantiles", &dspy::req_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
|
|
187
|
-
"This is a more efficient multiple-query version of get_quantile().\n"
|
|
188
152
|
"This returns an array that could have been generated by using get_quantile() for each "
|
|
189
|
-
"
|
|
190
|
-
"
|
|
191
|
-
"
|
|
192
|
-
|
|
193
|
-
"
|
|
194
|
-
.def("get_rank", &dspy::req_sketch_get_rank<T>, py::arg("item"), py::arg("inclusive")=false,
|
|
195
|
-
"Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
|
|
153
|
+
"normalized rank separately.\n"
|
|
154
|
+
"If the sketch is empty this returns an empty vector.\n"
|
|
155
|
+
"Deprecated. Will be removed in the next major version. Use get_quantile() instead.")
|
|
156
|
+
.def("get_rank", &req_sketch<T>::get_rank, py::arg("value"), py::arg("inclusive")=false,
|
|
157
|
+
"Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
|
|
196
158
|
"The resulting approximation has a probabilistic guarantee that can be obtained from the "
|
|
197
159
|
"get_normalized_rank_error(False) function.\n"
|
|
198
|
-
"With the parameter inclusive=true the weight of the given
|
|
199
|
-
"Otherwise the rank equals the sum of the weights of
|
|
160
|
+
"With the parameter inclusive=true the weight of the given value is included into the rank."
|
|
161
|
+
"Otherwise the rank equals the sum of the weights of values less than the given value.\n"
|
|
200
162
|
"If the sketch is empty this returns nan.")
|
|
201
163
|
.def("get_pmf", &dspy::req_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
|
202
164
|
"Returns an approximation to the Probability Mass Function (PMF) of the input stream "
|