datasketches 0.2.6 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/LICENSE +4 -6
- data/NOTICE +6 -5
- data/ext/datasketches/kll_wrapper.cpp +20 -20
- data/ext/datasketches/theta_wrapper.cpp +2 -2
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +9 -1
- data/vendor/datasketches-cpp/LICENSE +4 -6
- data/vendor/datasketches-cpp/MANIFEST.in +21 -4
- data/vendor/datasketches-cpp/common/CMakeLists.txt +5 -2
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +10 -0
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov_impl.hpp +6 -6
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/{quantile_sketch_sorted_view.hpp → quantiles_sorted_view.hpp} +60 -25
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +125 -0
- data/vendor/datasketches-cpp/common/{test/test_runner.cpp → include/version.hpp.in} +15 -8
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +37 -7
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +22 -1
- data/vendor/datasketches-cpp/common/test/integration_test.cpp +1 -1
- data/vendor/datasketches-cpp/common/test/quantiles_sorted_view_test.cpp +459 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +1 -1
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +28 -44
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +70 -78
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +11 -4
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +17 -10
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +55 -42
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +4 -4
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -1
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +1 -1
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -32
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +176 -233
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +337 -395
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +27 -27
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +197 -233
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +42 -32
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
- data/vendor/datasketches-cpp/pyproject.toml +17 -13
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/README.md +1 -1
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +104 -0
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +22 -0
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +113 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +31 -24
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +19 -1
- data/vendor/datasketches-cpp/python/src/__init__.py +17 -1
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +9 -3
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +18 -54
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +111 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +17 -53
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +17 -55
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +62 -67
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +47 -14
- data/vendor/datasketches-cpp/python/tests/__init__.py +16 -0
- data/vendor/datasketches-cpp/python/tests/req_test.py +1 -1
- data/vendor/datasketches-cpp/python/tests/vo_test.py +25 -1
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +135 -180
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +205 -210
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +1 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +20 -19
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +241 -233
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +15 -9
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +35 -19
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +126 -147
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +265 -245
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_custom_type_test.cpp +27 -27
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +117 -104
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +22 -46
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +180 -207
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +18 -39
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +75 -85
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +7 -7
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +3 -3
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +5 -5
- data/vendor/datasketches-cpp/setup.py +14 -3
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +15 -25
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +0 -9
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_jaccard_similarity_test.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +3 -2
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/tox.ini +26 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +36 -12
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +41 -35
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +2 -1
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +299 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +1 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +27 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -0
- metadata +14 -7
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +0 -91
- data/vendor/datasketches-cpp/common/test/catch.hpp +0 -17618
@@ -51,39 +51,17 @@ double kll_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
|
|
51
51
|
return kll_sketch<T>::get_normalized_rank_error(k, pmf);
|
52
52
|
}
|
53
53
|
|
54
|
-
template<typename T>
|
55
|
-
double kll_sketch_get_rank(const kll_sketch<T>& sk, const T& item, bool inclusive) {
|
56
|
-
if (inclusive)
|
57
|
-
return sk.template get_rank<true>(item);
|
58
|
-
else
|
59
|
-
return sk.template get_rank<false>(item);
|
60
|
-
}
|
61
|
-
|
62
|
-
template<typename T>
|
63
|
-
T kll_sketch_get_quantile(const kll_sketch<T>& sk,
|
64
|
-
double rank,
|
65
|
-
bool inclusive) {
|
66
|
-
if (inclusive)
|
67
|
-
return T(sk.template get_quantile<true>(rank));
|
68
|
-
else
|
69
|
-
return T(sk.template get_quantile<false>(rank));
|
70
|
-
}
|
71
|
-
|
72
54
|
template<typename T>
|
73
55
|
py::list kll_sketch_get_quantiles(const kll_sketch<T>& sk,
|
74
|
-
std::vector<double>&
|
56
|
+
std::vector<double>& ranks,
|
75
57
|
bool inclusive) {
|
76
|
-
size_t nQuantiles =
|
77
|
-
auto result = inclusive
|
78
|
-
sk.template get_quantiles<true>(fractions.data(), nQuantiles)
|
79
|
-
: sk.template get_quantiles<false>(fractions.data(), nQuantiles);
|
80
|
-
|
58
|
+
size_t nQuantiles = ranks.size();
|
59
|
+
auto result = sk.get_quantiles(ranks.data(), nQuantiles, inclusive);
|
81
60
|
// returning as std::vector<> would copy values to a list anyway
|
82
61
|
py::list list(nQuantiles);
|
83
62
|
for (size_t i = 0; i < nQuantiles; ++i) {
|
84
63
|
list[i] = result[i];
|
85
64
|
}
|
86
|
-
|
87
65
|
return list;
|
88
66
|
}
|
89
67
|
|
@@ -92,15 +70,11 @@ py::list kll_sketch_get_pmf(const kll_sketch<T>& sk,
|
|
92
70
|
std::vector<T>& split_points,
|
93
71
|
bool inclusive) {
|
94
72
|
size_t nPoints = split_points.size();
|
95
|
-
auto result = inclusive
|
96
|
-
sk.template get_PMF<true>(split_points.data(), nPoints)
|
97
|
-
: sk.template get_PMF<false>(split_points.data(), nPoints);
|
98
|
-
|
73
|
+
auto result = sk.get_PMF(split_points.data(), nPoints, inclusive);
|
99
74
|
py::list list(nPoints + 1);
|
100
75
|
for (size_t i = 0; i <= nPoints; ++i) {
|
101
76
|
list[i] = result[i];
|
102
77
|
}
|
103
|
-
|
104
78
|
return list;
|
105
79
|
}
|
106
80
|
|
@@ -109,15 +83,11 @@ py::list kll_sketch_get_cdf(const kll_sketch<T>& sk,
|
|
109
83
|
std::vector<T>& split_points,
|
110
84
|
bool inclusive) {
|
111
85
|
size_t nPoints = split_points.size();
|
112
|
-
auto result = inclusive
|
113
|
-
sk.template get_CDF<true>(split_points.data(), nPoints)
|
114
|
-
: sk.template get_CDF<false>(split_points.data(), nPoints);
|
115
|
-
|
86
|
+
auto result = sk.get_CDF(split_points.data(), nPoints, inclusive);
|
116
87
|
py::list list(nPoints + 1);
|
117
88
|
for (size_t i = 0; i <= nPoints; ++i) {
|
118
89
|
list[i] = result[i];
|
119
90
|
}
|
120
|
-
|
121
91
|
return list;
|
122
92
|
}
|
123
93
|
|
@@ -166,29 +136,23 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
166
136
|
"Returns the number of retained items (samples) in the sketch")
|
167
137
|
.def("is_estimation_mode", &kll_sketch<T>::is_estimation_mode,
|
168
138
|
"Returns True if the sketch is in estimation mode, otherwise False")
|
169
|
-
.def("get_min_value", &kll_sketch<T>::
|
170
|
-
"Returns the minimum value from the stream. If empty, kll_floats_sketch
|
171
|
-
.def("get_max_value", &kll_sketch<T>::
|
172
|
-
"Returns the maximum value from the stream. If empty, kll_floats_sketch
|
173
|
-
.def("get_quantile", &
|
174
|
-
"Returns an approximation to the
|
175
|
-
"
|
139
|
+
.def("get_min_value", &kll_sketch<T>::get_min_item,
|
140
|
+
"Returns the minimum value from the stream. If empty, kll_floats_sketch returns nan; kll_ints_sketch throws a RuntimeError")
|
141
|
+
.def("get_max_value", &kll_sketch<T>::get_max_item,
|
142
|
+
"Returns the maximum value from the stream. If empty, kll_floats_sketch returns nan; kll_ints_sketch throws a RuntimeError")
|
143
|
+
.def("get_quantile", &kll_sketch<T>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
|
144
|
+
"Returns an approximation to the data value "
|
145
|
+
"associated with the given normalized rank in a hypothetical sorted "
|
176
146
|
"version of the input stream so far.\n"
|
177
|
-
"Note that this method has a fairly large overhead (microseconds instead of nanoseconds) "
|
178
|
-
"so it should not be called multiple times to get different quantiles from the same "
|
179
|
-
"sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
|
180
147
|
"For kll_floats_sketch: if the sketch is empty this returns nan. "
|
181
148
|
"For kll_ints_sketch: if the sketch is empty this throws a RuntimeError.")
|
182
|
-
.def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("
|
183
|
-
"This is a more efficient multiple-query version of get_quantile().\n"
|
149
|
+
.def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
|
184
150
|
"This returns an array that could have been generated by using get_quantile() for each "
|
185
|
-
"
|
186
|
-
"
|
187
|
-
"
|
188
|
-
|
189
|
-
"
|
190
|
-
.def("get_rank", &dspy::kll_sketch_get_rank<T>, py::arg("value"), py::arg("inclusive")=false,
|
191
|
-
"Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
|
151
|
+
"normalized rank separately.\n"
|
152
|
+
"If the sketch is empty this returns an empty vector.\n"
|
153
|
+
"Deprecated. Will be removed in the next major version. Use get_quantile() instead.")
|
154
|
+
.def("get_rank", &kll_sketch<T>::get_rank, py::arg("value"), py::arg("inclusive")=false,
|
155
|
+
"Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
|
192
156
|
"The resulting approximation has a probabilistic guarantee that can be obtained from the "
|
193
157
|
"get_normalized_rank_error(False) function.\n"
|
194
158
|
"With the parameter inclusive=true the weight of the given value is included into the rank."
|
@@ -0,0 +1,111 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include <cstring>
|
21
|
+
#include "memory_operations.hpp"
|
22
|
+
|
23
|
+
#include "py_serde.hpp"
|
24
|
+
|
25
|
+
#include <pybind11/pybind11.h>
|
26
|
+
|
27
|
+
namespace py = pybind11;
|
28
|
+
|
29
|
+
void init_serde(py::module& m) {
|
30
|
+
py::class_<datasketches::py_object_serde, datasketches::PyObjectSerDe /* <--- trampoline*/>(m, "PyObjectSerDe")
|
31
|
+
.def(py::init<>())
|
32
|
+
.def("get_size", &datasketches::py_object_serde::get_size, py::arg("item"),
|
33
|
+
"Returns the size in bytes of an item")
|
34
|
+
.def("to_bytes", &datasketches::py_object_serde::to_bytes, py::arg("item"),
|
35
|
+
"Retuns a bytes object with a serialized version of an item")
|
36
|
+
.def("from_bytes", &datasketches::py_object_serde::from_bytes, py::arg("data"), py::arg("offset"),
|
37
|
+
"Reads a bytes object starting from the given offest and returns a tuple of the reconstructed "
|
38
|
+
"object and the number of additional bytes read")
|
39
|
+
;
|
40
|
+
}
|
41
|
+
|
42
|
+
namespace datasketches {
|
43
|
+
size_t py_object_serde::size_of_item(const py::object& item) const {
|
44
|
+
return get_size(item);
|
45
|
+
}
|
46
|
+
|
47
|
+
size_t py_object_serde::serialize(void* ptr, size_t capacity, const py::object* items, unsigned num) const {
|
48
|
+
size_t bytes_written = 0;
|
49
|
+
py::gil_scoped_acquire acquire;
|
50
|
+
for (unsigned i = 0; i < num; ++i) {
|
51
|
+
std::string bytes = to_bytes(items[i]); // implicit cast from py::bytes
|
52
|
+
check_memory_size(bytes_written + bytes.size(), capacity);
|
53
|
+
memcpy(ptr, bytes.c_str(), bytes.size());
|
54
|
+
ptr = static_cast<char*>(ptr) + bytes.size();
|
55
|
+
bytes_written += bytes.size();
|
56
|
+
}
|
57
|
+
py::gil_scoped_release release;
|
58
|
+
return bytes_written;
|
59
|
+
}
|
60
|
+
|
61
|
+
size_t py_object_serde::deserialize(const void* ptr, size_t capacity, py::object* items, unsigned num) const {
|
62
|
+
size_t bytes_read = 0;
|
63
|
+
unsigned i = 0;
|
64
|
+
bool failure = false;
|
65
|
+
bool error_from_python = false;
|
66
|
+
py::gil_scoped_acquire acquire;
|
67
|
+
|
68
|
+
// copy data into bytes only once
|
69
|
+
py::bytes bytes(static_cast<const char*>(ptr), capacity);
|
70
|
+
for (; i < num && !failure; ++i) {
|
71
|
+
py::tuple bytes_and_len;
|
72
|
+
try {
|
73
|
+
bytes_and_len = from_bytes(bytes, bytes_read);
|
74
|
+
} catch (py::error_already_set &e) {
|
75
|
+
failure = true;
|
76
|
+
error_from_python = true;
|
77
|
+
break;
|
78
|
+
}
|
79
|
+
|
80
|
+
size_t length = py::cast<size_t>(bytes_and_len[1]);
|
81
|
+
if (bytes_read + length > capacity) {
|
82
|
+
bytes_read += length; // use this value to report the error
|
83
|
+
failure = true;
|
84
|
+
break;
|
85
|
+
}
|
86
|
+
|
87
|
+
new (&items[i]) py::object(py::cast<py::object>(bytes_and_len[0]));
|
88
|
+
ptr = static_cast<const char*>(ptr) + length;
|
89
|
+
bytes_read += length;
|
90
|
+
}
|
91
|
+
|
92
|
+
if (failure) {
|
93
|
+
// clean up what we've allocated
|
94
|
+
for (unsigned j = 0; j < i; ++j) {
|
95
|
+
items[j].dec_ref();
|
96
|
+
}
|
97
|
+
|
98
|
+
if (error_from_python) {
|
99
|
+
throw py::value_error("Error reading value in from_bytes");
|
100
|
+
} else {
|
101
|
+
// this next call will throw
|
102
|
+
check_memory_size(bytes_read, capacity);
|
103
|
+
}
|
104
|
+
}
|
105
|
+
|
106
|
+
py::gil_scoped_release release;
|
107
|
+
return bytes_read;
|
108
|
+
}
|
109
|
+
|
110
|
+
|
111
|
+
} // namespace datasketches
|
@@ -49,41 +49,17 @@ double quantiles_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
|
|
49
49
|
return quantiles_sketch<T>::get_normalized_rank_error(k, pmf);
|
50
50
|
}
|
51
51
|
|
52
|
-
template<typename T>
|
53
|
-
double quantiles_sketch_get_rank(const quantiles_sketch<T>& sk,
|
54
|
-
const T& item,
|
55
|
-
bool inclusive) {
|
56
|
-
if (inclusive)
|
57
|
-
return sk.template get_rank<true>(item);
|
58
|
-
else
|
59
|
-
return sk.template get_rank<false>(item);
|
60
|
-
}
|
61
|
-
|
62
|
-
template<typename T>
|
63
|
-
T quantiles_sketch_get_quantile(const quantiles_sketch<T>& sk,
|
64
|
-
double rank,
|
65
|
-
bool inclusive) {
|
66
|
-
if (inclusive)
|
67
|
-
return T(sk.template get_quantile<true>(rank));
|
68
|
-
else
|
69
|
-
return T(sk.template get_quantile<false>(rank));
|
70
|
-
}
|
71
|
-
|
72
52
|
template<typename T>
|
73
53
|
py::list quantiles_sketch_get_quantiles(const quantiles_sketch<T>& sk,
|
74
|
-
std::vector<double>&
|
54
|
+
std::vector<double>& ranks,
|
75
55
|
bool inclusive) {
|
76
|
-
size_t n_quantiles =
|
77
|
-
auto result = inclusive
|
78
|
-
? sk.template get_quantiles<true>(&fractions[0], static_cast<uint32_t>(n_quantiles))
|
79
|
-
: sk.template get_quantiles<false>(&fractions[0], static_cast<uint32_t>(n_quantiles));
|
80
|
-
|
56
|
+
size_t n_quantiles = ranks.size();
|
57
|
+
auto result = sk.get_quantiles(ranks.data(), static_cast<uint32_t>(n_quantiles), inclusive);
|
81
58
|
// returning as std::vector<> would copy values to a list anyway
|
82
59
|
py::list list(n_quantiles);
|
83
60
|
for (size_t i = 0; i < n_quantiles; ++i) {
|
84
61
|
list[i] = result[i];
|
85
62
|
}
|
86
|
-
|
87
63
|
return list;
|
88
64
|
}
|
89
65
|
|
@@ -92,15 +68,11 @@ py::list quantiles_sketch_get_pmf(const quantiles_sketch<T>& sk,
|
|
92
68
|
std::vector<T>& split_points,
|
93
69
|
bool inclusive) {
|
94
70
|
size_t n_points = split_points.size();
|
95
|
-
auto result = inclusive
|
96
|
-
? sk.template get_PMF<true>(&split_points[0], n_points)
|
97
|
-
: sk.template get_PMF<false>(&split_points[0], n_points);
|
98
|
-
|
71
|
+
auto result = sk.get_PMF(split_points.data(), n_points, inclusive);
|
99
72
|
py::list list(n_points + 1);
|
100
73
|
for (size_t i = 0; i <= n_points; ++i) {
|
101
74
|
list[i] = result[i];
|
102
75
|
}
|
103
|
-
|
104
76
|
return list;
|
105
77
|
}
|
106
78
|
|
@@ -109,15 +81,11 @@ py::list quantiles_sketch_get_cdf(const quantiles_sketch<T>& sk,
|
|
109
81
|
std::vector<T>& split_points,
|
110
82
|
bool inclusive) {
|
111
83
|
size_t n_points = split_points.size();
|
112
|
-
auto result = inclusive
|
113
|
-
? sk.template get_CDF<true>(&split_points[0], n_points)
|
114
|
-
: sk.template get_CDF<false>(&split_points[0], n_points);
|
115
|
-
|
84
|
+
auto result = sk.get_CDF(split_points.data(), n_points, inclusive);
|
116
85
|
py::list list(n_points + 1);
|
117
86
|
for (size_t i = 0; i <= n_points; ++i) {
|
118
87
|
list[i] = result[i];
|
119
88
|
}
|
120
|
-
|
121
89
|
return list;
|
122
90
|
}
|
123
91
|
|
@@ -166,31 +134,27 @@ void bind_quantiles_sketch(py::module &m, const char* name) {
|
|
166
134
|
"Returns the number of retained items (samples) in the sketch")
|
167
135
|
.def("is_estimation_mode", &quantiles_sketch<T>::is_estimation_mode,
|
168
136
|
"Returns True if the sketch is in estimation mode, otherwise False")
|
169
|
-
.def("get_min_value", &quantiles_sketch<T>::
|
137
|
+
.def("get_min_value", &quantiles_sketch<T>::get_min_item,
|
170
138
|
"Returns the minimum value from the stream. If empty, quantiles_floats_sketch returns nan; quantiles_ints_sketch throws a RuntimeError")
|
171
|
-
.def("get_max_value", &quantiles_sketch<T>::
|
139
|
+
.def("get_max_value", &quantiles_sketch<T>::get_max_item,
|
172
140
|
"Returns the maximum value from the stream. If empty, quantiles_floats_sketch returns nan; quantiles_ints_sketch throws a RuntimeError")
|
173
|
-
.def("get_quantile", &
|
174
|
-
"Returns an approximation to the
|
175
|
-
"
|
141
|
+
.def("get_quantile", &quantiles_sketch<T>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
|
142
|
+
"Returns an approximation to the data value "
|
143
|
+
"associated with the given rank in a hypothetical sorted "
|
176
144
|
"version of the input stream so far.\n"
|
177
|
-
"Note that this method has a fairly large overhead (microseconds instead of nanoseconds) "
|
178
|
-
"so it should not be called multiple times to get different quantiles from the same "
|
179
|
-
"sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
|
180
145
|
"For quantiles_floats_sketch: if the sketch is empty this returns nan. "
|
181
146
|
"For quantiles_ints_sketch: if the sketch is empty this throws a RuntimeError.")
|
182
147
|
.def("get_quantiles", &dspy::quantiles_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
|
183
|
-
"This is a more efficient multiple-query version of get_quantile().\n"
|
184
148
|
"This returns an array that could have been generated by using get_quantile() for each "
|
185
|
-
"
|
186
|
-
"
|
187
|
-
"
|
188
|
-
|
189
|
-
"
|
190
|
-
.def("get_rank", &dspy::quantiles_sketch_get_rank<T>, py::arg("item"), py::arg("inclusive")=false,
|
191
|
-
"Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
|
149
|
+
"normalized rank separately.\n"
|
150
|
+
"If the sketch is empty this returns an empty vector.\n"
|
151
|
+
"Deprecated. Will be removed in the next major version. Use get_quantile() instead.")
|
152
|
+
.def("get_rank", &quantiles_sketch<T>::get_rank, py::arg("value"), py::arg("inclusive")=false,
|
153
|
+
"Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
|
192
154
|
"The resulting approximation has a probabilistic guarantee that can be obtained from the "
|
193
155
|
"get_normalized_rank_error(False) function.\n"
|
156
|
+
"With the parameter inclusive=true the weight of the given value is included into the rank."
|
157
|
+
"Otherwise the rank equals the sum of the weights of values less than the given value.\n"
|
194
158
|
"If the sketch is empty this returns nan.")
|
195
159
|
.def("get_pmf", &dspy::quantiles_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
196
160
|
"Returns an approximation to the Probability Mass Function (PMF) of the input stream "
|
@@ -51,41 +51,17 @@ double req_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
|
|
51
51
|
return req_sketch<T>::get_normalized_rank_error(k, pmf);
|
52
52
|
}
|
53
53
|
|
54
|
-
template<typename T>
|
55
|
-
double req_sketch_get_rank(const req_sketch<T>& sk,
|
56
|
-
const T& item,
|
57
|
-
bool inclusive) {
|
58
|
-
if (inclusive)
|
59
|
-
return sk.template get_rank<true>(item);
|
60
|
-
else
|
61
|
-
return sk.template get_rank<false>(item);
|
62
|
-
}
|
63
|
-
|
64
|
-
template<typename T>
|
65
|
-
T req_sketch_get_quantile(const req_sketch<T>& sk,
|
66
|
-
double rank,
|
67
|
-
bool inclusive) {
|
68
|
-
if (inclusive)
|
69
|
-
return T(sk.template get_quantile<true>(rank));
|
70
|
-
else
|
71
|
-
return T(sk.template get_quantile<false>(rank));
|
72
|
-
}
|
73
|
-
|
74
54
|
template<typename T>
|
75
55
|
py::list req_sketch_get_quantiles(const req_sketch<T>& sk,
|
76
|
-
std::vector<double>&
|
56
|
+
std::vector<double>& ranks,
|
77
57
|
bool inclusive) {
|
78
|
-
size_t n_quantiles =
|
79
|
-
auto result = inclusive
|
80
|
-
? sk.template get_quantiles<true>(&fractions[0], n_quantiles)
|
81
|
-
: sk.template get_quantiles<false>(&fractions[0], n_quantiles);
|
82
|
-
|
58
|
+
size_t n_quantiles = ranks.size();
|
59
|
+
auto result = sk.get_quantiles(ranks.data(), n_quantiles, inclusive);
|
83
60
|
// returning as std::vector<> would copy values to a list anyway
|
84
61
|
py::list list(n_quantiles);
|
85
62
|
for (size_t i = 0; i < n_quantiles; ++i) {
|
86
63
|
list[i] = result[i];
|
87
64
|
}
|
88
|
-
|
89
65
|
return list;
|
90
66
|
}
|
91
67
|
|
@@ -94,15 +70,11 @@ py::list req_sketch_get_pmf(const req_sketch<T>& sk,
|
|
94
70
|
std::vector<T>& split_points,
|
95
71
|
bool inclusive) {
|
96
72
|
size_t n_points = split_points.size();
|
97
|
-
auto result = inclusive
|
98
|
-
? sk.template get_PMF<true>(&split_points[0], n_points)
|
99
|
-
: sk.template get_PMF<false>(&split_points[0], n_points);
|
100
|
-
|
73
|
+
auto result = sk.get_PMF(split_points.data(), n_points, inclusive);
|
101
74
|
py::list list(n_points + 1);
|
102
75
|
for (size_t i = 0; i <= n_points; ++i) {
|
103
76
|
list[i] = result[i];
|
104
77
|
}
|
105
|
-
|
106
78
|
return list;
|
107
79
|
}
|
108
80
|
|
@@ -111,15 +83,11 @@ py::list req_sketch_get_cdf(const req_sketch<T>& sk,
|
|
111
83
|
std::vector<T>& split_points,
|
112
84
|
bool inclusive) {
|
113
85
|
size_t n_points = split_points.size();
|
114
|
-
auto result = inclusive
|
115
|
-
? sk.template get_CDF<true>(&split_points[0], n_points)
|
116
|
-
: sk.template get_CDF<false>(&split_points[0], n_points);
|
117
|
-
|
86
|
+
auto result = sk.get_CDF(split_points.data(), n_points, inclusive);
|
118
87
|
py::list list(n_points + 1);
|
119
88
|
for (size_t i = 0; i <= n_points; ++i) {
|
120
89
|
list[i] = result[i];
|
121
90
|
}
|
122
|
-
|
123
91
|
return list;
|
124
92
|
}
|
125
93
|
|
@@ -170,33 +138,27 @@ void bind_req_sketch(py::module &m, const char* name) {
|
|
170
138
|
"Returns the number of retained items (samples) in the sketch")
|
171
139
|
.def("is_estimation_mode", &req_sketch<T>::is_estimation_mode,
|
172
140
|
"Returns True if the sketch is in estimation mode, otherwise False")
|
173
|
-
.def("get_min_value", &req_sketch<T>::
|
141
|
+
.def("get_min_value", &req_sketch<T>::get_min_item,
|
174
142
|
"Returns the minimum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
|
175
|
-
.def("get_max_value", &req_sketch<T>::
|
143
|
+
.def("get_max_value", &req_sketch<T>::get_max_item,
|
176
144
|
"Returns the maximum value from the stream. If empty, req_floats_sketch returns nan; req_ints_sketch throws a RuntimeError")
|
177
|
-
.def("get_quantile", &
|
178
|
-
"Returns an approximation to the
|
179
|
-
"
|
145
|
+
.def("get_quantile", &req_sketch<T>::get_quantile, py::arg("rank"), py::arg("inclusive")=false,
|
146
|
+
"Returns an approximation to the data value "
|
147
|
+
"associated with the given normalized rank in a hypothetical sorted "
|
180
148
|
"version of the input stream so far.\n"
|
181
|
-
"Note that this method has a fairly large overhead (microseconds instead of nanoseconds) "
|
182
|
-
"so it should not be called multiple times to get different quantiles from the same "
|
183
|
-
"sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
|
184
149
|
"For req_floats_sketch: if the sketch is empty this returns nan. "
|
185
150
|
"For req_ints_sketch: if the sketch is empty this throws a RuntimeError.")
|
186
151
|
.def("get_quantiles", &dspy::req_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
|
187
|
-
"This is a more efficient multiple-query version of get_quantile().\n"
|
188
152
|
"This returns an array that could have been generated by using get_quantile() for each "
|
189
|
-
"
|
190
|
-
"
|
191
|
-
"
|
192
|
-
|
193
|
-
"
|
194
|
-
.def("get_rank", &dspy::req_sketch_get_rank<T>, py::arg("item"), py::arg("inclusive")=false,
|
195
|
-
"Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
|
153
|
+
"normalized rank separately.\n"
|
154
|
+
"If the sketch is empty this returns an empty vector.\n"
|
155
|
+
"Deprecated. Will be removed in the next major version. Use get_quantile() instead.")
|
156
|
+
.def("get_rank", &req_sketch<T>::get_rank, py::arg("value"), py::arg("inclusive")=false,
|
157
|
+
"Returns an approximation to the normalized rank of the given value from 0 to 1, inclusive.\n"
|
196
158
|
"The resulting approximation has a probabilistic guarantee that can be obtained from the "
|
197
159
|
"get_normalized_rank_error(False) function.\n"
|
198
|
-
"With the parameter inclusive=true the weight of the given
|
199
|
-
"Otherwise the rank equals the sum of the weights of
|
160
|
+
"With the parameter inclusive=true the weight of the given value is included into the rank."
|
161
|
+
"Otherwise the rank equals the sum of the weights of values less than the given value.\n"
|
200
162
|
"If the sketch is empty this returns nan.")
|
201
163
|
.def("get_pmf", &dspy::req_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
202
164
|
"Returns an approximation to the Probability Mass Function (PMF) of the input stream "
|