datasketches 0.2.2 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/README.md +8 -8
- data/ext/datasketches/kll_wrapper.cpp +5 -1
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +27 -5
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +18 -13
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +16 -0
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +10 -6
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +30 -12
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +5 -19
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +103 -44
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +110 -130
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +156 -23
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +17 -6
- data/vendor/datasketches-cpp/python/README.md +57 -50
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +49 -14
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +12 -5
- data/vendor/datasketches-cpp/python/tests/kll_test.py +12 -6
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
- data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
- data/vendor/datasketches-cpp/req/CMakeLists.txt +6 -21
- data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +66 -61
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +54 -12
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +45 -34
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +33 -15
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
- data/vendor/datasketches-cpp/setup.py +10 -7
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +92 -23
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +3 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +150 -93
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -2
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +9 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +39 -10
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +446 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +429 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -11
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +29 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +34 -14
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +8 -0
- metadata +34 -12
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
|
@@ -24,6 +24,7 @@
|
|
|
24
24
|
#include <pybind11/numpy.h>
|
|
25
25
|
#include <sstream>
|
|
26
26
|
#include <vector>
|
|
27
|
+
#include <stdexcept>
|
|
27
28
|
|
|
28
29
|
namespace py = pybind11;
|
|
29
30
|
|
|
@@ -50,11 +51,32 @@ double kll_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
|
|
|
50
51
|
return kll_sketch<T>::get_normalized_rank_error(k, pmf);
|
|
51
52
|
}
|
|
52
53
|
|
|
54
|
+
template<typename T>
|
|
55
|
+
double kll_sketch_get_rank(const kll_sketch<T>& sk, const T& item, bool inclusive) {
|
|
56
|
+
if (inclusive)
|
|
57
|
+
return sk.template get_rank<true>(item);
|
|
58
|
+
else
|
|
59
|
+
return sk.template get_rank<false>(item);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
template<typename T>
|
|
63
|
+
T kll_sketch_get_quantile(const kll_sketch<T>& sk,
|
|
64
|
+
double rank,
|
|
65
|
+
bool inclusive) {
|
|
66
|
+
if (inclusive)
|
|
67
|
+
return T(sk.template get_quantile<true>(rank));
|
|
68
|
+
else
|
|
69
|
+
return T(sk.template get_quantile<false>(rank));
|
|
70
|
+
}
|
|
71
|
+
|
|
53
72
|
template<typename T>
|
|
54
73
|
py::list kll_sketch_get_quantiles(const kll_sketch<T>& sk,
|
|
55
|
-
std::vector<double>& fractions
|
|
74
|
+
std::vector<double>& fractions,
|
|
75
|
+
bool inclusive) {
|
|
56
76
|
size_t nQuantiles = fractions.size();
|
|
57
|
-
auto result =
|
|
77
|
+
auto result = inclusive ?
|
|
78
|
+
sk.template get_quantiles<true>(fractions.data(), nQuantiles)
|
|
79
|
+
: sk.template get_quantiles<false>(fractions.data(), nQuantiles);
|
|
58
80
|
|
|
59
81
|
// returning as std::vector<> would copy values to a list anyway
|
|
60
82
|
py::list list(nQuantiles);
|
|
@@ -67,9 +89,12 @@ py::list kll_sketch_get_quantiles(const kll_sketch<T>& sk,
|
|
|
67
89
|
|
|
68
90
|
template<typename T>
|
|
69
91
|
py::list kll_sketch_get_pmf(const kll_sketch<T>& sk,
|
|
70
|
-
std::vector<T>& split_points
|
|
92
|
+
std::vector<T>& split_points,
|
|
93
|
+
bool inclusive) {
|
|
71
94
|
size_t nPoints = split_points.size();
|
|
72
|
-
auto result =
|
|
95
|
+
auto result = inclusive ?
|
|
96
|
+
sk.template get_PMF<true>(split_points.data(), nPoints)
|
|
97
|
+
: sk.template get_PMF<false>(split_points.data(), nPoints);
|
|
73
98
|
|
|
74
99
|
py::list list(nPoints + 1);
|
|
75
100
|
for (size_t i = 0; i <= nPoints; ++i) {
|
|
@@ -81,9 +106,12 @@ py::list kll_sketch_get_pmf(const kll_sketch<T>& sk,
|
|
|
81
106
|
|
|
82
107
|
template<typename T>
|
|
83
108
|
py::list kll_sketch_get_cdf(const kll_sketch<T>& sk,
|
|
84
|
-
std::vector<T>& split_points
|
|
109
|
+
std::vector<T>& split_points,
|
|
110
|
+
bool inclusive) {
|
|
85
111
|
size_t nPoints = split_points.size();
|
|
86
|
-
auto result =
|
|
112
|
+
auto result = inclusive ?
|
|
113
|
+
sk.template get_CDF<true>(split_points.data(), nPoints)
|
|
114
|
+
: sk.template get_CDF<false>(split_points.data(), nPoints);
|
|
87
115
|
|
|
88
116
|
py::list list(nPoints + 1);
|
|
89
117
|
for (size_t i = 0; i <= nPoints; ++i) {
|
|
@@ -116,7 +144,7 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
|
116
144
|
using namespace datasketches;
|
|
117
145
|
|
|
118
146
|
py::class_<kll_sketch<T>>(m, name)
|
|
119
|
-
.def(py::init<uint16_t>(), py::arg("k")=
|
|
147
|
+
.def(py::init<uint16_t>(), py::arg("k")=kll_constants::DEFAULT_K)
|
|
120
148
|
.def(py::init<const kll_sketch<T>&>())
|
|
121
149
|
.def("update", (void (kll_sketch<T>::*)(const T&)) &kll_sketch<T>::update, py::arg("item"),
|
|
122
150
|
"Updates the sketch with the given value")
|
|
@@ -142,7 +170,7 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
|
142
170
|
"Returns the minimum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
|
|
143
171
|
.def("get_max_value", &kll_sketch<T>::get_max_value,
|
|
144
172
|
"Returns the maximum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
|
|
145
|
-
.def("get_quantile", &
|
|
173
|
+
.def("get_quantile", &dspy::kll_sketch_get_quantile<T>, py::arg("fraction"), py::arg("inclusive")=false,
|
|
146
174
|
"Returns an approximation to the value of the data item "
|
|
147
175
|
"that would be preceded by the given fraction of a hypothetical sorted "
|
|
148
176
|
"version of the input stream so far.\n"
|
|
@@ -151,7 +179,7 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
|
151
179
|
"sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
|
|
152
180
|
"For kll_floats_sketch: if the sketch is empty this returns nan. "
|
|
153
181
|
"For kll_ints_sketch: if the sketch is empty this throws a RuntimeError.")
|
|
154
|
-
.def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("fractions"),
|
|
182
|
+
.def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("fractions"), py::arg("inclusive")=false,
|
|
155
183
|
"This is a more efficient multiple-query version of get_quantile().\n"
|
|
156
184
|
"This returns an array that could have been generated by using get_quantile() for each "
|
|
157
185
|
"fractional rank separately, but would be very inefficient. "
|
|
@@ -159,12 +187,14 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
|
159
187
|
"a single query. It is strongly recommend that this method be used instead of multiple calls "
|
|
160
188
|
"to get_quantile().\n"
|
|
161
189
|
"If the sketch is empty this returns an empty vector.")
|
|
162
|
-
.def("get_rank", &
|
|
190
|
+
.def("get_rank", &dspy::kll_sketch_get_rank<T>, py::arg("value"), py::arg("inclusive")=false,
|
|
163
191
|
"Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
|
|
164
192
|
"The resulting approximation has a probabilistic guarantee that can be obtained from the "
|
|
165
193
|
"get_normalized_rank_error(False) function.\n"
|
|
194
|
+
"With the parameter inclusive=true the weight of the given value is included into the rank."
|
|
195
|
+
"Otherwise the rank equals the sum of the weights of values less than the given value.\n"
|
|
166
196
|
"If the sketch is empty this returns nan.")
|
|
167
|
-
.def("get_pmf", &dspy::kll_sketch_get_pmf<T>, py::arg("split_points"),
|
|
197
|
+
.def("get_pmf", &dspy::kll_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
|
168
198
|
"Returns an approximation to the Probability Mass Function (PMF) of the input stream "
|
|
169
199
|
"given a set of split points (values).\n"
|
|
170
200
|
"The resulting approximations have a probabilistic guarantee that can be obtained from the "
|
|
@@ -172,11 +202,13 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
|
172
202
|
"If the sketch is empty this returns an empty vector.\n"
|
|
173
203
|
"split_points is an array of m unique, monotonically increasing float values "
|
|
174
204
|
"that divide the real number line into m+1 consecutive disjoint intervals.\n"
|
|
175
|
-
"
|
|
205
|
+
"If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
|
|
176
206
|
"exclusive of the right split point, with the exception that the last interval will include "
|
|
177
207
|
"the maximum value.\n"
|
|
208
|
+
"If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
|
|
209
|
+
"inclusive of the right split point.\n"
|
|
178
210
|
"It is not necessary to include either the min or max values in these split points.")
|
|
179
|
-
.def("get_cdf", &dspy::kll_sketch_get_cdf<T>, py::arg("split_points"),
|
|
211
|
+
.def("get_cdf", &dspy::kll_sketch_get_cdf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
|
180
212
|
"Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
|
|
181
213
|
"cumulative analog of the PMF, of the input stream given a set of split points (values).\n"
|
|
182
214
|
"The resulting approximations have a probabilistic guarantee that can be obtained from the "
|
|
@@ -184,9 +216,11 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
|
184
216
|
"If the sketch is empty this returns an empty vector.\n"
|
|
185
217
|
"split_points is an array of m unique, monotonically increasing float values "
|
|
186
218
|
"that divide the real number line into m+1 consecutive disjoint intervals.\n"
|
|
187
|
-
"
|
|
219
|
+
"If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
|
|
188
220
|
"exclusive of the right split point, with the exception that the last interval will include "
|
|
189
221
|
"the maximum value.\n"
|
|
222
|
+
"If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
|
|
223
|
+
"inclusive of the right split point.\n"
|
|
190
224
|
"It is not necessary to include either the min or max values in these split points.")
|
|
191
225
|
.def("normalized_rank_error", (double (kll_sketch<T>::*)(bool) const) &kll_sketch<T>::get_normalized_rank_error,
|
|
192
226
|
py::arg("as_pmf"),
|
|
@@ -208,4 +242,5 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
|
208
242
|
void init_kll(py::module &m) {
|
|
209
243
|
bind_kll_sketch<int>(m, "kll_ints_sketch");
|
|
210
244
|
bind_kll_sketch<float>(m, "kll_floats_sketch");
|
|
245
|
+
bind_kll_sketch<double>(m, "kll_doubles_sketch");
|
|
211
246
|
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include "kolmogorov_smirnov.hpp"
|
|
21
|
+
#include "kll_sketch.hpp"
|
|
22
|
+
#include "quantiles_sketch.hpp"
|
|
23
|
+
|
|
24
|
+
#include <pybind11/pybind11.h>
|
|
25
|
+
|
|
26
|
+
namespace py = pybind11;
|
|
27
|
+
|
|
28
|
+
void init_kolmogorov_smirnov(py::module &m) {
|
|
29
|
+
using namespace datasketches;
|
|
30
|
+
|
|
31
|
+
m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<int>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
|
32
|
+
"Performs the Kolmogorov-Smirnov Test between kll_ints_sketches.\n"
|
|
33
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
|
34
|
+
"this will return false.\n"
|
|
35
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
|
36
|
+
"distribution) using the provided p-value, otherwise False.");
|
|
37
|
+
m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<float>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
|
38
|
+
"Performs the Kolmogorov-Smirnov Test between kll_floats_sketches.\n"
|
|
39
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
|
40
|
+
"this will return false.\n"
|
|
41
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
|
42
|
+
"distribution) using the provided p-value, otherwise False.");
|
|
43
|
+
m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<double>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
|
44
|
+
"Performs the Kolmogorov-Smirnov Test between kll_doubles_sketches.\n"
|
|
45
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
|
46
|
+
"this will return false.\n"
|
|
47
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
|
48
|
+
"distribution) using the provided p-value, otherwise False.");
|
|
49
|
+
|
|
50
|
+
m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<int>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
|
51
|
+
"Performs the Kolmogorov-Smirnov Test between quantiles_ints_sketches.\n"
|
|
52
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
|
53
|
+
"this will return false.\n"
|
|
54
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
|
55
|
+
"distribution) using the provided p-value, otherwise False.");
|
|
56
|
+
m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<float>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
|
57
|
+
"Performs the Kolmogorov-Smirnov Test between quantiles_floats_sketches.\n"
|
|
58
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
|
59
|
+
"this will return false.\n"
|
|
60
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
|
61
|
+
"distribution) using the provided p-value, otherwise False.");
|
|
62
|
+
m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<double>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
|
63
|
+
"Performs the Kolmogorov-Smirnov Test between quantiles_doubles_sketches.\n"
|
|
64
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
|
65
|
+
"this will return false.\n"
|
|
66
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
|
67
|
+
"distribution) using the provided p-value, otherwise False.");
|
|
68
|
+
}
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include "quantiles_sketch.hpp"
|
|
21
|
+
|
|
22
|
+
#include <pybind11/pybind11.h>
|
|
23
|
+
#include <pybind11/stl.h>
|
|
24
|
+
#include <pybind11/numpy.h>
|
|
25
|
+
#include <vector>
|
|
26
|
+
|
|
27
|
+
namespace py = pybind11;
|
|
28
|
+
|
|
29
|
+
namespace datasketches {
|
|
30
|
+
|
|
31
|
+
namespace python {
|
|
32
|
+
|
|
33
|
+
template<typename T>
|
|
34
|
+
quantiles_sketch<T> quantiles_sketch_deserialize(py::bytes sk_bytes) {
|
|
35
|
+
std::string sk_str = sk_bytes; // implicit cast
|
|
36
|
+
return quantiles_sketch<T>::deserialize(sk_str.c_str(), sk_str.length());
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
template<typename T>
|
|
40
|
+
py::object quantiles_sketch_serialize(const quantiles_sketch<T>& sk) {
|
|
41
|
+
auto ser_result = sk.serialize();
|
|
42
|
+
return py::bytes((char*)ser_result.data(), ser_result.size());
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// maybe possible to disambiguate the static vs method rank error calls, but
|
|
46
|
+
// this is easier for now
|
|
47
|
+
template<typename T>
|
|
48
|
+
double quantiles_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
|
|
49
|
+
return quantiles_sketch<T>::get_normalized_rank_error(k, pmf);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
template<typename T>
|
|
53
|
+
double quantiles_sketch_get_rank(const quantiles_sketch<T>& sk,
|
|
54
|
+
const T& item,
|
|
55
|
+
bool inclusive) {
|
|
56
|
+
if (inclusive)
|
|
57
|
+
return sk.template get_rank<true>(item);
|
|
58
|
+
else
|
|
59
|
+
return sk.template get_rank<false>(item);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
template<typename T>
|
|
63
|
+
T quantiles_sketch_get_quantile(const quantiles_sketch<T>& sk,
|
|
64
|
+
double rank,
|
|
65
|
+
bool inclusive) {
|
|
66
|
+
if (inclusive)
|
|
67
|
+
return T(sk.template get_quantile<true>(rank));
|
|
68
|
+
else
|
|
69
|
+
return T(sk.template get_quantile<false>(rank));
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
template<typename T>
|
|
73
|
+
py::list quantiles_sketch_get_quantiles(const quantiles_sketch<T>& sk,
|
|
74
|
+
std::vector<double>& fractions,
|
|
75
|
+
bool inclusive) {
|
|
76
|
+
size_t n_quantiles = fractions.size();
|
|
77
|
+
auto result = inclusive
|
|
78
|
+
? sk.template get_quantiles<true>(&fractions[0], static_cast<uint32_t>(n_quantiles))
|
|
79
|
+
: sk.template get_quantiles<false>(&fractions[0], static_cast<uint32_t>(n_quantiles));
|
|
80
|
+
|
|
81
|
+
// returning as std::vector<> would copy values to a list anyway
|
|
82
|
+
py::list list(n_quantiles);
|
|
83
|
+
for (size_t i = 0; i < n_quantiles; ++i) {
|
|
84
|
+
list[i] = result[i];
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
return list;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
template<typename T>
|
|
91
|
+
py::list quantiles_sketch_get_pmf(const quantiles_sketch<T>& sk,
|
|
92
|
+
std::vector<T>& split_points,
|
|
93
|
+
bool inclusive) {
|
|
94
|
+
size_t n_points = split_points.size();
|
|
95
|
+
auto result = inclusive
|
|
96
|
+
? sk.template get_PMF<true>(&split_points[0], n_points)
|
|
97
|
+
: sk.template get_PMF<false>(&split_points[0], n_points);
|
|
98
|
+
|
|
99
|
+
py::list list(n_points + 1);
|
|
100
|
+
for (size_t i = 0; i <= n_points; ++i) {
|
|
101
|
+
list[i] = result[i];
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
return list;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
template<typename T>
|
|
108
|
+
py::list quantiles_sketch_get_cdf(const quantiles_sketch<T>& sk,
|
|
109
|
+
std::vector<T>& split_points,
|
|
110
|
+
bool inclusive) {
|
|
111
|
+
size_t n_points = split_points.size();
|
|
112
|
+
auto result = inclusive
|
|
113
|
+
? sk.template get_CDF<true>(&split_points[0], n_points)
|
|
114
|
+
: sk.template get_CDF<false>(&split_points[0], n_points);
|
|
115
|
+
|
|
116
|
+
py::list list(n_points + 1);
|
|
117
|
+
for (size_t i = 0; i <= n_points; ++i) {
|
|
118
|
+
list[i] = result[i];
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return list;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
template<typename T>
|
|
125
|
+
void quantiles_sketch_update(quantiles_sketch<T>& sk, py::array_t<T, py::array::c_style | py::array::forcecast> items) {
|
|
126
|
+
if (items.ndim() != 1) {
|
|
127
|
+
throw std::invalid_argument("input data must have only one dimension. Found: "
|
|
128
|
+
+ std::to_string(items.ndim()));
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
auto data = items.template unchecked<1>();
|
|
132
|
+
for (uint32_t i = 0; i < data.size(); ++i) {
|
|
133
|
+
sk.update(data(i));
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
namespace dspy = datasketches::python;
|
|
141
|
+
|
|
142
|
+
template<typename T>
|
|
143
|
+
void bind_quantiles_sketch(py::module &m, const char* name) {
|
|
144
|
+
using namespace datasketches;
|
|
145
|
+
|
|
146
|
+
py::class_<quantiles_sketch<T>>(m, name)
|
|
147
|
+
.def(py::init<uint16_t>(), py::arg("k")=quantiles_constants::DEFAULT_K)
|
|
148
|
+
.def(py::init<const quantiles_sketch<T>&>())
|
|
149
|
+
.def("update", (void (quantiles_sketch<T>::*)(const T&)) &quantiles_sketch<T>::update, py::arg("item"),
|
|
150
|
+
"Updates the sketch with the given value")
|
|
151
|
+
.def("update", &dspy::quantiles_sketch_update<T>, py::arg("array"),
|
|
152
|
+
"Updates the sketch with the values in the given array")
|
|
153
|
+
.def("merge", (void (quantiles_sketch<T>::*)(const quantiles_sketch<T>&)) &quantiles_sketch<T>::merge, py::arg("sketch"),
|
|
154
|
+
"Merges the provided sketch into the this one")
|
|
155
|
+
.def("__str__", &quantiles_sketch<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
|
|
156
|
+
"Produces a string summary of the sketch")
|
|
157
|
+
.def("to_string", &quantiles_sketch<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
|
|
158
|
+
"Produces a string summary of the sketch")
|
|
159
|
+
.def("is_empty", &quantiles_sketch<T>::is_empty,
|
|
160
|
+
"Returns True if the sketch is empty, otherwise False")
|
|
161
|
+
.def("get_k", &quantiles_sketch<T>::get_k,
|
|
162
|
+
"Returns the configured parameter k")
|
|
163
|
+
.def("get_n", &quantiles_sketch<T>::get_n,
|
|
164
|
+
"Returns the length of the input stream")
|
|
165
|
+
.def("get_num_retained", &quantiles_sketch<T>::get_num_retained,
|
|
166
|
+
"Returns the number of retained items (samples) in the sketch")
|
|
167
|
+
.def("is_estimation_mode", &quantiles_sketch<T>::is_estimation_mode,
|
|
168
|
+
"Returns True if the sketch is in estimation mode, otherwise False")
|
|
169
|
+
.def("get_min_value", &quantiles_sketch<T>::get_min_value,
|
|
170
|
+
"Returns the minimum value from the stream. If empty, quantiles_floats_sketch returns nan; quantiles_ints_sketch throws a RuntimeError")
|
|
171
|
+
.def("get_max_value", &quantiles_sketch<T>::get_max_value,
|
|
172
|
+
"Returns the maximum value from the stream. If empty, quantiles_floats_sketch returns nan; quantiles_ints_sketch throws a RuntimeError")
|
|
173
|
+
.def("get_quantile", &dspy::quantiles_sketch_get_quantile<T>, py::arg("rank"), py::arg("inclusive")=false,
|
|
174
|
+
"Returns an approximation to the value of the data item "
|
|
175
|
+
"that would be preceded by the given fraction of a hypothetical sorted "
|
|
176
|
+
"version of the input stream so far.\n"
|
|
177
|
+
"Note that this method has a fairly large overhead (microseconds instead of nanoseconds) "
|
|
178
|
+
"so it should not be called multiple times to get different quantiles from the same "
|
|
179
|
+
"sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
|
|
180
|
+
"For quantiles_floats_sketch: if the sketch is empty this returns nan. "
|
|
181
|
+
"For quantiles_ints_sketch: if the sketch is empty this throws a RuntimeError.")
|
|
182
|
+
.def("get_quantiles", &dspy::quantiles_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
|
|
183
|
+
"This is a more efficient multiple-query version of get_quantile().\n"
|
|
184
|
+
"This returns an array that could have been generated by using get_quantile() for each "
|
|
185
|
+
"fractional rank separately, but would be very inefficient. "
|
|
186
|
+
"This method incurs the internal set-up overhead once and obtains multiple quantile values in "
|
|
187
|
+
"a single query. It is strongly recommend that this method be used instead of multiple calls "
|
|
188
|
+
"to get_quantile().\n"
|
|
189
|
+
"If the sketch is empty this returns an empty vector.")
|
|
190
|
+
.def("get_rank", &dspy::quantiles_sketch_get_rank<T>, py::arg("item"), py::arg("inclusive")=false,
|
|
191
|
+
"Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
|
|
192
|
+
"The resulting approximation has a probabilistic guarantee that can be obtained from the "
|
|
193
|
+
"get_normalized_rank_error(False) function.\n"
|
|
194
|
+
"If the sketch is empty this returns nan.")
|
|
195
|
+
.def("get_pmf", &dspy::quantiles_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
|
196
|
+
"Returns an approximation to the Probability Mass Function (PMF) of the input stream "
|
|
197
|
+
"given a set of split points (values).\n"
|
|
198
|
+
"The resulting approximations have a probabilistic guarantee that can be obtained from the "
|
|
199
|
+
"get_normalized_rank_error(True) function.\n"
|
|
200
|
+
"If the sketch is empty this returns an empty vector.\n"
|
|
201
|
+
"split_points is an array of m unique, monotonically increasing float values "
|
|
202
|
+
"that divide the real number line into m+1 consecutive disjoint intervals.\n"
|
|
203
|
+
"The definition of an 'interval' is inclusive of the left split point (or minimum value) and "
|
|
204
|
+
"exclusive of the right split point, with the exception that the last interval will include "
|
|
205
|
+
"the maximum value.\n"
|
|
206
|
+
"It is not necessary to include either the min or max values in these split points.")
|
|
207
|
+
.def("get_cdf", &dspy::quantiles_sketch_get_cdf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
|
208
|
+
"Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
|
|
209
|
+
"cumulative analog of the PMF, of the input stream given a set of split points (values).\n"
|
|
210
|
+
"The resulting approximations have a probabilistic guarantee that can be obtained from the "
|
|
211
|
+
"get_normalized_rank_error(True) function.\n"
|
|
212
|
+
"If the sketch is empty this returns an empty vector.\n"
|
|
213
|
+
"split_points is an array of m unique, monotonically increasing float values "
|
|
214
|
+
"that divide the real number line into m+1 consecutive disjoint intervals.\n"
|
|
215
|
+
"The definition of an 'interval' is inclusive of the left split point (or minimum value) and "
|
|
216
|
+
"exclusive of the right split point, with the exception that the last interval will include "
|
|
217
|
+
"the maximum value.\n"
|
|
218
|
+
"It is not necessary to include either the min or max values in these split points.")
|
|
219
|
+
.def("normalized_rank_error", (double (quantiles_sketch<T>::*)(bool) const) &quantiles_sketch<T>::get_normalized_rank_error,
|
|
220
|
+
py::arg("as_pmf"),
|
|
221
|
+
"Gets the normalized rank error for this sketch.\n"
|
|
222
|
+
"If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n"
|
|
223
|
+
"Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n"
|
|
224
|
+
"Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials")
|
|
225
|
+
.def_static("get_normalized_rank_error", &dspy::quantiles_sketch_generic_normalized_rank_error<T>,
|
|
226
|
+
py::arg("k"), py::arg("as_pmf"),
|
|
227
|
+
"Gets the normalized rank error given parameters k and the pmf flag.\n"
|
|
228
|
+
"If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n"
|
|
229
|
+
"Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n"
|
|
230
|
+
"Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials")
|
|
231
|
+
.def("serialize", &dspy::quantiles_sketch_serialize<T>, "Serializes the sketch into a bytes object")
|
|
232
|
+
.def_static("deserialize", &dspy::quantiles_sketch_deserialize<T>, "Deserializes the sketch from a bytes object")
|
|
233
|
+
;
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
void init_quantiles(py::module &m) {
|
|
237
|
+
bind_quantiles_sketch<int>(m, "quantiles_ints_sketch");
|
|
238
|
+
bind_quantiles_sketch<float>(m, "quantiles_floats_sketch");
|
|
239
|
+
bind_quantiles_sketch<double>(m, "quantiles_doubles_sketch");
|
|
240
|
+
}
|
|
@@ -24,6 +24,7 @@
|
|
|
24
24
|
#include <pybind11/numpy.h>
|
|
25
25
|
#include <sstream>
|
|
26
26
|
#include <vector>
|
|
27
|
+
#include <stdexcept>
|
|
27
28
|
|
|
28
29
|
namespace py = pybind11;
|
|
29
30
|
|
|
@@ -194,6 +195,8 @@ void bind_req_sketch(py::module &m, const char* name) {
|
|
|
194
195
|
"Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
|
|
195
196
|
"The resulting approximation has a probabilistic guarantee that can be obtained from the "
|
|
196
197
|
"get_normalized_rank_error(False) function.\n"
|
|
198
|
+
"With the parameter inclusive=true the weight of the given item is included into the rank."
|
|
199
|
+
"Otherwise the rank equals the sum of the weights of items less than the given item.\n"
|
|
197
200
|
"If the sketch is empty this returns nan.")
|
|
198
201
|
.def("get_pmf", &dspy::req_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
|
199
202
|
"Returns an approximation to the Probability Mass Function (PMF) of the input stream "
|
|
@@ -203,9 +206,11 @@ void bind_req_sketch(py::module &m, const char* name) {
|
|
|
203
206
|
"If the sketch is empty this returns an empty vector.\n"
|
|
204
207
|
"split_points is an array of m unique, monotonically increasing float values "
|
|
205
208
|
"that divide the real number line into m+1 consecutive disjoint intervals.\n"
|
|
206
|
-
"
|
|
209
|
+
"If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
|
|
207
210
|
"exclusive of the right split point, with the exception that the last interval will include "
|
|
208
211
|
"the maximum value.\n"
|
|
212
|
+
"If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
|
|
213
|
+
"inclusive of the right split point.\n"
|
|
209
214
|
"It is not necessary to include either the min or max values in these split points.")
|
|
210
215
|
.def("get_cdf", &dspy::req_sketch_get_cdf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
|
211
216
|
"Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
|
|
@@ -215,9 +220,11 @@ void bind_req_sketch(py::module &m, const char* name) {
|
|
|
215
220
|
"If the sketch is empty this returns an empty vector.\n"
|
|
216
221
|
"split_points is an array of m unique, monotonically increasing float values "
|
|
217
222
|
"that divide the real number line into m+1 consecutive disjoint intervals.\n"
|
|
218
|
-
"
|
|
223
|
+
"If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
|
|
219
224
|
"exclusive of the right split point, with the exception that the last interval will include "
|
|
220
225
|
"the maximum value.\n"
|
|
226
|
+
"If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
|
|
227
|
+
"inclusive of the right split point.\n"
|
|
221
228
|
"It is not necessary to include either the min or max values in these split points.")
|
|
222
229
|
.def("get_rank_lower_bound", &req_sketch<T>::get_rank_lower_bound, py::arg("rank"), py::arg("num_std_dev"),
|
|
223
230
|
"Returns an approximate lower bound on the given normalized rank.\n"
|
|
@@ -103,7 +103,7 @@ void init_theta(py::module &m) {
|
|
|
103
103
|
|
|
104
104
|
py::class_<update_theta_sketch, theta_sketch>(m, "update_theta_sketch")
|
|
105
105
|
.def(py::init(&dspy::update_theta_sketch_factory),
|
|
106
|
-
py::arg("lg_k")=
|
|
106
|
+
py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED)
|
|
107
107
|
.def(py::init<const update_theta_sketch&>())
|
|
108
108
|
.def("update", (void (update_theta_sketch::*)(int64_t)) &update_theta_sketch::update, py::arg("datum"),
|
|
109
109
|
"Updates the sketch with the given integral value")
|
|
@@ -127,7 +127,7 @@ void init_theta(py::module &m) {
|
|
|
127
127
|
|
|
128
128
|
py::class_<theta_union>(m, "theta_union")
|
|
129
129
|
.def(py::init(&dspy::theta_union_factory),
|
|
130
|
-
py::arg("lg_k")=
|
|
130
|
+
py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED)
|
|
131
131
|
.def("update", &theta_union::update<const theta_sketch&>, py::arg("sketch"),
|
|
132
132
|
"Updates the union with the given sketch")
|
|
133
133
|
.def("get_result", &theta_union::get_result, py::arg("ordered")=true,
|
|
@@ -24,19 +24,26 @@
|
|
|
24
24
|
#include <pybind11/numpy.h>
|
|
25
25
|
#include <sstream>
|
|
26
26
|
#include <vector>
|
|
27
|
+
#include <stdexcept>
|
|
27
28
|
|
|
28
29
|
namespace py = pybind11;
|
|
29
30
|
|
|
30
31
|
namespace datasketches {
|
|
31
32
|
|
|
33
|
+
namespace vector_of_kll_constants {
|
|
34
|
+
static const uint32_t DEFAULT_K = kll_constants::DEFAULT_K;
|
|
35
|
+
static const uint32_t DEFAULT_D = 1;
|
|
36
|
+
}
|
|
37
|
+
|
|
32
38
|
// Wrapper class for Numpy compatibility
|
|
33
39
|
template <typename T, typename C = std::less<T>, typename S = serde<T>>
|
|
34
40
|
class vector_of_kll_sketches {
|
|
35
41
|
public:
|
|
36
|
-
|
|
37
|
-
static const uint32_t
|
|
42
|
+
// TODO: Redundant and deprecated. Will be removed in next major version release.
|
|
43
|
+
static const uint32_t DEFAULT_K = vector_of_kll_constants::DEFAULT_K;
|
|
44
|
+
static const uint32_t DEFAULT_D = vector_of_kll_constants::DEFAULT_D;
|
|
38
45
|
|
|
39
|
-
explicit vector_of_kll_sketches(uint32_t k = DEFAULT_K, uint32_t d = DEFAULT_D);
|
|
46
|
+
explicit vector_of_kll_sketches(uint32_t k = vector_of_kll_constants::DEFAULT_K, uint32_t d = vector_of_kll_constants::DEFAULT_D);
|
|
40
47
|
vector_of_kll_sketches(const vector_of_kll_sketches& other);
|
|
41
48
|
vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept;
|
|
42
49
|
vector_of_kll_sketches<T,C,S>& operator=(const vector_of_kll_sketches& other);
|
|
@@ -432,8 +439,8 @@ void bind_vector_of_kll_sketches(py::module &m, const char* name) {
|
|
|
432
439
|
using namespace datasketches;
|
|
433
440
|
|
|
434
441
|
py::class_<vector_of_kll_sketches<T>>(m, name)
|
|
435
|
-
.def(py::init<uint32_t, uint32_t>(), py::arg("k")=
|
|
436
|
-
py::arg("d")=
|
|
442
|
+
.def(py::init<uint32_t, uint32_t>(), py::arg("k")=vector_of_kll_constants::DEFAULT_K,
|
|
443
|
+
py::arg("d")=vector_of_kll_constants::DEFAULT_D)
|
|
437
444
|
.def(py::init<const vector_of_kll_sketches<T>&>())
|
|
438
445
|
// allow user to retrieve k or d, in case it's instantiated w/ defaults
|
|
439
446
|
.def("get_k", &vector_of_kll_sketches<T>::get_k,
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
# under the License.
|
|
17
17
|
|
|
18
18
|
import unittest
|
|
19
|
-
from datasketches import kll_ints_sketch, kll_floats_sketch
|
|
19
|
+
from datasketches import kll_ints_sketch, kll_floats_sketch, kll_doubles_sketch, ks_test
|
|
20
20
|
import numpy as np
|
|
21
21
|
|
|
22
22
|
class KllTest(unittest.TestCase):
|
|
@@ -30,10 +30,10 @@ class KllTest(unittest.TestCase):
|
|
|
30
30
|
kll.update(0.0)
|
|
31
31
|
|
|
32
32
|
# 0 should be near the median
|
|
33
|
-
self.assertAlmostEqual(0.5, kll.get_rank(0.0), delta=0.
|
|
33
|
+
self.assertAlmostEqual(0.5, kll.get_rank(0.0), delta=0.035)
|
|
34
34
|
|
|
35
35
|
# the median should be near 0
|
|
36
|
-
self.assertAlmostEqual(0.0, kll.get_quantile(0.5), delta=0.
|
|
36
|
+
self.assertAlmostEqual(0.0, kll.get_quantile(0.5), delta=0.035)
|
|
37
37
|
|
|
38
38
|
# we also track the min/max independently from the rest of the data
|
|
39
39
|
# which lets us know the full observed data range
|
|
@@ -73,6 +73,12 @@ class KllTest(unittest.TestCase):
|
|
|
73
73
|
self.assertEqual(kll.get_quantile(0.7), new_kll.get_quantile(0.7))
|
|
74
74
|
self.assertEqual(kll.get_rank(0.0), new_kll.get_rank(0.0))
|
|
75
75
|
|
|
76
|
+
# A Kolmogorov-Smirnov Test of kll and new_kll should match, even for
|
|
77
|
+
# a fairly small p-value -- cannot reject the null hypothesis that
|
|
78
|
+
# they come from the same distribution (since they do)
|
|
79
|
+
self.assertFalse(ks_test(kll, new_kll, 0.001))
|
|
80
|
+
|
|
81
|
+
|
|
76
82
|
def test_kll_ints_sketch(self):
|
|
77
83
|
k = 100
|
|
78
84
|
n = 10
|
|
@@ -109,10 +115,10 @@ class KllTest(unittest.TestCase):
|
|
|
109
115
|
sk_bytes = kll.serialize()
|
|
110
116
|
self.assertTrue(isinstance(kll_ints_sketch.deserialize(sk_bytes), kll_ints_sketch))
|
|
111
117
|
|
|
112
|
-
def
|
|
113
|
-
# already tested ints and it's templatized, so just make sure it instantiates properly
|
|
118
|
+
def test_kll_doubles_sketch(self):
|
|
119
|
+
# already tested float and ints and it's templatized, so just make sure it instantiates properly
|
|
114
120
|
k = 75
|
|
115
|
-
kll =
|
|
121
|
+
kll = kll_doubles_sketch(k)
|
|
116
122
|
self.assertTrue(kll.is_empty())
|
|
117
123
|
|
|
118
124
|
if __name__ == '__main__':
|