datasketches 0.2.2 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/README.md +8 -8
- data/ext/datasketches/kll_wrapper.cpp +5 -1
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +27 -5
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +18 -13
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +16 -0
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +10 -6
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +30 -12
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +5 -19
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +103 -44
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +110 -130
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +156 -23
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +17 -6
- data/vendor/datasketches-cpp/python/README.md +57 -50
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +49 -14
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +12 -5
- data/vendor/datasketches-cpp/python/tests/kll_test.py +12 -6
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
- data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
- data/vendor/datasketches-cpp/req/CMakeLists.txt +6 -21
- data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +66 -61
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +54 -12
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +45 -34
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +33 -15
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
- data/vendor/datasketches-cpp/setup.py +10 -7
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +92 -23
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +3 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +150 -93
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -2
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +9 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +39 -10
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +446 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +429 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -11
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +29 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +34 -14
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +8 -0
- metadata +34 -12
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -24,6 +24,7 @@
|
|
24
24
|
#include <pybind11/numpy.h>
|
25
25
|
#include <sstream>
|
26
26
|
#include <vector>
|
27
|
+
#include <stdexcept>
|
27
28
|
|
28
29
|
namespace py = pybind11;
|
29
30
|
|
@@ -50,11 +51,32 @@ double kll_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
|
|
50
51
|
return kll_sketch<T>::get_normalized_rank_error(k, pmf);
|
51
52
|
}
|
52
53
|
|
54
|
+
template<typename T>
|
55
|
+
double kll_sketch_get_rank(const kll_sketch<T>& sk, const T& item, bool inclusive) {
|
56
|
+
if (inclusive)
|
57
|
+
return sk.template get_rank<true>(item);
|
58
|
+
else
|
59
|
+
return sk.template get_rank<false>(item);
|
60
|
+
}
|
61
|
+
|
62
|
+
template<typename T>
|
63
|
+
T kll_sketch_get_quantile(const kll_sketch<T>& sk,
|
64
|
+
double rank,
|
65
|
+
bool inclusive) {
|
66
|
+
if (inclusive)
|
67
|
+
return T(sk.template get_quantile<true>(rank));
|
68
|
+
else
|
69
|
+
return T(sk.template get_quantile<false>(rank));
|
70
|
+
}
|
71
|
+
|
53
72
|
template<typename T>
|
54
73
|
py::list kll_sketch_get_quantiles(const kll_sketch<T>& sk,
|
55
|
-
std::vector<double>& fractions
|
74
|
+
std::vector<double>& fractions,
|
75
|
+
bool inclusive) {
|
56
76
|
size_t nQuantiles = fractions.size();
|
57
|
-
auto result =
|
77
|
+
auto result = inclusive ?
|
78
|
+
sk.template get_quantiles<true>(fractions.data(), nQuantiles)
|
79
|
+
: sk.template get_quantiles<false>(fractions.data(), nQuantiles);
|
58
80
|
|
59
81
|
// returning as std::vector<> would copy values to a list anyway
|
60
82
|
py::list list(nQuantiles);
|
@@ -67,9 +89,12 @@ py::list kll_sketch_get_quantiles(const kll_sketch<T>& sk,
|
|
67
89
|
|
68
90
|
template<typename T>
|
69
91
|
py::list kll_sketch_get_pmf(const kll_sketch<T>& sk,
|
70
|
-
std::vector<T>& split_points
|
92
|
+
std::vector<T>& split_points,
|
93
|
+
bool inclusive) {
|
71
94
|
size_t nPoints = split_points.size();
|
72
|
-
auto result =
|
95
|
+
auto result = inclusive ?
|
96
|
+
sk.template get_PMF<true>(split_points.data(), nPoints)
|
97
|
+
: sk.template get_PMF<false>(split_points.data(), nPoints);
|
73
98
|
|
74
99
|
py::list list(nPoints + 1);
|
75
100
|
for (size_t i = 0; i <= nPoints; ++i) {
|
@@ -81,9 +106,12 @@ py::list kll_sketch_get_pmf(const kll_sketch<T>& sk,
|
|
81
106
|
|
82
107
|
template<typename T>
|
83
108
|
py::list kll_sketch_get_cdf(const kll_sketch<T>& sk,
|
84
|
-
std::vector<T>& split_points
|
109
|
+
std::vector<T>& split_points,
|
110
|
+
bool inclusive) {
|
85
111
|
size_t nPoints = split_points.size();
|
86
|
-
auto result =
|
112
|
+
auto result = inclusive ?
|
113
|
+
sk.template get_CDF<true>(split_points.data(), nPoints)
|
114
|
+
: sk.template get_CDF<false>(split_points.data(), nPoints);
|
87
115
|
|
88
116
|
py::list list(nPoints + 1);
|
89
117
|
for (size_t i = 0; i <= nPoints; ++i) {
|
@@ -116,7 +144,7 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
116
144
|
using namespace datasketches;
|
117
145
|
|
118
146
|
py::class_<kll_sketch<T>>(m, name)
|
119
|
-
.def(py::init<uint16_t>(), py::arg("k")=
|
147
|
+
.def(py::init<uint16_t>(), py::arg("k")=kll_constants::DEFAULT_K)
|
120
148
|
.def(py::init<const kll_sketch<T>&>())
|
121
149
|
.def("update", (void (kll_sketch<T>::*)(const T&)) &kll_sketch<T>::update, py::arg("item"),
|
122
150
|
"Updates the sketch with the given value")
|
@@ -142,7 +170,7 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
142
170
|
"Returns the minimum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
|
143
171
|
.def("get_max_value", &kll_sketch<T>::get_max_value,
|
144
172
|
"Returns the maximum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
|
145
|
-
.def("get_quantile", &
|
173
|
+
.def("get_quantile", &dspy::kll_sketch_get_quantile<T>, py::arg("fraction"), py::arg("inclusive")=false,
|
146
174
|
"Returns an approximation to the value of the data item "
|
147
175
|
"that would be preceded by the given fraction of a hypothetical sorted "
|
148
176
|
"version of the input stream so far.\n"
|
@@ -151,7 +179,7 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
151
179
|
"sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
|
152
180
|
"For kll_floats_sketch: if the sketch is empty this returns nan. "
|
153
181
|
"For kll_ints_sketch: if the sketch is empty this throws a RuntimeError.")
|
154
|
-
.def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("fractions"),
|
182
|
+
.def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("fractions"), py::arg("inclusive")=false,
|
155
183
|
"This is a more efficient multiple-query version of get_quantile().\n"
|
156
184
|
"This returns an array that could have been generated by using get_quantile() for each "
|
157
185
|
"fractional rank separately, but would be very inefficient. "
|
@@ -159,12 +187,14 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
159
187
|
"a single query. It is strongly recommend that this method be used instead of multiple calls "
|
160
188
|
"to get_quantile().\n"
|
161
189
|
"If the sketch is empty this returns an empty vector.")
|
162
|
-
.def("get_rank", &
|
190
|
+
.def("get_rank", &dspy::kll_sketch_get_rank<T>, py::arg("value"), py::arg("inclusive")=false,
|
163
191
|
"Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
|
164
192
|
"The resulting approximation has a probabilistic guarantee that can be obtained from the "
|
165
193
|
"get_normalized_rank_error(False) function.\n"
|
194
|
+
"With the parameter inclusive=true the weight of the given value is included into the rank."
|
195
|
+
"Otherwise the rank equals the sum of the weights of values less than the given value.\n"
|
166
196
|
"If the sketch is empty this returns nan.")
|
167
|
-
.def("get_pmf", &dspy::kll_sketch_get_pmf<T>, py::arg("split_points"),
|
197
|
+
.def("get_pmf", &dspy::kll_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
168
198
|
"Returns an approximation to the Probability Mass Function (PMF) of the input stream "
|
169
199
|
"given a set of split points (values).\n"
|
170
200
|
"The resulting approximations have a probabilistic guarantee that can be obtained from the "
|
@@ -172,11 +202,13 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
172
202
|
"If the sketch is empty this returns an empty vector.\n"
|
173
203
|
"split_points is an array of m unique, monotonically increasing float values "
|
174
204
|
"that divide the real number line into m+1 consecutive disjoint intervals.\n"
|
175
|
-
"
|
205
|
+
"If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
|
176
206
|
"exclusive of the right split point, with the exception that the last interval will include "
|
177
207
|
"the maximum value.\n"
|
208
|
+
"If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
|
209
|
+
"inclusive of the right split point.\n"
|
178
210
|
"It is not necessary to include either the min or max values in these split points.")
|
179
|
-
.def("get_cdf", &dspy::kll_sketch_get_cdf<T>, py::arg("split_points"),
|
211
|
+
.def("get_cdf", &dspy::kll_sketch_get_cdf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
180
212
|
"Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
|
181
213
|
"cumulative analog of the PMF, of the input stream given a set of split points (values).\n"
|
182
214
|
"The resulting approximations have a probabilistic guarantee that can be obtained from the "
|
@@ -184,9 +216,11 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
184
216
|
"If the sketch is empty this returns an empty vector.\n"
|
185
217
|
"split_points is an array of m unique, monotonically increasing float values "
|
186
218
|
"that divide the real number line into m+1 consecutive disjoint intervals.\n"
|
187
|
-
"
|
219
|
+
"If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
|
188
220
|
"exclusive of the right split point, with the exception that the last interval will include "
|
189
221
|
"the maximum value.\n"
|
222
|
+
"If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
|
223
|
+
"inclusive of the right split point.\n"
|
190
224
|
"It is not necessary to include either the min or max values in these split points.")
|
191
225
|
.def("normalized_rank_error", (double (kll_sketch<T>::*)(bool) const) &kll_sketch<T>::get_normalized_rank_error,
|
192
226
|
py::arg("as_pmf"),
|
@@ -208,4 +242,5 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
208
242
|
void init_kll(py::module &m) {
|
209
243
|
bind_kll_sketch<int>(m, "kll_ints_sketch");
|
210
244
|
bind_kll_sketch<float>(m, "kll_floats_sketch");
|
245
|
+
bind_kll_sketch<double>(m, "kll_doubles_sketch");
|
211
246
|
}
|
@@ -0,0 +1,68 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include "kolmogorov_smirnov.hpp"
|
21
|
+
#include "kll_sketch.hpp"
|
22
|
+
#include "quantiles_sketch.hpp"
|
23
|
+
|
24
|
+
#include <pybind11/pybind11.h>
|
25
|
+
|
26
|
+
namespace py = pybind11;
|
27
|
+
|
28
|
+
void init_kolmogorov_smirnov(py::module &m) {
|
29
|
+
using namespace datasketches;
|
30
|
+
|
31
|
+
m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<int>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
32
|
+
"Performs the Kolmogorov-Smirnov Test between kll_ints_sketches.\n"
|
33
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
34
|
+
"this will return false.\n"
|
35
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
36
|
+
"distribution) using the provided p-value, otherwise False.");
|
37
|
+
m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<float>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
38
|
+
"Performs the Kolmogorov-Smirnov Test between kll_floats_sketches.\n"
|
39
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
40
|
+
"this will return false.\n"
|
41
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
42
|
+
"distribution) using the provided p-value, otherwise False.");
|
43
|
+
m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<double>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
44
|
+
"Performs the Kolmogorov-Smirnov Test between kll_doubles_sketches.\n"
|
45
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
46
|
+
"this will return false.\n"
|
47
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
48
|
+
"distribution) using the provided p-value, otherwise False.");
|
49
|
+
|
50
|
+
m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<int>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
51
|
+
"Performs the Kolmogorov-Smirnov Test between quantiles_ints_sketches.\n"
|
52
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
53
|
+
"this will return false.\n"
|
54
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
55
|
+
"distribution) using the provided p-value, otherwise False.");
|
56
|
+
m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<float>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
57
|
+
"Performs the Kolmogorov-Smirnov Test between quantiles_floats_sketches.\n"
|
58
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
59
|
+
"this will return false.\n"
|
60
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
61
|
+
"distribution) using the provided p-value, otherwise False.");
|
62
|
+
m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<double>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
63
|
+
"Performs the Kolmogorov-Smirnov Test between quantiles_doubles_sketches.\n"
|
64
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
65
|
+
"this will return false.\n"
|
66
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
67
|
+
"distribution) using the provided p-value, otherwise False.");
|
68
|
+
}
|
@@ -0,0 +1,240 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include "quantiles_sketch.hpp"
|
21
|
+
|
22
|
+
#include <pybind11/pybind11.h>
|
23
|
+
#include <pybind11/stl.h>
|
24
|
+
#include <pybind11/numpy.h>
|
25
|
+
#include <vector>
|
26
|
+
|
27
|
+
namespace py = pybind11;
|
28
|
+
|
29
|
+
namespace datasketches {
|
30
|
+
|
31
|
+
namespace python {
|
32
|
+
|
33
|
+
template<typename T>
|
34
|
+
quantiles_sketch<T> quantiles_sketch_deserialize(py::bytes sk_bytes) {
|
35
|
+
std::string sk_str = sk_bytes; // implicit cast
|
36
|
+
return quantiles_sketch<T>::deserialize(sk_str.c_str(), sk_str.length());
|
37
|
+
}
|
38
|
+
|
39
|
+
template<typename T>
|
40
|
+
py::object quantiles_sketch_serialize(const quantiles_sketch<T>& sk) {
|
41
|
+
auto ser_result = sk.serialize();
|
42
|
+
return py::bytes((char*)ser_result.data(), ser_result.size());
|
43
|
+
}
|
44
|
+
|
45
|
+
// maybe possible to disambiguate the static vs method rank error calls, but
|
46
|
+
// this is easier for now
|
47
|
+
template<typename T>
|
48
|
+
double quantiles_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
|
49
|
+
return quantiles_sketch<T>::get_normalized_rank_error(k, pmf);
|
50
|
+
}
|
51
|
+
|
52
|
+
template<typename T>
|
53
|
+
double quantiles_sketch_get_rank(const quantiles_sketch<T>& sk,
|
54
|
+
const T& item,
|
55
|
+
bool inclusive) {
|
56
|
+
if (inclusive)
|
57
|
+
return sk.template get_rank<true>(item);
|
58
|
+
else
|
59
|
+
return sk.template get_rank<false>(item);
|
60
|
+
}
|
61
|
+
|
62
|
+
template<typename T>
|
63
|
+
T quantiles_sketch_get_quantile(const quantiles_sketch<T>& sk,
|
64
|
+
double rank,
|
65
|
+
bool inclusive) {
|
66
|
+
if (inclusive)
|
67
|
+
return T(sk.template get_quantile<true>(rank));
|
68
|
+
else
|
69
|
+
return T(sk.template get_quantile<false>(rank));
|
70
|
+
}
|
71
|
+
|
72
|
+
template<typename T>
|
73
|
+
py::list quantiles_sketch_get_quantiles(const quantiles_sketch<T>& sk,
|
74
|
+
std::vector<double>& fractions,
|
75
|
+
bool inclusive) {
|
76
|
+
size_t n_quantiles = fractions.size();
|
77
|
+
auto result = inclusive
|
78
|
+
? sk.template get_quantiles<true>(&fractions[0], static_cast<uint32_t>(n_quantiles))
|
79
|
+
: sk.template get_quantiles<false>(&fractions[0], static_cast<uint32_t>(n_quantiles));
|
80
|
+
|
81
|
+
// returning as std::vector<> would copy values to a list anyway
|
82
|
+
py::list list(n_quantiles);
|
83
|
+
for (size_t i = 0; i < n_quantiles; ++i) {
|
84
|
+
list[i] = result[i];
|
85
|
+
}
|
86
|
+
|
87
|
+
return list;
|
88
|
+
}
|
89
|
+
|
90
|
+
template<typename T>
|
91
|
+
py::list quantiles_sketch_get_pmf(const quantiles_sketch<T>& sk,
|
92
|
+
std::vector<T>& split_points,
|
93
|
+
bool inclusive) {
|
94
|
+
size_t n_points = split_points.size();
|
95
|
+
auto result = inclusive
|
96
|
+
? sk.template get_PMF<true>(&split_points[0], n_points)
|
97
|
+
: sk.template get_PMF<false>(&split_points[0], n_points);
|
98
|
+
|
99
|
+
py::list list(n_points + 1);
|
100
|
+
for (size_t i = 0; i <= n_points; ++i) {
|
101
|
+
list[i] = result[i];
|
102
|
+
}
|
103
|
+
|
104
|
+
return list;
|
105
|
+
}
|
106
|
+
|
107
|
+
template<typename T>
|
108
|
+
py::list quantiles_sketch_get_cdf(const quantiles_sketch<T>& sk,
|
109
|
+
std::vector<T>& split_points,
|
110
|
+
bool inclusive) {
|
111
|
+
size_t n_points = split_points.size();
|
112
|
+
auto result = inclusive
|
113
|
+
? sk.template get_CDF<true>(&split_points[0], n_points)
|
114
|
+
: sk.template get_CDF<false>(&split_points[0], n_points);
|
115
|
+
|
116
|
+
py::list list(n_points + 1);
|
117
|
+
for (size_t i = 0; i <= n_points; ++i) {
|
118
|
+
list[i] = result[i];
|
119
|
+
}
|
120
|
+
|
121
|
+
return list;
|
122
|
+
}
|
123
|
+
|
124
|
+
template<typename T>
|
125
|
+
void quantiles_sketch_update(quantiles_sketch<T>& sk, py::array_t<T, py::array::c_style | py::array::forcecast> items) {
|
126
|
+
if (items.ndim() != 1) {
|
127
|
+
throw std::invalid_argument("input data must have only one dimension. Found: "
|
128
|
+
+ std::to_string(items.ndim()));
|
129
|
+
}
|
130
|
+
|
131
|
+
auto data = items.template unchecked<1>();
|
132
|
+
for (uint32_t i = 0; i < data.size(); ++i) {
|
133
|
+
sk.update(data(i));
|
134
|
+
}
|
135
|
+
}
|
136
|
+
|
137
|
+
}
|
138
|
+
}
|
139
|
+
|
140
|
+
namespace dspy = datasketches::python;
|
141
|
+
|
142
|
+
template<typename T>
|
143
|
+
void bind_quantiles_sketch(py::module &m, const char* name) {
|
144
|
+
using namespace datasketches;
|
145
|
+
|
146
|
+
py::class_<quantiles_sketch<T>>(m, name)
|
147
|
+
.def(py::init<uint16_t>(), py::arg("k")=quantiles_constants::DEFAULT_K)
|
148
|
+
.def(py::init<const quantiles_sketch<T>&>())
|
149
|
+
.def("update", (void (quantiles_sketch<T>::*)(const T&)) &quantiles_sketch<T>::update, py::arg("item"),
|
150
|
+
"Updates the sketch with the given value")
|
151
|
+
.def("update", &dspy::quantiles_sketch_update<T>, py::arg("array"),
|
152
|
+
"Updates the sketch with the values in the given array")
|
153
|
+
.def("merge", (void (quantiles_sketch<T>::*)(const quantiles_sketch<T>&)) &quantiles_sketch<T>::merge, py::arg("sketch"),
|
154
|
+
"Merges the provided sketch into the this one")
|
155
|
+
.def("__str__", &quantiles_sketch<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
|
156
|
+
"Produces a string summary of the sketch")
|
157
|
+
.def("to_string", &quantiles_sketch<T>::to_string, py::arg("print_levels")=false, py::arg("print_items")=false,
|
158
|
+
"Produces a string summary of the sketch")
|
159
|
+
.def("is_empty", &quantiles_sketch<T>::is_empty,
|
160
|
+
"Returns True if the sketch is empty, otherwise False")
|
161
|
+
.def("get_k", &quantiles_sketch<T>::get_k,
|
162
|
+
"Returns the configured parameter k")
|
163
|
+
.def("get_n", &quantiles_sketch<T>::get_n,
|
164
|
+
"Returns the length of the input stream")
|
165
|
+
.def("get_num_retained", &quantiles_sketch<T>::get_num_retained,
|
166
|
+
"Returns the number of retained items (samples) in the sketch")
|
167
|
+
.def("is_estimation_mode", &quantiles_sketch<T>::is_estimation_mode,
|
168
|
+
"Returns True if the sketch is in estimation mode, otherwise False")
|
169
|
+
.def("get_min_value", &quantiles_sketch<T>::get_min_value,
|
170
|
+
"Returns the minimum value from the stream. If empty, quantiles_floats_sketch returns nan; quantiles_ints_sketch throws a RuntimeError")
|
171
|
+
.def("get_max_value", &quantiles_sketch<T>::get_max_value,
|
172
|
+
"Returns the maximum value from the stream. If empty, quantiles_floats_sketch returns nan; quantiles_ints_sketch throws a RuntimeError")
|
173
|
+
.def("get_quantile", &dspy::quantiles_sketch_get_quantile<T>, py::arg("rank"), py::arg("inclusive")=false,
|
174
|
+
"Returns an approximation to the value of the data item "
|
175
|
+
"that would be preceded by the given fraction of a hypothetical sorted "
|
176
|
+
"version of the input stream so far.\n"
|
177
|
+
"Note that this method has a fairly large overhead (microseconds instead of nanoseconds) "
|
178
|
+
"so it should not be called multiple times to get different quantiles from the same "
|
179
|
+
"sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
|
180
|
+
"For quantiles_floats_sketch: if the sketch is empty this returns nan. "
|
181
|
+
"For quantiles_ints_sketch: if the sketch is empty this throws a RuntimeError.")
|
182
|
+
.def("get_quantiles", &dspy::quantiles_sketch_get_quantiles<T>, py::arg("ranks"), py::arg("inclusive")=false,
|
183
|
+
"This is a more efficient multiple-query version of get_quantile().\n"
|
184
|
+
"This returns an array that could have been generated by using get_quantile() for each "
|
185
|
+
"fractional rank separately, but would be very inefficient. "
|
186
|
+
"This method incurs the internal set-up overhead once and obtains multiple quantile values in "
|
187
|
+
"a single query. It is strongly recommend that this method be used instead of multiple calls "
|
188
|
+
"to get_quantile().\n"
|
189
|
+
"If the sketch is empty this returns an empty vector.")
|
190
|
+
.def("get_rank", &dspy::quantiles_sketch_get_rank<T>, py::arg("item"), py::arg("inclusive")=false,
|
191
|
+
"Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
|
192
|
+
"The resulting approximation has a probabilistic guarantee that can be obtained from the "
|
193
|
+
"get_normalized_rank_error(False) function.\n"
|
194
|
+
"If the sketch is empty this returns nan.")
|
195
|
+
.def("get_pmf", &dspy::quantiles_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
196
|
+
"Returns an approximation to the Probability Mass Function (PMF) of the input stream "
|
197
|
+
"given a set of split points (values).\n"
|
198
|
+
"The resulting approximations have a probabilistic guarantee that can be obtained from the "
|
199
|
+
"get_normalized_rank_error(True) function.\n"
|
200
|
+
"If the sketch is empty this returns an empty vector.\n"
|
201
|
+
"split_points is an array of m unique, monotonically increasing float values "
|
202
|
+
"that divide the real number line into m+1 consecutive disjoint intervals.\n"
|
203
|
+
"The definition of an 'interval' is inclusive of the left split point (or minimum value) and "
|
204
|
+
"exclusive of the right split point, with the exception that the last interval will include "
|
205
|
+
"the maximum value.\n"
|
206
|
+
"It is not necessary to include either the min or max values in these split points.")
|
207
|
+
.def("get_cdf", &dspy::quantiles_sketch_get_cdf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
208
|
+
"Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
|
209
|
+
"cumulative analog of the PMF, of the input stream given a set of split points (values).\n"
|
210
|
+
"The resulting approximations have a probabilistic guarantee that can be obtained from the "
|
211
|
+
"get_normalized_rank_error(True) function.\n"
|
212
|
+
"If the sketch is empty this returns an empty vector.\n"
|
213
|
+
"split_points is an array of m unique, monotonically increasing float values "
|
214
|
+
"that divide the real number line into m+1 consecutive disjoint intervals.\n"
|
215
|
+
"The definition of an 'interval' is inclusive of the left split point (or minimum value) and "
|
216
|
+
"exclusive of the right split point, with the exception that the last interval will include "
|
217
|
+
"the maximum value.\n"
|
218
|
+
"It is not necessary to include either the min or max values in these split points.")
|
219
|
+
.def("normalized_rank_error", (double (quantiles_sketch<T>::*)(bool) const) &quantiles_sketch<T>::get_normalized_rank_error,
|
220
|
+
py::arg("as_pmf"),
|
221
|
+
"Gets the normalized rank error for this sketch.\n"
|
222
|
+
"If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n"
|
223
|
+
"Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n"
|
224
|
+
"Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials")
|
225
|
+
.def_static("get_normalized_rank_error", &dspy::quantiles_sketch_generic_normalized_rank_error<T>,
|
226
|
+
py::arg("k"), py::arg("as_pmf"),
|
227
|
+
"Gets the normalized rank error given parameters k and the pmf flag.\n"
|
228
|
+
"If pmf is True, returns the 'double-sided' normalized rank error for the get_PMF() function.\n"
|
229
|
+
"Otherwise, it is the 'single-sided' normalized rank error for all the other queries.\n"
|
230
|
+
"Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials")
|
231
|
+
.def("serialize", &dspy::quantiles_sketch_serialize<T>, "Serializes the sketch into a bytes object")
|
232
|
+
.def_static("deserialize", &dspy::quantiles_sketch_deserialize<T>, "Deserializes the sketch from a bytes object")
|
233
|
+
;
|
234
|
+
}
|
235
|
+
|
236
|
+
void init_quantiles(py::module &m) {
|
237
|
+
bind_quantiles_sketch<int>(m, "quantiles_ints_sketch");
|
238
|
+
bind_quantiles_sketch<float>(m, "quantiles_floats_sketch");
|
239
|
+
bind_quantiles_sketch<double>(m, "quantiles_doubles_sketch");
|
240
|
+
}
|
@@ -24,6 +24,7 @@
|
|
24
24
|
#include <pybind11/numpy.h>
|
25
25
|
#include <sstream>
|
26
26
|
#include <vector>
|
27
|
+
#include <stdexcept>
|
27
28
|
|
28
29
|
namespace py = pybind11;
|
29
30
|
|
@@ -194,6 +195,8 @@ void bind_req_sketch(py::module &m, const char* name) {
|
|
194
195
|
"Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
|
195
196
|
"The resulting approximation has a probabilistic guarantee that can be obtained from the "
|
196
197
|
"get_normalized_rank_error(False) function.\n"
|
198
|
+
"With the parameter inclusive=true the weight of the given item is included into the rank."
|
199
|
+
"Otherwise the rank equals the sum of the weights of items less than the given item.\n"
|
197
200
|
"If the sketch is empty this returns nan.")
|
198
201
|
.def("get_pmf", &dspy::req_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
199
202
|
"Returns an approximation to the Probability Mass Function (PMF) of the input stream "
|
@@ -203,9 +206,11 @@ void bind_req_sketch(py::module &m, const char* name) {
|
|
203
206
|
"If the sketch is empty this returns an empty vector.\n"
|
204
207
|
"split_points is an array of m unique, monotonically increasing float values "
|
205
208
|
"that divide the real number line into m+1 consecutive disjoint intervals.\n"
|
206
|
-
"
|
209
|
+
"If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
|
207
210
|
"exclusive of the right split point, with the exception that the last interval will include "
|
208
211
|
"the maximum value.\n"
|
212
|
+
"If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
|
213
|
+
"inclusive of the right split point.\n"
|
209
214
|
"It is not necessary to include either the min or max values in these split points.")
|
210
215
|
.def("get_cdf", &dspy::req_sketch_get_cdf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
211
216
|
"Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
|
@@ -215,9 +220,11 @@ void bind_req_sketch(py::module &m, const char* name) {
|
|
215
220
|
"If the sketch is empty this returns an empty vector.\n"
|
216
221
|
"split_points is an array of m unique, monotonically increasing float values "
|
217
222
|
"that divide the real number line into m+1 consecutive disjoint intervals.\n"
|
218
|
-
"
|
223
|
+
"If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
|
219
224
|
"exclusive of the right split point, with the exception that the last interval will include "
|
220
225
|
"the maximum value.\n"
|
226
|
+
"If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
|
227
|
+
"inclusive of the right split point.\n"
|
221
228
|
"It is not necessary to include either the min or max values in these split points.")
|
222
229
|
.def("get_rank_lower_bound", &req_sketch<T>::get_rank_lower_bound, py::arg("rank"), py::arg("num_std_dev"),
|
223
230
|
"Returns an approximate lower bound on the given normalized rank.\n"
|
@@ -103,7 +103,7 @@ void init_theta(py::module &m) {
|
|
103
103
|
|
104
104
|
py::class_<update_theta_sketch, theta_sketch>(m, "update_theta_sketch")
|
105
105
|
.def(py::init(&dspy::update_theta_sketch_factory),
|
106
|
-
py::arg("lg_k")=
|
106
|
+
py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED)
|
107
107
|
.def(py::init<const update_theta_sketch&>())
|
108
108
|
.def("update", (void (update_theta_sketch::*)(int64_t)) &update_theta_sketch::update, py::arg("datum"),
|
109
109
|
"Updates the sketch with the given integral value")
|
@@ -127,7 +127,7 @@ void init_theta(py::module &m) {
|
|
127
127
|
|
128
128
|
py::class_<theta_union>(m, "theta_union")
|
129
129
|
.def(py::init(&dspy::theta_union_factory),
|
130
|
-
py::arg("lg_k")=
|
130
|
+
py::arg("lg_k")=theta_constants::DEFAULT_LG_K, py::arg("p")=1.0, py::arg("seed")=DEFAULT_SEED)
|
131
131
|
.def("update", &theta_union::update<const theta_sketch&>, py::arg("sketch"),
|
132
132
|
"Updates the union with the given sketch")
|
133
133
|
.def("get_result", &theta_union::get_result, py::arg("ordered")=true,
|
@@ -24,19 +24,26 @@
|
|
24
24
|
#include <pybind11/numpy.h>
|
25
25
|
#include <sstream>
|
26
26
|
#include <vector>
|
27
|
+
#include <stdexcept>
|
27
28
|
|
28
29
|
namespace py = pybind11;
|
29
30
|
|
30
31
|
namespace datasketches {
|
31
32
|
|
33
|
+
namespace vector_of_kll_constants {
|
34
|
+
static const uint32_t DEFAULT_K = kll_constants::DEFAULT_K;
|
35
|
+
static const uint32_t DEFAULT_D = 1;
|
36
|
+
}
|
37
|
+
|
32
38
|
// Wrapper class for Numpy compatibility
|
33
39
|
template <typename T, typename C = std::less<T>, typename S = serde<T>>
|
34
40
|
class vector_of_kll_sketches {
|
35
41
|
public:
|
36
|
-
|
37
|
-
static const uint32_t
|
42
|
+
// TODO: Redundant and deprecated. Will be removed in next major version release.
|
43
|
+
static const uint32_t DEFAULT_K = vector_of_kll_constants::DEFAULT_K;
|
44
|
+
static const uint32_t DEFAULT_D = vector_of_kll_constants::DEFAULT_D;
|
38
45
|
|
39
|
-
explicit vector_of_kll_sketches(uint32_t k = DEFAULT_K, uint32_t d = DEFAULT_D);
|
46
|
+
explicit vector_of_kll_sketches(uint32_t k = vector_of_kll_constants::DEFAULT_K, uint32_t d = vector_of_kll_constants::DEFAULT_D);
|
40
47
|
vector_of_kll_sketches(const vector_of_kll_sketches& other);
|
41
48
|
vector_of_kll_sketches(vector_of_kll_sketches&& other) noexcept;
|
42
49
|
vector_of_kll_sketches<T,C,S>& operator=(const vector_of_kll_sketches& other);
|
@@ -432,8 +439,8 @@ void bind_vector_of_kll_sketches(py::module &m, const char* name) {
|
|
432
439
|
using namespace datasketches;
|
433
440
|
|
434
441
|
py::class_<vector_of_kll_sketches<T>>(m, name)
|
435
|
-
.def(py::init<uint32_t, uint32_t>(), py::arg("k")=
|
436
|
-
py::arg("d")=
|
442
|
+
.def(py::init<uint32_t, uint32_t>(), py::arg("k")=vector_of_kll_constants::DEFAULT_K,
|
443
|
+
py::arg("d")=vector_of_kll_constants::DEFAULT_D)
|
437
444
|
.def(py::init<const vector_of_kll_sketches<T>&>())
|
438
445
|
// allow user to retrieve k or d, in case it's instantiated w/ defaults
|
439
446
|
.def("get_k", &vector_of_kll_sketches<T>::get_k,
|
@@ -16,7 +16,7 @@
|
|
16
16
|
# under the License.
|
17
17
|
|
18
18
|
import unittest
|
19
|
-
from datasketches import kll_ints_sketch, kll_floats_sketch
|
19
|
+
from datasketches import kll_ints_sketch, kll_floats_sketch, kll_doubles_sketch, ks_test
|
20
20
|
import numpy as np
|
21
21
|
|
22
22
|
class KllTest(unittest.TestCase):
|
@@ -30,10 +30,10 @@ class KllTest(unittest.TestCase):
|
|
30
30
|
kll.update(0.0)
|
31
31
|
|
32
32
|
# 0 should be near the median
|
33
|
-
self.assertAlmostEqual(0.5, kll.get_rank(0.0), delta=0.
|
33
|
+
self.assertAlmostEqual(0.5, kll.get_rank(0.0), delta=0.035)
|
34
34
|
|
35
35
|
# the median should be near 0
|
36
|
-
self.assertAlmostEqual(0.0, kll.get_quantile(0.5), delta=0.
|
36
|
+
self.assertAlmostEqual(0.0, kll.get_quantile(0.5), delta=0.035)
|
37
37
|
|
38
38
|
# we also track the min/max independently from the rest of the data
|
39
39
|
# which lets us know the full observed data range
|
@@ -73,6 +73,12 @@ class KllTest(unittest.TestCase):
|
|
73
73
|
self.assertEqual(kll.get_quantile(0.7), new_kll.get_quantile(0.7))
|
74
74
|
self.assertEqual(kll.get_rank(0.0), new_kll.get_rank(0.0))
|
75
75
|
|
76
|
+
# A Kolmogorov-Smirnov Test of kll and new_kll should match, even for
|
77
|
+
# a fairly small p-value -- cannot reject the null hypothesis that
|
78
|
+
# they come from the same distribution (since they do)
|
79
|
+
self.assertFalse(ks_test(kll, new_kll, 0.001))
|
80
|
+
|
81
|
+
|
76
82
|
def test_kll_ints_sketch(self):
|
77
83
|
k = 100
|
78
84
|
n = 10
|
@@ -109,10 +115,10 @@ class KllTest(unittest.TestCase):
|
|
109
115
|
sk_bytes = kll.serialize()
|
110
116
|
self.assertTrue(isinstance(kll_ints_sketch.deserialize(sk_bytes), kll_ints_sketch))
|
111
117
|
|
112
|
-
def
|
113
|
-
# already tested ints and it's templatized, so just make sure it instantiates properly
|
118
|
+
def test_kll_doubles_sketch(self):
|
119
|
+
# already tested float and ints and it's templatized, so just make sure it instantiates properly
|
114
120
|
k = 75
|
115
|
-
kll =
|
121
|
+
kll = kll_doubles_sketch(k)
|
116
122
|
self.assertTrue(kll.is_empty())
|
117
123
|
|
118
124
|
if __name__ == '__main__':
|