datasketches 0.2.3 → 0.2.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/README.md +8 -8
- data/ext/datasketches/kll_wrapper.cpp +7 -3
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +25 -5
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/NOTICE +6 -5
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +18 -13
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +3 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +10 -6
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +29 -11
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +5 -19
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +5 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +108 -41
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +150 -132
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +165 -31
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
- data/vendor/datasketches-cpp/pyproject.toml +1 -1
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/README.md +13 -9
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +656 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1373 -0
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +975 -0
- data/vendor/datasketches-cpp/req/CMakeLists.txt +6 -21
- data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
- data/vendor/datasketches-cpp/req/include/req_compactor.hpp +6 -0
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +30 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +73 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +95 -63
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +74 -3
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +44 -33
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +33 -15
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +92 -23
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +3 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +150 -93
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -2
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -4
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +34 -9
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +446 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +429 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -11
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +29 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +34 -14
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +8 -0
- metadata +33 -12
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -22,6 +22,7 @@
|
|
22
22
|
#include <cstring>
|
23
23
|
#include <sstream>
|
24
24
|
#include <fstream>
|
25
|
+
#include <stdexcept>
|
25
26
|
|
26
27
|
#include <kll_sketch.hpp>
|
27
28
|
#include <test_allocator.hpp>
|
@@ -38,9 +39,9 @@ static std::string testBinaryInputPath = "test/";
|
|
38
39
|
#endif
|
39
40
|
|
40
41
|
// typical usage would be just kll_sketch<float> or kll_sketch<std::string>, but here we use test_allocator
|
41
|
-
|
42
|
+
using kll_float_sketch = kll_sketch<float, std::less<float>, serde<float>, test_allocator<float>>;
|
42
43
|
// let std::string use the default allocator for simplicity, otherwise we need to define "less" and "serde"
|
43
|
-
|
44
|
+
using kll_string_sketch = kll_sketch<std::string, std::less<std::string>, serde<std::string>, test_allocator<std::string>>;
|
44
45
|
|
45
46
|
TEST_CASE("kll sketch", "[kll_sketch]") {
|
46
47
|
|
@@ -74,7 +75,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
74
75
|
(void) it; // to suppress "unused" warning
|
75
76
|
FAIL("should be no iterations over an empty sketch");
|
76
77
|
}
|
77
|
-
}
|
78
|
+
}
|
78
79
|
|
79
80
|
SECTION("get bad quantile") {
|
80
81
|
kll_float_sketch sketch(200, 0);
|
@@ -90,7 +91,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
90
91
|
REQUIRE(sketch.get_n() == 1);
|
91
92
|
REQUIRE(sketch.get_num_retained() == 1);
|
92
93
|
REQUIRE(sketch.get_rank(1.0f) == 0.0);
|
94
|
+
REQUIRE(sketch.get_rank<true>(1.0f) == 1.0);
|
93
95
|
REQUIRE(sketch.get_rank(2.0f) == 1.0);
|
96
|
+
REQUIRE(sketch.get_rank(std::numeric_limits<float>::infinity()) == 1.0);
|
94
97
|
REQUIRE(sketch.get_min_value() == 1.0);
|
95
98
|
REQUIRE(sketch.get_max_value() == 1.0);
|
96
99
|
REQUIRE(sketch.get_quantile(0.5) == 1.0);
|
@@ -142,8 +145,10 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
142
145
|
REQUIRE(quantiles[2] == n - 1 );
|
143
146
|
|
144
147
|
for (uint32_t i = 0; i < n; i++) {
|
145
|
-
const double
|
146
|
-
REQUIRE(sketch.get_rank(static_cast<float>(i)) ==
|
148
|
+
const double true_rank = (double) i / n;
|
149
|
+
REQUIRE(sketch.get_rank(static_cast<float>(i)) == true_rank);
|
150
|
+
const double true_rank_inclusive = (double) (i + 1) / n;
|
151
|
+
REQUIRE(sketch.get_rank<true>(static_cast<float>(i)) == true_rank_inclusive);
|
147
152
|
}
|
148
153
|
|
149
154
|
// the alternative method must produce the same result
|
@@ -241,20 +246,38 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
241
246
|
sketch.update(static_cast<float>(i));
|
242
247
|
values[i] = static_cast<float>(i);
|
243
248
|
}
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
249
|
+
{ // inclusive=false (default)
|
250
|
+
const auto ranks(sketch.get_CDF(values, n));
|
251
|
+
const auto pmf(sketch.get_PMF(values, n));
|
252
|
+
|
253
|
+
double subtotal_pmf = 0;
|
254
|
+
for (int i = 0; i < n; i++) {
|
255
|
+
if (sketch.get_rank(values[i]) != ranks[i]) {
|
256
|
+
std::cerr << "checking rank vs CDF for value " << i << std::endl;
|
257
|
+
REQUIRE(sketch.get_rank(values[i]) == ranks[i]);
|
258
|
+
}
|
259
|
+
subtotal_pmf += pmf[i];
|
260
|
+
if (abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
|
261
|
+
std::cerr << "CDF vs PMF for value " << i << std::endl;
|
262
|
+
REQUIRE(ranks[i] == Approx(subtotal_pmf).margin(NUMERIC_NOISE_TOLERANCE));
|
263
|
+
}
|
253
264
|
}
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
265
|
+
}
|
266
|
+
{ // inclusive=true
|
267
|
+
const auto ranks(sketch.get_CDF<true>(values, n));
|
268
|
+
const auto pmf(sketch.get_PMF<true>(values, n));
|
269
|
+
|
270
|
+
double subtotal_pmf = 0;
|
271
|
+
for (int i = 0; i < n; i++) {
|
272
|
+
if (sketch.get_rank<true>(values[i]) != ranks[i]) {
|
273
|
+
std::cerr << "checking rank vs CDF for value " << i << std::endl;
|
274
|
+
REQUIRE(sketch.get_rank(values[i]) == ranks[i]);
|
275
|
+
}
|
276
|
+
subtotal_pmf += pmf[i];
|
277
|
+
if (abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
|
278
|
+
std::cerr << "CDF vs PMF for value " << i << std::endl;
|
279
|
+
REQUIRE(ranks[i] == Approx(subtotal_pmf).margin(NUMERIC_NOISE_TOLERANCE));
|
280
|
+
}
|
258
281
|
}
|
259
282
|
}
|
260
283
|
}
|
@@ -293,7 +316,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
293
316
|
SECTION("bytes serialize deserialize empty") {
|
294
317
|
kll_float_sketch sketch(200, 0);
|
295
318
|
auto bytes = sketch.serialize();
|
296
|
-
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
319
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
|
297
320
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
298
321
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
299
322
|
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
|
@@ -311,7 +334,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
311
334
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
312
335
|
sketch.serialize(s);
|
313
336
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
314
|
-
auto sketch2 = kll_float_sketch::deserialize(s,
|
337
|
+
auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), 0);
|
315
338
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
316
339
|
REQUIRE(s.tellg() == s.tellp());
|
317
340
|
REQUIRE_FALSE(sketch2.is_empty());
|
@@ -330,7 +353,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
330
353
|
sketch.update(1.0f);
|
331
354
|
auto bytes = sketch.serialize();
|
332
355
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
333
|
-
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
356
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
|
334
357
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
335
358
|
REQUIRE_FALSE(sketch2.is_empty());
|
336
359
|
REQUIRE_FALSE(sketch2.is_estimation_mode());
|
@@ -347,7 +370,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
347
370
|
std::ifstream is;
|
348
371
|
is.exceptions(std::ios::failbit | std::ios::badbit);
|
349
372
|
is.open(testBinaryInputPath + "kll_sketch_float_one_item_v1.sk", std::ios::binary);
|
350
|
-
auto sketch = kll_float_sketch::deserialize(is,
|
373
|
+
auto sketch = kll_float_sketch::deserialize(is, serde<float>(), 0);
|
351
374
|
REQUIRE_FALSE(sketch.is_empty());
|
352
375
|
REQUIRE_FALSE(sketch.is_estimation_mode());
|
353
376
|
REQUIRE(sketch.get_n() == 1);
|
@@ -364,7 +387,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
364
387
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
365
388
|
sketch.serialize(s);
|
366
389
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
367
|
-
auto sketch2 = kll_float_sketch::deserialize(s,
|
390
|
+
auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), 0);
|
368
391
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
369
392
|
REQUIRE(s.tellg() == s.tellp());
|
370
393
|
REQUIRE_FALSE(sketch2.is_empty());
|
@@ -382,7 +405,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
382
405
|
sketch.update(3.0f);
|
383
406
|
auto bytes = sketch.serialize();
|
384
407
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
385
|
-
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
408
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
|
386
409
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
387
410
|
REQUIRE_FALSE(sketch2.is_empty());
|
388
411
|
REQUIRE_FALSE(sketch2.is_estimation_mode());
|
@@ -399,7 +422,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
399
422
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
400
423
|
sketch.serialize(s);
|
401
424
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
402
|
-
auto sketch2 = kll_float_sketch::deserialize(s,
|
425
|
+
auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), 0);
|
403
426
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
404
427
|
REQUIRE(s.tellg() == s.tellp());
|
405
428
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
@@ -421,7 +444,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
421
444
|
for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
|
422
445
|
auto bytes = sketch.serialize();
|
423
446
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
424
|
-
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
447
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
|
425
448
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
426
449
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
427
450
|
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
|
@@ -678,7 +701,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
678
701
|
|
679
702
|
auto bytes = sketch1.serialize();
|
680
703
|
REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
|
681
|
-
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
704
|
+
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), serde<std::string>(), 0);
|
682
705
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
683
706
|
REQUIRE(sketch2.is_empty() == sketch1.is_empty());
|
684
707
|
REQUIRE(sketch2.is_estimation_mode() == sketch1.is_estimation_mode());
|
@@ -699,7 +722,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
699
722
|
sketch1.update("a");
|
700
723
|
auto bytes = sketch1.serialize();
|
701
724
|
REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
|
702
|
-
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
725
|
+
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), serde<std::string>(), 0);
|
703
726
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
704
727
|
}
|
705
728
|
|
@@ -766,10 +789,121 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
766
789
|
auto kll2 = kll_sketch<int8_t>::deserialize(blob.data(), blob.size());
|
767
790
|
}
|
768
791
|
|
769
|
-
|
770
|
-
|
771
|
-
|
792
|
+
SECTION("sorted view") {
|
793
|
+
kll_sketch<int> kll;
|
794
|
+
kll.update(2);
|
795
|
+
kll.update(3);
|
796
|
+
kll.update(1);
|
797
|
+
|
798
|
+
{ // non-cumulative, using operator->
|
799
|
+
auto view = kll.get_sorted_view(false);
|
800
|
+
REQUIRE(view.size() == 3);
|
801
|
+
auto it = view.begin();
|
802
|
+
REQUIRE(it->first == 1);
|
803
|
+
REQUIRE(it->second == 1);
|
804
|
+
++it;
|
805
|
+
REQUIRE(it->first == 2);
|
806
|
+
REQUIRE(it->second == 1);
|
807
|
+
++it;
|
808
|
+
REQUIRE(it->first == 3);
|
809
|
+
REQUIRE(it->second == 1);
|
810
|
+
}
|
811
|
+
{ // cumulative, non-inclusive, using operator->
|
812
|
+
auto view = kll.get_sorted_view(true);
|
813
|
+
REQUIRE(view.size() == 3);
|
814
|
+
auto it = view.begin();
|
815
|
+
REQUIRE(it->first == 1);
|
816
|
+
REQUIRE(it->second == 0);
|
817
|
+
++it;
|
818
|
+
REQUIRE(it->first == 2);
|
819
|
+
REQUIRE(it->second == 1);
|
820
|
+
++it;
|
821
|
+
REQUIRE(it->first == 3);
|
822
|
+
REQUIRE(it->second == 2);
|
823
|
+
}
|
824
|
+
{ // cumulative, inclusive, using operator*
|
825
|
+
auto view = kll.get_sorted_view<true>(true);
|
826
|
+
REQUIRE(view.size() == 3);
|
827
|
+
auto it = view.begin();
|
828
|
+
REQUIRE((*it).first == 1);
|
829
|
+
REQUIRE((*it).second == 1);
|
830
|
+
++it;
|
831
|
+
REQUIRE((*it).first == 2);
|
832
|
+
REQUIRE((*it).second == 2);
|
833
|
+
++it;
|
834
|
+
REQUIRE((*it).first == 3);
|
835
|
+
REQUIRE((*it).second == 3);
|
836
|
+
}
|
837
|
+
}
|
838
|
+
|
839
|
+
SECTION("type conversion: empty") {
|
840
|
+
kll_sketch<double> kll_double;
|
841
|
+
kll_sketch<float> kll_float(kll_double);
|
842
|
+
REQUIRE(kll_float.is_empty());
|
843
|
+
REQUIRE(kll_float.get_k() == kll_double.get_k());
|
844
|
+
REQUIRE(kll_float.get_n() == 0);
|
845
|
+
REQUIRE(kll_float.get_num_retained() == 0);
|
846
|
+
}
|
847
|
+
|
848
|
+
SECTION("type conversion: over k") {
|
849
|
+
kll_sketch<double> kll_double;
|
850
|
+
for (int i = 0; i < 1000; ++i) kll_double.update(static_cast<double>(i));
|
851
|
+
kll_sketch<float> kll_float(kll_double);
|
852
|
+
REQUIRE(!kll_float.is_empty());
|
853
|
+
REQUIRE(kll_float.get_k() == kll_double.get_k());
|
854
|
+
REQUIRE(kll_float.get_n() == kll_double.get_n());
|
855
|
+
REQUIRE(kll_float.get_num_retained() == kll_double.get_num_retained());
|
856
|
+
|
857
|
+
auto sv_float = kll_float.get_sorted_view(false);
|
858
|
+
auto sv_double = kll_double.get_sorted_view(false);
|
859
|
+
auto sv_float_it = sv_float.begin();
|
860
|
+
auto sv_double_it = sv_double.begin();
|
861
|
+
while (sv_float_it != sv_float.end()) {
|
862
|
+
REQUIRE(sv_double_it != sv_double.end());
|
863
|
+
auto float_pair = *sv_float_it;
|
864
|
+
auto double_pair = *sv_double_it;
|
865
|
+
REQUIRE(float_pair.first == Approx(double_pair.first).margin(0.01));
|
866
|
+
REQUIRE(float_pair.second == double_pair.second);
|
867
|
+
++sv_float_it;
|
868
|
+
++sv_double_it;
|
869
|
+
}
|
870
|
+
REQUIRE(sv_double_it == sv_double.end());
|
772
871
|
}
|
872
|
+
|
873
|
+
class A {
|
874
|
+
int val;
|
875
|
+
public:
|
876
|
+
A(int val): val(val) {}
|
877
|
+
int get_val() const { return val; }
|
878
|
+
};
|
879
|
+
|
880
|
+
struct less_A {
|
881
|
+
bool operator()(const A& a1, const A& a2) const { return a1.get_val() < a2.get_val(); }
|
882
|
+
};
|
883
|
+
|
884
|
+
class B {
|
885
|
+
int val;
|
886
|
+
public:
|
887
|
+
explicit B(const A& a): val(a.get_val()) {}
|
888
|
+
int get_val() const { return val; }
|
889
|
+
};
|
890
|
+
|
891
|
+
struct less_B {
|
892
|
+
bool operator()(const B& b1, const B& b2) const { return b1.get_val() < b2.get_val(); }
|
893
|
+
};
|
894
|
+
|
895
|
+
SECTION("type conversion: custom types") {
|
896
|
+
kll_sketch<A, less_A> sa;
|
897
|
+
sa.update(1);
|
898
|
+
sa.update(2);
|
899
|
+
sa.update(3);
|
900
|
+
|
901
|
+
kll_sketch<B, less_B> sb(sa);
|
902
|
+
REQUIRE(sb.get_n() == 3);
|
903
|
+
}
|
904
|
+
|
905
|
+
// cleanup
|
906
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
773
907
|
}
|
774
908
|
|
775
909
|
} /* namespace datasketches */
|
@@ -46,7 +46,7 @@ TEST_CASE("kolmogorov-smirnov same distribution", "[kll_sketch]") {
|
|
46
46
|
sketch1.update(x);
|
47
47
|
sketch2.update(x);
|
48
48
|
}
|
49
|
-
REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == Approx(0).margin(0.
|
49
|
+
REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == Approx(0).margin(0.02));
|
50
50
|
REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.01));
|
51
51
|
}
|
52
52
|
|
@@ -15,7 +15,11 @@
|
|
15
15
|
# specific language governing permissions and limitations
|
16
16
|
# under the License.
|
17
17
|
|
18
|
-
|
18
|
+
if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18.0")
|
19
|
+
find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED)
|
20
|
+
else()
|
21
|
+
find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
|
22
|
+
endif()
|
19
23
|
|
20
24
|
# only Windows+MSVC seems to have trouble locating pybind11
|
21
25
|
if (MSVC)
|
@@ -40,6 +44,7 @@ target_link_libraries(python
|
|
40
44
|
theta
|
41
45
|
sampling
|
42
46
|
req
|
47
|
+
quantiles
|
43
48
|
pybind11::module
|
44
49
|
)
|
45
50
|
|
@@ -63,5 +68,7 @@ target_sources(python
|
|
63
68
|
src/theta_wrapper.cpp
|
64
69
|
src/vo_wrapper.cpp
|
65
70
|
src/req_wrapper.cpp
|
71
|
+
src/quantiles_wrapper.cpp
|
72
|
+
src/ks_wrapper.cpp
|
66
73
|
src/vector_of_kll.cpp
|
67
74
|
)
|
@@ -12,21 +12,28 @@ This package provides a variety of sketches as described below. Wherever a speci
|
|
12
12
|
|
13
13
|
## Building and Installation
|
14
14
|
|
15
|
-
Once cloned, the library can be installed by running `
|
15
|
+
Once cloned, the library can be installed by running `python3 -m pip install .` in the project root directory -- not the python subdirectory -- which will also install the necessary dependencies, namely numpy and [pybind11[global]](https://github.com/pybind/pybind11).
|
16
16
|
|
17
|
-
If you prefer to call the `setup.py` build script directly, you must first install `pybind11[global]`, as well as any other dependencies listed under the build-system section in `pyproject.toml`.
|
17
|
+
If you prefer to call the `setup.py` build script directly, which is discoraged, you must first install `pybind11[global]`, as well as any other dependencies listed under the build-system section in `pyproject.toml`.
|
18
18
|
|
19
|
-
The library is also available from PyPI via `
|
19
|
+
The library is also available from PyPI via `python3 -m pip install datasketches`.
|
20
20
|
|
21
21
|
## Usage
|
22
22
|
|
23
23
|
Having installed the library, loading the Apache Datasketches Library in Python is simple: `import datasketches`.
|
24
24
|
|
25
|
+
The unit tests are mostly structured in a tutorial style and can be used as a reference example for how to feed data into and query the different types of sketches.
|
26
|
+
|
25
27
|
## Available Sketch Classes
|
26
28
|
|
27
29
|
- KLL (Absolute Error Quantiles)
|
28
30
|
- `kll_ints_sketch`
|
29
31
|
- `kll_floats_sketch`
|
32
|
+
- `kll_doubles_sketch`
|
33
|
+
- Quantiles (Absolute Error Quantiles, inferior algorithm)
|
34
|
+
- `quantiles_ints_sketch`
|
35
|
+
- `quantiles_floats_sketch`
|
36
|
+
- `quantiles_doubles_sketch`
|
30
37
|
- REQ (Relative Error Quantiles)
|
31
38
|
- `req_ints_sketch`
|
32
39
|
- `req_floats_sketch`
|
@@ -52,6 +59,8 @@ Having installed the library, loading the Apache Datasketches Library in Python
|
|
52
59
|
- Vector of KLL
|
53
60
|
- `vector_of_kll_ints_sketches`
|
54
61
|
- `vector_of_kll_floats_sketches`
|
62
|
+
- Kolmogorov-Smirnov Test
|
63
|
+
- `ks_test` applied to a pair of matched-type Absolute Error quantiles sketches
|
55
64
|
|
56
65
|
## Known Differences from C++
|
57
66
|
|
@@ -67,12 +76,7 @@ The only developer-specific instructions relate to running unit tests.
|
|
67
76
|
|
68
77
|
### Unit tests
|
69
78
|
|
70
|
-
The Python unit tests are run
|
71
|
-
|
72
|
-
```bash
|
73
|
-
python -m pip install --upgrade tox
|
74
|
-
tox
|
75
|
-
```
|
79
|
+
The Python unit tests are run via `tox`, with no arguments, from the project root directory -- not the python subdirectory. Tox creates a temporary virtual environment in which to build and run teh unit tests. In the event you are missing the necessary pacakge, tox may be installed with `python3 -m pip install --upgrade tox`.
|
76
80
|
|
77
81
|
## License
|
78
82
|
|
@@ -28,6 +28,8 @@ void init_cpc(py::module& m);
|
|
28
28
|
void init_theta(py::module& m);
|
29
29
|
void init_vo(py::module& m);
|
30
30
|
void init_req(py::module& m);
|
31
|
+
void init_quantiles(py::module& m);
|
32
|
+
void init_kolmogorov_smirnov(py::module& m);
|
31
33
|
void init_vector_of_kll(py::module& m);
|
32
34
|
|
33
35
|
PYBIND11_MODULE(datasketches, m) {
|
@@ -38,5 +40,7 @@ PYBIND11_MODULE(datasketches, m) {
|
|
38
40
|
init_theta(m);
|
39
41
|
init_vo(m);
|
40
42
|
init_req(m);
|
43
|
+
init_quantiles(m);
|
44
|
+
init_kolmogorov_smirnov(m);
|
41
45
|
init_vector_of_kll(m);
|
42
46
|
}
|
@@ -64,6 +64,11 @@ py::list fi_sketch_get_frequent_items(const frequent_items_sketch<T>& sk,
|
|
64
64
|
return list;
|
65
65
|
}
|
66
66
|
|
67
|
+
template<typename T>
|
68
|
+
size_t fi_sketch_get_serialized_size_bytes(const frequent_items_sketch<T>& sk) {
|
69
|
+
return sk.get_serialized_size_bytes();
|
70
|
+
}
|
71
|
+
|
67
72
|
}
|
68
73
|
}
|
69
74
|
|
@@ -104,7 +109,7 @@ void bind_fi_sketch(py::module &m, const char* name) {
|
|
104
109
|
"Returns the epsilon value used to compute a priori error for a given log2(max_map_size)")
|
105
110
|
.def_static("get_apriori_error", &frequent_items_sketch<T>::get_apriori_error, py::arg("lg_max_map_size"), py::arg("estimated_total_weight"),
|
106
111
|
"Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight.")
|
107
|
-
.def("get_serialized_size_bytes", &
|
112
|
+
.def("get_serialized_size_bytes", &dspy::fi_sketch_get_serialized_size_bytes<T>,
|
108
113
|
"Computes the size needed to serialize the current state of the sketch. This can be expensive since every item needs to be looked at.")
|
109
114
|
.def("serialize", &dspy::fi_sketch_serialize<T>, "Serializes the sketch into a bytes object")
|
110
115
|
.def_static("deserialize", &dspy::fi_sketch_deserialize<T>, "Reads a bytes object and returns the corresponding frequent_strings_sketch")
|
@@ -24,6 +24,7 @@
|
|
24
24
|
#include <pybind11/numpy.h>
|
25
25
|
#include <sstream>
|
26
26
|
#include <vector>
|
27
|
+
#include <stdexcept>
|
27
28
|
|
28
29
|
namespace py = pybind11;
|
29
30
|
|
@@ -50,11 +51,32 @@ double kll_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
|
|
50
51
|
return kll_sketch<T>::get_normalized_rank_error(k, pmf);
|
51
52
|
}
|
52
53
|
|
54
|
+
template<typename T>
|
55
|
+
double kll_sketch_get_rank(const kll_sketch<T>& sk, const T& item, bool inclusive) {
|
56
|
+
if (inclusive)
|
57
|
+
return sk.template get_rank<true>(item);
|
58
|
+
else
|
59
|
+
return sk.template get_rank<false>(item);
|
60
|
+
}
|
61
|
+
|
62
|
+
template<typename T>
|
63
|
+
T kll_sketch_get_quantile(const kll_sketch<T>& sk,
|
64
|
+
double rank,
|
65
|
+
bool inclusive) {
|
66
|
+
if (inclusive)
|
67
|
+
return T(sk.template get_quantile<true>(rank));
|
68
|
+
else
|
69
|
+
return T(sk.template get_quantile<false>(rank));
|
70
|
+
}
|
71
|
+
|
53
72
|
template<typename T>
|
54
73
|
py::list kll_sketch_get_quantiles(const kll_sketch<T>& sk,
|
55
|
-
std::vector<double>& fractions
|
74
|
+
std::vector<double>& fractions,
|
75
|
+
bool inclusive) {
|
56
76
|
size_t nQuantiles = fractions.size();
|
57
|
-
auto result =
|
77
|
+
auto result = inclusive ?
|
78
|
+
sk.template get_quantiles<true>(fractions.data(), nQuantiles)
|
79
|
+
: sk.template get_quantiles<false>(fractions.data(), nQuantiles);
|
58
80
|
|
59
81
|
// returning as std::vector<> would copy values to a list anyway
|
60
82
|
py::list list(nQuantiles);
|
@@ -67,9 +89,12 @@ py::list kll_sketch_get_quantiles(const kll_sketch<T>& sk,
|
|
67
89
|
|
68
90
|
template<typename T>
|
69
91
|
py::list kll_sketch_get_pmf(const kll_sketch<T>& sk,
|
70
|
-
std::vector<T>& split_points
|
92
|
+
std::vector<T>& split_points,
|
93
|
+
bool inclusive) {
|
71
94
|
size_t nPoints = split_points.size();
|
72
|
-
auto result =
|
95
|
+
auto result = inclusive ?
|
96
|
+
sk.template get_PMF<true>(split_points.data(), nPoints)
|
97
|
+
: sk.template get_PMF<false>(split_points.data(), nPoints);
|
73
98
|
|
74
99
|
py::list list(nPoints + 1);
|
75
100
|
for (size_t i = 0; i <= nPoints; ++i) {
|
@@ -81,9 +106,12 @@ py::list kll_sketch_get_pmf(const kll_sketch<T>& sk,
|
|
81
106
|
|
82
107
|
template<typename T>
|
83
108
|
py::list kll_sketch_get_cdf(const kll_sketch<T>& sk,
|
84
|
-
std::vector<T>& split_points
|
109
|
+
std::vector<T>& split_points,
|
110
|
+
bool inclusive) {
|
85
111
|
size_t nPoints = split_points.size();
|
86
|
-
auto result =
|
112
|
+
auto result = inclusive ?
|
113
|
+
sk.template get_CDF<true>(split_points.data(), nPoints)
|
114
|
+
: sk.template get_CDF<false>(split_points.data(), nPoints);
|
87
115
|
|
88
116
|
py::list list(nPoints + 1);
|
89
117
|
for (size_t i = 0; i <= nPoints; ++i) {
|
@@ -142,7 +170,7 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
142
170
|
"Returns the minimum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
|
143
171
|
.def("get_max_value", &kll_sketch<T>::get_max_value,
|
144
172
|
"Returns the maximum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
|
145
|
-
.def("get_quantile", &
|
173
|
+
.def("get_quantile", &dspy::kll_sketch_get_quantile<T>, py::arg("fraction"), py::arg("inclusive")=false,
|
146
174
|
"Returns an approximation to the value of the data item "
|
147
175
|
"that would be preceded by the given fraction of a hypothetical sorted "
|
148
176
|
"version of the input stream so far.\n"
|
@@ -151,7 +179,7 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
151
179
|
"sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
|
152
180
|
"For kll_floats_sketch: if the sketch is empty this returns nan. "
|
153
181
|
"For kll_ints_sketch: if the sketch is empty this throws a RuntimeError.")
|
154
|
-
.def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("fractions"),
|
182
|
+
.def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("fractions"), py::arg("inclusive")=false,
|
155
183
|
"This is a more efficient multiple-query version of get_quantile().\n"
|
156
184
|
"This returns an array that could have been generated by using get_quantile() for each "
|
157
185
|
"fractional rank separately, but would be very inefficient. "
|
@@ -159,12 +187,14 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
159
187
|
"a single query. It is strongly recommend that this method be used instead of multiple calls "
|
160
188
|
"to get_quantile().\n"
|
161
189
|
"If the sketch is empty this returns an empty vector.")
|
162
|
-
.def("get_rank", &
|
190
|
+
.def("get_rank", &dspy::kll_sketch_get_rank<T>, py::arg("value"), py::arg("inclusive")=false,
|
163
191
|
"Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
|
164
192
|
"The resulting approximation has a probabilistic guarantee that can be obtained from the "
|
165
193
|
"get_normalized_rank_error(False) function.\n"
|
194
|
+
"With the parameter inclusive=true the weight of the given value is included into the rank."
|
195
|
+
"Otherwise the rank equals the sum of the weights of values less than the given value.\n"
|
166
196
|
"If the sketch is empty this returns nan.")
|
167
|
-
.def("get_pmf", &dspy::kll_sketch_get_pmf<T>, py::arg("split_points"),
|
197
|
+
.def("get_pmf", &dspy::kll_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
168
198
|
"Returns an approximation to the Probability Mass Function (PMF) of the input stream "
|
169
199
|
"given a set of split points (values).\n"
|
170
200
|
"The resulting approximations have a probabilistic guarantee that can be obtained from the "
|
@@ -172,11 +202,13 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
172
202
|
"If the sketch is empty this returns an empty vector.\n"
|
173
203
|
"split_points is an array of m unique, monotonically increasing float values "
|
174
204
|
"that divide the real number line into m+1 consecutive disjoint intervals.\n"
|
175
|
-
"
|
205
|
+
"If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
|
176
206
|
"exclusive of the right split point, with the exception that the last interval will include "
|
177
207
|
"the maximum value.\n"
|
208
|
+
"If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
|
209
|
+
"inclusive of the right split point.\n"
|
178
210
|
"It is not necessary to include either the min or max values in these split points.")
|
179
|
-
.def("get_cdf", &dspy::kll_sketch_get_cdf<T>, py::arg("split_points"),
|
211
|
+
.def("get_cdf", &dspy::kll_sketch_get_cdf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
180
212
|
"Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
|
181
213
|
"cumulative analog of the PMF, of the input stream given a set of split points (values).\n"
|
182
214
|
"The resulting approximations have a probabilistic guarantee that can be obtained from the "
|
@@ -184,9 +216,11 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
184
216
|
"If the sketch is empty this returns an empty vector.\n"
|
185
217
|
"split_points is an array of m unique, monotonically increasing float values "
|
186
218
|
"that divide the real number line into m+1 consecutive disjoint intervals.\n"
|
187
|
-
"
|
219
|
+
"If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
|
188
220
|
"exclusive of the right split point, with the exception that the last interval will include "
|
189
221
|
"the maximum value.\n"
|
222
|
+
"If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
|
223
|
+
"inclusive of the right split point.\n"
|
190
224
|
"It is not necessary to include either the min or max values in these split points.")
|
191
225
|
.def("normalized_rank_error", (double (kll_sketch<T>::*)(bool) const) &kll_sketch<T>::get_normalized_rank_error,
|
192
226
|
py::arg("as_pmf"),
|
@@ -208,4 +242,5 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
208
242
|
void init_kll(py::module &m) {
|
209
243
|
bind_kll_sketch<int>(m, "kll_ints_sketch");
|
210
244
|
bind_kll_sketch<float>(m, "kll_floats_sketch");
|
245
|
+
bind_kll_sketch<double>(m, "kll_doubles_sketch");
|
211
246
|
}
|