datasketches 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +1 -1
- data/ext/datasketches/kll_wrapper.cpp +5 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +4 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +2 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +25 -9
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +96 -42
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +105 -127
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +94 -25
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
- data/vendor/datasketches-cpp/pyproject.toml +1 -1
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/README.md +7 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
- data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -2
- data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +62 -59
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +31 -26
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +25 -9
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +8 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +7 -45
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +29 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +2 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +16 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -0
- metadata +25 -9
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
|
@@ -22,6 +22,7 @@
|
|
|
22
22
|
#include <cstring>
|
|
23
23
|
#include <sstream>
|
|
24
24
|
#include <fstream>
|
|
25
|
+
#include <stdexcept>
|
|
25
26
|
|
|
26
27
|
#include <kll_sketch.hpp>
|
|
27
28
|
#include <test_allocator.hpp>
|
|
@@ -90,7 +91,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
90
91
|
REQUIRE(sketch.get_n() == 1);
|
|
91
92
|
REQUIRE(sketch.get_num_retained() == 1);
|
|
92
93
|
REQUIRE(sketch.get_rank(1.0f) == 0.0);
|
|
94
|
+
REQUIRE(sketch.get_rank<true>(1.0f) == 1.0);
|
|
93
95
|
REQUIRE(sketch.get_rank(2.0f) == 1.0);
|
|
96
|
+
REQUIRE(sketch.get_rank(std::numeric_limits<float>::infinity()) == 1.0);
|
|
94
97
|
REQUIRE(sketch.get_min_value() == 1.0);
|
|
95
98
|
REQUIRE(sketch.get_max_value() == 1.0);
|
|
96
99
|
REQUIRE(sketch.get_quantile(0.5) == 1.0);
|
|
@@ -142,8 +145,10 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
142
145
|
REQUIRE(quantiles[2] == n - 1 );
|
|
143
146
|
|
|
144
147
|
for (uint32_t i = 0; i < n; i++) {
|
|
145
|
-
const double
|
|
146
|
-
REQUIRE(sketch.get_rank(static_cast<float>(i)) ==
|
|
148
|
+
const double true_rank = (double) i / n;
|
|
149
|
+
REQUIRE(sketch.get_rank(static_cast<float>(i)) == true_rank);
|
|
150
|
+
const double true_rank_inclusive = (double) (i + 1) / n;
|
|
151
|
+
REQUIRE(sketch.get_rank<true>(static_cast<float>(i)) == true_rank_inclusive);
|
|
147
152
|
}
|
|
148
153
|
|
|
149
154
|
// the alternative method must produce the same result
|
|
@@ -241,20 +246,38 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
241
246
|
sketch.update(static_cast<float>(i));
|
|
242
247
|
values[i] = static_cast<float>(i);
|
|
243
248
|
}
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
249
|
+
{ // inclusive=false (default)
|
|
250
|
+
const auto ranks(sketch.get_CDF(values, n));
|
|
251
|
+
const auto pmf(sketch.get_PMF(values, n));
|
|
252
|
+
|
|
253
|
+
double subtotal_pmf = 0;
|
|
254
|
+
for (int i = 0; i < n; i++) {
|
|
255
|
+
if (sketch.get_rank(values[i]) != ranks[i]) {
|
|
256
|
+
std::cerr << "checking rank vs CDF for value " << i << std::endl;
|
|
257
|
+
REQUIRE(sketch.get_rank(values[i]) == ranks[i]);
|
|
258
|
+
}
|
|
259
|
+
subtotal_pmf += pmf[i];
|
|
260
|
+
if (abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
|
|
261
|
+
std::cerr << "CDF vs PMF for value " << i << std::endl;
|
|
262
|
+
REQUIRE(ranks[i] == Approx(subtotal_pmf).margin(NUMERIC_NOISE_TOLERANCE));
|
|
263
|
+
}
|
|
253
264
|
}
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
265
|
+
}
|
|
266
|
+
{ // inclusive=true
|
|
267
|
+
const auto ranks(sketch.get_CDF<true>(values, n));
|
|
268
|
+
const auto pmf(sketch.get_PMF<true>(values, n));
|
|
269
|
+
|
|
270
|
+
double subtotal_pmf = 0;
|
|
271
|
+
for (int i = 0; i < n; i++) {
|
|
272
|
+
if (sketch.get_rank<true>(values[i]) != ranks[i]) {
|
|
273
|
+
std::cerr << "checking rank vs CDF for value " << i << std::endl;
|
|
274
|
+
REQUIRE(sketch.get_rank(values[i]) == ranks[i]);
|
|
275
|
+
}
|
|
276
|
+
subtotal_pmf += pmf[i];
|
|
277
|
+
if (abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
|
|
278
|
+
std::cerr << "CDF vs PMF for value " << i << std::endl;
|
|
279
|
+
REQUIRE(ranks[i] == Approx(subtotal_pmf).margin(NUMERIC_NOISE_TOLERANCE));
|
|
280
|
+
}
|
|
258
281
|
}
|
|
259
282
|
}
|
|
260
283
|
}
|
|
@@ -293,7 +316,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
293
316
|
SECTION("bytes serialize deserialize empty") {
|
|
294
317
|
kll_float_sketch sketch(200, 0);
|
|
295
318
|
auto bytes = sketch.serialize();
|
|
296
|
-
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
|
319
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
|
|
297
320
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
|
298
321
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
|
299
322
|
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
|
|
@@ -311,7 +334,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
311
334
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
312
335
|
sketch.serialize(s);
|
|
313
336
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
|
314
|
-
auto sketch2 = kll_float_sketch::deserialize(s,
|
|
337
|
+
auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), 0);
|
|
315
338
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
|
316
339
|
REQUIRE(s.tellg() == s.tellp());
|
|
317
340
|
REQUIRE_FALSE(sketch2.is_empty());
|
|
@@ -330,7 +353,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
330
353
|
sketch.update(1.0f);
|
|
331
354
|
auto bytes = sketch.serialize();
|
|
332
355
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
|
333
|
-
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
|
356
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
|
|
334
357
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
|
335
358
|
REQUIRE_FALSE(sketch2.is_empty());
|
|
336
359
|
REQUIRE_FALSE(sketch2.is_estimation_mode());
|
|
@@ -347,7 +370,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
347
370
|
std::ifstream is;
|
|
348
371
|
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
349
372
|
is.open(testBinaryInputPath + "kll_sketch_float_one_item_v1.sk", std::ios::binary);
|
|
350
|
-
auto sketch = kll_float_sketch::deserialize(is,
|
|
373
|
+
auto sketch = kll_float_sketch::deserialize(is, serde<float>(), 0);
|
|
351
374
|
REQUIRE_FALSE(sketch.is_empty());
|
|
352
375
|
REQUIRE_FALSE(sketch.is_estimation_mode());
|
|
353
376
|
REQUIRE(sketch.get_n() == 1);
|
|
@@ -364,7 +387,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
364
387
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
365
388
|
sketch.serialize(s);
|
|
366
389
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
|
367
|
-
auto sketch2 = kll_float_sketch::deserialize(s,
|
|
390
|
+
auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), 0);
|
|
368
391
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
|
369
392
|
REQUIRE(s.tellg() == s.tellp());
|
|
370
393
|
REQUIRE_FALSE(sketch2.is_empty());
|
|
@@ -382,7 +405,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
382
405
|
sketch.update(3.0f);
|
|
383
406
|
auto bytes = sketch.serialize();
|
|
384
407
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
|
385
|
-
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
|
408
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
|
|
386
409
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
|
387
410
|
REQUIRE_FALSE(sketch2.is_empty());
|
|
388
411
|
REQUIRE_FALSE(sketch2.is_estimation_mode());
|
|
@@ -399,7 +422,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
399
422
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
400
423
|
sketch.serialize(s);
|
|
401
424
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
|
402
|
-
auto sketch2 = kll_float_sketch::deserialize(s,
|
|
425
|
+
auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), 0);
|
|
403
426
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
|
404
427
|
REQUIRE(s.tellg() == s.tellp());
|
|
405
428
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
|
@@ -421,7 +444,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
421
444
|
for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
|
|
422
445
|
auto bytes = sketch.serialize();
|
|
423
446
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
|
424
|
-
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
|
447
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
|
|
425
448
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
|
426
449
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
|
427
450
|
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
|
|
@@ -678,7 +701,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
678
701
|
|
|
679
702
|
auto bytes = sketch1.serialize();
|
|
680
703
|
REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
|
|
681
|
-
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
|
704
|
+
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), serde<std::string>(), 0);
|
|
682
705
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
|
683
706
|
REQUIRE(sketch2.is_empty() == sketch1.is_empty());
|
|
684
707
|
REQUIRE(sketch2.is_estimation_mode() == sketch1.is_estimation_mode());
|
|
@@ -699,7 +722,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
699
722
|
sketch1.update("a");
|
|
700
723
|
auto bytes = sketch1.serialize();
|
|
701
724
|
REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
|
|
702
|
-
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
|
725
|
+
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), serde<std::string>(), 0);
|
|
703
726
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
|
704
727
|
}
|
|
705
728
|
|
|
@@ -766,6 +789,52 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
|
766
789
|
auto kll2 = kll_sketch<int8_t>::deserialize(blob.data(), blob.size());
|
|
767
790
|
}
|
|
768
791
|
|
|
792
|
+
SECTION("sorted view") {
|
|
793
|
+
kll_sketch<int> kll;
|
|
794
|
+
kll.update(2);
|
|
795
|
+
kll.update(3);
|
|
796
|
+
kll.update(1);
|
|
797
|
+
|
|
798
|
+
{ // non-cumulative, using operator->
|
|
799
|
+
auto view = kll.get_sorted_view(false);
|
|
800
|
+
REQUIRE(view.size() == 3);
|
|
801
|
+
auto it = view.begin();
|
|
802
|
+
REQUIRE(it->first == 1);
|
|
803
|
+
REQUIRE(it->second == 1);
|
|
804
|
+
++it;
|
|
805
|
+
REQUIRE(it->first == 2);
|
|
806
|
+
REQUIRE(it->second == 1);
|
|
807
|
+
++it;
|
|
808
|
+
REQUIRE(it->first == 3);
|
|
809
|
+
REQUIRE(it->second == 1);
|
|
810
|
+
}
|
|
811
|
+
{ // cumulative, non-inclusive, using operator->
|
|
812
|
+
auto view = kll.get_sorted_view(true);
|
|
813
|
+
REQUIRE(view.size() == 3);
|
|
814
|
+
auto it = view.begin();
|
|
815
|
+
REQUIRE(it->first == 1);
|
|
816
|
+
REQUIRE(it->second == 0);
|
|
817
|
+
++it;
|
|
818
|
+
REQUIRE(it->first == 2);
|
|
819
|
+
REQUIRE(it->second == 1);
|
|
820
|
+
++it;
|
|
821
|
+
REQUIRE(it->first == 3);
|
|
822
|
+
REQUIRE(it->second == 2);
|
|
823
|
+
}
|
|
824
|
+
{ // cumulative, inclusive, using operator*
|
|
825
|
+
auto view = kll.get_sorted_view<true>(true);
|
|
826
|
+
REQUIRE(view.size() == 3);
|
|
827
|
+
auto it = view.begin();
|
|
828
|
+
REQUIRE((*it).first == 1);
|
|
829
|
+
REQUIRE((*it).second == 1);
|
|
830
|
+
++it;
|
|
831
|
+
REQUIRE((*it).first == 2);
|
|
832
|
+
REQUIRE((*it).second == 2);
|
|
833
|
+
++it;
|
|
834
|
+
REQUIRE((*it).first == 3);
|
|
835
|
+
REQUIRE((*it).second == 3);
|
|
836
|
+
}
|
|
837
|
+
}
|
|
769
838
|
// cleanup
|
|
770
839
|
if (test_allocator_total_bytes != 0) {
|
|
771
840
|
REQUIRE(test_allocator_total_bytes == 0);
|
|
@@ -46,7 +46,7 @@ TEST_CASE("kolmogorov-smirnov same distribution", "[kll_sketch]") {
|
|
|
46
46
|
sketch1.update(x);
|
|
47
47
|
sketch2.update(x);
|
|
48
48
|
}
|
|
49
|
-
REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == Approx(0).margin(0.
|
|
49
|
+
REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == Approx(0).margin(0.02));
|
|
50
50
|
REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.01));
|
|
51
51
|
}
|
|
52
52
|
|
|
@@ -15,7 +15,11 @@
|
|
|
15
15
|
# specific language governing permissions and limitations
|
|
16
16
|
# under the License.
|
|
17
17
|
|
|
18
|
-
|
|
18
|
+
if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18.0")
|
|
19
|
+
find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED)
|
|
20
|
+
else()
|
|
21
|
+
find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
|
|
22
|
+
endif()
|
|
19
23
|
|
|
20
24
|
# only Windows+MSVC seems to have trouble locating pybind11
|
|
21
25
|
if (MSVC)
|
|
@@ -40,6 +44,7 @@ target_link_libraries(python
|
|
|
40
44
|
theta
|
|
41
45
|
sampling
|
|
42
46
|
req
|
|
47
|
+
quantiles
|
|
43
48
|
pybind11::module
|
|
44
49
|
)
|
|
45
50
|
|
|
@@ -63,5 +68,7 @@ target_sources(python
|
|
|
63
68
|
src/theta_wrapper.cpp
|
|
64
69
|
src/vo_wrapper.cpp
|
|
65
70
|
src/req_wrapper.cpp
|
|
71
|
+
src/quantiles_wrapper.cpp
|
|
72
|
+
src/ks_wrapper.cpp
|
|
66
73
|
src/vector_of_kll.cpp
|
|
67
74
|
)
|
|
@@ -27,6 +27,11 @@ Having installed the library, loading the Apache Datasketches Library in Python
|
|
|
27
27
|
- KLL (Absolute Error Quantiles)
|
|
28
28
|
- `kll_ints_sketch`
|
|
29
29
|
- `kll_floats_sketch`
|
|
30
|
+
- `kll_doubles_sketch`
|
|
31
|
+
- Quantiles (Absolute Error Quantiles, inferior algorithm)
|
|
32
|
+
- `quantiles_ints_sketch`
|
|
33
|
+
- `quantiles_floats_sketch`
|
|
34
|
+
- `quantiles_doubles_sketch`
|
|
30
35
|
- REQ (Relative Error Quantiles)
|
|
31
36
|
- `req_ints_sketch`
|
|
32
37
|
- `req_floats_sketch`
|
|
@@ -52,6 +57,8 @@ Having installed the library, loading the Apache Datasketches Library in Python
|
|
|
52
57
|
- Vector of KLL
|
|
53
58
|
- `vector_of_kll_ints_sketches`
|
|
54
59
|
- `vector_of_kll_floats_sketches`
|
|
60
|
+
- Kolmogorov-Smirnov Test
|
|
61
|
+
- `ks_test` applied to a pair of matched-type Absolute Error quantiles sketches
|
|
55
62
|
|
|
56
63
|
## Known Differences from C++
|
|
57
64
|
|
|
@@ -28,6 +28,8 @@ void init_cpc(py::module& m);
|
|
|
28
28
|
void init_theta(py::module& m);
|
|
29
29
|
void init_vo(py::module& m);
|
|
30
30
|
void init_req(py::module& m);
|
|
31
|
+
void init_quantiles(py::module& m);
|
|
32
|
+
void init_kolmogorov_smirnov(py::module& m);
|
|
31
33
|
void init_vector_of_kll(py::module& m);
|
|
32
34
|
|
|
33
35
|
PYBIND11_MODULE(datasketches, m) {
|
|
@@ -38,5 +40,7 @@ PYBIND11_MODULE(datasketches, m) {
|
|
|
38
40
|
init_theta(m);
|
|
39
41
|
init_vo(m);
|
|
40
42
|
init_req(m);
|
|
43
|
+
init_quantiles(m);
|
|
44
|
+
init_kolmogorov_smirnov(m);
|
|
41
45
|
init_vector_of_kll(m);
|
|
42
46
|
}
|
|
@@ -64,6 +64,11 @@ py::list fi_sketch_get_frequent_items(const frequent_items_sketch<T>& sk,
|
|
|
64
64
|
return list;
|
|
65
65
|
}
|
|
66
66
|
|
|
67
|
+
template<typename T>
|
|
68
|
+
size_t fi_sketch_get_serialized_size_bytes(const frequent_items_sketch<T>& sk) {
|
|
69
|
+
return sk.get_serialized_size_bytes();
|
|
70
|
+
}
|
|
71
|
+
|
|
67
72
|
}
|
|
68
73
|
}
|
|
69
74
|
|
|
@@ -104,7 +109,7 @@ void bind_fi_sketch(py::module &m, const char* name) {
|
|
|
104
109
|
"Returns the epsilon value used to compute a priori error for a given log2(max_map_size)")
|
|
105
110
|
.def_static("get_apriori_error", &frequent_items_sketch<T>::get_apriori_error, py::arg("lg_max_map_size"), py::arg("estimated_total_weight"),
|
|
106
111
|
"Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight.")
|
|
107
|
-
.def("get_serialized_size_bytes", &
|
|
112
|
+
.def("get_serialized_size_bytes", &dspy::fi_sketch_get_serialized_size_bytes<T>,
|
|
108
113
|
"Computes the size needed to serialize the current state of the sketch. This can be expensive since every item needs to be looked at.")
|
|
109
114
|
.def("serialize", &dspy::fi_sketch_serialize<T>, "Serializes the sketch into a bytes object")
|
|
110
115
|
.def_static("deserialize", &dspy::fi_sketch_deserialize<T>, "Reads a bytes object and returns the corresponding frequent_strings_sketch")
|
|
@@ -24,6 +24,7 @@
|
|
|
24
24
|
#include <pybind11/numpy.h>
|
|
25
25
|
#include <sstream>
|
|
26
26
|
#include <vector>
|
|
27
|
+
#include <stdexcept>
|
|
27
28
|
|
|
28
29
|
namespace py = pybind11;
|
|
29
30
|
|
|
@@ -50,11 +51,32 @@ double kll_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
|
|
|
50
51
|
return kll_sketch<T>::get_normalized_rank_error(k, pmf);
|
|
51
52
|
}
|
|
52
53
|
|
|
54
|
+
template<typename T>
|
|
55
|
+
double kll_sketch_get_rank(const kll_sketch<T>& sk, const T& item, bool inclusive) {
|
|
56
|
+
if (inclusive)
|
|
57
|
+
return sk.template get_rank<true>(item);
|
|
58
|
+
else
|
|
59
|
+
return sk.template get_rank<false>(item);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
template<typename T>
|
|
63
|
+
T kll_sketch_get_quantile(const kll_sketch<T>& sk,
|
|
64
|
+
double rank,
|
|
65
|
+
bool inclusive) {
|
|
66
|
+
if (inclusive)
|
|
67
|
+
return T(sk.template get_quantile<true>(rank));
|
|
68
|
+
else
|
|
69
|
+
return T(sk.template get_quantile<false>(rank));
|
|
70
|
+
}
|
|
71
|
+
|
|
53
72
|
template<typename T>
|
|
54
73
|
py::list kll_sketch_get_quantiles(const kll_sketch<T>& sk,
|
|
55
|
-
std::vector<double>& fractions
|
|
74
|
+
std::vector<double>& fractions,
|
|
75
|
+
bool inclusive) {
|
|
56
76
|
size_t nQuantiles = fractions.size();
|
|
57
|
-
auto result =
|
|
77
|
+
auto result = inclusive ?
|
|
78
|
+
sk.template get_quantiles<true>(fractions.data(), nQuantiles)
|
|
79
|
+
: sk.template get_quantiles<false>(fractions.data(), nQuantiles);
|
|
58
80
|
|
|
59
81
|
// returning as std::vector<> would copy values to a list anyway
|
|
60
82
|
py::list list(nQuantiles);
|
|
@@ -67,9 +89,12 @@ py::list kll_sketch_get_quantiles(const kll_sketch<T>& sk,
|
|
|
67
89
|
|
|
68
90
|
template<typename T>
|
|
69
91
|
py::list kll_sketch_get_pmf(const kll_sketch<T>& sk,
|
|
70
|
-
std::vector<T>& split_points
|
|
92
|
+
std::vector<T>& split_points,
|
|
93
|
+
bool inclusive) {
|
|
71
94
|
size_t nPoints = split_points.size();
|
|
72
|
-
auto result =
|
|
95
|
+
auto result = inclusive ?
|
|
96
|
+
sk.template get_PMF<true>(split_points.data(), nPoints)
|
|
97
|
+
: sk.template get_PMF<false>(split_points.data(), nPoints);
|
|
73
98
|
|
|
74
99
|
py::list list(nPoints + 1);
|
|
75
100
|
for (size_t i = 0; i <= nPoints; ++i) {
|
|
@@ -81,9 +106,12 @@ py::list kll_sketch_get_pmf(const kll_sketch<T>& sk,
|
|
|
81
106
|
|
|
82
107
|
template<typename T>
|
|
83
108
|
py::list kll_sketch_get_cdf(const kll_sketch<T>& sk,
|
|
84
|
-
std::vector<T>& split_points
|
|
109
|
+
std::vector<T>& split_points,
|
|
110
|
+
bool inclusive) {
|
|
85
111
|
size_t nPoints = split_points.size();
|
|
86
|
-
auto result =
|
|
112
|
+
auto result = inclusive ?
|
|
113
|
+
sk.template get_CDF<true>(split_points.data(), nPoints)
|
|
114
|
+
: sk.template get_CDF<false>(split_points.data(), nPoints);
|
|
87
115
|
|
|
88
116
|
py::list list(nPoints + 1);
|
|
89
117
|
for (size_t i = 0; i <= nPoints; ++i) {
|
|
@@ -142,7 +170,7 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
|
142
170
|
"Returns the minimum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
|
|
143
171
|
.def("get_max_value", &kll_sketch<T>::get_max_value,
|
|
144
172
|
"Returns the maximum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
|
|
145
|
-
.def("get_quantile", &
|
|
173
|
+
.def("get_quantile", &dspy::kll_sketch_get_quantile<T>, py::arg("fraction"), py::arg("inclusive")=false,
|
|
146
174
|
"Returns an approximation to the value of the data item "
|
|
147
175
|
"that would be preceded by the given fraction of a hypothetical sorted "
|
|
148
176
|
"version of the input stream so far.\n"
|
|
@@ -151,7 +179,7 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
|
151
179
|
"sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
|
|
152
180
|
"For kll_floats_sketch: if the sketch is empty this returns nan. "
|
|
153
181
|
"For kll_ints_sketch: if the sketch is empty this throws a RuntimeError.")
|
|
154
|
-
.def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("fractions"),
|
|
182
|
+
.def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("fractions"), py::arg("inclusive")=false,
|
|
155
183
|
"This is a more efficient multiple-query version of get_quantile().\n"
|
|
156
184
|
"This returns an array that could have been generated by using get_quantile() for each "
|
|
157
185
|
"fractional rank separately, but would be very inefficient. "
|
|
@@ -159,12 +187,14 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
|
159
187
|
"a single query. It is strongly recommend that this method be used instead of multiple calls "
|
|
160
188
|
"to get_quantile().\n"
|
|
161
189
|
"If the sketch is empty this returns an empty vector.")
|
|
162
|
-
.def("get_rank", &
|
|
190
|
+
.def("get_rank", &dspy::kll_sketch_get_rank<T>, py::arg("value"), py::arg("inclusive")=false,
|
|
163
191
|
"Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
|
|
164
192
|
"The resulting approximation has a probabilistic guarantee that can be obtained from the "
|
|
165
193
|
"get_normalized_rank_error(False) function.\n"
|
|
194
|
+
"With the parameter inclusive=true the weight of the given value is included into the rank."
|
|
195
|
+
"Otherwise the rank equals the sum of the weights of values less than the given value.\n"
|
|
166
196
|
"If the sketch is empty this returns nan.")
|
|
167
|
-
.def("get_pmf", &dspy::kll_sketch_get_pmf<T>, py::arg("split_points"),
|
|
197
|
+
.def("get_pmf", &dspy::kll_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
|
168
198
|
"Returns an approximation to the Probability Mass Function (PMF) of the input stream "
|
|
169
199
|
"given a set of split points (values).\n"
|
|
170
200
|
"The resulting approximations have a probabilistic guarantee that can be obtained from the "
|
|
@@ -172,11 +202,13 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
|
172
202
|
"If the sketch is empty this returns an empty vector.\n"
|
|
173
203
|
"split_points is an array of m unique, monotonically increasing float values "
|
|
174
204
|
"that divide the real number line into m+1 consecutive disjoint intervals.\n"
|
|
175
|
-
"
|
|
205
|
+
"If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
|
|
176
206
|
"exclusive of the right split point, with the exception that the last interval will include "
|
|
177
207
|
"the maximum value.\n"
|
|
208
|
+
"If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
|
|
209
|
+
"inclusive of the right split point.\n"
|
|
178
210
|
"It is not necessary to include either the min or max values in these split points.")
|
|
179
|
-
.def("get_cdf", &dspy::kll_sketch_get_cdf<T>, py::arg("split_points"),
|
|
211
|
+
.def("get_cdf", &dspy::kll_sketch_get_cdf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
|
180
212
|
"Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
|
|
181
213
|
"cumulative analog of the PMF, of the input stream given a set of split points (values).\n"
|
|
182
214
|
"The resulting approximations have a probabilistic guarantee that can be obtained from the "
|
|
@@ -184,9 +216,11 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
|
184
216
|
"If the sketch is empty this returns an empty vector.\n"
|
|
185
217
|
"split_points is an array of m unique, monotonically increasing float values "
|
|
186
218
|
"that divide the real number line into m+1 consecutive disjoint intervals.\n"
|
|
187
|
-
"
|
|
219
|
+
"If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
|
|
188
220
|
"exclusive of the right split point, with the exception that the last interval will include "
|
|
189
221
|
"the maximum value.\n"
|
|
222
|
+
"If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
|
|
223
|
+
"inclusive of the right split point.\n"
|
|
190
224
|
"It is not necessary to include either the min or max values in these split points.")
|
|
191
225
|
.def("normalized_rank_error", (double (kll_sketch<T>::*)(bool) const) &kll_sketch<T>::get_normalized_rank_error,
|
|
192
226
|
py::arg("as_pmf"),
|
|
@@ -208,4 +242,5 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
|
208
242
|
void init_kll(py::module &m) {
|
|
209
243
|
bind_kll_sketch<int>(m, "kll_ints_sketch");
|
|
210
244
|
bind_kll_sketch<float>(m, "kll_floats_sketch");
|
|
245
|
+
bind_kll_sketch<double>(m, "kll_doubles_sketch");
|
|
211
246
|
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include "kolmogorov_smirnov.hpp"
|
|
21
|
+
#include "kll_sketch.hpp"
|
|
22
|
+
#include "quantiles_sketch.hpp"
|
|
23
|
+
|
|
24
|
+
#include <pybind11/pybind11.h>
|
|
25
|
+
|
|
26
|
+
namespace py = pybind11;
|
|
27
|
+
|
|
28
|
+
void init_kolmogorov_smirnov(py::module &m) {
|
|
29
|
+
using namespace datasketches;
|
|
30
|
+
|
|
31
|
+
m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<int>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
|
32
|
+
"Performs the Kolmogorov-Smirnov Test between kll_ints_sketches.\n"
|
|
33
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
|
34
|
+
"this will return false.\n"
|
|
35
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
|
36
|
+
"distribution) using the provided p-value, otherwise False.");
|
|
37
|
+
m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<float>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
|
38
|
+
"Performs the Kolmogorov-Smirnov Test between kll_floats_sketches.\n"
|
|
39
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
|
40
|
+
"this will return false.\n"
|
|
41
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
|
42
|
+
"distribution) using the provided p-value, otherwise False.");
|
|
43
|
+
m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<double>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
|
44
|
+
"Performs the Kolmogorov-Smirnov Test between kll_doubles_sketches.\n"
|
|
45
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
|
46
|
+
"this will return false.\n"
|
|
47
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
|
48
|
+
"distribution) using the provided p-value, otherwise False.");
|
|
49
|
+
|
|
50
|
+
m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<int>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
|
51
|
+
"Performs the Kolmogorov-Smirnov Test between quantiles_ints_sketches.\n"
|
|
52
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
|
53
|
+
"this will return false.\n"
|
|
54
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
|
55
|
+
"distribution) using the provided p-value, otherwise False.");
|
|
56
|
+
m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<float>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
|
57
|
+
"Performs the Kolmogorov-Smirnov Test between quantiles_floats_sketches.\n"
|
|
58
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
|
59
|
+
"this will return false.\n"
|
|
60
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
|
61
|
+
"distribution) using the provided p-value, otherwise False.");
|
|
62
|
+
m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<double>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
|
63
|
+
"Performs the Kolmogorov-Smirnov Test between quantiles_doubles_sketches.\n"
|
|
64
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
|
65
|
+
"this will return false.\n"
|
|
66
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
|
67
|
+
"distribution) using the provided p-value, otherwise False.");
|
|
68
|
+
}
|