datasketches 0.2.4 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +1 -1
- data/ext/datasketches/kll_wrapper.cpp +5 -1
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +4 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +2 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +25 -9
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +96 -42
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +105 -127
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +94 -25
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
- data/vendor/datasketches-cpp/pyproject.toml +1 -1
- data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
- data/vendor/datasketches-cpp/python/README.md +7 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
- data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -2
- data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +62 -59
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +31 -26
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +25 -9
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +8 -6
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +7 -45
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +29 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +2 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +16 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -0
- metadata +25 -9
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
@@ -22,6 +22,7 @@
|
|
22
22
|
#include <cstring>
|
23
23
|
#include <sstream>
|
24
24
|
#include <fstream>
|
25
|
+
#include <stdexcept>
|
25
26
|
|
26
27
|
#include <kll_sketch.hpp>
|
27
28
|
#include <test_allocator.hpp>
|
@@ -90,7 +91,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
90
91
|
REQUIRE(sketch.get_n() == 1);
|
91
92
|
REQUIRE(sketch.get_num_retained() == 1);
|
92
93
|
REQUIRE(sketch.get_rank(1.0f) == 0.0);
|
94
|
+
REQUIRE(sketch.get_rank<true>(1.0f) == 1.0);
|
93
95
|
REQUIRE(sketch.get_rank(2.0f) == 1.0);
|
96
|
+
REQUIRE(sketch.get_rank(std::numeric_limits<float>::infinity()) == 1.0);
|
94
97
|
REQUIRE(sketch.get_min_value() == 1.0);
|
95
98
|
REQUIRE(sketch.get_max_value() == 1.0);
|
96
99
|
REQUIRE(sketch.get_quantile(0.5) == 1.0);
|
@@ -142,8 +145,10 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
142
145
|
REQUIRE(quantiles[2] == n - 1 );
|
143
146
|
|
144
147
|
for (uint32_t i = 0; i < n; i++) {
|
145
|
-
const double
|
146
|
-
REQUIRE(sketch.get_rank(static_cast<float>(i)) ==
|
148
|
+
const double true_rank = (double) i / n;
|
149
|
+
REQUIRE(sketch.get_rank(static_cast<float>(i)) == true_rank);
|
150
|
+
const double true_rank_inclusive = (double) (i + 1) / n;
|
151
|
+
REQUIRE(sketch.get_rank<true>(static_cast<float>(i)) == true_rank_inclusive);
|
147
152
|
}
|
148
153
|
|
149
154
|
// the alternative method must produce the same result
|
@@ -241,20 +246,38 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
241
246
|
sketch.update(static_cast<float>(i));
|
242
247
|
values[i] = static_cast<float>(i);
|
243
248
|
}
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
249
|
+
{ // inclusive=false (default)
|
250
|
+
const auto ranks(sketch.get_CDF(values, n));
|
251
|
+
const auto pmf(sketch.get_PMF(values, n));
|
252
|
+
|
253
|
+
double subtotal_pmf = 0;
|
254
|
+
for (int i = 0; i < n; i++) {
|
255
|
+
if (sketch.get_rank(values[i]) != ranks[i]) {
|
256
|
+
std::cerr << "checking rank vs CDF for value " << i << std::endl;
|
257
|
+
REQUIRE(sketch.get_rank(values[i]) == ranks[i]);
|
258
|
+
}
|
259
|
+
subtotal_pmf += pmf[i];
|
260
|
+
if (abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
|
261
|
+
std::cerr << "CDF vs PMF for value " << i << std::endl;
|
262
|
+
REQUIRE(ranks[i] == Approx(subtotal_pmf).margin(NUMERIC_NOISE_TOLERANCE));
|
263
|
+
}
|
253
264
|
}
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
265
|
+
}
|
266
|
+
{ // inclusive=true
|
267
|
+
const auto ranks(sketch.get_CDF<true>(values, n));
|
268
|
+
const auto pmf(sketch.get_PMF<true>(values, n));
|
269
|
+
|
270
|
+
double subtotal_pmf = 0;
|
271
|
+
for (int i = 0; i < n; i++) {
|
272
|
+
if (sketch.get_rank<true>(values[i]) != ranks[i]) {
|
273
|
+
std::cerr << "checking rank vs CDF for value " << i << std::endl;
|
274
|
+
REQUIRE(sketch.get_rank(values[i]) == ranks[i]);
|
275
|
+
}
|
276
|
+
subtotal_pmf += pmf[i];
|
277
|
+
if (abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
|
278
|
+
std::cerr << "CDF vs PMF for value " << i << std::endl;
|
279
|
+
REQUIRE(ranks[i] == Approx(subtotal_pmf).margin(NUMERIC_NOISE_TOLERANCE));
|
280
|
+
}
|
258
281
|
}
|
259
282
|
}
|
260
283
|
}
|
@@ -293,7 +316,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
293
316
|
SECTION("bytes serialize deserialize empty") {
|
294
317
|
kll_float_sketch sketch(200, 0);
|
295
318
|
auto bytes = sketch.serialize();
|
296
|
-
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
319
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
|
297
320
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
298
321
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
299
322
|
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
|
@@ -311,7 +334,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
311
334
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
312
335
|
sketch.serialize(s);
|
313
336
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
314
|
-
auto sketch2 = kll_float_sketch::deserialize(s,
|
337
|
+
auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), 0);
|
315
338
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
316
339
|
REQUIRE(s.tellg() == s.tellp());
|
317
340
|
REQUIRE_FALSE(sketch2.is_empty());
|
@@ -330,7 +353,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
330
353
|
sketch.update(1.0f);
|
331
354
|
auto bytes = sketch.serialize();
|
332
355
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
333
|
-
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
356
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
|
334
357
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
335
358
|
REQUIRE_FALSE(sketch2.is_empty());
|
336
359
|
REQUIRE_FALSE(sketch2.is_estimation_mode());
|
@@ -347,7 +370,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
347
370
|
std::ifstream is;
|
348
371
|
is.exceptions(std::ios::failbit | std::ios::badbit);
|
349
372
|
is.open(testBinaryInputPath + "kll_sketch_float_one_item_v1.sk", std::ios::binary);
|
350
|
-
auto sketch = kll_float_sketch::deserialize(is,
|
373
|
+
auto sketch = kll_float_sketch::deserialize(is, serde<float>(), 0);
|
351
374
|
REQUIRE_FALSE(sketch.is_empty());
|
352
375
|
REQUIRE_FALSE(sketch.is_estimation_mode());
|
353
376
|
REQUIRE(sketch.get_n() == 1);
|
@@ -364,7 +387,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
364
387
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
365
388
|
sketch.serialize(s);
|
366
389
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
367
|
-
auto sketch2 = kll_float_sketch::deserialize(s,
|
390
|
+
auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), 0);
|
368
391
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
369
392
|
REQUIRE(s.tellg() == s.tellp());
|
370
393
|
REQUIRE_FALSE(sketch2.is_empty());
|
@@ -382,7 +405,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
382
405
|
sketch.update(3.0f);
|
383
406
|
auto bytes = sketch.serialize();
|
384
407
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
385
|
-
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
408
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
|
386
409
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
387
410
|
REQUIRE_FALSE(sketch2.is_empty());
|
388
411
|
REQUIRE_FALSE(sketch2.is_estimation_mode());
|
@@ -399,7 +422,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
399
422
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
400
423
|
sketch.serialize(s);
|
401
424
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
|
402
|
-
auto sketch2 = kll_float_sketch::deserialize(s,
|
425
|
+
auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), 0);
|
403
426
|
REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
|
404
427
|
REQUIRE(s.tellg() == s.tellp());
|
405
428
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
@@ -421,7 +444,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
421
444
|
for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
|
422
445
|
auto bytes = sketch.serialize();
|
423
446
|
REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
|
424
|
-
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
447
|
+
auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
|
425
448
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
426
449
|
REQUIRE(sketch2.is_empty() == sketch.is_empty());
|
427
450
|
REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
|
@@ -678,7 +701,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
678
701
|
|
679
702
|
auto bytes = sketch1.serialize();
|
680
703
|
REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
|
681
|
-
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
704
|
+
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), serde<std::string>(), 0);
|
682
705
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
683
706
|
REQUIRE(sketch2.is_empty() == sketch1.is_empty());
|
684
707
|
REQUIRE(sketch2.is_estimation_mode() == sketch1.is_estimation_mode());
|
@@ -699,7 +722,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
699
722
|
sketch1.update("a");
|
700
723
|
auto bytes = sketch1.serialize();
|
701
724
|
REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
|
702
|
-
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), 0);
|
725
|
+
auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), serde<std::string>(), 0);
|
703
726
|
REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
|
704
727
|
}
|
705
728
|
|
@@ -766,6 +789,52 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
|
|
766
789
|
auto kll2 = kll_sketch<int8_t>::deserialize(blob.data(), blob.size());
|
767
790
|
}
|
768
791
|
|
792
|
+
SECTION("sorted view") {
|
793
|
+
kll_sketch<int> kll;
|
794
|
+
kll.update(2);
|
795
|
+
kll.update(3);
|
796
|
+
kll.update(1);
|
797
|
+
|
798
|
+
{ // non-cumulative, using operator->
|
799
|
+
auto view = kll.get_sorted_view(false);
|
800
|
+
REQUIRE(view.size() == 3);
|
801
|
+
auto it = view.begin();
|
802
|
+
REQUIRE(it->first == 1);
|
803
|
+
REQUIRE(it->second == 1);
|
804
|
+
++it;
|
805
|
+
REQUIRE(it->first == 2);
|
806
|
+
REQUIRE(it->second == 1);
|
807
|
+
++it;
|
808
|
+
REQUIRE(it->first == 3);
|
809
|
+
REQUIRE(it->second == 1);
|
810
|
+
}
|
811
|
+
{ // cumulative, non-inclusive, using operator->
|
812
|
+
auto view = kll.get_sorted_view(true);
|
813
|
+
REQUIRE(view.size() == 3);
|
814
|
+
auto it = view.begin();
|
815
|
+
REQUIRE(it->first == 1);
|
816
|
+
REQUIRE(it->second == 0);
|
817
|
+
++it;
|
818
|
+
REQUIRE(it->first == 2);
|
819
|
+
REQUIRE(it->second == 1);
|
820
|
+
++it;
|
821
|
+
REQUIRE(it->first == 3);
|
822
|
+
REQUIRE(it->second == 2);
|
823
|
+
}
|
824
|
+
{ // cumulative, inclusive, using operator*
|
825
|
+
auto view = kll.get_sorted_view<true>(true);
|
826
|
+
REQUIRE(view.size() == 3);
|
827
|
+
auto it = view.begin();
|
828
|
+
REQUIRE((*it).first == 1);
|
829
|
+
REQUIRE((*it).second == 1);
|
830
|
+
++it;
|
831
|
+
REQUIRE((*it).first == 2);
|
832
|
+
REQUIRE((*it).second == 2);
|
833
|
+
++it;
|
834
|
+
REQUIRE((*it).first == 3);
|
835
|
+
REQUIRE((*it).second == 3);
|
836
|
+
}
|
837
|
+
}
|
769
838
|
// cleanup
|
770
839
|
if (test_allocator_total_bytes != 0) {
|
771
840
|
REQUIRE(test_allocator_total_bytes == 0);
|
@@ -46,7 +46,7 @@ TEST_CASE("kolmogorov-smirnov same distribution", "[kll_sketch]") {
|
|
46
46
|
sketch1.update(x);
|
47
47
|
sketch2.update(x);
|
48
48
|
}
|
49
|
-
REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == Approx(0).margin(0.
|
49
|
+
REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == Approx(0).margin(0.02));
|
50
50
|
REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.01));
|
51
51
|
}
|
52
52
|
|
@@ -15,7 +15,11 @@
|
|
15
15
|
# specific language governing permissions and limitations
|
16
16
|
# under the License.
|
17
17
|
|
18
|
-
|
18
|
+
if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18.0")
|
19
|
+
find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED)
|
20
|
+
else()
|
21
|
+
find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
|
22
|
+
endif()
|
19
23
|
|
20
24
|
# only Windows+MSVC seems to have trouble locating pybind11
|
21
25
|
if (MSVC)
|
@@ -40,6 +44,7 @@ target_link_libraries(python
|
|
40
44
|
theta
|
41
45
|
sampling
|
42
46
|
req
|
47
|
+
quantiles
|
43
48
|
pybind11::module
|
44
49
|
)
|
45
50
|
|
@@ -63,5 +68,7 @@ target_sources(python
|
|
63
68
|
src/theta_wrapper.cpp
|
64
69
|
src/vo_wrapper.cpp
|
65
70
|
src/req_wrapper.cpp
|
71
|
+
src/quantiles_wrapper.cpp
|
72
|
+
src/ks_wrapper.cpp
|
66
73
|
src/vector_of_kll.cpp
|
67
74
|
)
|
@@ -27,6 +27,11 @@ Having installed the library, loading the Apache Datasketches Library in Python
|
|
27
27
|
- KLL (Absolute Error Quantiles)
|
28
28
|
- `kll_ints_sketch`
|
29
29
|
- `kll_floats_sketch`
|
30
|
+
- `kll_doubles_sketch`
|
31
|
+
- Quantiles (Absolute Error Quantiles, inferior algorithm)
|
32
|
+
- `quantiles_ints_sketch`
|
33
|
+
- `quantiles_floats_sketch`
|
34
|
+
- `quantiles_doubles_sketch`
|
30
35
|
- REQ (Relative Error Quantiles)
|
31
36
|
- `req_ints_sketch`
|
32
37
|
- `req_floats_sketch`
|
@@ -52,6 +57,8 @@ Having installed the library, loading the Apache Datasketches Library in Python
|
|
52
57
|
- Vector of KLL
|
53
58
|
- `vector_of_kll_ints_sketches`
|
54
59
|
- `vector_of_kll_floats_sketches`
|
60
|
+
- Kolmogorov-Smirnov Test
|
61
|
+
- `ks_test` applied to a pair of matched-type Absolute Error quantiles sketches
|
55
62
|
|
56
63
|
## Known Differences from C++
|
57
64
|
|
@@ -28,6 +28,8 @@ void init_cpc(py::module& m);
|
|
28
28
|
void init_theta(py::module& m);
|
29
29
|
void init_vo(py::module& m);
|
30
30
|
void init_req(py::module& m);
|
31
|
+
void init_quantiles(py::module& m);
|
32
|
+
void init_kolmogorov_smirnov(py::module& m);
|
31
33
|
void init_vector_of_kll(py::module& m);
|
32
34
|
|
33
35
|
PYBIND11_MODULE(datasketches, m) {
|
@@ -38,5 +40,7 @@ PYBIND11_MODULE(datasketches, m) {
|
|
38
40
|
init_theta(m);
|
39
41
|
init_vo(m);
|
40
42
|
init_req(m);
|
43
|
+
init_quantiles(m);
|
44
|
+
init_kolmogorov_smirnov(m);
|
41
45
|
init_vector_of_kll(m);
|
42
46
|
}
|
@@ -64,6 +64,11 @@ py::list fi_sketch_get_frequent_items(const frequent_items_sketch<T>& sk,
|
|
64
64
|
return list;
|
65
65
|
}
|
66
66
|
|
67
|
+
template<typename T>
|
68
|
+
size_t fi_sketch_get_serialized_size_bytes(const frequent_items_sketch<T>& sk) {
|
69
|
+
return sk.get_serialized_size_bytes();
|
70
|
+
}
|
71
|
+
|
67
72
|
}
|
68
73
|
}
|
69
74
|
|
@@ -104,7 +109,7 @@ void bind_fi_sketch(py::module &m, const char* name) {
|
|
104
109
|
"Returns the epsilon value used to compute a priori error for a given log2(max_map_size)")
|
105
110
|
.def_static("get_apriori_error", &frequent_items_sketch<T>::get_apriori_error, py::arg("lg_max_map_size"), py::arg("estimated_total_weight"),
|
106
111
|
"Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight.")
|
107
|
-
.def("get_serialized_size_bytes", &
|
112
|
+
.def("get_serialized_size_bytes", &dspy::fi_sketch_get_serialized_size_bytes<T>,
|
108
113
|
"Computes the size needed to serialize the current state of the sketch. This can be expensive since every item needs to be looked at.")
|
109
114
|
.def("serialize", &dspy::fi_sketch_serialize<T>, "Serializes the sketch into a bytes object")
|
110
115
|
.def_static("deserialize", &dspy::fi_sketch_deserialize<T>, "Reads a bytes object and returns the corresponding frequent_strings_sketch")
|
@@ -24,6 +24,7 @@
|
|
24
24
|
#include <pybind11/numpy.h>
|
25
25
|
#include <sstream>
|
26
26
|
#include <vector>
|
27
|
+
#include <stdexcept>
|
27
28
|
|
28
29
|
namespace py = pybind11;
|
29
30
|
|
@@ -50,11 +51,32 @@ double kll_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
|
|
50
51
|
return kll_sketch<T>::get_normalized_rank_error(k, pmf);
|
51
52
|
}
|
52
53
|
|
54
|
+
template<typename T>
|
55
|
+
double kll_sketch_get_rank(const kll_sketch<T>& sk, const T& item, bool inclusive) {
|
56
|
+
if (inclusive)
|
57
|
+
return sk.template get_rank<true>(item);
|
58
|
+
else
|
59
|
+
return sk.template get_rank<false>(item);
|
60
|
+
}
|
61
|
+
|
62
|
+
template<typename T>
|
63
|
+
T kll_sketch_get_quantile(const kll_sketch<T>& sk,
|
64
|
+
double rank,
|
65
|
+
bool inclusive) {
|
66
|
+
if (inclusive)
|
67
|
+
return T(sk.template get_quantile<true>(rank));
|
68
|
+
else
|
69
|
+
return T(sk.template get_quantile<false>(rank));
|
70
|
+
}
|
71
|
+
|
53
72
|
template<typename T>
|
54
73
|
py::list kll_sketch_get_quantiles(const kll_sketch<T>& sk,
|
55
|
-
std::vector<double>& fractions
|
74
|
+
std::vector<double>& fractions,
|
75
|
+
bool inclusive) {
|
56
76
|
size_t nQuantiles = fractions.size();
|
57
|
-
auto result =
|
77
|
+
auto result = inclusive ?
|
78
|
+
sk.template get_quantiles<true>(fractions.data(), nQuantiles)
|
79
|
+
: sk.template get_quantiles<false>(fractions.data(), nQuantiles);
|
58
80
|
|
59
81
|
// returning as std::vector<> would copy values to a list anyway
|
60
82
|
py::list list(nQuantiles);
|
@@ -67,9 +89,12 @@ py::list kll_sketch_get_quantiles(const kll_sketch<T>& sk,
|
|
67
89
|
|
68
90
|
template<typename T>
|
69
91
|
py::list kll_sketch_get_pmf(const kll_sketch<T>& sk,
|
70
|
-
std::vector<T>& split_points
|
92
|
+
std::vector<T>& split_points,
|
93
|
+
bool inclusive) {
|
71
94
|
size_t nPoints = split_points.size();
|
72
|
-
auto result =
|
95
|
+
auto result = inclusive ?
|
96
|
+
sk.template get_PMF<true>(split_points.data(), nPoints)
|
97
|
+
: sk.template get_PMF<false>(split_points.data(), nPoints);
|
73
98
|
|
74
99
|
py::list list(nPoints + 1);
|
75
100
|
for (size_t i = 0; i <= nPoints; ++i) {
|
@@ -81,9 +106,12 @@ py::list kll_sketch_get_pmf(const kll_sketch<T>& sk,
|
|
81
106
|
|
82
107
|
template<typename T>
|
83
108
|
py::list kll_sketch_get_cdf(const kll_sketch<T>& sk,
|
84
|
-
std::vector<T>& split_points
|
109
|
+
std::vector<T>& split_points,
|
110
|
+
bool inclusive) {
|
85
111
|
size_t nPoints = split_points.size();
|
86
|
-
auto result =
|
112
|
+
auto result = inclusive ?
|
113
|
+
sk.template get_CDF<true>(split_points.data(), nPoints)
|
114
|
+
: sk.template get_CDF<false>(split_points.data(), nPoints);
|
87
115
|
|
88
116
|
py::list list(nPoints + 1);
|
89
117
|
for (size_t i = 0; i <= nPoints; ++i) {
|
@@ -142,7 +170,7 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
142
170
|
"Returns the minimum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
|
143
171
|
.def("get_max_value", &kll_sketch<T>::get_max_value,
|
144
172
|
"Returns the maximum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
|
145
|
-
.def("get_quantile", &
|
173
|
+
.def("get_quantile", &dspy::kll_sketch_get_quantile<T>, py::arg("fraction"), py::arg("inclusive")=false,
|
146
174
|
"Returns an approximation to the value of the data item "
|
147
175
|
"that would be preceded by the given fraction of a hypothetical sorted "
|
148
176
|
"version of the input stream so far.\n"
|
@@ -151,7 +179,7 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
151
179
|
"sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
|
152
180
|
"For kll_floats_sketch: if the sketch is empty this returns nan. "
|
153
181
|
"For kll_ints_sketch: if the sketch is empty this throws a RuntimeError.")
|
154
|
-
.def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("fractions"),
|
182
|
+
.def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("fractions"), py::arg("inclusive")=false,
|
155
183
|
"This is a more efficient multiple-query version of get_quantile().\n"
|
156
184
|
"This returns an array that could have been generated by using get_quantile() for each "
|
157
185
|
"fractional rank separately, but would be very inefficient. "
|
@@ -159,12 +187,14 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
159
187
|
"a single query. It is strongly recommend that this method be used instead of multiple calls "
|
160
188
|
"to get_quantile().\n"
|
161
189
|
"If the sketch is empty this returns an empty vector.")
|
162
|
-
.def("get_rank", &
|
190
|
+
.def("get_rank", &dspy::kll_sketch_get_rank<T>, py::arg("value"), py::arg("inclusive")=false,
|
163
191
|
"Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
|
164
192
|
"The resulting approximation has a probabilistic guarantee that can be obtained from the "
|
165
193
|
"get_normalized_rank_error(False) function.\n"
|
194
|
+
"With the parameter inclusive=true the weight of the given value is included into the rank."
|
195
|
+
"Otherwise the rank equals the sum of the weights of values less than the given value.\n"
|
166
196
|
"If the sketch is empty this returns nan.")
|
167
|
-
.def("get_pmf", &dspy::kll_sketch_get_pmf<T>, py::arg("split_points"),
|
197
|
+
.def("get_pmf", &dspy::kll_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
168
198
|
"Returns an approximation to the Probability Mass Function (PMF) of the input stream "
|
169
199
|
"given a set of split points (values).\n"
|
170
200
|
"The resulting approximations have a probabilistic guarantee that can be obtained from the "
|
@@ -172,11 +202,13 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
172
202
|
"If the sketch is empty this returns an empty vector.\n"
|
173
203
|
"split_points is an array of m unique, monotonically increasing float values "
|
174
204
|
"that divide the real number line into m+1 consecutive disjoint intervals.\n"
|
175
|
-
"
|
205
|
+
"If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
|
176
206
|
"exclusive of the right split point, with the exception that the last interval will include "
|
177
207
|
"the maximum value.\n"
|
208
|
+
"If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
|
209
|
+
"inclusive of the right split point.\n"
|
178
210
|
"It is not necessary to include either the min or max values in these split points.")
|
179
|
-
.def("get_cdf", &dspy::kll_sketch_get_cdf<T>, py::arg("split_points"),
|
211
|
+
.def("get_cdf", &dspy::kll_sketch_get_cdf<T>, py::arg("split_points"), py::arg("inclusive")=false,
|
180
212
|
"Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
|
181
213
|
"cumulative analog of the PMF, of the input stream given a set of split points (values).\n"
|
182
214
|
"The resulting approximations have a probabilistic guarantee that can be obtained from the "
|
@@ -184,9 +216,11 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
184
216
|
"If the sketch is empty this returns an empty vector.\n"
|
185
217
|
"split_points is an array of m unique, monotonically increasing float values "
|
186
218
|
"that divide the real number line into m+1 consecutive disjoint intervals.\n"
|
187
|
-
"
|
219
|
+
"If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
|
188
220
|
"exclusive of the right split point, with the exception that the last interval will include "
|
189
221
|
"the maximum value.\n"
|
222
|
+
"If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
|
223
|
+
"inclusive of the right split point.\n"
|
190
224
|
"It is not necessary to include either the min or max values in these split points.")
|
191
225
|
.def("normalized_rank_error", (double (kll_sketch<T>::*)(bool) const) &kll_sketch<T>::get_normalized_rank_error,
|
192
226
|
py::arg("as_pmf"),
|
@@ -208,4 +242,5 @@ void bind_kll_sketch(py::module &m, const char* name) {
|
|
208
242
|
void init_kll(py::module &m) {
|
209
243
|
bind_kll_sketch<int>(m, "kll_ints_sketch");
|
210
244
|
bind_kll_sketch<float>(m, "kll_floats_sketch");
|
245
|
+
bind_kll_sketch<double>(m, "kll_doubles_sketch");
|
211
246
|
}
|
@@ -0,0 +1,68 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#include "kolmogorov_smirnov.hpp"
|
21
|
+
#include "kll_sketch.hpp"
|
22
|
+
#include "quantiles_sketch.hpp"
|
23
|
+
|
24
|
+
#include <pybind11/pybind11.h>
|
25
|
+
|
26
|
+
namespace py = pybind11;
|
27
|
+
|
28
|
+
void init_kolmogorov_smirnov(py::module &m) {
|
29
|
+
using namespace datasketches;
|
30
|
+
|
31
|
+
m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<int>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
32
|
+
"Performs the Kolmogorov-Smirnov Test between kll_ints_sketches.\n"
|
33
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
34
|
+
"this will return false.\n"
|
35
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
36
|
+
"distribution) using the provided p-value, otherwise False.");
|
37
|
+
m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<float>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
38
|
+
"Performs the Kolmogorov-Smirnov Test between kll_floats_sketches.\n"
|
39
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
40
|
+
"this will return false.\n"
|
41
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
42
|
+
"distribution) using the provided p-value, otherwise False.");
|
43
|
+
m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<double>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
44
|
+
"Performs the Kolmogorov-Smirnov Test between kll_doubles_sketches.\n"
|
45
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
46
|
+
"this will return false.\n"
|
47
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
48
|
+
"distribution) using the provided p-value, otherwise False.");
|
49
|
+
|
50
|
+
m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<int>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
51
|
+
"Performs the Kolmogorov-Smirnov Test between quantiles_ints_sketches.\n"
|
52
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
53
|
+
"this will return false.\n"
|
54
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
55
|
+
"distribution) using the provided p-value, otherwise False.");
|
56
|
+
m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<float>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
57
|
+
"Performs the Kolmogorov-Smirnov Test between quantiles_floats_sketches.\n"
|
58
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
59
|
+
"this will return false.\n"
|
60
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
61
|
+
"distribution) using the provided p-value, otherwise False.");
|
62
|
+
m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<double>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
|
63
|
+
"Performs the Kolmogorov-Smirnov Test between quantiles_doubles_sketches.\n"
|
64
|
+
"Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
|
65
|
+
"this will return false.\n"
|
66
|
+
"Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
|
67
|
+
"distribution) using the provided p-value, otherwise False.");
|
68
|
+
}
|