datasketches 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +1 -1
  4. data/ext/datasketches/kll_wrapper.cpp +5 -1
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +4 -3
  7. data/vendor/datasketches-cpp/common/CMakeLists.txt +4 -0
  8. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
  9. data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
  10. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
  11. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
  12. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
  13. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
  14. data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
  15. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
  16. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
  17. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +2 -0
  18. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
  19. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
  20. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
  21. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +25 -9
  22. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
  23. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
  24. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
  25. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
  26. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
  27. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
  28. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
  29. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
  30. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
  31. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
  32. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
  33. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
  34. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
  35. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
  36. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
  37. data/vendor/datasketches-cpp/kll/CMakeLists.txt +0 -4
  38. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
  39. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
  40. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +96 -42
  41. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +105 -127
  42. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +94 -25
  43. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  44. data/vendor/datasketches-cpp/pyproject.toml +1 -1
  45. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  46. data/vendor/datasketches-cpp/python/README.md +7 -0
  47. data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
  48. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
  49. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
  50. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
  51. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
  52. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
  53. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
  54. data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
  55. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
  56. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
  57. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
  58. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
  59. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
  60. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
  61. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
  62. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
  63. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
  64. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
  65. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
  66. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
  67. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
  68. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
  69. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
  70. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
  71. data/vendor/datasketches-cpp/req/CMakeLists.txt +0 -2
  72. data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
  73. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
  74. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
  75. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +62 -59
  76. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
  77. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
  78. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +31 -26
  79. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
  80. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +25 -9
  81. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
  82. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
  83. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
  84. data/vendor/datasketches-cpp/setup.py +1 -1
  85. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  86. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +8 -6
  87. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +1 -0
  88. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -0
  89. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +7 -45
  90. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -0
  91. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +1 -0
  92. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
  93. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
  94. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +1 -0
  95. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +29 -1
  96. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +2 -0
  97. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +16 -0
  98. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +1 -0
  99. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
  100. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
  101. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +1 -0
  102. metadata +25 -9
  103. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
  104. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
  105. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
  106. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
@@ -22,6 +22,7 @@
22
22
  #include <cstring>
23
23
  #include <sstream>
24
24
  #include <fstream>
25
+ #include <stdexcept>
25
26
 
26
27
  #include <kll_sketch.hpp>
27
28
  #include <test_allocator.hpp>
@@ -90,7 +91,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
90
91
  REQUIRE(sketch.get_n() == 1);
91
92
  REQUIRE(sketch.get_num_retained() == 1);
92
93
  REQUIRE(sketch.get_rank(1.0f) == 0.0);
94
+ REQUIRE(sketch.get_rank<true>(1.0f) == 1.0);
93
95
  REQUIRE(sketch.get_rank(2.0f) == 1.0);
96
+ REQUIRE(sketch.get_rank(std::numeric_limits<float>::infinity()) == 1.0);
94
97
  REQUIRE(sketch.get_min_value() == 1.0);
95
98
  REQUIRE(sketch.get_max_value() == 1.0);
96
99
  REQUIRE(sketch.get_quantile(0.5) == 1.0);
@@ -142,8 +145,10 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
142
145
  REQUIRE(quantiles[2] == n - 1 );
143
146
 
144
147
  for (uint32_t i = 0; i < n; i++) {
145
- const double trueRank = (double) i / n;
146
- REQUIRE(sketch.get_rank(static_cast<float>(i)) == trueRank);
148
+ const double true_rank = (double) i / n;
149
+ REQUIRE(sketch.get_rank(static_cast<float>(i)) == true_rank);
150
+ const double true_rank_inclusive = (double) (i + 1) / n;
151
+ REQUIRE(sketch.get_rank<true>(static_cast<float>(i)) == true_rank_inclusive);
147
152
  }
148
153
 
149
154
  // the alternative method must produce the same result
@@ -241,20 +246,38 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
241
246
  sketch.update(static_cast<float>(i));
242
247
  values[i] = static_cast<float>(i);
243
248
  }
244
-
245
- const auto ranks(sketch.get_CDF(values, n));
246
- const auto pmf(sketch.get_PMF(values, n));
247
-
248
- double subtotal_pmf(0);
249
- for (int i = 0; i < n; i++) {
250
- if (sketch.get_rank(values[i]) != ranks[i]) {
251
- std::cerr << "checking rank vs CDF for value " << i << std::endl;
252
- REQUIRE(sketch.get_rank(values[i]) == ranks[i]);
249
+ { // inclusive=false (default)
250
+ const auto ranks(sketch.get_CDF(values, n));
251
+ const auto pmf(sketch.get_PMF(values, n));
252
+
253
+ double subtotal_pmf = 0;
254
+ for (int i = 0; i < n; i++) {
255
+ if (sketch.get_rank(values[i]) != ranks[i]) {
256
+ std::cerr << "checking rank vs CDF for value " << i << std::endl;
257
+ REQUIRE(sketch.get_rank(values[i]) == ranks[i]);
258
+ }
259
+ subtotal_pmf += pmf[i];
260
+ if (abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
261
+ std::cerr << "CDF vs PMF for value " << i << std::endl;
262
+ REQUIRE(ranks[i] == Approx(subtotal_pmf).margin(NUMERIC_NOISE_TOLERANCE));
263
+ }
253
264
  }
254
- subtotal_pmf += pmf[i];
255
- if (abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
256
- std::cerr << "CDF vs PMF for value " << i << std::endl;
257
- REQUIRE(ranks[i] == Approx(subtotal_pmf).margin(NUMERIC_NOISE_TOLERANCE));
265
+ }
266
+ { // inclusive=true
267
+ const auto ranks(sketch.get_CDF<true>(values, n));
268
+ const auto pmf(sketch.get_PMF<true>(values, n));
269
+
270
+ double subtotal_pmf = 0;
271
+ for (int i = 0; i < n; i++) {
272
+ if (sketch.get_rank<true>(values[i]) != ranks[i]) {
273
+ std::cerr << "checking rank vs CDF for value " << i << std::endl;
274
+ REQUIRE(sketch.get_rank(values[i]) == ranks[i]);
275
+ }
276
+ subtotal_pmf += pmf[i];
277
+ if (abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
278
+ std::cerr << "CDF vs PMF for value " << i << std::endl;
279
+ REQUIRE(ranks[i] == Approx(subtotal_pmf).margin(NUMERIC_NOISE_TOLERANCE));
280
+ }
258
281
  }
259
282
  }
260
283
  }
@@ -293,7 +316,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
293
316
  SECTION("bytes serialize deserialize empty") {
294
317
  kll_float_sketch sketch(200, 0);
295
318
  auto bytes = sketch.serialize();
296
- auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
319
+ auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
297
320
  REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
298
321
  REQUIRE(sketch2.is_empty() == sketch.is_empty());
299
322
  REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
@@ -311,7 +334,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
311
334
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
312
335
  sketch.serialize(s);
313
336
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
314
- auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
337
+ auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), 0);
315
338
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
316
339
  REQUIRE(s.tellg() == s.tellp());
317
340
  REQUIRE_FALSE(sketch2.is_empty());
@@ -330,7 +353,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
330
353
  sketch.update(1.0f);
331
354
  auto bytes = sketch.serialize();
332
355
  REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
333
- auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
356
+ auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
334
357
  REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
335
358
  REQUIRE_FALSE(sketch2.is_empty());
336
359
  REQUIRE_FALSE(sketch2.is_estimation_mode());
@@ -347,7 +370,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
347
370
  std::ifstream is;
348
371
  is.exceptions(std::ios::failbit | std::ios::badbit);
349
372
  is.open(testBinaryInputPath + "kll_sketch_float_one_item_v1.sk", std::ios::binary);
350
- auto sketch = kll_float_sketch::deserialize(is, test_allocator<float>(0));
373
+ auto sketch = kll_float_sketch::deserialize(is, serde<float>(), 0);
351
374
  REQUIRE_FALSE(sketch.is_empty());
352
375
  REQUIRE_FALSE(sketch.is_estimation_mode());
353
376
  REQUIRE(sketch.get_n() == 1);
@@ -364,7 +387,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
364
387
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
365
388
  sketch.serialize(s);
366
389
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
367
- auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
390
+ auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), 0);
368
391
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
369
392
  REQUIRE(s.tellg() == s.tellp());
370
393
  REQUIRE_FALSE(sketch2.is_empty());
@@ -382,7 +405,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
382
405
  sketch.update(3.0f);
383
406
  auto bytes = sketch.serialize();
384
407
  REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
385
- auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
408
+ auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
386
409
  REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
387
410
  REQUIRE_FALSE(sketch2.is_empty());
388
411
  REQUIRE_FALSE(sketch2.is_estimation_mode());
@@ -399,7 +422,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
399
422
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
400
423
  sketch.serialize(s);
401
424
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
402
- auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
425
+ auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), 0);
403
426
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
404
427
  REQUIRE(s.tellg() == s.tellp());
405
428
  REQUIRE(sketch2.is_empty() == sketch.is_empty());
@@ -421,7 +444,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
421
444
  for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
422
445
  auto bytes = sketch.serialize();
423
446
  REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
424
- auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
447
+ auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
425
448
  REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
426
449
  REQUIRE(sketch2.is_empty() == sketch.is_empty());
427
450
  REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
@@ -678,7 +701,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
678
701
 
679
702
  auto bytes = sketch1.serialize();
680
703
  REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
681
- auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), 0);
704
+ auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), serde<std::string>(), 0);
682
705
  REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
683
706
  REQUIRE(sketch2.is_empty() == sketch1.is_empty());
684
707
  REQUIRE(sketch2.is_estimation_mode() == sketch1.is_estimation_mode());
@@ -699,7 +722,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
699
722
  sketch1.update("a");
700
723
  auto bytes = sketch1.serialize();
701
724
  REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
702
- auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), 0);
725
+ auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), serde<std::string>(), 0);
703
726
  REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
704
727
  }
705
728
 
@@ -766,6 +789,52 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
766
789
  auto kll2 = kll_sketch<int8_t>::deserialize(blob.data(), blob.size());
767
790
  }
768
791
 
792
+ SECTION("sorted view") {
793
+ kll_sketch<int> kll;
794
+ kll.update(2);
795
+ kll.update(3);
796
+ kll.update(1);
797
+
798
+ { // non-cumulative, using operator->
799
+ auto view = kll.get_sorted_view(false);
800
+ REQUIRE(view.size() == 3);
801
+ auto it = view.begin();
802
+ REQUIRE(it->first == 1);
803
+ REQUIRE(it->second == 1);
804
+ ++it;
805
+ REQUIRE(it->first == 2);
806
+ REQUIRE(it->second == 1);
807
+ ++it;
808
+ REQUIRE(it->first == 3);
809
+ REQUIRE(it->second == 1);
810
+ }
811
+ { // cumulative, non-inclusive, using operator->
812
+ auto view = kll.get_sorted_view(true);
813
+ REQUIRE(view.size() == 3);
814
+ auto it = view.begin();
815
+ REQUIRE(it->first == 1);
816
+ REQUIRE(it->second == 0);
817
+ ++it;
818
+ REQUIRE(it->first == 2);
819
+ REQUIRE(it->second == 1);
820
+ ++it;
821
+ REQUIRE(it->first == 3);
822
+ REQUIRE(it->second == 2);
823
+ }
824
+ { // cumulative, inclusive, using operator*
825
+ auto view = kll.get_sorted_view<true>(true);
826
+ REQUIRE(view.size() == 3);
827
+ auto it = view.begin();
828
+ REQUIRE((*it).first == 1);
829
+ REQUIRE((*it).second == 1);
830
+ ++it;
831
+ REQUIRE((*it).first == 2);
832
+ REQUIRE((*it).second == 2);
833
+ ++it;
834
+ REQUIRE((*it).first == 3);
835
+ REQUIRE((*it).second == 3);
836
+ }
837
+ }
769
838
  // cleanup
770
839
  if (test_allocator_total_bytes != 0) {
771
840
  REQUIRE(test_allocator_total_bytes == 0);
@@ -46,7 +46,7 @@ TEST_CASE("kolmogorov-smirnov same distribution", "[kll_sketch]") {
46
46
  sketch1.update(x);
47
47
  sketch2.update(x);
48
48
  }
49
- REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == Approx(0).margin(0.01));
49
+ REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == Approx(0).margin(0.02));
50
50
  REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.01));
51
51
  }
52
52
 
@@ -1,7 +1,7 @@
1
1
  [build-system]
2
2
  requires = ["wheel",
3
3
  "setuptools >= 30.3.0",
4
- "cmake >= 3.12",
4
+ "cmake >= 3.16",
5
5
  "pip >= 10.0",
6
6
  "pybind11[global] >= 2.6.0"]
7
7
  build-backend = "setuptools.build_meta"
@@ -15,7 +15,11 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- find_package(Python3 COMPONENTS Interpreter Development)
18
+ if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18.0")
19
+ find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED)
20
+ else()
21
+ find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
22
+ endif()
19
23
 
20
24
  # only Windows+MSVC seems to have trouble locating pybind11
21
25
  if (MSVC)
@@ -40,6 +44,7 @@ target_link_libraries(python
40
44
  theta
41
45
  sampling
42
46
  req
47
+ quantiles
43
48
  pybind11::module
44
49
  )
45
50
 
@@ -63,5 +68,7 @@ target_sources(python
63
68
  src/theta_wrapper.cpp
64
69
  src/vo_wrapper.cpp
65
70
  src/req_wrapper.cpp
71
+ src/quantiles_wrapper.cpp
72
+ src/ks_wrapper.cpp
66
73
  src/vector_of_kll.cpp
67
74
  )
@@ -27,6 +27,11 @@ Having installed the library, loading the Apache Datasketches Library in Python
27
27
  - KLL (Absolute Error Quantiles)
28
28
  - `kll_ints_sketch`
29
29
  - `kll_floats_sketch`
30
+ - `kll_doubles_sketch`
31
+ - Quantiles (Absolute Error Quantiles, inferior algorithm)
32
+ - `quantiles_ints_sketch`
33
+ - `quantiles_floats_sketch`
34
+ - `quantiles_doubles_sketch`
30
35
  - REQ (Relative Error Quantiles)
31
36
  - `req_ints_sketch`
32
37
  - `req_floats_sketch`
@@ -52,6 +57,8 @@ Having installed the library, loading the Apache Datasketches Library in Python
52
57
  - Vector of KLL
53
58
  - `vector_of_kll_ints_sketches`
54
59
  - `vector_of_kll_floats_sketches`
60
+ - Kolmogorov-Smirnov Test
61
+ - `ks_test` applied to a pair of matched-type Absolute Error quantiles sketches
55
62
 
56
63
  ## Known Differences from C++
57
64
 
@@ -28,6 +28,8 @@ void init_cpc(py::module& m);
28
28
  void init_theta(py::module& m);
29
29
  void init_vo(py::module& m);
30
30
  void init_req(py::module& m);
31
+ void init_quantiles(py::module& m);
32
+ void init_kolmogorov_smirnov(py::module& m);
31
33
  void init_vector_of_kll(py::module& m);
32
34
 
33
35
  PYBIND11_MODULE(datasketches, m) {
@@ -38,5 +40,7 @@ PYBIND11_MODULE(datasketches, m) {
38
40
  init_theta(m);
39
41
  init_vo(m);
40
42
  init_req(m);
43
+ init_quantiles(m);
44
+ init_kolmogorov_smirnov(m);
41
45
  init_vector_of_kll(m);
42
46
  }
@@ -64,6 +64,11 @@ py::list fi_sketch_get_frequent_items(const frequent_items_sketch<T>& sk,
64
64
  return list;
65
65
  }
66
66
 
67
+ template<typename T>
68
+ size_t fi_sketch_get_serialized_size_bytes(const frequent_items_sketch<T>& sk) {
69
+ return sk.get_serialized_size_bytes();
70
+ }
71
+
67
72
  }
68
73
  }
69
74
 
@@ -104,7 +109,7 @@ void bind_fi_sketch(py::module &m, const char* name) {
104
109
  "Returns the epsilon value used to compute a priori error for a given log2(max_map_size)")
105
110
  .def_static("get_apriori_error", &frequent_items_sketch<T>::get_apriori_error, py::arg("lg_max_map_size"), py::arg("estimated_total_weight"),
106
111
  "Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight.")
107
- .def("get_serialized_size_bytes", &frequent_items_sketch<T>::get_serialized_size_bytes,
112
+ .def("get_serialized_size_bytes", &dspy::fi_sketch_get_serialized_size_bytes<T>,
108
113
  "Computes the size needed to serialize the current state of the sketch. This can be expensive since every item needs to be looked at.")
109
114
  .def("serialize", &dspy::fi_sketch_serialize<T>, "Serializes the sketch into a bytes object")
110
115
  .def_static("deserialize", &dspy::fi_sketch_deserialize<T>, "Reads a bytes object and returns the corresponding frequent_strings_sketch")
@@ -24,6 +24,7 @@
24
24
  #include <pybind11/numpy.h>
25
25
  #include <sstream>
26
26
  #include <vector>
27
+ #include <stdexcept>
27
28
 
28
29
  namespace py = pybind11;
29
30
 
@@ -50,11 +51,32 @@ double kll_sketch_generic_normalized_rank_error(uint16_t k, bool pmf) {
50
51
  return kll_sketch<T>::get_normalized_rank_error(k, pmf);
51
52
  }
52
53
 
54
+ template<typename T>
55
+ double kll_sketch_get_rank(const kll_sketch<T>& sk, const T& item, bool inclusive) {
56
+ if (inclusive)
57
+ return sk.template get_rank<true>(item);
58
+ else
59
+ return sk.template get_rank<false>(item);
60
+ }
61
+
62
+ template<typename T>
63
+ T kll_sketch_get_quantile(const kll_sketch<T>& sk,
64
+ double rank,
65
+ bool inclusive) {
66
+ if (inclusive)
67
+ return T(sk.template get_quantile<true>(rank));
68
+ else
69
+ return T(sk.template get_quantile<false>(rank));
70
+ }
71
+
53
72
  template<typename T>
54
73
  py::list kll_sketch_get_quantiles(const kll_sketch<T>& sk,
55
- std::vector<double>& fractions) {
74
+ std::vector<double>& fractions,
75
+ bool inclusive) {
56
76
  size_t nQuantiles = fractions.size();
57
- auto result = sk.get_quantiles(&fractions[0], nQuantiles);
77
+ auto result = inclusive ?
78
+ sk.template get_quantiles<true>(fractions.data(), nQuantiles)
79
+ : sk.template get_quantiles<false>(fractions.data(), nQuantiles);
58
80
 
59
81
  // returning as std::vector<> would copy values to a list anyway
60
82
  py::list list(nQuantiles);
@@ -67,9 +89,12 @@ py::list kll_sketch_get_quantiles(const kll_sketch<T>& sk,
67
89
 
68
90
  template<typename T>
69
91
  py::list kll_sketch_get_pmf(const kll_sketch<T>& sk,
70
- std::vector<T>& split_points) {
92
+ std::vector<T>& split_points,
93
+ bool inclusive) {
71
94
  size_t nPoints = split_points.size();
72
- auto result = sk.get_PMF(&split_points[0], nPoints);
95
+ auto result = inclusive ?
96
+ sk.template get_PMF<true>(split_points.data(), nPoints)
97
+ : sk.template get_PMF<false>(split_points.data(), nPoints);
73
98
 
74
99
  py::list list(nPoints + 1);
75
100
  for (size_t i = 0; i <= nPoints; ++i) {
@@ -81,9 +106,12 @@ py::list kll_sketch_get_pmf(const kll_sketch<T>& sk,
81
106
 
82
107
  template<typename T>
83
108
  py::list kll_sketch_get_cdf(const kll_sketch<T>& sk,
84
- std::vector<T>& split_points) {
109
+ std::vector<T>& split_points,
110
+ bool inclusive) {
85
111
  size_t nPoints = split_points.size();
86
- auto result = sk.get_CDF(&split_points[0], nPoints);
112
+ auto result = inclusive ?
113
+ sk.template get_CDF<true>(split_points.data(), nPoints)
114
+ : sk.template get_CDF<false>(split_points.data(), nPoints);
87
115
 
88
116
  py::list list(nPoints + 1);
89
117
  for (size_t i = 0; i <= nPoints; ++i) {
@@ -142,7 +170,7 @@ void bind_kll_sketch(py::module &m, const char* name) {
142
170
  "Returns the minimum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
143
171
  .def("get_max_value", &kll_sketch<T>::get_max_value,
144
172
  "Returns the maximum value from the stream. If empty, kll_floats_sketch retursn nan; kll_ints_sketch throws a RuntimeError")
145
- .def("get_quantile", &kll_sketch<T>::get_quantile, py::arg("fraction"),
173
+ .def("get_quantile", &dspy::kll_sketch_get_quantile<T>, py::arg("fraction"), py::arg("inclusive")=false,
146
174
  "Returns an approximation to the value of the data item "
147
175
  "that would be preceded by the given fraction of a hypothetical sorted "
148
176
  "version of the input stream so far.\n"
@@ -151,7 +179,7 @@ void bind_kll_sketch(py::module &m, const char* name) {
151
179
  "sketch. Instead use get_quantiles(), which pays the overhead only once.\n"
152
180
  "For kll_floats_sketch: if the sketch is empty this returns nan. "
153
181
  "For kll_ints_sketch: if the sketch is empty this throws a RuntimeError.")
154
- .def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("fractions"),
182
+ .def("get_quantiles", &dspy::kll_sketch_get_quantiles<T>, py::arg("fractions"), py::arg("inclusive")=false,
155
183
  "This is a more efficient multiple-query version of get_quantile().\n"
156
184
  "This returns an array that could have been generated by using get_quantile() for each "
157
185
  "fractional rank separately, but would be very inefficient. "
@@ -159,12 +187,14 @@ void bind_kll_sketch(py::module &m, const char* name) {
159
187
  "a single query. It is strongly recommend that this method be used instead of multiple calls "
160
188
  "to get_quantile().\n"
161
189
  "If the sketch is empty this returns an empty vector.")
162
- .def("get_rank", &kll_sketch<T>::get_rank, py::arg("value"),
190
+ .def("get_rank", &dspy::kll_sketch_get_rank<T>, py::arg("value"), py::arg("inclusive")=false,
163
191
  "Returns an approximation to the normalized (fractional) rank of the given value from 0 to 1, inclusive.\n"
164
192
  "The resulting approximation has a probabilistic guarantee that can be obtained from the "
165
193
  "get_normalized_rank_error(False) function.\n"
194
+ "With the parameter inclusive=true the weight of the given value is included into the rank."
195
+ "Otherwise the rank equals the sum of the weights of values less than the given value.\n"
166
196
  "If the sketch is empty this returns nan.")
167
- .def("get_pmf", &dspy::kll_sketch_get_pmf<T>, py::arg("split_points"),
197
+ .def("get_pmf", &dspy::kll_sketch_get_pmf<T>, py::arg("split_points"), py::arg("inclusive")=false,
168
198
  "Returns an approximation to the Probability Mass Function (PMF) of the input stream "
169
199
  "given a set of split points (values).\n"
170
200
  "The resulting approximations have a probabilistic guarantee that can be obtained from the "
@@ -172,11 +202,13 @@ void bind_kll_sketch(py::module &m, const char* name) {
172
202
  "If the sketch is empty this returns an empty vector.\n"
173
203
  "split_points is an array of m unique, monotonically increasing float values "
174
204
  "that divide the real number line into m+1 consecutive disjoint intervals.\n"
175
- "The definition of an 'interval' is inclusive of the left split point (or minimum value) and "
205
+ "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
176
206
  "exclusive of the right split point, with the exception that the last interval will include "
177
207
  "the maximum value.\n"
208
+ "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
209
+ "inclusive of the right split point.\n"
178
210
  "It is not necessary to include either the min or max values in these split points.")
179
- .def("get_cdf", &dspy::kll_sketch_get_cdf<T>, py::arg("split_points"),
211
+ .def("get_cdf", &dspy::kll_sketch_get_cdf<T>, py::arg("split_points"), py::arg("inclusive")=false,
180
212
  "Returns an approximation to the Cumulative Distribution Function (CDF), which is the "
181
213
  "cumulative analog of the PMF, of the input stream given a set of split points (values).\n"
182
214
  "The resulting approximations have a probabilistic guarantee that can be obtained from the "
@@ -184,9 +216,11 @@ void bind_kll_sketch(py::module &m, const char* name) {
184
216
  "If the sketch is empty this returns an empty vector.\n"
185
217
  "split_points is an array of m unique, monotonically increasing float values "
186
218
  "that divide the real number line into m+1 consecutive disjoint intervals.\n"
187
- "The definition of an 'interval' is inclusive of the left split point (or minimum value) and "
219
+ "If the parameter inclusive=false, the definition of an 'interval' is inclusive of the left split point (or minimum value) and "
188
220
  "exclusive of the right split point, with the exception that the last interval will include "
189
221
  "the maximum value.\n"
222
+ "If the parameter inclusive=true, the definition of an 'interval' is exclusive of the left split point (or minimum value) and "
223
+ "inclusive of the right split point.\n"
190
224
  "It is not necessary to include either the min or max values in these split points.")
191
225
  .def("normalized_rank_error", (double (kll_sketch<T>::*)(bool) const) &kll_sketch<T>::get_normalized_rank_error,
192
226
  py::arg("as_pmf"),
@@ -208,4 +242,5 @@ void bind_kll_sketch(py::module &m, const char* name) {
208
242
  void init_kll(py::module &m) {
209
243
  bind_kll_sketch<int>(m, "kll_ints_sketch");
210
244
  bind_kll_sketch<float>(m, "kll_floats_sketch");
245
+ bind_kll_sketch<double>(m, "kll_doubles_sketch");
211
246
  }
@@ -0,0 +1,68 @@
1
+ /*
2
+ * Licensed to the Apache Software Foundation (ASF) under one
3
+ * or more contributor license agreements. See the NOTICE file
4
+ * distributed with this work for additional information
5
+ * regarding copyright ownership. The ASF licenses this file
6
+ * to you under the Apache License, Version 2.0 (the
7
+ * "License"); you may not use this file except in compliance
8
+ * with the License. You may obtain a copy of the License at
9
+ *
10
+ * http://www.apache.org/licenses/LICENSE-2.0
11
+ *
12
+ * Unless required by applicable law or agreed to in writing,
13
+ * software distributed under the License is distributed on an
14
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
+ * KIND, either express or implied. See the License for the
16
+ * specific language governing permissions and limitations
17
+ * under the License.
18
+ */
19
+
20
+ #include "kolmogorov_smirnov.hpp"
21
+ #include "kll_sketch.hpp"
22
+ #include "quantiles_sketch.hpp"
23
+
24
+ #include <pybind11/pybind11.h>
25
+
26
+ namespace py = pybind11;
27
+
28
+ void init_kolmogorov_smirnov(py::module &m) {
29
+ using namespace datasketches;
30
+
31
+ m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<int>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
32
+ "Performs the Kolmogorov-Smirnov Test between kll_ints_sketches.\n"
33
+ "Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
34
+ "this will return false.\n"
35
+ "Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
36
+ "distribution) using the provided p-value, otherwise False.");
37
+ m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<float>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
38
+ "Performs the Kolmogorov-Smirnov Test between kll_floats_sketches.\n"
39
+ "Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
40
+ "this will return false.\n"
41
+ "Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
42
+ "distribution) using the provided p-value, otherwise False.");
43
+ m.def("ks_test", &kolmogorov_smirnov::test<kll_sketch<double>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
44
+ "Performs the Kolmogorov-Smirnov Test between kll_doubles_sketches.\n"
45
+ "Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
46
+ "this will return false.\n"
47
+ "Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
48
+ "distribution) using the provided p-value, otherwise False.");
49
+
50
+ m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<int>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
51
+ "Performs the Kolmogorov-Smirnov Test between quantiles_ints_sketches.\n"
52
+ "Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
53
+ "this will return false.\n"
54
+ "Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
55
+ "distribution) using the provided p-value, otherwise False.");
56
+ m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<float>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
57
+ "Performs the Kolmogorov-Smirnov Test between quantiles_floats_sketches.\n"
58
+ "Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
59
+ "this will return false.\n"
60
+ "Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
61
+ "distribution) using the provided p-value, otherwise False.");
62
+ m.def("ks_test", &kolmogorov_smirnov::test<quantiles_sketch<double>>, py::arg("sk_1"), py::arg("sk_2"), py::arg("p"),
63
+ "Performs the Kolmogorov-Smirnov Test between quantiles_doubles_sketches.\n"
64
+ "Note: if the given sketches have insufficient data or if the sketch sizes are too small, "
65
+ "this will return false.\n"
66
+ "Returns True if we can reject the null hypothesis (that the sketches reflect the same underlying "
67
+ "distribution) using the provided p-value, otherwise False.");
68
+ }