datasketches 0.2.2 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (154) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +12 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/README.md +8 -8
  6. data/ext/datasketches/kll_wrapper.cpp +5 -1
  7. data/ext/datasketches/theta_wrapper.cpp +20 -4
  8. data/lib/datasketches/version.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +27 -5
  10. data/vendor/datasketches-cpp/LICENSE +40 -3
  11. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  12. data/vendor/datasketches-cpp/NOTICE +1 -1
  13. data/vendor/datasketches-cpp/README.md +76 -9
  14. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +18 -13
  16. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +16 -0
  18. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
  19. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
  20. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
  21. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
  22. data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
  23. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  24. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
  25. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
  26. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
  27. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +1 -1
  28. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
  29. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +10 -6
  31. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  32. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
  33. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
  34. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  35. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
  36. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +30 -12
  37. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
  38. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
  39. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  40. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
  41. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
  42. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
  43. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
  44. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
  45. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
  46. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
  47. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
  48. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
  49. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
  50. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
  51. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
  52. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
  53. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
  54. data/vendor/datasketches-cpp/kll/CMakeLists.txt +5 -19
  55. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
  56. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
  57. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +103 -44
  58. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +110 -130
  59. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +156 -23
  60. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  61. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  62. data/vendor/datasketches-cpp/python/CMakeLists.txt +17 -6
  63. data/vendor/datasketches-cpp/python/README.md +57 -50
  64. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  65. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  66. data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
  67. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
  68. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +49 -14
  69. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
  70. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
  71. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
  72. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +2 -2
  73. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +12 -5
  74. data/vendor/datasketches-cpp/python/tests/kll_test.py +12 -6
  75. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
  76. data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
  77. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  78. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
  79. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
  80. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
  81. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
  82. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
  83. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
  84. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
  85. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
  86. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
  87. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
  88. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
  89. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
  90. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
  91. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
  92. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
  93. data/vendor/datasketches-cpp/req/CMakeLists.txt +6 -21
  94. data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
  95. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
  96. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
  97. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +66 -61
  98. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
  99. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  100. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +54 -12
  101. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +45 -34
  102. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
  103. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +33 -15
  104. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
  105. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
  106. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
  107. data/vendor/datasketches-cpp/setup.py +10 -7
  108. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  109. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  110. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +92 -23
  111. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
  112. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  113. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +7 -6
  114. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +3 -2
  115. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
  116. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +150 -93
  117. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
  118. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  119. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -2
  120. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  121. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +9 -5
  122. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +39 -10
  123. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  124. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
  125. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  126. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  127. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  128. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  129. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  130. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
  131. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +446 -0
  132. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +429 -1
  133. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -11
  134. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  135. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  136. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
  137. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  138. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  139. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +29 -9
  140. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +34 -14
  141. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  142. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  143. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
  144. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
  145. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
  146. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
  147. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +8 -0
  148. metadata +34 -12
  149. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
  150. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
  151. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
  152. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
  153. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  154. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -22,6 +22,7 @@
22
22
  #include <cstring>
23
23
  #include <sstream>
24
24
  #include <fstream>
25
+ #include <stdexcept>
25
26
 
26
27
  #include <kll_sketch.hpp>
27
28
  #include <test_allocator.hpp>
@@ -90,7 +91,9 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
90
91
  REQUIRE(sketch.get_n() == 1);
91
92
  REQUIRE(sketch.get_num_retained() == 1);
92
93
  REQUIRE(sketch.get_rank(1.0f) == 0.0);
94
+ REQUIRE(sketch.get_rank<true>(1.0f) == 1.0);
93
95
  REQUIRE(sketch.get_rank(2.0f) == 1.0);
96
+ REQUIRE(sketch.get_rank(std::numeric_limits<float>::infinity()) == 1.0);
94
97
  REQUIRE(sketch.get_min_value() == 1.0);
95
98
  REQUIRE(sketch.get_max_value() == 1.0);
96
99
  REQUIRE(sketch.get_quantile(0.5) == 1.0);
@@ -142,8 +145,10 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
142
145
  REQUIRE(quantiles[2] == n - 1 );
143
146
 
144
147
  for (uint32_t i = 0; i < n; i++) {
145
- const double trueRank = (double) i / n;
146
- REQUIRE(sketch.get_rank(static_cast<float>(i)) == trueRank);
148
+ const double true_rank = (double) i / n;
149
+ REQUIRE(sketch.get_rank(static_cast<float>(i)) == true_rank);
150
+ const double true_rank_inclusive = (double) (i + 1) / n;
151
+ REQUIRE(sketch.get_rank<true>(static_cast<float>(i)) == true_rank_inclusive);
147
152
  }
148
153
 
149
154
  // the alternative method must produce the same result
@@ -241,20 +246,38 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
241
246
  sketch.update(static_cast<float>(i));
242
247
  values[i] = static_cast<float>(i);
243
248
  }
244
-
245
- const auto ranks(sketch.get_CDF(values, n));
246
- const auto pmf(sketch.get_PMF(values, n));
247
-
248
- double subtotal_pmf(0);
249
- for (int i = 0; i < n; i++) {
250
- if (sketch.get_rank(values[i]) != ranks[i]) {
251
- std::cerr << "checking rank vs CDF for value " << i << std::endl;
252
- REQUIRE(sketch.get_rank(values[i]) == ranks[i]);
249
+ { // inclusive=false (default)
250
+ const auto ranks(sketch.get_CDF(values, n));
251
+ const auto pmf(sketch.get_PMF(values, n));
252
+
253
+ double subtotal_pmf = 0;
254
+ for (int i = 0; i < n; i++) {
255
+ if (sketch.get_rank(values[i]) != ranks[i]) {
256
+ std::cerr << "checking rank vs CDF for value " << i << std::endl;
257
+ REQUIRE(sketch.get_rank(values[i]) == ranks[i]);
258
+ }
259
+ subtotal_pmf += pmf[i];
260
+ if (abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
261
+ std::cerr << "CDF vs PMF for value " << i << std::endl;
262
+ REQUIRE(ranks[i] == Approx(subtotal_pmf).margin(NUMERIC_NOISE_TOLERANCE));
263
+ }
253
264
  }
254
- subtotal_pmf += pmf[i];
255
- if (abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
256
- std::cerr << "CDF vs PMF for value " << i << std::endl;
257
- REQUIRE(ranks[i] == Approx(subtotal_pmf).margin(NUMERIC_NOISE_TOLERANCE));
265
+ }
266
+ { // inclusive=true
267
+ const auto ranks(sketch.get_CDF<true>(values, n));
268
+ const auto pmf(sketch.get_PMF<true>(values, n));
269
+
270
+ double subtotal_pmf = 0;
271
+ for (int i = 0; i < n; i++) {
272
+ if (sketch.get_rank<true>(values[i]) != ranks[i]) {
273
+ std::cerr << "checking rank vs CDF for value " << i << std::endl;
274
+ REQUIRE(sketch.get_rank(values[i]) == ranks[i]);
275
+ }
276
+ subtotal_pmf += pmf[i];
277
+ if (abs(ranks[i] - subtotal_pmf) > NUMERIC_NOISE_TOLERANCE) {
278
+ std::cerr << "CDF vs PMF for value " << i << std::endl;
279
+ REQUIRE(ranks[i] == Approx(subtotal_pmf).margin(NUMERIC_NOISE_TOLERANCE));
280
+ }
258
281
  }
259
282
  }
260
283
  }
@@ -279,6 +302,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
279
302
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
280
303
  auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
281
304
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
305
+ REQUIRE(s.tellg() == s.tellp());
282
306
  REQUIRE(sketch2.is_empty() == sketch.is_empty());
283
307
  REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
284
308
  REQUIRE(sketch2.get_n() == sketch.get_n());
@@ -292,7 +316,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
292
316
  SECTION("bytes serialize deserialize empty") {
293
317
  kll_float_sketch sketch(200, 0);
294
318
  auto bytes = sketch.serialize();
295
- auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
319
+ auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
296
320
  REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
297
321
  REQUIRE(sketch2.is_empty() == sketch.is_empty());
298
322
  REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
@@ -304,13 +328,13 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
304
328
  REQUIRE(sketch2.get_normalized_rank_error(true) == sketch.get_normalized_rank_error(true));
305
329
  }
306
330
 
307
- SECTION("serialize deserialize one item") {
331
+ SECTION("stream serialize deserialize one item") {
308
332
  kll_float_sketch sketch(200, 0);
309
333
  sketch.update(1.0f);
310
334
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
311
335
  sketch.serialize(s);
312
336
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
313
- auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
337
+ auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), 0);
314
338
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
315
339
  REQUIRE(s.tellg() == s.tellp());
316
340
  REQUIRE_FALSE(sketch2.is_empty());
@@ -324,11 +348,29 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
324
348
  REQUIRE(sketch2.get_rank(2) == 1.0);
325
349
  }
326
350
 
351
+ SECTION("bytes serialize deserialize one item") {
352
+ kll_float_sketch sketch(200, 0);
353
+ sketch.update(1.0f);
354
+ auto bytes = sketch.serialize();
355
+ REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
356
+ auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
357
+ REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
358
+ REQUIRE_FALSE(sketch2.is_empty());
359
+ REQUIRE_FALSE(sketch2.is_estimation_mode());
360
+ REQUIRE(sketch2.get_n() == 1);
361
+ REQUIRE(sketch2.get_num_retained() == 1);
362
+ REQUIRE(sketch2.get_min_value() == 1.0);
363
+ REQUIRE(sketch2.get_max_value() == 1.0);
364
+ REQUIRE(sketch2.get_quantile(0.5) == 1.0);
365
+ REQUIRE(sketch2.get_rank(1) == 0.0);
366
+ REQUIRE(sketch2.get_rank(2) == 1.0);
367
+ }
368
+
327
369
  SECTION("deserialize one item v1") {
328
370
  std::ifstream is;
329
371
  is.exceptions(std::ios::failbit | std::ios::badbit);
330
372
  is.open(testBinaryInputPath + "kll_sketch_float_one_item_v1.sk", std::ios::binary);
331
- auto sketch = kll_float_sketch::deserialize(is, test_allocator<float>(0));
373
+ auto sketch = kll_float_sketch::deserialize(is, serde<float>(), 0);
332
374
  REQUIRE_FALSE(sketch.is_empty());
333
375
  REQUIRE_FALSE(sketch.is_estimation_mode());
334
376
  REQUIRE(sketch.get_n() == 1);
@@ -337,6 +379,42 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
337
379
  REQUIRE(sketch.get_max_value() == 1.0);
338
380
  }
339
381
 
382
+ SECTION("stream serialize deserialize three items") {
383
+ kll_float_sketch sketch(200, 0);
384
+ sketch.update(1.0f);
385
+ sketch.update(2.0f);
386
+ sketch.update(3.0f);
387
+ std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
388
+ sketch.serialize(s);
389
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
390
+ auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), 0);
391
+ REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
392
+ REQUIRE(s.tellg() == s.tellp());
393
+ REQUIRE_FALSE(sketch2.is_empty());
394
+ REQUIRE_FALSE(sketch2.is_estimation_mode());
395
+ REQUIRE(sketch2.get_n() == 3);
396
+ REQUIRE(sketch2.get_num_retained() == 3);
397
+ REQUIRE(sketch2.get_min_value() == 1.0);
398
+ REQUIRE(sketch2.get_max_value() == 3.0);
399
+ }
400
+
401
+ SECTION("bytes serialize deserialize three items") {
402
+ kll_float_sketch sketch(200, 0);
403
+ sketch.update(1.0f);
404
+ sketch.update(2.0f);
405
+ sketch.update(3.0f);
406
+ auto bytes = sketch.serialize();
407
+ REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
408
+ auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
409
+ REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
410
+ REQUIRE_FALSE(sketch2.is_empty());
411
+ REQUIRE_FALSE(sketch2.is_estimation_mode());
412
+ REQUIRE(sketch2.get_n() == 3);
413
+ REQUIRE(sketch2.get_num_retained() == 3);
414
+ REQUIRE(sketch2.get_min_value() == 1.0);
415
+ REQUIRE(sketch2.get_max_value() == 3.0);
416
+ }
417
+
340
418
  SECTION("stream serialize deserialize many floats") {
341
419
  kll_float_sketch sketch(200, 0);
342
420
  const int n = 1000;
@@ -344,7 +422,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
344
422
  std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
345
423
  sketch.serialize(s);
346
424
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch.get_serialized_size_bytes());
347
- auto sketch2 = kll_float_sketch::deserialize(s, test_allocator<float>(0));
425
+ auto sketch2 = kll_float_sketch::deserialize(s, serde<float>(), 0);
348
426
  REQUIRE(static_cast<size_t>(s.tellp()) == sketch2.get_serialized_size_bytes());
349
427
  REQUIRE(s.tellg() == s.tellp());
350
428
  REQUIRE(sketch2.is_empty() == sketch.is_empty());
@@ -366,7 +444,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
366
444
  for (int i = 0; i < n; i++) sketch.update(static_cast<float>(i));
367
445
  auto bytes = sketch.serialize();
368
446
  REQUIRE(bytes.size() == sketch.get_serialized_size_bytes());
369
- auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), 0);
447
+ auto sketch2 = kll_float_sketch::deserialize(bytes.data(), bytes.size(), serde<float>(), 0);
370
448
  REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
371
449
  REQUIRE(sketch2.is_empty() == sketch.is_empty());
372
450
  REQUIRE(sketch2.is_estimation_mode() == sketch.is_estimation_mode());
@@ -623,7 +701,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
623
701
 
624
702
  auto bytes = sketch1.serialize();
625
703
  REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
626
- auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), 0);
704
+ auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), serde<std::string>(), 0);
627
705
  REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
628
706
  REQUIRE(sketch2.is_empty() == sketch1.is_empty());
629
707
  REQUIRE(sketch2.is_estimation_mode() == sketch1.is_estimation_mode());
@@ -644,7 +722,7 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
644
722
  sketch1.update("a");
645
723
  auto bytes = sketch1.serialize();
646
724
  REQUIRE(bytes.size() == sketch1.get_serialized_size_bytes());
647
- auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), 0);
725
+ auto sketch2 = kll_string_sketch::deserialize(bytes.data(), bytes.size(), serde<std::string>(), 0);
648
726
  REQUIRE(bytes.size() == sketch2.get_serialized_size_bytes());
649
727
  }
650
728
 
@@ -702,6 +780,61 @@ TEST_CASE("kll sketch", "[kll_sketch]") {
702
780
  REQUIRE(kll_sketch<std::string>::get_max_serialized_size_bytes(200, 1000000000, 4) == 3160);
703
781
  }
704
782
 
783
+ SECTION("issue #236") {
784
+ kll_sketch<int8_t> kll;
785
+ kll.update(1);
786
+ kll.update(2);
787
+ kll.update(3);
788
+ auto blob = kll.serialize();
789
+ auto kll2 = kll_sketch<int8_t>::deserialize(blob.data(), blob.size());
790
+ }
791
+
792
+ SECTION("sorted view") {
793
+ kll_sketch<int> kll;
794
+ kll.update(2);
795
+ kll.update(3);
796
+ kll.update(1);
797
+
798
+ { // non-cumulative, using operator->
799
+ auto view = kll.get_sorted_view(false);
800
+ REQUIRE(view.size() == 3);
801
+ auto it = view.begin();
802
+ REQUIRE(it->first == 1);
803
+ REQUIRE(it->second == 1);
804
+ ++it;
805
+ REQUIRE(it->first == 2);
806
+ REQUIRE(it->second == 1);
807
+ ++it;
808
+ REQUIRE(it->first == 3);
809
+ REQUIRE(it->second == 1);
810
+ }
811
+ { // cumulative, non-inclusive, using operator->
812
+ auto view = kll.get_sorted_view(true);
813
+ REQUIRE(view.size() == 3);
814
+ auto it = view.begin();
815
+ REQUIRE(it->first == 1);
816
+ REQUIRE(it->second == 0);
817
+ ++it;
818
+ REQUIRE(it->first == 2);
819
+ REQUIRE(it->second == 1);
820
+ ++it;
821
+ REQUIRE(it->first == 3);
822
+ REQUIRE(it->second == 2);
823
+ }
824
+ { // cumulative, inclusive, using operator*
825
+ auto view = kll.get_sorted_view<true>(true);
826
+ REQUIRE(view.size() == 3);
827
+ auto it = view.begin();
828
+ REQUIRE((*it).first == 1);
829
+ REQUIRE((*it).second == 1);
830
+ ++it;
831
+ REQUIRE((*it).first == 2);
832
+ REQUIRE((*it).second == 2);
833
+ ++it;
834
+ REQUIRE((*it).first == 3);
835
+ REQUIRE((*it).second == 3);
836
+ }
837
+ }
705
838
  // cleanup
706
839
  if (test_allocator_total_bytes != 0) {
707
840
  REQUIRE(test_allocator_total_bytes == 0);
@@ -46,7 +46,7 @@ TEST_CASE("kolmogorov-smirnov same distribution", "[kll_sketch]") {
46
46
  sketch1.update(x);
47
47
  sketch2.update(x);
48
48
  }
49
- REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == Approx(0).margin(0.01));
49
+ REQUIRE(kolmogorov_smirnov::delta(sketch1, sketch2) == Approx(0).margin(0.02));
50
50
  REQUIRE_FALSE(kolmogorov_smirnov::test(sketch1, sketch2, 0.01));
51
51
  }
52
52
 
@@ -1,8 +1,10 @@
1
1
  [build-system]
2
2
  requires = ["wheel",
3
3
  "setuptools >= 30.3.0",
4
- "setuptools_scm",
5
- "cmake >= 3.12"]
4
+ "cmake >= 3.16",
5
+ "pip >= 10.0",
6
+ "pybind11[global] >= 2.6.0"]
7
+ build-backend = "setuptools.build_meta"
6
8
 
7
9
  [tool.tox]
8
10
  legacy_tox_ini = """
@@ -15,16 +15,24 @@
15
15
  # specific language governing permissions and limitations
16
16
  # under the License.
17
17
 
18
- # TODO: Can we force python version >= 3.0?
19
- if (MSVC)
20
- set(PYBIND11_CPP_STANDARD /std:c++11)
18
+ if(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18.0")
19
+ find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED)
21
20
  else()
22
- set(PYBIND11_CPP_STANDARD -std=c++11)
21
+ find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
22
+ endif()
23
+
24
+ # only Windows+MSVC seems to have trouble locating pybind11
25
+ if (MSVC)
26
+ execute_process(COMMAND cmd.exe /c ${CMAKE_CURRENT_SOURCE_DIR}/pybind11Path.cmd "${Python3_EXECUTABLE}"
27
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
28
+ OUTPUT_STRIP_TRAILING_WHITESPACE
29
+ OUTPUT_VARIABLE EXTRA_PACKAGE_PATH)
30
+ set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${EXTRA_PACKAGE_PATH})
23
31
  endif()
24
32
 
25
- add_subdirectory(pybind11)
33
+ find_package(pybind11 CONFIG REQUIRED)
26
34
 
27
- pybind11_add_module(python MODULE EXCLUDE_FROM_ALL SYSTEM THIN_LTO)
35
+ pybind11_add_module(python MODULE EXCLUDE_FROM_ALL THIN_LTO)
28
36
 
29
37
  target_link_libraries(python
30
38
  PRIVATE
@@ -36,6 +44,7 @@ target_link_libraries(python
36
44
  theta
37
45
  sampling
38
46
  req
47
+ quantiles
39
48
  pybind11::module
40
49
  )
41
50
 
@@ -59,5 +68,7 @@ target_sources(python
59
68
  src/theta_wrapper.cpp
60
69
  src/vo_wrapper.cpp
61
70
  src/req_wrapper.cpp
71
+ src/quantiles_wrapper.cpp
72
+ src/ks_wrapper.cpp
62
73
  src/vector_of_kll.cpp
63
74
  )
@@ -1,76 +1,64 @@
1
- # Python Wrapper for Apache DataSketches
1
+ <img src="https://raw.githubusercontent.com/apache/datasketches-website/master/logos/svg/datasketches-HorizontalColor-TM.svg" width="75%" alt="Apache DataSketchs Logo">
2
2
 
3
- ## Installation
3
+ # The Apache DataSketches Library for Python
4
4
 
5
- The release files do not include the needed python binding library ([pybind11](https://github.com/pybind/pybind11)). If building
6
- from a relase package, you must ensure that the pybind11 directory points to a local copy of pybind11.
5
+ This is the official version of the [Apache DataSketches](https://datasketches.apache.org) Python library.
7
6
 
8
- An official pypi build is eventually planned but not yet available.
7
+ In the analysis of big data there are often problem queries that don’t scale because they require huge compute resources and time to generate exact results. Examples include count distinct, quantiles, most-frequent items, joins, matrix computations, and graph analysis.
9
8
 
10
- If you instead want to take a (possibly ill-advised) gamble on the current state of the master branch being useable, you can run:
11
- ```pip install git+https://github.com/apache/datasketches-cpp.git```
9
+ If approximate results are acceptable, there is a class of specialized algorithms, called streaming algorithms, or sketches that can produce results orders-of magnitude faster and with mathematically proven error bounds. For interactive queries there may not be other viable alternatives, and in the case of real-time analysis, sketches are the only known solution.
12
10
 
13
- ## Developer Instructions
14
-
15
- ### Building
16
-
17
- When cloning the source repository, you should include the pybind11 submodule with the `--recursive` option to the clone command:
18
- ```
19
- git clone --recursive https://github.com/apache/datasketches-cpp.git
20
- cd datasketches-cpp
21
- python -m pip install --upgrade pip setuptools wheel numpy
22
- python setup.py build
23
- ```
11
+ This package provides a variety of sketches as described below. Wherever a specific type of sketch exists in Apache DataSketches packages for other languages, the sketches will be portable between languages (for platforms with the same endianness).
24
12
 
25
- If you cloned without `--recursive`, you can add the submodule post-checkout using `git submodule update --init --recursive`.
13
+ ## Building and Installation
26
14
 
27
- ### Installing
15
+ Once cloned, the library can be installed by running `python -m pip install .` in the project root directory, which will also install the necessary dependencies, namely numpy and [pybind11[global]](https://github.com/pybind/pybind11).
28
16
 
29
- Assuming you have already checked out the library and any dependent submodules, install by simply replacing the lsat
30
- line of the build command with `python setup.py install`.
17
+ If you prefer to call the `setup.py` build script directly, you must first install `pybind11[global]`, as well as any other dependencies listed under the build-system section in `pyproject.toml`.
31
18
 
32
- ### Unit tests
33
-
34
- The python tests are run with `tox`. To ensure you have all the needed packages, from the package base directory run:
35
- ```
36
- python -m pip install --upgrade pip setuptools wheel numpy tox
37
- tox
38
- ```
19
+ The library is also available from PyPI via `python -m pip install datasketches`.
39
20
 
40
21
  ## Usage
41
22
 
42
- Having installed the library, loading the Apache Datasketches library in Python is simple: `import datasketches`.
23
+ Having installed the library, loading the Apache Datasketches Library in Python is simple: `import datasketches`.
43
24
 
44
25
  ## Available Sketch Classes
45
26
 
46
27
  - KLL (Absolute Error Quantiles)
47
- - `kll_ints_sketch`
48
- - `kll_floats_sketch`
28
+ - `kll_ints_sketch`
29
+ - `kll_floats_sketch`
30
+ - `kll_doubles_sketch`
31
+ - Quantiles (Absolute Error Quantiles, inferior algorithm)
32
+ - `quantiles_ints_sketch`
33
+ - `quantiles_floats_sketch`
34
+ - `quantiles_doubles_sketch`
49
35
  - REQ (Relative Error Quantiles)
50
- - `req_ints_sketch`
51
- - `req_floats_sketch`
36
+ - `req_ints_sketch`
37
+ - `req_floats_sketch`
52
38
  - Frequent Items
53
- - `frequent_strings_sketch`
54
- - Error types are `frequent_items_error_type.{NO_FALSE_NEGATIVES | NO_FALSE_POSITIVES}`
39
+ - `frequent_strings_sketch`
40
+ - Error types are `frequent_items_error_type.{NO_FALSE_NEGATIVES | NO_FALSE_POSITIVES}`
55
41
  - Theta
56
- - `update_theta_sketch`
57
- - `compact_theta_sketch` (cannot be instantiated directly)
58
- - `theta_union`
59
- - `theta_intersection`
60
- - `theta_a_not_b`
42
+ - `update_theta_sketch`
43
+ - `compact_theta_sketch` (cannot be instantiated directly)
44
+ - `theta_union`
45
+ - `theta_intersection`
46
+ - `theta_a_not_b`
61
47
  - HLL
62
- - `hll_sketch`
63
- - `hll_union`
64
- - Target HLL types are `tgt_hll_type.{HLL_4 | HLL_6 | HLL_8}`
48
+ - `hll_sketch`
49
+ - `hll_union`
50
+ - Target HLL types are `tgt_hll_type.{HLL_4 | HLL_6 | HLL_8}`
65
51
  - CPC
66
- - `cpc_sketch`
67
- - `cpc_union`
52
+ - `cpc_sketch`
53
+ - `cpc_union`
68
54
  - VarOpt Sampling
69
- - `var_opt_sketch`
70
- - `var_opt_union`
55
+ - `var_opt_sketch`
56
+ - `var_opt_union`
71
57
  - Vector of KLL
72
- - `vector_of_kll_ints_sketches`
73
- - `vector_of_kll_floats_sketches`
58
+ - `vector_of_kll_ints_sketches`
59
+ - `vector_of_kll_floats_sketches`
60
+ - Kolmogorov-Smirnov Test
61
+ - `ks_test` applied to a pair of matched-type Absolute Error quantiles sketches
74
62
 
75
63
  ## Known Differences from C++
76
64
 
@@ -79,3 +67,22 @@ The Python API largely mirrors the C++ API, with a few minor exceptions: The pri
79
67
  The Vector of KLL object is currently exclusive to python, and holds an array of independent KLL sketches. This is useful for creating a set of KLL sketches over a vector and has been designed to allow input as either a vector or a matrix of multiple vectors.
80
68
 
81
69
  We have also removed reliance on a builder class for theta sketches as Python allows named arguments to the constructor, not strictly positional arguments.
70
+
71
+ ## Developer Instructions
72
+
73
+ The only developer-specific instructions relate to running unit tests.
74
+
75
+ ### Unit tests
76
+
77
+ The Python unit tests are run with `tox`. To ensure you have all the needed package, from the package base directory run:
78
+
79
+ ```bash
80
+ python -m pip install --upgrade tox
81
+ tox
82
+ ```
83
+
84
+ ## License
85
+
86
+ The Apache DataSketches Library is distrubted under an Apache 2.0 License.
87
+
88
+ There may be precompiled binaries provided as a convenience and distributed through PyPI via [https://pypi.org/project/datasketches/] contain compiled code from [pybind11](https://github.com/pybind/pybind11), which is distributed under a BSD license.
@@ -0,0 +1,3 @@
1
+ @echo off
2
+ :: Takes path to the Python interpreter and returns the path to pybind11
3
+ %1 -m pip show pybind11 | %1 -c "import sys,re;[sys.stdout.write(re.sub('^Location:\\s+','',line)) for line in sys.stdin if re.search('^Location:\\s+',line)]"
@@ -53,7 +53,7 @@ void init_cpc(py::module &m) {
53
53
  using namespace datasketches;
54
54
 
55
55
  py::class_<cpc_sketch>(m, "cpc_sketch")
56
- .def(py::init<uint8_t, uint64_t>(), py::arg("lg_k")=CPC_DEFAULT_LG_K, py::arg("seed")=DEFAULT_SEED)
56
+ .def(py::init<uint8_t, uint64_t>(), py::arg("lg_k")=cpc_constants::DEFAULT_LG_K, py::arg("seed")=DEFAULT_SEED)
57
57
  .def(py::init<const cpc_sketch&>())
58
58
  .def("__str__", &cpc_sketch::to_string,
59
59
  "Produces a string summary of the sketch")
@@ -28,6 +28,8 @@ void init_cpc(py::module& m);
28
28
  void init_theta(py::module& m);
29
29
  void init_vo(py::module& m);
30
30
  void init_req(py::module& m);
31
+ void init_quantiles(py::module& m);
32
+ void init_kolmogorov_smirnov(py::module& m);
31
33
  void init_vector_of_kll(py::module& m);
32
34
 
33
35
  PYBIND11_MODULE(datasketches, m) {
@@ -38,5 +40,7 @@ PYBIND11_MODULE(datasketches, m) {
38
40
  init_theta(m);
39
41
  init_vo(m);
40
42
  init_req(m);
43
+ init_quantiles(m);
44
+ init_kolmogorov_smirnov(m);
41
45
  init_vector_of_kll(m);
42
46
  }
@@ -64,6 +64,11 @@ py::list fi_sketch_get_frequent_items(const frequent_items_sketch<T>& sk,
64
64
  return list;
65
65
  }
66
66
 
67
+ template<typename T>
68
+ size_t fi_sketch_get_serialized_size_bytes(const frequent_items_sketch<T>& sk) {
69
+ return sk.get_serialized_size_bytes();
70
+ }
71
+
67
72
  }
68
73
  }
69
74
 
@@ -104,7 +109,7 @@ void bind_fi_sketch(py::module &m, const char* name) {
104
109
  "Returns the epsilon value used to compute a priori error for a given log2(max_map_size)")
105
110
  .def_static("get_apriori_error", &frequent_items_sketch<T>::get_apriori_error, py::arg("lg_max_map_size"), py::arg("estimated_total_weight"),
106
111
  "Returns the estimated a priori error given the max_map_size for the sketch and the estimated_total_stream_weight.")
107
- .def("get_serialized_size_bytes", &frequent_items_sketch<T>::get_serialized_size_bytes,
112
+ .def("get_serialized_size_bytes", &dspy::fi_sketch_get_serialized_size_bytes<T>,
108
113
  "Computes the size needed to serialize the current state of the sketch. This can be expensive since every item needs to be looked at.")
109
114
  .def("serialize", &dspy::fi_sketch_serialize<T>, "Serializes the sketch into a bytes object")
110
115
  .def_static("deserialize", &dspy::fi_sketch_deserialize<T>, "Reads a bytes object and returns the corresponding frequent_strings_sketch")