datasketches 0.2.3 → 0.2.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (143) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +12 -0
  3. data/README.md +8 -8
  4. data/ext/datasketches/kll_wrapper.cpp +7 -3
  5. data/ext/datasketches/theta_wrapper.cpp +20 -4
  6. data/lib/datasketches/version.rb +1 -1
  7. data/vendor/datasketches-cpp/CMakeLists.txt +25 -5
  8. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  9. data/vendor/datasketches-cpp/NOTICE +6 -5
  10. data/vendor/datasketches-cpp/README.md +76 -9
  11. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  12. data/vendor/datasketches-cpp/common/CMakeLists.txt +18 -13
  13. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
  14. data/vendor/datasketches-cpp/common/include/common_defs.hpp +14 -0
  15. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
  16. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
  17. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
  18. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
  19. data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
  20. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  21. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +3 -1
  22. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
  23. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
  24. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +10 -6
  25. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  26. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
  27. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
  28. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  29. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
  30. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +29 -11
  31. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
  32. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
  33. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  34. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
  35. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
  36. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
  37. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
  38. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
  39. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
  40. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
  41. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
  42. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
  43. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
  44. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
  45. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
  46. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
  47. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
  48. data/vendor/datasketches-cpp/kll/CMakeLists.txt +5 -19
  49. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
  50. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +5 -2
  51. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +108 -41
  52. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +150 -132
  53. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +165 -31
  54. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  55. data/vendor/datasketches-cpp/pyproject.toml +1 -1
  56. data/vendor/datasketches-cpp/python/CMakeLists.txt +8 -1
  57. data/vendor/datasketches-cpp/python/README.md +13 -9
  58. data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
  59. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
  60. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +48 -13
  61. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
  62. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
  63. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
  64. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +1 -0
  65. data/vendor/datasketches-cpp/python/tests/kll_test.py +10 -4
  66. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
  67. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
  68. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +656 -0
  69. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1373 -0
  70. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
  71. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
  72. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
  73. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
  74. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
  75. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
  76. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
  77. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
  78. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
  79. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
  80. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
  81. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +975 -0
  82. data/vendor/datasketches-cpp/req/CMakeLists.txt +6 -21
  83. data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
  84. data/vendor/datasketches-cpp/req/include/req_compactor.hpp +6 -0
  85. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +30 -2
  86. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +73 -23
  87. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +95 -63
  88. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +74 -3
  89. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  90. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +44 -7
  91. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +44 -33
  92. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
  93. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +33 -15
  94. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
  95. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
  96. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
  97. data/vendor/datasketches-cpp/setup.py +1 -1
  98. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  99. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  100. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +92 -23
  101. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  102. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +7 -6
  103. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +3 -2
  104. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
  105. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +150 -93
  106. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
  107. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  108. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -2
  109. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  110. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -4
  111. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +34 -9
  112. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  113. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
  114. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  115. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  116. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  117. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  118. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  119. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
  120. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +446 -0
  121. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +429 -1
  122. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -11
  123. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  124. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  125. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
  126. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  127. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  128. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +29 -9
  129. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +34 -14
  130. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  131. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  132. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
  133. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
  134. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
  135. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
  136. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +8 -0
  137. metadata +33 -12
  138. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
  139. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
  140. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
  141. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
  142. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  143. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -25,6 +25,7 @@
25
25
  #include <cmath>
26
26
  #include <random>
27
27
  #include <algorithm>
28
+ #include <stdexcept>
28
29
 
29
30
  #include "var_opt_sketch.hpp"
30
31
  #include "serde.hpp"
@@ -311,8 +312,8 @@ var_opt_sketch<T,S,A>& var_opt_sketch<T,S,A>::operator=(var_opt_sketch&& other)
311
312
 
312
313
  // implementation for fixed-size arithmetic types (integral and floating point)
313
314
  template<typename T, typename S, typename A>
314
- template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
315
- size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
315
+ template<typename TT, typename SerDe, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
316
+ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes(const SerDe&) const {
316
317
  if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; }
317
318
  size_t num_bytes = (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL) << 3;
318
319
  num_bytes += h_ * sizeof(double); // weights
@@ -325,8 +326,8 @@ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
325
326
 
326
327
  // implementation for all other types
327
328
  template<typename T, typename S, typename A>
328
- template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
329
- size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
329
+ template<typename TT, typename SerDe, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
330
+ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes(const SerDe& sd) const {
330
331
  if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; }
331
332
  size_t num_bytes = (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL) << 3;
332
333
  num_bytes += h_ * sizeof(double); // weights
@@ -335,13 +336,14 @@ size_t var_opt_sketch<T,S,A>::get_serialized_size_bytes() const {
335
336
  }
336
337
  // must iterate over the items
337
338
  for (auto it: *this)
338
- num_bytes += S().size_of_item(it.first);
339
+ num_bytes += sd.size_of_item(it.first);
339
340
  return num_bytes;
340
341
  }
341
342
 
342
343
  template<typename T, typename S, typename A>
343
- std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned header_size_bytes) const {
344
- const size_t size = header_size_bytes + get_serialized_size_bytes();
344
+ template<typename SerDe>
345
+ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned header_size_bytes, const SerDe& sd) const {
346
+ const size_t size = header_size_bytes + get_serialized_size_bytes(sd);
345
347
  std::vector<uint8_t, AllocU8<A>> bytes(size, 0, allocator_);
346
348
  uint8_t* ptr = bytes.data() + header_size_bytes;
347
349
  uint8_t* end_ptr = ptr + size;
@@ -400,8 +402,8 @@ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned heade
400
402
  }
401
403
 
402
404
  // write the sample items, skipping the gap. Either h_ or r_ may be 0
403
- ptr += S().serialize(ptr, end_ptr - ptr, data_, h_);
404
- ptr += S().serialize(ptr, end_ptr - ptr, &data_[h_ + 1], r_);
405
+ ptr += sd.serialize(ptr, end_ptr - ptr, data_, h_);
406
+ ptr += sd.serialize(ptr, end_ptr - ptr, &data_[h_ + 1], r_);
405
407
  }
406
408
 
407
409
  size_t bytes_written = ptr - bytes.data();
@@ -413,7 +415,8 @@ std::vector<uint8_t, AllocU8<A>> var_opt_sketch<T,S,A>::serialize(unsigned heade
413
415
  }
414
416
 
415
417
  template<typename T, typename S, typename A>
416
- void var_opt_sketch<T,S,A>::serialize(std::ostream& os) const {
418
+ template<typename SerDe>
419
+ void var_opt_sketch<T,S,A>::serialize(std::ostream& os, const SerDe& sd) const {
417
420
  const bool empty = (h_ == 0) && (r_ == 0);
418
421
 
419
422
  const uint8_t preLongs = (empty ? PREAMBLE_LONGS_EMPTY
@@ -469,13 +472,19 @@ void var_opt_sketch<T,S,A>::serialize(std::ostream& os) const {
469
472
  }
470
473
 
471
474
  // write the sample items, skipping the gap. Either h_ or r_ may be 0
472
- S().serialize(os, data_, h_);
473
- S().serialize(os, &data_[h_ + 1], r_);
475
+ sd.serialize(os, data_, h_);
476
+ sd.serialize(os, &data_[h_ + 1], r_);
474
477
  }
475
478
  }
476
479
 
477
480
  template<typename T, typename S, typename A>
478
481
  var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size_t size, const A& allocator) {
482
+ return deserialize(bytes, size, S(), allocator);
483
+ }
484
+
485
+ template<typename T, typename S, typename A>
486
+ template<typename SerDe>
487
+ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {
479
488
  ensure_minimum_memory(size, 8);
480
489
  const char* ptr = static_cast<const char*>(bytes);
481
490
  const char* base = ptr;
@@ -559,10 +568,10 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
559
568
  items_deleter deleter(array_size, allocator);
560
569
  std::unique_ptr<T, items_deleter> items(A(allocator).allocate(array_size), deleter);
561
570
 
562
- ptr += S().deserialize(ptr, end_ptr - ptr, items.get(), h);
571
+ ptr += sd.deserialize(ptr, end_ptr - ptr, items.get(), h);
563
572
  items.get_deleter().set_h(h); // serde didn't throw, so the items are now valid
564
573
 
565
- ptr += S().deserialize(ptr, end_ptr - ptr, &(items.get()[h + 1]), r);
574
+ ptr += sd.deserialize(ptr, end_ptr - ptr, &(items.get()[h + 1]), r);
566
575
  items.get_deleter().set_r(r); // serde didn't throw, so the items are now valid
567
576
 
568
577
  return var_opt_sketch(k, h, (r > 0 ? 1 : 0), r, n, total_wt_r, rf, array_size, false,
@@ -571,6 +580,12 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(const void* bytes, size
571
580
 
572
581
  template<typename T, typename S, typename A>
573
582
  var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const A& allocator) {
583
+ return deserialize(is, S(), allocator);
584
+ }
585
+
586
+ template<typename T, typename S, typename A>
587
+ template<typename SerDe>
588
+ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
574
589
  const auto first_byte = read<uint8_t>(is);
575
590
  uint8_t preamble_longs = first_byte & 0x3f;
576
591
  const resize_factor rf = static_cast<resize_factor>((first_byte >> 6) & 0x03);
@@ -640,10 +655,10 @@ var_opt_sketch<T,S,A> var_opt_sketch<T,S,A>::deserialize(std::istream& is, const
640
655
  items_deleter deleter(array_size, allocator);
641
656
  std::unique_ptr<T, items_deleter> items(A(allocator).allocate(array_size), deleter);
642
657
 
643
- S().deserialize(is, items.get(), h); // aka &data_[0]
658
+ sd.deserialize(is, items.get(), h); // aka &data_[0]
644
659
  items.get_deleter().set_h(h); // serde didn't throw, so the items are now valid
645
660
 
646
- S().deserialize(is, &(items.get()[h + 1]), r);
661
+ sd.deserialize(is, &(items.get()[h + 1]), r);
647
662
  items.get_deleter().set_r(r); // serde didn't throw, so the items are now valid
648
663
 
649
664
  if (!is.good())
@@ -731,8 +746,10 @@ void var_opt_sketch<T,S,A>::update(T&& item, double weight) {
731
746
 
732
747
  template<typename T, typename S, typename A>
733
748
  string<A> var_opt_sketch<T,S,A>::to_string() const {
734
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
735
- os << "### VarOpt SUMMARY: " << std::endl;
749
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
750
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
751
+ std::ostringstream os;
752
+ os << "### VarOpt SUMMARY:" << std::endl;
736
753
  os << " k : " << k_ << std::endl;
737
754
  os << " h : " << h_ << std::endl;
738
755
  os << " r : " << r_ << std::endl;
@@ -740,24 +757,28 @@ string<A> var_opt_sketch<T,S,A>::to_string() const {
740
757
  os << " Current size : " << curr_items_alloc_ << std::endl;
741
758
  os << " Resize factor: " << (1 << rf_) << std::endl;
742
759
  os << "### END SKETCH SUMMARY" << std::endl;
743
- return os.str();
760
+ return string<A>(os.str().c_str(), allocator_);
744
761
  }
745
762
 
746
763
  template<typename T, typename S, typename A>
747
764
  string<A> var_opt_sketch<T,S,A>::items_to_string() const {
748
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
765
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
766
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
767
+ std::ostringstream os;
749
768
  os << "### Sketch Items" << std::endl;
750
769
  int idx = 0;
751
770
  for (auto record : *this) {
752
771
  os << idx << ": " << record.first << "\twt = " << record.second << std::endl;
753
772
  ++idx;
754
773
  }
755
- return os.str();
774
+ return string<A>(os.str().c_str(), allocator_);
756
775
  }
757
776
 
758
777
  template<typename T, typename S, typename A>
759
778
  string<A> var_opt_sketch<T,S,A>::items_to_string(bool print_gap) const {
760
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
779
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
780
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
781
+ std::ostringstream os;
761
782
  os << "### Sketch Items" << std::endl;
762
783
  const uint32_t array_length = (n_ < k_ ? n_ : k_ + 1);
763
784
  for (uint32_t i = 0, display_idx = 0; i < array_length; ++i) {
@@ -774,7 +795,7 @@ string<A> var_opt_sketch<T,S,A>::items_to_string(bool print_gap) const {
774
795
  ++display_idx;
775
796
  }
776
797
  }
777
- return os.str();
798
+ return string<A>(os.str().c_str(), allocator_);
778
799
  }
779
800
 
780
801
  template<typename T, typename S, typename A>
@@ -1677,16 +1698,6 @@ bool var_opt_sketch<T, S, A>::iterator::get_mark() const {
1677
1698
  return sk_->marks_ == nullptr ? false : sk_->marks_[idx_];
1678
1699
  }
1679
1700
 
1680
-
1681
-
1682
- // ******************** MOVE TO COMMON UTILS AREA EVENTUALLY *********************
1683
-
1684
- namespace random_utils {
1685
- static std::random_device rd; // possibly unsafe in MinGW with GCC < 9.2
1686
- static std::mt19937_64 rand(rd());
1687
- static std::uniform_real_distribution<> next_double(0.0, 1.0);
1688
- }
1689
-
1690
1701
  /**
1691
1702
  * Checks if target sampling allocation is more than 50% of max sampling size.
1692
1703
  * If so, returns max sampling size, otherwise passes through target size.
@@ -45,7 +45,11 @@ template<typename A> using AllocU8 = typename std::allocator_traits<A>::template
45
45
  * author Kevin Lang
46
46
  * author Jon Malkin
47
47
  */
48
- template <typename T, typename S = serde<T>, typename A = std::allocator<T>>
48
+ template<
49
+ typename T,
50
+ typename S = serde<T>, // deprecated, to be removed in the next major version
51
+ typename A = std::allocator<T>
52
+ >
49
53
  class var_opt_union {
50
54
 
51
55
  public:
@@ -88,14 +92,16 @@ public:
88
92
  /**
89
93
  * Computes size needed to serialize the current state of the union.
90
94
  * This version is for all other types and can be expensive since every item needs to be looked at.
95
+ * @param instance of a SerDe
91
96
  * @return size in bytes needed to serialize this sketch
92
97
  */
93
- size_t get_serialized_size_bytes() const;
94
-
98
+ template<typename SerDe = S>
99
+ size_t get_serialized_size_bytes(const SerDe& sd = SerDe()) const;
100
+
95
101
  // This is a convenience alias for users
96
102
  // The type returned by the following serialize method
97
103
  typedef vector_u8<A> vector_bytes;
98
-
104
+
99
105
  /**
100
106
  * NOTE: This method may be deprecated in a future version.
101
107
  * This method serializes the sketch as a vector of bytes.
@@ -103,33 +109,62 @@ public:
103
109
  * It is a blank space of a given size.
104
110
  * This header is used in Datasketches PostgreSQL extension.
105
111
  * @param header_size_bytes space to reserve in front of the sketch
112
+ * @param instance of a SerDe
106
113
  */
107
- vector_bytes serialize(unsigned header_size_bytes = 0) const;
114
+ template<typename SerDe = S>
115
+ vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const;
108
116
 
109
117
  /**
110
118
  * NOTE: This method may be deprecated in a future version.
111
119
  * This method serializes the sketch into a given stream in a binary form
112
120
  * @param os output stream
121
+ * @param instance of a SerDe
113
122
  */
114
- void serialize(std::ostream& os) const;
123
+ template<typename SerDe = S>
124
+ void serialize(std::ostream& os, const SerDe& sd = SerDe()) const;
115
125
 
116
126
  /**
117
127
  * NOTE: This method may be deprecated in a future version.
118
128
  * This method deserializes a union from a given stream.
119
129
  * @param is input stream
130
+ * @param instance of an Allocator
120
131
  * @return an instance of a union
121
132
  */
122
133
  static var_opt_union deserialize(std::istream& is, const A& allocator = A());
123
134
 
135
+ /**
136
+ * NOTE: This method may be deprecated in a future version.
137
+ * This method deserializes a union from a given stream.
138
+ * @param is input stream
139
+ * @param instance of a SerDe
140
+ * @param instance of an Allocator
141
+ * @return an instance of a union
142
+ */
143
+ template<typename SerDe = S>
144
+ static var_opt_union deserialize(std::istream& is, const SerDe& sd = SerDe(), const A& allocator = A());
145
+
124
146
  /**
125
147
  * NOTE: This method may be deprecated in a future version.
126
148
  * This method deserializes a union from a given array of bytes.
127
149
  * @param bytes pointer to the array of bytes
128
150
  * @param size the size of the array
151
+ * @param instance of an Allocator
129
152
  * @return an instance of a union
130
153
  */
131
154
  static var_opt_union deserialize(const void* bytes, size_t size, const A& allocator = A());
132
155
 
156
+ /**
157
+ * NOTE: This method may be deprecated in a future version.
158
+ * This method deserializes a union from a given array of bytes.
159
+ * @param bytes pointer to the array of bytes
160
+ * @param size the size of the array
161
+ * @param instance of a SerDe
162
+ * @param instance of an Allocator
163
+ * @return an instance of a union
164
+ */
165
+ template<typename SerDe = S>
166
+ static var_opt_union deserialize(const void* bytes, size_t size, const SerDe& sd = SerDe(), const A& allocator = A());
167
+
133
168
  /**
134
169
  * Prints a summary of the union as a string.
135
170
  * @return the summary as a string
@@ -24,6 +24,7 @@
24
24
 
25
25
  #include <cmath>
26
26
  #include <sstream>
27
+ #include <stdexcept>
27
28
 
28
29
  namespace datasketches {
29
30
 
@@ -129,6 +130,12 @@ var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(var_opt_union&& other) {
129
130
 
130
131
  template<typename T, typename S, typename A>
131
132
  var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A& allocator) {
133
+ return deserialize(is, S(), allocator);
134
+ }
135
+
136
+ template<typename T, typename S, typename A>
137
+ template<typename SerDe>
138
+ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const SerDe& sd, const A& allocator) {
132
139
  const auto preamble_longs = read<uint8_t>(is);
133
140
  const auto serial_version = read<uint8_t>(is);
134
141
  const auto family_id = read<uint8_t>(is);
@@ -155,7 +162,7 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A
155
162
  const auto outer_tau_numer = read<double>(is);
156
163
  const auto outer_tau_denom = read<uint64_t>(is);
157
164
 
158
- var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is, allocator);
165
+ var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is, sd, allocator);
159
166
 
160
167
  if (!is.good())
161
168
  throw std::runtime_error("error reading from std::istream");
@@ -165,6 +172,12 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is, const A
165
172
 
166
173
  template<typename T, typename S, typename A>
167
174
  var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size, const A& allocator) {
175
+ return deserialize(bytes, size, S(), allocator);
176
+ }
177
+
178
+ template<typename T, typename S, typename A>
179
+ template<typename SerDe>
180
+ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size, const SerDe& sd, const A& allocator) {
168
181
  ensure_minimum_memory(size, 8);
169
182
  const char* ptr = static_cast<const char*>(bytes);
170
183
  uint8_t preamble_longs;
@@ -199,22 +212,24 @@ var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t
199
212
  ptr += copy_from_mem(ptr, outer_tau_denom);
200
213
 
201
214
  const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
202
- var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size, allocator);
215
+ var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size, sd, allocator);
203
216
 
204
217
  return var_opt_union<T,S,A>(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
205
218
  }
206
219
 
207
220
  template<typename T, typename S, typename A>
208
- size_t var_opt_union<T,S,A>::get_serialized_size_bytes() const {
221
+ template<typename SerDe>
222
+ size_t var_opt_union<T,S,A>::get_serialized_size_bytes(const SerDe& sd) const {
209
223
  if (n_ == 0) {
210
224
  return PREAMBLE_LONGS_EMPTY << 3;
211
225
  } else {
212
- return (PREAMBLE_LONGS_NON_EMPTY << 3) + gadget_.get_serialized_size_bytes();
226
+ return (PREAMBLE_LONGS_NON_EMPTY << 3) + gadget_.get_serialized_size_bytes(sd);
213
227
  }
214
228
  }
215
229
 
216
230
  template<typename T, typename S, typename A>
217
- void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
231
+ template<typename SerDe>
232
+ void var_opt_union<T,S,A>::serialize(std::ostream& os, const SerDe& sd) const {
218
233
  bool empty = (n_ == 0);
219
234
 
220
235
  const uint8_t serialization_version(SER_VER);
@@ -240,13 +255,14 @@ void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
240
255
  write(os, n_);
241
256
  write(os, outer_tau_numer_);
242
257
  write(os, outer_tau_denom_);
243
- gadget_.serialize(os);
258
+ gadget_.serialize(os, sd);
244
259
  }
245
260
  }
246
261
 
247
262
  template<typename T, typename S, typename A>
248
- std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header_size_bytes) const {
249
- const size_t size = header_size_bytes + get_serialized_size_bytes();
263
+ template<typename SerDe>
264
+ std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header_size_bytes, const SerDe& sd) const {
265
+ const size_t size = header_size_bytes + get_serialized_size_bytes(sd);
250
266
  std::vector<uint8_t, AllocU8<A>> bytes(size, 0, gadget_.allocator_);
251
267
  uint8_t* ptr = bytes.data() + header_size_bytes;
252
268
 
@@ -278,7 +294,7 @@ std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header
278
294
  ptr += copy_to_mem(outer_tau_numer_, ptr);
279
295
  ptr += copy_to_mem(outer_tau_denom_, ptr);
280
296
 
281
- auto gadget_bytes = gadget_.serialize();
297
+ auto gadget_bytes = gadget_.serialize(0, sd);
282
298
  ptr += copy_to_mem(gadget_bytes.data(), ptr, gadget_bytes.size() * sizeof(uint8_t));
283
299
  }
284
300
 
@@ -295,14 +311,16 @@ void var_opt_union<T,S,A>::reset() {
295
311
 
296
312
  template<typename T, typename S, typename A>
297
313
  string<A> var_opt_union<T,S,A>::to_string() const {
298
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
299
- os << "### VarOpt Union SUMMARY: " << std::endl;
300
- os << " . n : " << n_ << std::endl;
314
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
315
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
316
+ std::ostringstream os;
317
+ os << "### VarOpt Union SUMMARY:" << std::endl;
318
+ os << " n : " << n_ << std::endl;
301
319
  os << " Max k : " << max_k_ << std::endl;
302
- os << " Gadget Summary: " << std::endl;
320
+ os << " Gadget Summary:" << std::endl;
303
321
  os << gadget_.to_string();
304
- os << "### END VarOpt Union SUMMARY: " << std::endl;
305
- return os.str();
322
+ os << "### END VarOpt Union SUMMARY" << std::endl;
323
+ return string<A>(os.str().c_str(), gadget_.allocator_);
306
324
  }
307
325
 
308
326
  template<typename T, typename S, typename A>
@@ -39,7 +39,7 @@ TEST_CASE("varopt allocation test", "[var_opt_sketch]") {
39
39
  var_opt_test_sketch sk1(10, var_opt_test_sketch::DEFAULT_RESIZE_FACTOR, 0);
40
40
  for (int i = 0; i < 100; ++i) sk1.update(i);
41
41
  auto bytes1 = sk1.serialize();
42
- auto sk2 = var_opt_test_sketch::deserialize(bytes1.data(), bytes1.size(), 0);
42
+ auto sk2 = var_opt_test_sketch::deserialize(bytes1.data(), bytes1.size(), test_type_serde(), 0);
43
43
 
44
44
  std::stringstream ss;
45
45
  sk1.serialize(ss);
@@ -51,7 +51,7 @@ TEST_CASE("varopt allocation test", "[var_opt_sketch]") {
51
51
  u1.update(sk3);
52
52
 
53
53
  auto bytes2 = u1.serialize();
54
- auto u2 = var_opt_test_union::deserialize(bytes2.data(), bytes2.size(), 0);
54
+ auto u2 = var_opt_test_union::deserialize(bytes2.data(), bytes2.size(), test_type_serde(), 0);
55
55
  }
56
56
  REQUIRE(test_allocator_total_bytes == 0);
57
57
  REQUIRE(test_allocator_net_allocations == 0);
@@ -27,6 +27,7 @@
27
27
  #include <fstream>
28
28
  #include <cmath>
29
29
  #include <random>
30
+ #include <stdexcept>
30
31
 
31
32
  #ifdef TEST_BINARY_INPUT_PATH
32
33
  static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
@@ -27,6 +27,7 @@
27
27
  #include <fstream>
28
28
  #include <cmath>
29
29
  #include <random>
30
+ #include <stdexcept>
30
31
 
31
32
  #ifdef TEST_BINARY_INPUT_PATH
32
33
  static std::string testBinaryInputPath = TEST_BINARY_INPUT_PATH;
@@ -81,7 +81,7 @@ class CMakeBuild(build_ext):
81
81
 
82
82
  setup(
83
83
  name='datasketches',
84
- version='3.2.0.1',
84
+ version='3.5.0',
85
85
  author='Apache Software Foundation',
86
86
  author_email='dev@datasketches.apache.org',
87
87
  description='The Apache DataSketches Library for Python',
@@ -32,53 +32,34 @@ target_include_directories(theta
32
32
  target_link_libraries(theta INTERFACE common)
33
33
  target_compile_features(theta INTERFACE cxx_std_11)
34
34
 
35
- set(theta_HEADERS "")
36
- list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_sketch_impl.hpp")
37
- list(APPEND theta_HEADERS "include/theta_union.hpp;include/theta_union_impl.hpp")
38
- list(APPEND theta_HEADERS "include/theta_intersection.hpp;include/theta_intersection_impl.hpp")
39
- list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/theta_a_not_b_impl.hpp")
40
- list(APPEND theta_HEADERS "include/theta_jaccard_similarity.hpp")
41
- list(APPEND theta_HEADERS "include/theta_comparators.hpp")
42
- list(APPEND theta_HEADERS "include/theta_constants.hpp")
43
- list(APPEND theta_HEADERS "include/theta_helpers.hpp")
44
- list(APPEND theta_HEADERS "include/theta_update_sketch_base.hpp;include/theta_update_sketch_base_impl.hpp")
45
- list(APPEND theta_HEADERS "include/theta_union_base.hpp;include/theta_union_base_impl.hpp")
46
- list(APPEND theta_HEADERS "include/theta_intersection_base.hpp;include/theta_intersection_base_impl.hpp")
47
- list(APPEND theta_HEADERS "include/theta_set_difference_base.hpp;include/theta_set_difference_base_impl.hpp")
48
- list(APPEND theta_HEADERS "include/theta_jaccard_similarity_base.hpp")
49
- list(APPEND theta_HEADERS "include/bounds_on_ratios_in_sampled_sets.hpp")
50
- list(APPEND theta_HEADERS "include/bounds_on_ratios_in_theta_sketched_sets.hpp")
51
-
52
35
  install(TARGETS theta
53
36
  EXPORT ${PROJECT_NAME}
54
37
  )
55
38
 
56
- install(FILES ${theta_HEADERS}
39
+ install(FILES
40
+ include/theta_sketch.hpp
41
+ include/theta_sketch_impl.hpp
42
+ include/theta_union.hpp
43
+ include/theta_union_impl.hpp
44
+ include/theta_intersection.hpp
45
+ include/theta_intersection_impl.hpp
46
+ include/theta_a_not_b.hpp
47
+ include/theta_a_not_b_impl.hpp
48
+ include/theta_jaccard_similarity.hpp
49
+ include/theta_comparators.hpp
50
+ include/theta_constants.hpp
51
+ include/theta_helpers.hpp
52
+ include/theta_update_sketch_base.hpp
53
+ include/theta_update_sketch_base_impl.hpp
54
+ include/theta_union_base.hpp
55
+ include/theta_union_base_impl.hpp
56
+ include/theta_intersection_base.hpp
57
+ include/theta_intersection_base_impl.hpp
58
+ include/theta_set_difference_base.hpp
59
+ include/theta_set_difference_base_impl.hpp
60
+ include/theta_jaccard_similarity_base.hpp
61
+ include/bounds_on_ratios_in_sampled_sets.hpp
62
+ include/bounds_on_ratios_in_theta_sketched_sets.hpp
63
+ include/compact_theta_sketch_parser.hpp
64
+ include/compact_theta_sketch_parser_impl.hpp
57
65
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
58
-
59
- target_sources(theta
60
- INTERFACE
61
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch.hpp
62
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union.hpp
63
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection.hpp
64
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b.hpp
65
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch_impl.hpp
66
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_impl.hpp
67
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_impl.hpp
68
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b_impl.hpp
69
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity.hpp
70
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_comparators.hpp
71
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_constants.hpp
72
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_helpers.hpp
73
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base.hpp
74
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base_impl.hpp
75
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base.hpp
76
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base_impl.hpp
77
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base.hpp
78
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base_impl.hpp
79
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base.hpp
80
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base_impl.hpp
81
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity_base.hpp
82
- ${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_sampled_sets.hpp
83
- ${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_theta_sketched_sets.hpp
84
- )
@@ -22,6 +22,7 @@
22
22
 
23
23
  #include <cstdint>
24
24
  #include <string>
25
+ #include <stdexcept>
25
26
 
26
27
  #include "bounds_binomial_proportions.hpp"
27
28