datasketches 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +4 -0
  3. data/README.md +7 -7
  4. data/ext/datasketches/theta_wrapper.cpp +20 -4
  5. data/lib/datasketches/version.rb +1 -1
  6. data/vendor/datasketches-cpp/CMakeLists.txt +22 -3
  7. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  8. data/vendor/datasketches-cpp/README.md +76 -9
  9. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  10. data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
  11. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  12. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
  13. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -6
  14. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  15. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  16. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +4 -2
  17. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  18. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
  19. data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
  20. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +4 -2
  21. data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
  22. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +4 -2
  23. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  24. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +13 -7
  25. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +8 -6
  26. data/vendor/datasketches-cpp/setup.py +1 -1
  27. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  28. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +89 -22
  29. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  30. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
  31. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
  32. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
  33. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +146 -51
  34. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
  35. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  36. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +8 -2
  37. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  38. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -4
  39. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -9
  40. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  41. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  42. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  43. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  44. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  45. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  46. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
  47. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +400 -0
  48. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +23 -11
  49. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  50. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  51. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
  52. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  53. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  54. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
  55. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -14
  56. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  57. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  58. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
  59. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
  60. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +7 -0
  61. metadata +11 -6
  62. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  63. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -32,27 +32,17 @@ target_include_directories(kll
32
32
  target_link_libraries(kll INTERFACE common)
33
33
  target_compile_features(kll INTERFACE cxx_std_11)
34
34
 
35
- set(kll_HEADERS "")
36
- list(APPEND kll_HEADERS "include/kll_sketch.hpp")
37
- list(APPEND kll_HEADERS "include/kll_sketch_impl.hpp")
38
- list(APPEND kll_HEADERS "include/kll_helper.hpp")
39
- list(APPEND kll_HEADERS "include/kll_helper_impl.hpp")
40
- list(APPEND kll_HEADERS "include/kll_quantile_calculator.hpp")
41
- list(APPEND kll_HEADERS "include/kll_quantile_calculator_impl.hpp")
42
-
43
35
  install(TARGETS kll
44
36
  EXPORT ${PROJECT_NAME}
45
37
  )
46
38
 
47
- install(FILES ${kll_HEADERS}
39
+ install(FILES
40
+ include/kll_sketch.hpp
41
+ include/kll_sketch_impl.hpp
42
+ include/kll_helper.hpp
43
+ include/kll_helper_impl.hpp
44
+ include/kll_quantile_calculator.hpp
45
+ include/kll_quantile_calculator_impl.hpp
46
+ include/kolmogorov_smirnov.hpp
47
+ include/kolmogorov_smirnov_impl.hpp
48
48
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
49
-
50
- target_sources(kll
51
- INTERFACE
52
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_helper.hpp
53
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_helper_impl.hpp
54
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_sketch.hpp
55
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_sketch_impl.hpp
56
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_quantile_calculator.hpp
57
- ${CMAKE_CURRENT_SOURCE_DIR}/include/kll_quantile_calculator_impl.hpp
58
- )
@@ -1023,7 +1023,9 @@ void kll_sketch<T, C, S, A>::check_family_id(uint8_t family_id) {
1023
1023
 
1024
1024
  template <typename T, typename C, typename S, typename A>
1025
1025
  string<A> kll_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items) const {
1026
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
1026
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
1027
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
1028
+ std::ostringstream os;
1027
1029
  os << "### KLL sketch summary:" << std::endl;
1028
1030
  os << " K : " << k_ << std::endl;
1029
1031
  os << " min K : " << min_k_ << std::endl;
@@ -1069,7 +1071,7 @@ string<A> kll_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items)
1069
1071
  }
1070
1072
  os << "### End sketch data" << std::endl;
1071
1073
  }
1072
- return os.str();
1074
+ return string<A>(os.str().c_str(), allocator_);
1073
1075
  }
1074
1076
 
1075
1077
  template <typename T, typename C, typename S, typename A>
@@ -32,29 +32,16 @@ target_include_directories(req
32
32
  target_link_libraries(req INTERFACE common)
33
33
  target_compile_features(req INTERFACE cxx_std_11)
34
34
 
35
- set(req_HEADERS "")
36
- list(APPEND req_HEADERS "include/req_common.hpp")
37
- list(APPEND req_HEADERS "include/req_sketch.hpp")
38
- list(APPEND req_HEADERS "include/req_sketch_impl.hpp")
39
- list(APPEND req_HEADERS "include/req_compactor.hpp")
40
- list(APPEND req_HEADERS "include/req_compactor_impl.hpp")
41
- list(APPEND req_HEADERS "include/req_quantile_calculator.hpp")
42
- list(APPEND req_HEADERS "include/req_quantile_calculator_impl.hpp")
43
-
44
35
  install(TARGETS req
45
36
  EXPORT ${PROJECT_NAME}
46
37
  )
47
38
 
48
- install(FILES ${req_HEADERS}
39
+ install(FILES
40
+ include/req_common.hpp
41
+ include/req_sketch.hpp
42
+ include/req_sketch_impl.hpp
43
+ include/req_compactor.hpp
44
+ include/req_compactor_impl.hpp
45
+ include/req_quantile_calculator.hpp
46
+ include/req_quantile_calculator_impl.hpp
49
47
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
50
-
51
- target_sources(req
52
- INTERFACE
53
- ${CMAKE_CURRENT_SOURCE_DIR}/include/req_common.hpp
54
- ${CMAKE_CURRENT_SOURCE_DIR}/include/req_sketch.hpp
55
- ${CMAKE_CURRENT_SOURCE_DIR}/include/req_sketch_impl.hpp
56
- ${CMAKE_CURRENT_SOURCE_DIR}/include/req_compactor.hpp
57
- ${CMAKE_CURRENT_SOURCE_DIR}/include/req_compactor_impl.hpp
58
- ${CMAKE_CURRENT_SOURCE_DIR}/include/req_quantile_calculator.hpp
59
- ${CMAKE_CURRENT_SOURCE_DIR}/include/req_quantile_calculator_impl.hpp
60
- )
@@ -653,7 +653,9 @@ void req_sketch<T, C, S, A>::compress() {
653
653
 
654
654
  template<typename T, typename C, typename S, typename A>
655
655
  string<A> req_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items) const {
656
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
656
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
657
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
658
+ std::ostringstream os;
657
659
  os << "### REQ sketch summary:" << std::endl;
658
660
  os << " K : " << k_ << std::endl;
659
661
  os << " High Rank Acc : " << (hra_ ? "true" : "false") << std::endl;
@@ -693,7 +695,7 @@ string<A> req_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items)
693
695
  }
694
696
  os << "### End sketch data" << std::endl;
695
697
  }
696
- return os.str();
698
+ return string<A>(os.str().c_str(), allocator_);
697
699
  }
698
700
 
699
701
  template<typename T, typename C, typename S, typename A>
@@ -32,17 +32,13 @@ target_include_directories(sampling
32
32
  target_link_libraries(sampling INTERFACE common)
33
33
  target_compile_features(sampling INTERFACE cxx_std_11)
34
34
 
35
- set(sampling_HEADERS "include/var_opt_sketch.hpp;include/var_opt_sketch_impl.hpp")
36
-
37
35
  install(TARGETS sampling
38
36
  EXPORT ${PROJECT_NAME}
39
37
  )
40
38
 
41
- install(FILES ${sampling_HEADERS}
39
+ install(FILES
40
+ include/var_opt_sketch.hpp
41
+ include/var_opt_sketch_impl.hpp
42
+ include/var_opt_union.hpp
43
+ include/var_opt_union_impl.hpp
42
44
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
43
-
44
- target_sources(sampling
45
- INTERFACE
46
- ${CMAKE_CURRENT_SOURCE_DIR}/include/var_opt_sketch.hpp
47
- ${CMAKE_CURRENT_SOURCE_DIR}/include/var_opt_sketch_impl.hpp
48
- )
@@ -731,8 +731,10 @@ void var_opt_sketch<T,S,A>::update(T&& item, double weight) {
731
731
 
732
732
  template<typename T, typename S, typename A>
733
733
  string<A> var_opt_sketch<T,S,A>::to_string() const {
734
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
735
- os << "### VarOpt SUMMARY: " << std::endl;
734
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
735
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
736
+ std::ostringstream os;
737
+ os << "### VarOpt SUMMARY:" << std::endl;
736
738
  os << " k : " << k_ << std::endl;
737
739
  os << " h : " << h_ << std::endl;
738
740
  os << " r : " << r_ << std::endl;
@@ -740,24 +742,28 @@ string<A> var_opt_sketch<T,S,A>::to_string() const {
740
742
  os << " Current size : " << curr_items_alloc_ << std::endl;
741
743
  os << " Resize factor: " << (1 << rf_) << std::endl;
742
744
  os << "### END SKETCH SUMMARY" << std::endl;
743
- return os.str();
745
+ return string<A>(os.str().c_str(), allocator_);
744
746
  }
745
747
 
746
748
  template<typename T, typename S, typename A>
747
749
  string<A> var_opt_sketch<T,S,A>::items_to_string() const {
748
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
750
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
751
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
752
+ std::ostringstream os;
749
753
  os << "### Sketch Items" << std::endl;
750
754
  int idx = 0;
751
755
  for (auto record : *this) {
752
756
  os << idx << ": " << record.first << "\twt = " << record.second << std::endl;
753
757
  ++idx;
754
758
  }
755
- return os.str();
759
+ return string<A>(os.str().c_str(), allocator_);
756
760
  }
757
761
 
758
762
  template<typename T, typename S, typename A>
759
763
  string<A> var_opt_sketch<T,S,A>::items_to_string(bool print_gap) const {
760
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
764
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
765
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
766
+ std::ostringstream os;
761
767
  os << "### Sketch Items" << std::endl;
762
768
  const uint32_t array_length = (n_ < k_ ? n_ : k_ + 1);
763
769
  for (uint32_t i = 0, display_idx = 0; i < array_length; ++i) {
@@ -774,7 +780,7 @@ string<A> var_opt_sketch<T,S,A>::items_to_string(bool print_gap) const {
774
780
  ++display_idx;
775
781
  }
776
782
  }
777
- return os.str();
783
+ return string<A>(os.str().c_str(), allocator_);
778
784
  }
779
785
 
780
786
  template<typename T, typename S, typename A>
@@ -295,14 +295,16 @@ void var_opt_union<T,S,A>::reset() {
295
295
 
296
296
  template<typename T, typename S, typename A>
297
297
  string<A> var_opt_union<T,S,A>::to_string() const {
298
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
299
- os << "### VarOpt Union SUMMARY: " << std::endl;
300
- os << " . n : " << n_ << std::endl;
298
+ // Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
299
+ // The stream does not support passing an allocator instance, and alternatives are complicated.
300
+ std::ostringstream os;
301
+ os << "### VarOpt Union SUMMARY:" << std::endl;
302
+ os << " n : " << n_ << std::endl;
301
303
  os << " Max k : " << max_k_ << std::endl;
302
- os << " Gadget Summary: " << std::endl;
304
+ os << " Gadget Summary:" << std::endl;
303
305
  os << gadget_.to_string();
304
- os << "### END VarOpt Union SUMMARY: " << std::endl;
305
- return os.str();
306
+ os << "### END VarOpt Union SUMMARY" << std::endl;
307
+ return string<A>(os.str().c_str(), gadget_.allocator_);
306
308
  }
307
309
 
308
310
  template<typename T, typename S, typename A>
@@ -81,7 +81,7 @@ class CMakeBuild(build_ext):
81
81
 
82
82
  setup(
83
83
  name='datasketches',
84
- version='3.2.0.1',
84
+ version='3.3.0',
85
85
  author='Apache Software Foundation',
86
86
  author_email='dev@datasketches.apache.org',
87
87
  description='The Apache DataSketches Library for Python',
@@ -32,53 +32,34 @@ target_include_directories(theta
32
32
  target_link_libraries(theta INTERFACE common)
33
33
  target_compile_features(theta INTERFACE cxx_std_11)
34
34
 
35
- set(theta_HEADERS "")
36
- list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_sketch_impl.hpp")
37
- list(APPEND theta_HEADERS "include/theta_union.hpp;include/theta_union_impl.hpp")
38
- list(APPEND theta_HEADERS "include/theta_intersection.hpp;include/theta_intersection_impl.hpp")
39
- list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/theta_a_not_b_impl.hpp")
40
- list(APPEND theta_HEADERS "include/theta_jaccard_similarity.hpp")
41
- list(APPEND theta_HEADERS "include/theta_comparators.hpp")
42
- list(APPEND theta_HEADERS "include/theta_constants.hpp")
43
- list(APPEND theta_HEADERS "include/theta_helpers.hpp")
44
- list(APPEND theta_HEADERS "include/theta_update_sketch_base.hpp;include/theta_update_sketch_base_impl.hpp")
45
- list(APPEND theta_HEADERS "include/theta_union_base.hpp;include/theta_union_base_impl.hpp")
46
- list(APPEND theta_HEADERS "include/theta_intersection_base.hpp;include/theta_intersection_base_impl.hpp")
47
- list(APPEND theta_HEADERS "include/theta_set_difference_base.hpp;include/theta_set_difference_base_impl.hpp")
48
- list(APPEND theta_HEADERS "include/theta_jaccard_similarity_base.hpp")
49
- list(APPEND theta_HEADERS "include/bounds_on_ratios_in_sampled_sets.hpp")
50
- list(APPEND theta_HEADERS "include/bounds_on_ratios_in_theta_sketched_sets.hpp")
51
-
52
35
  install(TARGETS theta
53
36
  EXPORT ${PROJECT_NAME}
54
37
  )
55
38
 
56
- install(FILES ${theta_HEADERS}
39
+ install(FILES
40
+ include/theta_sketch.hpp
41
+ include/theta_sketch_impl.hpp
42
+ include/theta_union.hpp
43
+ include/theta_union_impl.hpp
44
+ include/theta_intersection.hpp
45
+ include/theta_intersection_impl.hpp
46
+ include/theta_a_not_b.hpp
47
+ include/theta_a_not_b_impl.hpp
48
+ include/theta_jaccard_similarity.hpp
49
+ include/theta_comparators.hpp
50
+ include/theta_constants.hpp
51
+ include/theta_helpers.hpp
52
+ include/theta_update_sketch_base.hpp
53
+ include/theta_update_sketch_base_impl.hpp
54
+ include/theta_union_base.hpp
55
+ include/theta_union_base_impl.hpp
56
+ include/theta_intersection_base.hpp
57
+ include/theta_intersection_base_impl.hpp
58
+ include/theta_set_difference_base.hpp
59
+ include/theta_set_difference_base_impl.hpp
60
+ include/theta_jaccard_similarity_base.hpp
61
+ include/bounds_on_ratios_in_sampled_sets.hpp
62
+ include/bounds_on_ratios_in_theta_sketched_sets.hpp
63
+ include/compact_theta_sketch_parser.hpp
64
+ include/compact_theta_sketch_parser_impl.hpp
57
65
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
58
-
59
- target_sources(theta
60
- INTERFACE
61
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch.hpp
62
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union.hpp
63
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection.hpp
64
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b.hpp
65
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch_impl.hpp
66
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_impl.hpp
67
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_impl.hpp
68
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b_impl.hpp
69
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity.hpp
70
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_comparators.hpp
71
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_constants.hpp
72
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_helpers.hpp
73
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base.hpp
74
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base_impl.hpp
75
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base.hpp
76
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base_impl.hpp
77
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base.hpp
78
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base_impl.hpp
79
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base.hpp
80
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base_impl.hpp
81
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity_base.hpp
82
- ${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_sampled_sets.hpp
83
- ${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_theta_sketched_sets.hpp
84
- )
@@ -29,32 +29,99 @@ template<bool dummy>
29
29
  auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
30
30
  if (size < 8) throw std::invalid_argument("at least 8 bytes expected, actual " + std::to_string(size)
31
31
  + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
32
- checker<true>::check_serial_version(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE], COMPACT_SKETCH_SERIAL_VERSION);
33
- checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
34
- uint64_t theta = theta_constants::MAX_THETA;
35
- const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
36
- checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
37
- if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_EMPTY_FLAG)) {
38
- return {true, true, seed_hash, 0, theta, nullptr};
32
+
33
+ uint8_t serial_version = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE];
34
+
35
+ switch(serial_version) {
36
+ case COMPACT_SKETCH_SERIAL_VERSION: {
37
+ checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
38
+ uint64_t theta = theta_constants::MAX_THETA;
39
+ const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
40
+ if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_EMPTY_FLAG)) {
41
+ return {true, true, seed_hash, 0, theta, nullptr};
42
+ }
43
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
44
+ const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
45
+ if (has_theta) {
46
+ if (size < 16) throw std::invalid_argument("at least 16 bytes expected, actual " + std::to_string(size));
47
+ theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
48
+ }
49
+ if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] == 1) {
50
+ return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
51
+ }
52
+ const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
53
+ const size_t entries_start_u64 = has_theta ? COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 : COMPACT_SKETCH_ENTRIES_EXACT_U64;
54
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + entries_start_u64;
55
+ const size_t expected_size_bytes = (entries_start_u64 + num_entries) * sizeof(uint64_t);
56
+ if (size < expected_size_bytes) {
57
+ throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
58
+ + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
59
+ }
60
+ const bool is_ordered = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_ORDERED_FLAG);
61
+ return {false, is_ordered, seed_hash, num_entries, theta, entries};
39
62
  }
40
- const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
41
- if (has_theta) {
42
- if (size < 16) throw std::invalid_argument("at least 16 bytes expected, actual " + std::to_string(size));
43
- theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
63
+ case 1: {
64
+ uint16_t seed_hash = compute_seed_hash(seed);
65
+ checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
66
+ const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
67
+ uint64_t theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
68
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
69
+ if (is_empty) {
70
+ return {true, true, seed_hash, 0, theta, nullptr};
71
+ }
72
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
73
+ const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
74
+ if (size < expected_size_bytes) {
75
+ throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
76
+ + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
77
+ }
78
+ return {false, true, seed_hash, num_entries, theta, entries};
44
79
  }
45
- if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] == 1) {
46
- return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
80
+ case 2: {
81
+ uint8_t preamble_size = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE];
82
+ checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
83
+ const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
84
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
85
+ if (preamble_size == 1) {
86
+ return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr};
87
+ } else if (preamble_size == 2) {
88
+ const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
89
+ if (num_entries == 0) {
90
+ return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr};
91
+ } else {
92
+ const size_t expected_size_bytes = (preamble_size + num_entries) << 3;
93
+ if (size < expected_size_bytes) {
94
+ throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
95
+ + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
96
+ }
97
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_EXACT_U64;
98
+ return {false, true, seed_hash, num_entries, theta_constants::MAX_THETA, entries};
99
+ }
100
+ } else if (preamble_size == 3) {
101
+ const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
102
+ uint64_t theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
103
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
104
+ if (is_empty) {
105
+ return {true, true, seed_hash, 0, theta, nullptr};
106
+ }
107
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
108
+ const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
109
+ if (size < expected_size_bytes) {
110
+ throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
111
+ + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
112
+ }
113
+ return {false, true, seed_hash, num_entries, theta, entries};
114
+ } else {
115
+ throw std::invalid_argument(std::to_string(preamble_size) + " longs of premable, but expected 1, 2, or 3");
116
+ }
47
117
  }
48
- const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
49
- const size_t entries_start_u64 = has_theta ? COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 : COMPACT_SKETCH_ENTRIES_EXACT_U64;
50
- const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + entries_start_u64;
51
- const size_t expected_size_bytes = (entries_start_u64 + num_entries) * sizeof(uint64_t);
52
- if (size < expected_size_bytes) {
53
- throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
54
- + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
118
+ default:
119
+ // this should always fail since the valid cases are handled above
120
+ checker<true>::check_serial_version(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE], COMPACT_SKETCH_SERIAL_VERSION);
121
+ // this throw is never reached, because check_serial_version will throw an informative exception.
122
+ // This is only here to avoid a compiler warning about a path without a return value.
123
+ throw std::invalid_argument("unexpected sketch serialization version");
55
124
  }
56
- const bool is_ordered = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_ORDERED_FLAG);
57
- return {false, is_ordered, seed_hash, num_entries, theta, entries};
58
125
  }
59
126
 
60
127
  template<bool dummy>
@@ -49,6 +49,21 @@ public:
49
49
  }
50
50
  };
51
51
 
52
+ template<bool dummy>
53
+ class theta_build_helper{
54
+ public:
55
+ // consistent way of initializing theta from p
56
+ // avoids multiplication if p == 1 since it might not yield MAX_THETA exactly
57
+ static uint64_t starting_theta_from_p(float p) {
58
+ if (p < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p);
59
+ return theta_constants::MAX_THETA;
60
+ }
61
+
62
+ static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
63
+ return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
64
+ }
65
+ };
66
+
52
67
  } /* namespace datasketches */
53
68
 
54
69
  #endif
@@ -29,7 +29,7 @@ template<typename EN, typename EK, typename P, typename S, typename CS, typename
29
29
  theta_intersection_base<EN, EK, P, S, CS, A>::theta_intersection_base(uint64_t seed, const P& policy, const A& allocator):
30
30
  policy_(policy),
31
31
  is_valid_(false),
32
- table_(0, 0, resize_factor::X1, theta_constants::MAX_THETA, seed, allocator, false)
32
+ table_(0, 0, resize_factor::X1, 1, theta_constants::MAX_THETA, seed, allocator, false)
33
33
  {}
34
34
 
35
35
  template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
@@ -38,17 +38,17 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
38
38
  if (table_.is_empty_) return;
39
39
  if (!sketch.is_empty() && sketch.get_seed_hash() != compute_seed_hash(table_.seed_)) throw std::invalid_argument("seed hash mismatch");
40
40
  table_.is_empty_ |= sketch.is_empty();
41
- table_.theta_ = std::min(table_.theta_, sketch.get_theta64());
41
+ table_.theta_ = table_.is_empty_ ? theta_constants::MAX_THETA : std::min(table_.theta_, sketch.get_theta64());
42
42
  if (is_valid_ && table_.num_entries_ == 0) return;
43
43
  if (sketch.get_num_retained() == 0) {
44
44
  is_valid_ = true;
45
- table_ = hash_table(0, 0, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
45
+ table_ = hash_table(0, 0, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
46
46
  return;
47
47
  }
48
48
  if (!is_valid_) { // first update, copy or move incoming sketch
49
49
  is_valid_ = true;
50
50
  const uint8_t lg_size = lg_size_from_count(sketch.get_num_retained(), theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
51
- table_ = hash_table(lg_size, lg_size, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
51
+ table_ = hash_table(lg_size, lg_size, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
52
52
  for (auto& entry: sketch) {
53
53
  auto result = table_.find(EK()(entry));
54
54
  if (result.second) {
@@ -83,11 +83,11 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
83
83
  throw std::invalid_argument(" fewer keys than expected, possibly corrupted input sketch");
84
84
  }
85
85
  if (match_count == 0) {
86
- table_ = hash_table(0, 0, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
86
+ table_ = hash_table(0, 0, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
87
87
  if (table_.theta_ == theta_constants::MAX_THETA) table_.is_empty_ = true;
88
88
  } else {
89
89
  const uint8_t lg_size = lg_size_from_count(match_count, theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
90
- table_ = hash_table(lg_size, lg_size, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
90
+ table_ = hash_table(lg_size, lg_size, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
91
91
  for (uint32_t i = 0; i < match_count; i++) {
92
92
  auto result = table_.find(EK()(matched_entries[i]));
93
93
  table_.insert(result.first, std::move(matched_entries[i]));
@@ -36,7 +36,7 @@ seed_hash_(compute_seed_hash(seed))
36
36
  template<typename EN, typename EK, typename CS, typename A>
37
37
  template<typename FwdSketch, typename Sketch>
38
38
  CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch& b, bool ordered) const {
39
- if (a.is_empty() || a.get_num_retained() == 0 || b.is_empty()) return CS(a, ordered);
39
+ if (a.is_empty() || (a.get_num_retained() > 0 && b.is_empty())) return CS(a, ordered);
40
40
  if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
41
41
  if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
42
42
 
@@ -53,7 +53,7 @@ CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch
53
53
  conditional_back_inserter(entries, key_less_than<uint64_t, EN, EK>(theta)), comparator());
54
54
  } else { // hash-based
55
55
  const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), hash_table::REBUILD_THRESHOLD);
56
- hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 0, 0, allocator_); // theta and seed are not used here
56
+ hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 1, 0, 0, allocator_); // theta and seed are not used here
57
57
  for (const auto& entry: b) {
58
58
  const uint64_t hash = EK()(entry);
59
59
  if (hash < theta) {
@@ -25,14 +25,10 @@
25
25
  namespace datasketches {
26
26
 
27
27
  template<typename Allocator = std::allocator<uint64_t>>
28
- class theta_sketch_alloc {
28
+ class base_theta_sketch_alloc {
29
29
  public:
30
- using Entry = uint64_t;
31
- using ExtractKey = trivial_extract_key;
32
- using iterator = theta_iterator<Entry, ExtractKey>;
33
- using const_iterator = theta_const_iterator<Entry, ExtractKey>;
34
30
 
35
- virtual ~theta_sketch_alloc() = default;
31
+ virtual ~base_theta_sketch_alloc() = default;
36
32
 
37
33
  /**
38
34
  * @return allocator
@@ -104,6 +100,21 @@ public:
104
100
  */
105
101
  virtual string<Allocator> to_string(bool print_items = false) const;
106
102
 
103
+ protected:
104
+ virtual void print_specifics(std::ostringstream& os) const = 0;
105
+ virtual void print_items(std::ostringstream& os) const = 0;
106
+ };
107
+
108
+ template<typename Allocator = std::allocator<uint64_t>>
109
+ class theta_sketch_alloc: public base_theta_sketch_alloc<Allocator> {
110
+ public:
111
+ using Entry = uint64_t;
112
+ using ExtractKey = trivial_extract_key;
113
+ using iterator = theta_iterator<Entry, ExtractKey>;
114
+ using const_iterator = theta_const_iterator<Entry, ExtractKey>;
115
+
116
+ virtual ~theta_sketch_alloc() = default;
117
+
107
118
  /**
108
119
  * Iterator over hash values in this sketch.
109
120
  * @return begin iterator
@@ -131,8 +142,7 @@ public:
131
142
  virtual const_iterator end() const = 0;
132
143
 
133
144
  protected:
134
- using ostrstream = std::basic_ostringstream<char, std::char_traits<char>, AllocChar<Allocator>>;
135
- virtual void print_specifics(ostrstream& os) const = 0;
145
+ virtual void print_items(std::ostringstream& os) const;
136
146
  };
137
147
 
138
148
  // forward declaration
@@ -269,6 +279,11 @@ public:
269
279
  */
270
280
  void trim();
271
281
 
282
+ /**
283
+ * Reset the sketch to the initial empty state
284
+ */
285
+ void reset();
286
+
272
287
  /**
273
288
  * Converts this sketch to a compact sketch (ordered or unordered).
274
289
  * @param ordered optional flag to specify if ordered sketch should be produced
@@ -285,11 +300,10 @@ private:
285
300
  theta_table table_;
286
301
 
287
302
  // for builder
288
- update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta,
289
- uint64_t seed, const Allocator& allocator);
303
+ update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
304
+ uint64_t theta, uint64_t seed, const Allocator& allocator);
290
305
 
291
- using ostrstream = typename Base::ostrstream;
292
- virtual void print_specifics(ostrstream& os) const;
306
+ virtual void print_specifics(std::ostringstream& os) const;
293
307
  };
294
308
 
295
309
  // compact sketch
@@ -377,8 +391,7 @@ private:
377
391
  uint64_t theta_;
378
392
  std::vector<uint64_t, Allocator> entries_;
379
393
 
380
- using ostrstream = typename Base::ostrstream;
381
- virtual void print_specifics(ostrstream& os) const;
394
+ virtual void print_specifics(std::ostringstream& os) const;
382
395
  };
383
396
 
384
397
  template<typename Allocator>
@@ -392,7 +405,7 @@ public:
392
405
  // It does not take the ownership of the buffer.
393
406
 
394
407
  template<typename Allocator = std::allocator<uint64_t>>
395
- class wrapped_compact_theta_sketch_alloc {
408
+ class wrapped_compact_theta_sketch_alloc : public base_theta_sketch_alloc<Allocator> {
396
409
  public:
397
410
  using const_iterator = const uint64_t*;
398
411
 
@@ -415,6 +428,10 @@ public:
415
428
  */
416
429
  static const wrapped_compact_theta_sketch_alloc wrap(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, bool dump_on_error = false);
417
430
 
431
+ protected:
432
+ virtual void print_specifics(std::ostringstream& os) const;
433
+ virtual void print_items(std::ostringstream& os) const;
434
+
418
435
  private:
419
436
  bool is_empty_;
420
437
  bool is_ordered_;