datasketches 0.2.2 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +12 -0
  3. data/LICENSE +40 -3
  4. data/NOTICE +1 -1
  5. data/README.md +8 -8
  6. data/ext/datasketches/kll_wrapper.cpp +5 -1
  7. data/ext/datasketches/theta_wrapper.cpp +20 -4
  8. data/lib/datasketches/version.rb +1 -1
  9. data/vendor/datasketches-cpp/CMakeLists.txt +27 -5
  10. data/vendor/datasketches-cpp/LICENSE +40 -3
  11. data/vendor/datasketches-cpp/MANIFEST.in +3 -0
  12. data/vendor/datasketches-cpp/NOTICE +1 -1
  13. data/vendor/datasketches-cpp/README.md +76 -9
  14. data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
  15. data/vendor/datasketches-cpp/common/CMakeLists.txt +18 -13
  16. data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
  17. data/vendor/datasketches-cpp/common/include/common_defs.hpp +16 -0
  18. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
  19. data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
  20. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
  21. data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
  22. data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
  23. data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
  24. data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
  25. data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
  26. data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
  27. data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +1 -1
  28. data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
  29. data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
  30. data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +10 -6
  31. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
  32. data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
  33. data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
  34. data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
  35. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
  36. data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +30 -12
  37. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
  38. data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
  39. data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
  40. data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
  41. data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
  42. data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
  43. data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
  44. data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
  45. data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
  46. data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
  47. data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
  48. data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
  49. data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
  50. data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
  51. data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
  52. data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
  53. data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
  54. data/vendor/datasketches-cpp/kll/CMakeLists.txt +5 -19
  55. data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
  56. data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
  57. data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +103 -44
  58. data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +110 -130
  59. data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +156 -23
  60. data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
  61. data/vendor/datasketches-cpp/pyproject.toml +4 -2
  62. data/vendor/datasketches-cpp/python/CMakeLists.txt +17 -6
  63. data/vendor/datasketches-cpp/python/README.md +57 -50
  64. data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
  65. data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
  66. data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
  67. data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
  68. data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +49 -14
  69. data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
  70. data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
  71. data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
  72. data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +2 -2
  73. data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +12 -5
  74. data/vendor/datasketches-cpp/python/tests/kll_test.py +12 -6
  75. data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
  76. data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
  77. data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
  78. data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
  79. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
  80. data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
  81. data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
  82. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
  83. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
  84. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
  85. data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
  86. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
  87. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
  88. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
  89. data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
  90. data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
  91. data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
  92. data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
  93. data/vendor/datasketches-cpp/req/CMakeLists.txt +6 -21
  94. data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
  95. data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
  96. data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
  97. data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +66 -61
  98. data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
  99. data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
  100. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +54 -12
  101. data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +45 -34
  102. data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
  103. data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +33 -15
  104. data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
  105. data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
  106. data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
  107. data/vendor/datasketches-cpp/setup.py +10 -7
  108. data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
  109. data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
  110. data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +92 -23
  111. data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
  112. data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
  113. data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +7 -6
  114. data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +3 -2
  115. data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
  116. data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +150 -93
  117. data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
  118. data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
  119. data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -2
  120. data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
  121. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +9 -5
  122. data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +39 -10
  123. data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
  124. data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
  125. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
  126. data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
  127. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
  128. data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
  129. data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
  130. data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
  131. data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +446 -0
  132. data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +429 -1
  133. data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -11
  134. data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
  135. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
  136. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
  137. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
  138. data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
  139. data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +29 -9
  140. data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +34 -14
  141. data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
  142. data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
  143. data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
  144. data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
  145. data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
  146. data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
  147. data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +8 -0
  148. metadata +34 -12
  149. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
  150. data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
  151. data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
  152. data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
  153. data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
  154. data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -32,53 +32,34 @@ target_include_directories(theta
32
32
  target_link_libraries(theta INTERFACE common)
33
33
  target_compile_features(theta INTERFACE cxx_std_11)
34
34
 
35
- set(theta_HEADERS "")
36
- list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_sketch_impl.hpp")
37
- list(APPEND theta_HEADERS "include/theta_union.hpp;include/theta_union_impl.hpp")
38
- list(APPEND theta_HEADERS "include/theta_intersection.hpp;include/theta_intersection_impl.hpp")
39
- list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/theta_a_not_b_impl.hpp")
40
- list(APPEND theta_HEADERS "include/theta_jaccard_similarity.hpp")
41
- list(APPEND theta_HEADERS "include/theta_comparators.hpp")
42
- list(APPEND theta_HEADERS "include/theta_constants.hpp")
43
- list(APPEND theta_HEADERS "include/theta_helpers.hpp")
44
- list(APPEND theta_HEADERS "include/theta_update_sketch_base.hpp;include/theta_update_sketch_base_impl.hpp")
45
- list(APPEND theta_HEADERS "include/theta_union_base.hpp;include/theta_union_base_impl.hpp")
46
- list(APPEND theta_HEADERS "include/theta_intersection_base.hpp;include/theta_intersection_base_impl.hpp")
47
- list(APPEND theta_HEADERS "include/theta_set_difference_base.hpp;include/theta_set_difference_base_impl.hpp")
48
- list(APPEND theta_HEADERS "include/theta_jaccard_similarity_base.hpp")
49
- list(APPEND theta_HEADERS "include/bounds_on_ratios_in_sampled_sets.hpp")
50
- list(APPEND theta_HEADERS "include/bounds_on_ratios_in_theta_sketched_sets.hpp")
51
-
52
35
  install(TARGETS theta
53
36
  EXPORT ${PROJECT_NAME}
54
37
  )
55
38
 
56
- install(FILES ${theta_HEADERS}
39
+ install(FILES
40
+ include/theta_sketch.hpp
41
+ include/theta_sketch_impl.hpp
42
+ include/theta_union.hpp
43
+ include/theta_union_impl.hpp
44
+ include/theta_intersection.hpp
45
+ include/theta_intersection_impl.hpp
46
+ include/theta_a_not_b.hpp
47
+ include/theta_a_not_b_impl.hpp
48
+ include/theta_jaccard_similarity.hpp
49
+ include/theta_comparators.hpp
50
+ include/theta_constants.hpp
51
+ include/theta_helpers.hpp
52
+ include/theta_update_sketch_base.hpp
53
+ include/theta_update_sketch_base_impl.hpp
54
+ include/theta_union_base.hpp
55
+ include/theta_union_base_impl.hpp
56
+ include/theta_intersection_base.hpp
57
+ include/theta_intersection_base_impl.hpp
58
+ include/theta_set_difference_base.hpp
59
+ include/theta_set_difference_base_impl.hpp
60
+ include/theta_jaccard_similarity_base.hpp
61
+ include/bounds_on_ratios_in_sampled_sets.hpp
62
+ include/bounds_on_ratios_in_theta_sketched_sets.hpp
63
+ include/compact_theta_sketch_parser.hpp
64
+ include/compact_theta_sketch_parser_impl.hpp
57
65
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
58
-
59
- target_sources(theta
60
- INTERFACE
61
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch.hpp
62
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union.hpp
63
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection.hpp
64
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b.hpp
65
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch_impl.hpp
66
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_impl.hpp
67
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_impl.hpp
68
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b_impl.hpp
69
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity.hpp
70
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_comparators.hpp
71
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_constants.hpp
72
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_helpers.hpp
73
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base.hpp
74
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base_impl.hpp
75
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base.hpp
76
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base_impl.hpp
77
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base.hpp
78
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base_impl.hpp
79
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base.hpp
80
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base_impl.hpp
81
- ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity_base.hpp
82
- ${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_sampled_sets.hpp
83
- ${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_theta_sketched_sets.hpp
84
- )
@@ -22,6 +22,7 @@
22
22
 
23
23
  #include <cstdint>
24
24
  #include <string>
25
+ #include <stdexcept>
25
26
 
26
27
  #include "bounds_binomial_proportions.hpp"
27
28
 
@@ -22,39 +22,108 @@
22
22
 
23
23
  #include <iostream>
24
24
  #include <iomanip>
25
+ #include <stdexcept>
25
26
 
26
27
  namespace datasketches {
27
28
 
28
29
  template<bool dummy>
29
30
  auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
30
- if (size < 8) throw std::invalid_argument("at least 8 bytes expected, actual " + std::to_string(size)
31
+ if (size < 8) throw std::out_of_range("at least 8 bytes expected, actual " + std::to_string(size)
31
32
  + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
32
- checker<true>::check_serial_version(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE], COMPACT_SKETCH_SERIAL_VERSION);
33
- checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
34
- uint64_t theta = theta_constants::MAX_THETA;
35
- const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
36
- checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
37
- if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_EMPTY_FLAG)) {
38
- return {true, true, seed_hash, 0, theta, nullptr};
33
+
34
+ uint8_t serial_version = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE];
35
+
36
+ switch(serial_version) {
37
+ case COMPACT_SKETCH_SERIAL_VERSION: {
38
+ checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
39
+ uint64_t theta = theta_constants::MAX_THETA;
40
+ const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
41
+ if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_EMPTY_FLAG)) {
42
+ return {true, true, seed_hash, 0, theta, nullptr};
43
+ }
44
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
45
+ const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
46
+ if (has_theta) {
47
+ if (size < 16) throw std::out_of_range("at least 16 bytes expected, actual " + std::to_string(size));
48
+ theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
49
+ }
50
+ if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] == 1) {
51
+ if (size < 16) throw std::out_of_range("at least 16 bytes expected, actual " + std::to_string(size));
52
+ return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
53
+ }
54
+ const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
55
+ const size_t entries_start_u64 = has_theta ? COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 : COMPACT_SKETCH_ENTRIES_EXACT_U64;
56
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + entries_start_u64;
57
+ const size_t expected_size_bytes = (entries_start_u64 + num_entries) * sizeof(uint64_t);
58
+ if (size < expected_size_bytes) {
59
+ throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
60
+ + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
61
+ }
62
+ const bool is_ordered = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_ORDERED_FLAG);
63
+ return {false, is_ordered, seed_hash, num_entries, theta, entries};
39
64
  }
40
- const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
41
- if (has_theta) {
42
- if (size < 16) throw std::invalid_argument("at least 16 bytes expected, actual " + std::to_string(size));
43
- theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
65
+ case 1: {
66
+ uint16_t seed_hash = compute_seed_hash(seed);
67
+ checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
68
+ const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
69
+ uint64_t theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
70
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
71
+ if (is_empty) {
72
+ return {true, true, seed_hash, 0, theta, nullptr};
73
+ }
74
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
75
+ const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
76
+ if (size < expected_size_bytes) {
77
+ throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
78
+ + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
79
+ }
80
+ return {false, true, seed_hash, num_entries, theta, entries};
44
81
  }
45
- if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] == 1) {
46
- return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
82
+ case 2: {
83
+ uint8_t preamble_size = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE];
84
+ checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
85
+ const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
86
+ checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
87
+ if (preamble_size == 1) {
88
+ return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr};
89
+ } else if (preamble_size == 2) {
90
+ const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
91
+ if (num_entries == 0) {
92
+ return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr};
93
+ } else {
94
+ const size_t expected_size_bytes = (preamble_size + num_entries) << 3;
95
+ if (size < expected_size_bytes) {
96
+ throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
97
+ + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
98
+ }
99
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_EXACT_U64;
100
+ return {false, true, seed_hash, num_entries, theta_constants::MAX_THETA, entries};
101
+ }
102
+ } else if (preamble_size == 3) {
103
+ const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
104
+ uint64_t theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
105
+ bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
106
+ if (is_empty) {
107
+ return {true, true, seed_hash, 0, theta, nullptr};
108
+ }
109
+ const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
110
+ const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
111
+ if (size < expected_size_bytes) {
112
+ throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
113
+ + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
114
+ }
115
+ return {false, true, seed_hash, num_entries, theta, entries};
116
+ } else {
117
+ throw std::invalid_argument(std::to_string(preamble_size) + " longs of premable, but expected 1, 2, or 3");
118
+ }
47
119
  }
48
- const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
49
- const size_t entries_start_u64 = has_theta ? COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 : COMPACT_SKETCH_ENTRIES_EXACT_U64;
50
- const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + entries_start_u64;
51
- const size_t expected_size_bytes = (entries_start_u64 + num_entries) * sizeof(uint64_t);
52
- if (size < expected_size_bytes) {
53
- throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
54
- + (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
120
+ default:
121
+ // this should always fail since the valid cases are handled above
122
+ checker<true>::check_serial_version(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE], COMPACT_SKETCH_SERIAL_VERSION);
123
+ // this throw is never reached, because check_serial_version will throw an informative exception.
124
+ // This is only here to avoid a compiler warning about a path without a return value.
125
+ throw std::invalid_argument("unexpected sketch serialization version");
55
126
  }
56
- const bool is_ordered = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_ORDERED_FLAG);
57
- return {false, is_ordered, seed_hash, num_entries, theta, entries};
58
127
  }
59
128
 
60
129
  template<bool dummy>
@@ -21,14 +21,19 @@
21
21
  #define THETA_CONSTANTS_HPP_
22
22
 
23
23
  #include <climits>
24
+ #include "common_defs.hpp"
24
25
 
25
26
  namespace datasketches {
26
27
 
27
28
  namespace theta_constants {
28
- enum resize_factor { X1, X2, X4, X8 };
29
- static const uint64_t MAX_THETA = LLONG_MAX; // signed max for compatibility with Java
30
- static const uint8_t MIN_LG_K = 5;
31
- static const uint8_t MAX_LG_K = 26;
29
+ using resize_factor = datasketches::resize_factor;
30
+ //enum resize_factor { X1, X2, X4, X8 };
31
+ const uint64_t MAX_THETA = LLONG_MAX; // signed max for compatibility with Java
32
+ const uint8_t MIN_LG_K = 5;
33
+ const uint8_t MAX_LG_K = 26;
34
+
35
+ const uint8_t DEFAULT_LG_K = 12;
36
+ const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8;
32
37
  }
33
38
 
34
39
  } /* namespace datasketches */
@@ -49,6 +49,21 @@ public:
49
49
  }
50
50
  };
51
51
 
52
+ template<bool dummy>
53
+ class theta_build_helper{
54
+ public:
55
+ // consistent way of initializing theta from p
56
+ // avoids multiplication if p == 1 since it might not yield MAX_THETA exactly
57
+ static uint64_t starting_theta_from_p(float p) {
58
+ if (p < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p);
59
+ return theta_constants::MAX_THETA;
60
+ }
61
+
62
+ static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
63
+ return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
64
+ }
65
+ };
66
+
52
67
  } /* namespace datasketches */
53
68
 
54
69
  #endif
@@ -20,6 +20,7 @@
20
20
  #include <iostream>
21
21
  #include <sstream>
22
22
  #include <algorithm>
23
+ #include <stdexcept>
23
24
 
24
25
  #include "conditional_forward.hpp"
25
26
 
@@ -29,7 +30,7 @@ template<typename EN, typename EK, typename P, typename S, typename CS, typename
29
30
  theta_intersection_base<EN, EK, P, S, CS, A>::theta_intersection_base(uint64_t seed, const P& policy, const A& allocator):
30
31
  policy_(policy),
31
32
  is_valid_(false),
32
- table_(0, 0, resize_factor::X1, theta_constants::MAX_THETA, seed, allocator, false)
33
+ table_(0, 0, resize_factor::X1, 1, theta_constants::MAX_THETA, seed, allocator, false)
33
34
  {}
34
35
 
35
36
  template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
@@ -38,17 +39,17 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
38
39
  if (table_.is_empty_) return;
39
40
  if (!sketch.is_empty() && sketch.get_seed_hash() != compute_seed_hash(table_.seed_)) throw std::invalid_argument("seed hash mismatch");
40
41
  table_.is_empty_ |= sketch.is_empty();
41
- table_.theta_ = std::min(table_.theta_, sketch.get_theta64());
42
+ table_.theta_ = table_.is_empty_ ? theta_constants::MAX_THETA : std::min(table_.theta_, sketch.get_theta64());
42
43
  if (is_valid_ && table_.num_entries_ == 0) return;
43
44
  if (sketch.get_num_retained() == 0) {
44
45
  is_valid_ = true;
45
- table_ = hash_table(0, 0, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
46
+ table_ = hash_table(0, 0, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
46
47
  return;
47
48
  }
48
49
  if (!is_valid_) { // first update, copy or move incoming sketch
49
50
  is_valid_ = true;
50
51
  const uint8_t lg_size = lg_size_from_count(sketch.get_num_retained(), theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
51
- table_ = hash_table(lg_size, lg_size, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
52
+ table_ = hash_table(lg_size, lg_size, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
52
53
  for (auto& entry: sketch) {
53
54
  auto result = table_.find(EK()(entry));
54
55
  if (result.second) {
@@ -83,11 +84,11 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
83
84
  throw std::invalid_argument(" fewer keys than expected, possibly corrupted input sketch");
84
85
  }
85
86
  if (match_count == 0) {
86
- table_ = hash_table(0, 0, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
87
+ table_ = hash_table(0, 0, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
87
88
  if (table_.theta_ == theta_constants::MAX_THETA) table_.is_empty_ = true;
88
89
  } else {
89
90
  const uint8_t lg_size = lg_size_from_count(match_count, theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
90
- table_ = hash_table(lg_size, lg_size, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
91
+ table_ = hash_table(lg_size, lg_size, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
91
92
  for (uint32_t i = 0; i < match_count; i++) {
92
93
  auto result = table_.find(EK()(matched_entries[i]));
93
94
  table_.insert(result.first, std::move(matched_entries[i]));
@@ -21,6 +21,7 @@
21
21
  #define THETA_A_SET_DIFFERENCE_BASE_IMPL_HPP_
22
22
 
23
23
  #include <algorithm>
24
+ #include <stdexcept>
24
25
 
25
26
  #include "conditional_back_inserter.hpp"
26
27
  #include "conditional_forward.hpp"
@@ -36,7 +37,7 @@ seed_hash_(compute_seed_hash(seed))
36
37
  template<typename EN, typename EK, typename CS, typename A>
37
38
  template<typename FwdSketch, typename Sketch>
38
39
  CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch& b, bool ordered) const {
39
- if (a.is_empty() || a.get_num_retained() == 0 || b.is_empty()) return CS(a, ordered);
40
+ if (a.is_empty() || (a.get_num_retained() > 0 && b.is_empty())) return CS(a, ordered);
40
41
  if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
41
42
  if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
42
43
 
@@ -53,7 +54,7 @@ CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch
53
54
  conditional_back_inserter(entries, key_less_than<uint64_t, EN, EK>(theta)), comparator());
54
55
  } else { // hash-based
55
56
  const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), hash_table::REBUILD_THRESHOLD);
56
- hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 0, 0, allocator_); // theta and seed are not used here
57
+ hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 1, 0, 0, allocator_); // theta and seed are not used here
57
58
  for (const auto& entry: b) {
58
59
  const uint64_t hash = EK()(entry);
59
60
  if (hash < theta) {
@@ -25,14 +25,10 @@
25
25
  namespace datasketches {
26
26
 
27
27
  template<typename Allocator = std::allocator<uint64_t>>
28
- class theta_sketch_alloc {
28
+ class base_theta_sketch_alloc {
29
29
  public:
30
- using Entry = uint64_t;
31
- using ExtractKey = trivial_extract_key;
32
- using iterator = theta_iterator<Entry, ExtractKey>;
33
- using const_iterator = theta_const_iterator<Entry, ExtractKey>;
34
30
 
35
- virtual ~theta_sketch_alloc() = default;
31
+ virtual ~base_theta_sketch_alloc() = default;
36
32
 
37
33
  /**
38
34
  * @return allocator
@@ -104,6 +100,21 @@ public:
104
100
  */
105
101
  virtual string<Allocator> to_string(bool print_items = false) const;
106
102
 
103
+ protected:
104
+ virtual void print_specifics(std::ostringstream& os) const = 0;
105
+ virtual void print_items(std::ostringstream& os) const = 0;
106
+ };
107
+
108
+ template<typename Allocator = std::allocator<uint64_t>>
109
+ class theta_sketch_alloc: public base_theta_sketch_alloc<Allocator> {
110
+ public:
111
+ using Entry = uint64_t;
112
+ using ExtractKey = trivial_extract_key;
113
+ using iterator = theta_iterator<Entry, ExtractKey>;
114
+ using const_iterator = theta_const_iterator<Entry, ExtractKey>;
115
+
116
+ virtual ~theta_sketch_alloc() = default;
117
+
107
118
  /**
108
119
  * Iterator over hash values in this sketch.
109
120
  * @return begin iterator
@@ -131,8 +142,7 @@ public:
131
142
  virtual const_iterator end() const = 0;
132
143
 
133
144
  protected:
134
- using ostrstream = std::basic_ostringstream<char, std::char_traits<char>, AllocChar<Allocator>>;
135
- virtual void print_specifics(ostrstream& os) const = 0;
145
+ virtual void print_items(std::ostringstream& os) const;
136
146
  };
137
147
 
138
148
  // forward declaration
@@ -269,6 +279,11 @@ public:
269
279
  */
270
280
  void trim();
271
281
 
282
+ /**
283
+ * Reset the sketch to the initial empty state
284
+ */
285
+ void reset();
286
+
272
287
  /**
273
288
  * Converts this sketch to a compact sketch (ordered or unordered).
274
289
  * @param ordered optional flag to specify if ordered sketch should be produced
@@ -285,11 +300,10 @@ private:
285
300
  theta_table table_;
286
301
 
287
302
  // for builder
288
- update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta,
289
- uint64_t seed, const Allocator& allocator);
303
+ update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
304
+ uint64_t theta, uint64_t seed, const Allocator& allocator);
290
305
 
291
- using ostrstream = typename Base::ostrstream;
292
- virtual void print_specifics(ostrstream& os) const;
306
+ virtual void print_specifics(std::ostringstream& os) const;
293
307
  };
294
308
 
295
309
  // compact sketch
@@ -377,8 +391,7 @@ private:
377
391
  uint64_t theta_;
378
392
  std::vector<uint64_t, Allocator> entries_;
379
393
 
380
- using ostrstream = typename Base::ostrstream;
381
- virtual void print_specifics(ostrstream& os) const;
394
+ virtual void print_specifics(std::ostringstream& os) const;
382
395
  };
383
396
 
384
397
  template<typename Allocator>
@@ -392,7 +405,7 @@ public:
392
405
  // It does not take the ownership of the buffer.
393
406
 
394
407
  template<typename Allocator = std::allocator<uint64_t>>
395
- class wrapped_compact_theta_sketch_alloc {
408
+ class wrapped_compact_theta_sketch_alloc : public base_theta_sketch_alloc<Allocator> {
396
409
  public:
397
410
  using const_iterator = const uint64_t*;
398
411
 
@@ -415,6 +428,10 @@ public:
415
428
  */
416
429
  static const wrapped_compact_theta_sketch_alloc wrap(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, bool dump_on_error = false);
417
430
 
431
+ protected:
432
+ virtual void print_specifics(std::ostringstream& os) const;
433
+ virtual void print_items(std::ostringstream& os) const;
434
+
418
435
  private:
419
436
  bool is_empty_;
420
437
  bool is_ordered_;