datasketches 0.2.2 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/LICENSE +40 -3
- data/NOTICE +1 -1
- data/README.md +8 -8
- data/ext/datasketches/kll_wrapper.cpp +5 -1
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +27 -5
- data/vendor/datasketches-cpp/LICENSE +40 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +18 -13
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +1 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +16 -0
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov.hpp +5 -3
- data/vendor/datasketches-cpp/{kll → common}/include/kolmogorov_smirnov_impl.hpp +13 -16
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view.hpp +121 -0
- data/vendor/datasketches-cpp/common/include/quantile_sketch_sorted_view_impl.hpp +91 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +2 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +10 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +1 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +1 -1
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +10 -6
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +2 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +37 -5
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +30 -12
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +2 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +1 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +2 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +2 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +1 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +59 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +2 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +1 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +5 -19
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +0 -4
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +3 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +103 -44
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +110 -130
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +156 -23
- data/vendor/datasketches-cpp/kll/test/kolmogorov_smirnov_test.cpp +1 -1
- data/vendor/datasketches-cpp/pyproject.toml +4 -2
- data/vendor/datasketches-cpp/python/CMakeLists.txt +17 -6
- data/vendor/datasketches-cpp/python/README.md +57 -50
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +3 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +1 -1
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +4 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +6 -1
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +49 -14
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +68 -0
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +240 -0
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +9 -2
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +2 -2
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +12 -5
- data/vendor/datasketches-cpp/python/tests/kll_test.py +12 -6
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +126 -0
- data/vendor/datasketches-cpp/python/tests/req_test.py +2 -2
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +4 -4
- data/vendor/datasketches-cpp/quantiles/CMakeLists.txt +42 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +641 -0
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +1309 -0
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n1000_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.3.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.6.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.0.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/Qk128_n50_v0.8.3.sk +0 -0
- data/vendor/datasketches-cpp/quantiles/test/kolmogorov_smirnov_test.cpp +110 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_compatibility_test.cpp +129 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +912 -0
- data/vendor/datasketches-cpp/req/CMakeLists.txt +6 -21
- data/vendor/datasketches-cpp/req/include/req_common.hpp +0 -5
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +3 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +62 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +66 -61
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +5 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +54 -12
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +45 -34
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +41 -6
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +33 -15
- data/vendor/datasketches-cpp/sampling/test/var_opt_allocation_test.cpp +2 -2
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +1 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +1 -0
- data/vendor/datasketches-cpp/setup.py +10 -7
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +92 -23
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +9 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +3 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +150 -93
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +9 -2
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +9 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +39 -10
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +2 -0
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +446 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +429 -1
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +25 -11
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +29 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +34 -14
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +1 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +8 -0
- metadata +34 -12
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +0 -75
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +0 -184
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator.hpp +0 -69
- data/vendor/datasketches-cpp/req/include/req_quantile_calculator_impl.hpp +0 -60
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
|
@@ -32,53 +32,34 @@ target_include_directories(theta
|
|
|
32
32
|
target_link_libraries(theta INTERFACE common)
|
|
33
33
|
target_compile_features(theta INTERFACE cxx_std_11)
|
|
34
34
|
|
|
35
|
-
set(theta_HEADERS "")
|
|
36
|
-
list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_sketch_impl.hpp")
|
|
37
|
-
list(APPEND theta_HEADERS "include/theta_union.hpp;include/theta_union_impl.hpp")
|
|
38
|
-
list(APPEND theta_HEADERS "include/theta_intersection.hpp;include/theta_intersection_impl.hpp")
|
|
39
|
-
list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/theta_a_not_b_impl.hpp")
|
|
40
|
-
list(APPEND theta_HEADERS "include/theta_jaccard_similarity.hpp")
|
|
41
|
-
list(APPEND theta_HEADERS "include/theta_comparators.hpp")
|
|
42
|
-
list(APPEND theta_HEADERS "include/theta_constants.hpp")
|
|
43
|
-
list(APPEND theta_HEADERS "include/theta_helpers.hpp")
|
|
44
|
-
list(APPEND theta_HEADERS "include/theta_update_sketch_base.hpp;include/theta_update_sketch_base_impl.hpp")
|
|
45
|
-
list(APPEND theta_HEADERS "include/theta_union_base.hpp;include/theta_union_base_impl.hpp")
|
|
46
|
-
list(APPEND theta_HEADERS "include/theta_intersection_base.hpp;include/theta_intersection_base_impl.hpp")
|
|
47
|
-
list(APPEND theta_HEADERS "include/theta_set_difference_base.hpp;include/theta_set_difference_base_impl.hpp")
|
|
48
|
-
list(APPEND theta_HEADERS "include/theta_jaccard_similarity_base.hpp")
|
|
49
|
-
list(APPEND theta_HEADERS "include/bounds_on_ratios_in_sampled_sets.hpp")
|
|
50
|
-
list(APPEND theta_HEADERS "include/bounds_on_ratios_in_theta_sketched_sets.hpp")
|
|
51
|
-
|
|
52
35
|
install(TARGETS theta
|
|
53
36
|
EXPORT ${PROJECT_NAME}
|
|
54
37
|
)
|
|
55
38
|
|
|
56
|
-
install(FILES
|
|
39
|
+
install(FILES
|
|
40
|
+
include/theta_sketch.hpp
|
|
41
|
+
include/theta_sketch_impl.hpp
|
|
42
|
+
include/theta_union.hpp
|
|
43
|
+
include/theta_union_impl.hpp
|
|
44
|
+
include/theta_intersection.hpp
|
|
45
|
+
include/theta_intersection_impl.hpp
|
|
46
|
+
include/theta_a_not_b.hpp
|
|
47
|
+
include/theta_a_not_b_impl.hpp
|
|
48
|
+
include/theta_jaccard_similarity.hpp
|
|
49
|
+
include/theta_comparators.hpp
|
|
50
|
+
include/theta_constants.hpp
|
|
51
|
+
include/theta_helpers.hpp
|
|
52
|
+
include/theta_update_sketch_base.hpp
|
|
53
|
+
include/theta_update_sketch_base_impl.hpp
|
|
54
|
+
include/theta_union_base.hpp
|
|
55
|
+
include/theta_union_base_impl.hpp
|
|
56
|
+
include/theta_intersection_base.hpp
|
|
57
|
+
include/theta_intersection_base_impl.hpp
|
|
58
|
+
include/theta_set_difference_base.hpp
|
|
59
|
+
include/theta_set_difference_base_impl.hpp
|
|
60
|
+
include/theta_jaccard_similarity_base.hpp
|
|
61
|
+
include/bounds_on_ratios_in_sampled_sets.hpp
|
|
62
|
+
include/bounds_on_ratios_in_theta_sketched_sets.hpp
|
|
63
|
+
include/compact_theta_sketch_parser.hpp
|
|
64
|
+
include/compact_theta_sketch_parser_impl.hpp
|
|
57
65
|
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
|
|
58
|
-
|
|
59
|
-
target_sources(theta
|
|
60
|
-
INTERFACE
|
|
61
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch.hpp
|
|
62
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union.hpp
|
|
63
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection.hpp
|
|
64
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b.hpp
|
|
65
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch_impl.hpp
|
|
66
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_impl.hpp
|
|
67
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_impl.hpp
|
|
68
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b_impl.hpp
|
|
69
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity.hpp
|
|
70
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_comparators.hpp
|
|
71
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_constants.hpp
|
|
72
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_helpers.hpp
|
|
73
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base.hpp
|
|
74
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base_impl.hpp
|
|
75
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base.hpp
|
|
76
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base_impl.hpp
|
|
77
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base.hpp
|
|
78
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base_impl.hpp
|
|
79
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base.hpp
|
|
80
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base_impl.hpp
|
|
81
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity_base.hpp
|
|
82
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_sampled_sets.hpp
|
|
83
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_theta_sketched_sets.hpp
|
|
84
|
-
)
|
|
@@ -22,39 +22,108 @@
|
|
|
22
22
|
|
|
23
23
|
#include <iostream>
|
|
24
24
|
#include <iomanip>
|
|
25
|
+
#include <stdexcept>
|
|
25
26
|
|
|
26
27
|
namespace datasketches {
|
|
27
28
|
|
|
28
29
|
template<bool dummy>
|
|
29
30
|
auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
|
|
30
|
-
if (size < 8) throw std::
|
|
31
|
+
if (size < 8) throw std::out_of_range("at least 8 bytes expected, actual " + std::to_string(size)
|
|
31
32
|
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
33
|
+
|
|
34
|
+
uint8_t serial_version = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE];
|
|
35
|
+
|
|
36
|
+
switch(serial_version) {
|
|
37
|
+
case COMPACT_SKETCH_SERIAL_VERSION: {
|
|
38
|
+
checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
|
|
39
|
+
uint64_t theta = theta_constants::MAX_THETA;
|
|
40
|
+
const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
|
|
41
|
+
if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_EMPTY_FLAG)) {
|
|
42
|
+
return {true, true, seed_hash, 0, theta, nullptr};
|
|
43
|
+
}
|
|
44
|
+
checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
|
45
|
+
const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
|
|
46
|
+
if (has_theta) {
|
|
47
|
+
if (size < 16) throw std::out_of_range("at least 16 bytes expected, actual " + std::to_string(size));
|
|
48
|
+
theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
|
|
49
|
+
}
|
|
50
|
+
if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] == 1) {
|
|
51
|
+
if (size < 16) throw std::out_of_range("at least 16 bytes expected, actual " + std::to_string(size));
|
|
52
|
+
return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
|
|
53
|
+
}
|
|
54
|
+
const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
|
|
55
|
+
const size_t entries_start_u64 = has_theta ? COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 : COMPACT_SKETCH_ENTRIES_EXACT_U64;
|
|
56
|
+
const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + entries_start_u64;
|
|
57
|
+
const size_t expected_size_bytes = (entries_start_u64 + num_entries) * sizeof(uint64_t);
|
|
58
|
+
if (size < expected_size_bytes) {
|
|
59
|
+
throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
|
|
60
|
+
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
|
61
|
+
}
|
|
62
|
+
const bool is_ordered = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_ORDERED_FLAG);
|
|
63
|
+
return {false, is_ordered, seed_hash, num_entries, theta, entries};
|
|
39
64
|
}
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
65
|
+
case 1: {
|
|
66
|
+
uint16_t seed_hash = compute_seed_hash(seed);
|
|
67
|
+
checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
|
|
68
|
+
const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
|
|
69
|
+
uint64_t theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
|
|
70
|
+
bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
|
|
71
|
+
if (is_empty) {
|
|
72
|
+
return {true, true, seed_hash, 0, theta, nullptr};
|
|
73
|
+
}
|
|
74
|
+
const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
|
|
75
|
+
const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
|
|
76
|
+
if (size < expected_size_bytes) {
|
|
77
|
+
throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
|
|
78
|
+
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
|
79
|
+
}
|
|
80
|
+
return {false, true, seed_hash, num_entries, theta, entries};
|
|
44
81
|
}
|
|
45
|
-
|
|
46
|
-
|
|
82
|
+
case 2: {
|
|
83
|
+
uint8_t preamble_size = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE];
|
|
84
|
+
checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
|
|
85
|
+
const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
|
|
86
|
+
checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
|
87
|
+
if (preamble_size == 1) {
|
|
88
|
+
return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr};
|
|
89
|
+
} else if (preamble_size == 2) {
|
|
90
|
+
const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
|
|
91
|
+
if (num_entries == 0) {
|
|
92
|
+
return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr};
|
|
93
|
+
} else {
|
|
94
|
+
const size_t expected_size_bytes = (preamble_size + num_entries) << 3;
|
|
95
|
+
if (size < expected_size_bytes) {
|
|
96
|
+
throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
|
|
97
|
+
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
|
98
|
+
}
|
|
99
|
+
const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_EXACT_U64;
|
|
100
|
+
return {false, true, seed_hash, num_entries, theta_constants::MAX_THETA, entries};
|
|
101
|
+
}
|
|
102
|
+
} else if (preamble_size == 3) {
|
|
103
|
+
const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
|
|
104
|
+
uint64_t theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
|
|
105
|
+
bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
|
|
106
|
+
if (is_empty) {
|
|
107
|
+
return {true, true, seed_hash, 0, theta, nullptr};
|
|
108
|
+
}
|
|
109
|
+
const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
|
|
110
|
+
const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
|
|
111
|
+
if (size < expected_size_bytes) {
|
|
112
|
+
throw std::out_of_range(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
|
|
113
|
+
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
|
114
|
+
}
|
|
115
|
+
return {false, true, seed_hash, num_entries, theta, entries};
|
|
116
|
+
} else {
|
|
117
|
+
throw std::invalid_argument(std::to_string(preamble_size) + " longs of premable, but expected 1, 2, or 3");
|
|
118
|
+
}
|
|
47
119
|
}
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
|
120
|
+
default:
|
|
121
|
+
// this should always fail since the valid cases are handled above
|
|
122
|
+
checker<true>::check_serial_version(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE], COMPACT_SKETCH_SERIAL_VERSION);
|
|
123
|
+
// this throw is never reached, because check_serial_version will throw an informative exception.
|
|
124
|
+
// This is only here to avoid a compiler warning about a path without a return value.
|
|
125
|
+
throw std::invalid_argument("unexpected sketch serialization version");
|
|
55
126
|
}
|
|
56
|
-
const bool is_ordered = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_ORDERED_FLAG);
|
|
57
|
-
return {false, is_ordered, seed_hash, num_entries, theta, entries};
|
|
58
127
|
}
|
|
59
128
|
|
|
60
129
|
template<bool dummy>
|
|
@@ -21,14 +21,19 @@
|
|
|
21
21
|
#define THETA_CONSTANTS_HPP_
|
|
22
22
|
|
|
23
23
|
#include <climits>
|
|
24
|
+
#include "common_defs.hpp"
|
|
24
25
|
|
|
25
26
|
namespace datasketches {
|
|
26
27
|
|
|
27
28
|
namespace theta_constants {
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
29
|
+
using resize_factor = datasketches::resize_factor;
|
|
30
|
+
//enum resize_factor { X1, X2, X4, X8 };
|
|
31
|
+
const uint64_t MAX_THETA = LLONG_MAX; // signed max for compatibility with Java
|
|
32
|
+
const uint8_t MIN_LG_K = 5;
|
|
33
|
+
const uint8_t MAX_LG_K = 26;
|
|
34
|
+
|
|
35
|
+
const uint8_t DEFAULT_LG_K = 12;
|
|
36
|
+
const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8;
|
|
32
37
|
}
|
|
33
38
|
|
|
34
39
|
} /* namespace datasketches */
|
|
@@ -49,6 +49,21 @@ public:
|
|
|
49
49
|
}
|
|
50
50
|
};
|
|
51
51
|
|
|
52
|
+
template<bool dummy>
|
|
53
|
+
class theta_build_helper{
|
|
54
|
+
public:
|
|
55
|
+
// consistent way of initializing theta from p
|
|
56
|
+
// avoids multiplication if p == 1 since it might not yield MAX_THETA exactly
|
|
57
|
+
static uint64_t starting_theta_from_p(float p) {
|
|
58
|
+
if (p < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p);
|
|
59
|
+
return theta_constants::MAX_THETA;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
|
|
63
|
+
return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
|
|
64
|
+
}
|
|
65
|
+
};
|
|
66
|
+
|
|
52
67
|
} /* namespace datasketches */
|
|
53
68
|
|
|
54
69
|
#endif
|
|
@@ -20,6 +20,7 @@
|
|
|
20
20
|
#include <iostream>
|
|
21
21
|
#include <sstream>
|
|
22
22
|
#include <algorithm>
|
|
23
|
+
#include <stdexcept>
|
|
23
24
|
|
|
24
25
|
#include "conditional_forward.hpp"
|
|
25
26
|
|
|
@@ -29,7 +30,7 @@ template<typename EN, typename EK, typename P, typename S, typename CS, typename
|
|
|
29
30
|
theta_intersection_base<EN, EK, P, S, CS, A>::theta_intersection_base(uint64_t seed, const P& policy, const A& allocator):
|
|
30
31
|
policy_(policy),
|
|
31
32
|
is_valid_(false),
|
|
32
|
-
table_(0, 0, resize_factor::X1, theta_constants::MAX_THETA, seed, allocator, false)
|
|
33
|
+
table_(0, 0, resize_factor::X1, 1, theta_constants::MAX_THETA, seed, allocator, false)
|
|
33
34
|
{}
|
|
34
35
|
|
|
35
36
|
template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
|
|
@@ -38,17 +39,17 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
|
|
|
38
39
|
if (table_.is_empty_) return;
|
|
39
40
|
if (!sketch.is_empty() && sketch.get_seed_hash() != compute_seed_hash(table_.seed_)) throw std::invalid_argument("seed hash mismatch");
|
|
40
41
|
table_.is_empty_ |= sketch.is_empty();
|
|
41
|
-
table_.theta_ = std::min(table_.theta_, sketch.get_theta64());
|
|
42
|
+
table_.theta_ = table_.is_empty_ ? theta_constants::MAX_THETA : std::min(table_.theta_, sketch.get_theta64());
|
|
42
43
|
if (is_valid_ && table_.num_entries_ == 0) return;
|
|
43
44
|
if (sketch.get_num_retained() == 0) {
|
|
44
45
|
is_valid_ = true;
|
|
45
|
-
table_ = hash_table(0, 0, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
|
46
|
+
table_ = hash_table(0, 0, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
|
46
47
|
return;
|
|
47
48
|
}
|
|
48
49
|
if (!is_valid_) { // first update, copy or move incoming sketch
|
|
49
50
|
is_valid_ = true;
|
|
50
51
|
const uint8_t lg_size = lg_size_from_count(sketch.get_num_retained(), theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
|
|
51
|
-
table_ = hash_table(lg_size, lg_size, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
|
52
|
+
table_ = hash_table(lg_size, lg_size, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
|
52
53
|
for (auto& entry: sketch) {
|
|
53
54
|
auto result = table_.find(EK()(entry));
|
|
54
55
|
if (result.second) {
|
|
@@ -83,11 +84,11 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
|
|
|
83
84
|
throw std::invalid_argument(" fewer keys than expected, possibly corrupted input sketch");
|
|
84
85
|
}
|
|
85
86
|
if (match_count == 0) {
|
|
86
|
-
table_ = hash_table(0, 0, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
|
87
|
+
table_ = hash_table(0, 0, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
|
87
88
|
if (table_.theta_ == theta_constants::MAX_THETA) table_.is_empty_ = true;
|
|
88
89
|
} else {
|
|
89
90
|
const uint8_t lg_size = lg_size_from_count(match_count, theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
|
|
90
|
-
table_ = hash_table(lg_size, lg_size, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
|
91
|
+
table_ = hash_table(lg_size, lg_size, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
|
91
92
|
for (uint32_t i = 0; i < match_count; i++) {
|
|
92
93
|
auto result = table_.find(EK()(matched_entries[i]));
|
|
93
94
|
table_.insert(result.first, std::move(matched_entries[i]));
|
|
@@ -21,6 +21,7 @@
|
|
|
21
21
|
#define THETA_A_SET_DIFFERENCE_BASE_IMPL_HPP_
|
|
22
22
|
|
|
23
23
|
#include <algorithm>
|
|
24
|
+
#include <stdexcept>
|
|
24
25
|
|
|
25
26
|
#include "conditional_back_inserter.hpp"
|
|
26
27
|
#include "conditional_forward.hpp"
|
|
@@ -36,7 +37,7 @@ seed_hash_(compute_seed_hash(seed))
|
|
|
36
37
|
template<typename EN, typename EK, typename CS, typename A>
|
|
37
38
|
template<typename FwdSketch, typename Sketch>
|
|
38
39
|
CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch& b, bool ordered) const {
|
|
39
|
-
if (a.is_empty() || a.get_num_retained()
|
|
40
|
+
if (a.is_empty() || (a.get_num_retained() > 0 && b.is_empty())) return CS(a, ordered);
|
|
40
41
|
if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
|
|
41
42
|
if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
|
|
42
43
|
|
|
@@ -53,7 +54,7 @@ CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch
|
|
|
53
54
|
conditional_back_inserter(entries, key_less_than<uint64_t, EN, EK>(theta)), comparator());
|
|
54
55
|
} else { // hash-based
|
|
55
56
|
const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), hash_table::REBUILD_THRESHOLD);
|
|
56
|
-
hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 0, 0, allocator_); // theta and seed are not used here
|
|
57
|
+
hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 1, 0, 0, allocator_); // theta and seed are not used here
|
|
57
58
|
for (const auto& entry: b) {
|
|
58
59
|
const uint64_t hash = EK()(entry);
|
|
59
60
|
if (hash < theta) {
|
|
@@ -25,14 +25,10 @@
|
|
|
25
25
|
namespace datasketches {
|
|
26
26
|
|
|
27
27
|
template<typename Allocator = std::allocator<uint64_t>>
|
|
28
|
-
class
|
|
28
|
+
class base_theta_sketch_alloc {
|
|
29
29
|
public:
|
|
30
|
-
using Entry = uint64_t;
|
|
31
|
-
using ExtractKey = trivial_extract_key;
|
|
32
|
-
using iterator = theta_iterator<Entry, ExtractKey>;
|
|
33
|
-
using const_iterator = theta_const_iterator<Entry, ExtractKey>;
|
|
34
30
|
|
|
35
|
-
virtual ~
|
|
31
|
+
virtual ~base_theta_sketch_alloc() = default;
|
|
36
32
|
|
|
37
33
|
/**
|
|
38
34
|
* @return allocator
|
|
@@ -104,6 +100,21 @@ public:
|
|
|
104
100
|
*/
|
|
105
101
|
virtual string<Allocator> to_string(bool print_items = false) const;
|
|
106
102
|
|
|
103
|
+
protected:
|
|
104
|
+
virtual void print_specifics(std::ostringstream& os) const = 0;
|
|
105
|
+
virtual void print_items(std::ostringstream& os) const = 0;
|
|
106
|
+
};
|
|
107
|
+
|
|
108
|
+
template<typename Allocator = std::allocator<uint64_t>>
|
|
109
|
+
class theta_sketch_alloc: public base_theta_sketch_alloc<Allocator> {
|
|
110
|
+
public:
|
|
111
|
+
using Entry = uint64_t;
|
|
112
|
+
using ExtractKey = trivial_extract_key;
|
|
113
|
+
using iterator = theta_iterator<Entry, ExtractKey>;
|
|
114
|
+
using const_iterator = theta_const_iterator<Entry, ExtractKey>;
|
|
115
|
+
|
|
116
|
+
virtual ~theta_sketch_alloc() = default;
|
|
117
|
+
|
|
107
118
|
/**
|
|
108
119
|
* Iterator over hash values in this sketch.
|
|
109
120
|
* @return begin iterator
|
|
@@ -131,8 +142,7 @@ public:
|
|
|
131
142
|
virtual const_iterator end() const = 0;
|
|
132
143
|
|
|
133
144
|
protected:
|
|
134
|
-
|
|
135
|
-
virtual void print_specifics(ostrstream& os) const = 0;
|
|
145
|
+
virtual void print_items(std::ostringstream& os) const;
|
|
136
146
|
};
|
|
137
147
|
|
|
138
148
|
// forward declaration
|
|
@@ -269,6 +279,11 @@ public:
|
|
|
269
279
|
*/
|
|
270
280
|
void trim();
|
|
271
281
|
|
|
282
|
+
/**
|
|
283
|
+
* Reset the sketch to the initial empty state
|
|
284
|
+
*/
|
|
285
|
+
void reset();
|
|
286
|
+
|
|
272
287
|
/**
|
|
273
288
|
* Converts this sketch to a compact sketch (ordered or unordered).
|
|
274
289
|
* @param ordered optional flag to specify if ordered sketch should be produced
|
|
@@ -285,11 +300,10 @@ private:
|
|
|
285
300
|
theta_table table_;
|
|
286
301
|
|
|
287
302
|
// for builder
|
|
288
|
-
update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
|
|
289
|
-
uint64_t seed, const Allocator& allocator);
|
|
303
|
+
update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
|
|
304
|
+
uint64_t theta, uint64_t seed, const Allocator& allocator);
|
|
290
305
|
|
|
291
|
-
|
|
292
|
-
virtual void print_specifics(ostrstream& os) const;
|
|
306
|
+
virtual void print_specifics(std::ostringstream& os) const;
|
|
293
307
|
};
|
|
294
308
|
|
|
295
309
|
// compact sketch
|
|
@@ -377,8 +391,7 @@ private:
|
|
|
377
391
|
uint64_t theta_;
|
|
378
392
|
std::vector<uint64_t, Allocator> entries_;
|
|
379
393
|
|
|
380
|
-
|
|
381
|
-
virtual void print_specifics(ostrstream& os) const;
|
|
394
|
+
virtual void print_specifics(std::ostringstream& os) const;
|
|
382
395
|
};
|
|
383
396
|
|
|
384
397
|
template<typename Allocator>
|
|
@@ -392,7 +405,7 @@ public:
|
|
|
392
405
|
// It does not take the ownership of the buffer.
|
|
393
406
|
|
|
394
407
|
template<typename Allocator = std::allocator<uint64_t>>
|
|
395
|
-
class wrapped_compact_theta_sketch_alloc {
|
|
408
|
+
class wrapped_compact_theta_sketch_alloc : public base_theta_sketch_alloc<Allocator> {
|
|
396
409
|
public:
|
|
397
410
|
using const_iterator = const uint64_t*;
|
|
398
411
|
|
|
@@ -415,6 +428,10 @@ public:
|
|
|
415
428
|
*/
|
|
416
429
|
static const wrapped_compact_theta_sketch_alloc wrap(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, bool dump_on_error = false);
|
|
417
430
|
|
|
431
|
+
protected:
|
|
432
|
+
virtual void print_specifics(std::ostringstream& os) const;
|
|
433
|
+
virtual void print_items(std::ostringstream& os) const;
|
|
434
|
+
|
|
418
435
|
private:
|
|
419
436
|
bool is_empty_;
|
|
420
437
|
bool is_ordered_;
|