datasketches 0.2.3 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +7 -7
- data/ext/datasketches/theta_wrapper.cpp +20 -4
- data/lib/datasketches/version.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +22 -3
- data/vendor/datasketches-cpp/MANIFEST.in +3 -0
- data/vendor/datasketches-cpp/README.md +76 -9
- data/vendor/datasketches-cpp/cmake/DataSketchesConfig.cmake.in +10 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +14 -13
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +15 -35
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +5 -3
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -6
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_allocation_test.cpp +17 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +5 -15
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +4 -2
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +33 -56
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +6 -4
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +9 -19
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +4 -2
- data/vendor/datasketches-cpp/req/CMakeLists.txt +8 -21
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +4 -2
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +5 -9
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +13 -7
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +8 -6
- data/vendor/datasketches-cpp/setup.py +1 -1
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +26 -45
- data/vendor/datasketches-cpp/theta/include/compact_theta_sketch_parser_impl.hpp +89 -22
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +15 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +6 -6
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +2 -2
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +32 -15
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +146 -51
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +6 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base.hpp +3 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +8 -2
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +8 -5
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -4
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +33 -9
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v1.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java_v2.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_setop_test.cpp +445 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +400 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +23 -11
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +18 -33
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +1 -1
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +13 -9
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +33 -14
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +6 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +8 -3
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +16 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +46 -8
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +7 -0
- metadata +11 -6
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
@@ -32,27 +32,17 @@ target_include_directories(kll
|
|
32
32
|
target_link_libraries(kll INTERFACE common)
|
33
33
|
target_compile_features(kll INTERFACE cxx_std_11)
|
34
34
|
|
35
|
-
set(kll_HEADERS "")
|
36
|
-
list(APPEND kll_HEADERS "include/kll_sketch.hpp")
|
37
|
-
list(APPEND kll_HEADERS "include/kll_sketch_impl.hpp")
|
38
|
-
list(APPEND kll_HEADERS "include/kll_helper.hpp")
|
39
|
-
list(APPEND kll_HEADERS "include/kll_helper_impl.hpp")
|
40
|
-
list(APPEND kll_HEADERS "include/kll_quantile_calculator.hpp")
|
41
|
-
list(APPEND kll_HEADERS "include/kll_quantile_calculator_impl.hpp")
|
42
|
-
|
43
35
|
install(TARGETS kll
|
44
36
|
EXPORT ${PROJECT_NAME}
|
45
37
|
)
|
46
38
|
|
47
|
-
install(FILES
|
39
|
+
install(FILES
|
40
|
+
include/kll_sketch.hpp
|
41
|
+
include/kll_sketch_impl.hpp
|
42
|
+
include/kll_helper.hpp
|
43
|
+
include/kll_helper_impl.hpp
|
44
|
+
include/kll_quantile_calculator.hpp
|
45
|
+
include/kll_quantile_calculator_impl.hpp
|
46
|
+
include/kolmogorov_smirnov.hpp
|
47
|
+
include/kolmogorov_smirnov_impl.hpp
|
48
48
|
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
|
49
|
-
|
50
|
-
target_sources(kll
|
51
|
-
INTERFACE
|
52
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/kll_helper.hpp
|
53
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/kll_helper_impl.hpp
|
54
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/kll_sketch.hpp
|
55
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/kll_sketch_impl.hpp
|
56
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/kll_quantile_calculator.hpp
|
57
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/kll_quantile_calculator_impl.hpp
|
58
|
-
)
|
@@ -1023,7 +1023,9 @@ void kll_sketch<T, C, S, A>::check_family_id(uint8_t family_id) {
|
|
1023
1023
|
|
1024
1024
|
template <typename T, typename C, typename S, typename A>
|
1025
1025
|
string<A> kll_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items) const {
|
1026
|
-
|
1026
|
+
// Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
|
1027
|
+
// The stream does not support passing an allocator instance, and alternatives are complicated.
|
1028
|
+
std::ostringstream os;
|
1027
1029
|
os << "### KLL sketch summary:" << std::endl;
|
1028
1030
|
os << " K : " << k_ << std::endl;
|
1029
1031
|
os << " min K : " << min_k_ << std::endl;
|
@@ -1069,7 +1071,7 @@ string<A> kll_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items)
|
|
1069
1071
|
}
|
1070
1072
|
os << "### End sketch data" << std::endl;
|
1071
1073
|
}
|
1072
|
-
return os.str();
|
1074
|
+
return string<A>(os.str().c_str(), allocator_);
|
1073
1075
|
}
|
1074
1076
|
|
1075
1077
|
template <typename T, typename C, typename S, typename A>
|
@@ -32,29 +32,16 @@ target_include_directories(req
|
|
32
32
|
target_link_libraries(req INTERFACE common)
|
33
33
|
target_compile_features(req INTERFACE cxx_std_11)
|
34
34
|
|
35
|
-
set(req_HEADERS "")
|
36
|
-
list(APPEND req_HEADERS "include/req_common.hpp")
|
37
|
-
list(APPEND req_HEADERS "include/req_sketch.hpp")
|
38
|
-
list(APPEND req_HEADERS "include/req_sketch_impl.hpp")
|
39
|
-
list(APPEND req_HEADERS "include/req_compactor.hpp")
|
40
|
-
list(APPEND req_HEADERS "include/req_compactor_impl.hpp")
|
41
|
-
list(APPEND req_HEADERS "include/req_quantile_calculator.hpp")
|
42
|
-
list(APPEND req_HEADERS "include/req_quantile_calculator_impl.hpp")
|
43
|
-
|
44
35
|
install(TARGETS req
|
45
36
|
EXPORT ${PROJECT_NAME}
|
46
37
|
)
|
47
38
|
|
48
|
-
install(FILES
|
39
|
+
install(FILES
|
40
|
+
include/req_common.hpp
|
41
|
+
include/req_sketch.hpp
|
42
|
+
include/req_sketch_impl.hpp
|
43
|
+
include/req_compactor.hpp
|
44
|
+
include/req_compactor_impl.hpp
|
45
|
+
include/req_quantile_calculator.hpp
|
46
|
+
include/req_quantile_calculator_impl.hpp
|
49
47
|
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
|
50
|
-
|
51
|
-
target_sources(req
|
52
|
-
INTERFACE
|
53
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/req_common.hpp
|
54
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/req_sketch.hpp
|
55
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/req_sketch_impl.hpp
|
56
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/req_compactor.hpp
|
57
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/req_compactor_impl.hpp
|
58
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/req_quantile_calculator.hpp
|
59
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/req_quantile_calculator_impl.hpp
|
60
|
-
)
|
@@ -653,7 +653,9 @@ void req_sketch<T, C, S, A>::compress() {
|
|
653
653
|
|
654
654
|
template<typename T, typename C, typename S, typename A>
|
655
655
|
string<A> req_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items) const {
|
656
|
-
|
656
|
+
// Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
|
657
|
+
// The stream does not support passing an allocator instance, and alternatives are complicated.
|
658
|
+
std::ostringstream os;
|
657
659
|
os << "### REQ sketch summary:" << std::endl;
|
658
660
|
os << " K : " << k_ << std::endl;
|
659
661
|
os << " High Rank Acc : " << (hra_ ? "true" : "false") << std::endl;
|
@@ -693,7 +695,7 @@ string<A> req_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items)
|
|
693
695
|
}
|
694
696
|
os << "### End sketch data" << std::endl;
|
695
697
|
}
|
696
|
-
return os.str();
|
698
|
+
return string<A>(os.str().c_str(), allocator_);
|
697
699
|
}
|
698
700
|
|
699
701
|
template<typename T, typename C, typename S, typename A>
|
@@ -32,17 +32,13 @@ target_include_directories(sampling
|
|
32
32
|
target_link_libraries(sampling INTERFACE common)
|
33
33
|
target_compile_features(sampling INTERFACE cxx_std_11)
|
34
34
|
|
35
|
-
set(sampling_HEADERS "include/var_opt_sketch.hpp;include/var_opt_sketch_impl.hpp")
|
36
|
-
|
37
35
|
install(TARGETS sampling
|
38
36
|
EXPORT ${PROJECT_NAME}
|
39
37
|
)
|
40
38
|
|
41
|
-
install(FILES
|
39
|
+
install(FILES
|
40
|
+
include/var_opt_sketch.hpp
|
41
|
+
include/var_opt_sketch_impl.hpp
|
42
|
+
include/var_opt_union.hpp
|
43
|
+
include/var_opt_union_impl.hpp
|
42
44
|
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
|
43
|
-
|
44
|
-
target_sources(sampling
|
45
|
-
INTERFACE
|
46
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/var_opt_sketch.hpp
|
47
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/var_opt_sketch_impl.hpp
|
48
|
-
)
|
@@ -731,8 +731,10 @@ void var_opt_sketch<T,S,A>::update(T&& item, double weight) {
|
|
731
731
|
|
732
732
|
template<typename T, typename S, typename A>
|
733
733
|
string<A> var_opt_sketch<T,S,A>::to_string() const {
|
734
|
-
|
735
|
-
|
734
|
+
// Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
|
735
|
+
// The stream does not support passing an allocator instance, and alternatives are complicated.
|
736
|
+
std::ostringstream os;
|
737
|
+
os << "### VarOpt SUMMARY:" << std::endl;
|
736
738
|
os << " k : " << k_ << std::endl;
|
737
739
|
os << " h : " << h_ << std::endl;
|
738
740
|
os << " r : " << r_ << std::endl;
|
@@ -740,24 +742,28 @@ string<A> var_opt_sketch<T,S,A>::to_string() const {
|
|
740
742
|
os << " Current size : " << curr_items_alloc_ << std::endl;
|
741
743
|
os << " Resize factor: " << (1 << rf_) << std::endl;
|
742
744
|
os << "### END SKETCH SUMMARY" << std::endl;
|
743
|
-
return os.str();
|
745
|
+
return string<A>(os.str().c_str(), allocator_);
|
744
746
|
}
|
745
747
|
|
746
748
|
template<typename T, typename S, typename A>
|
747
749
|
string<A> var_opt_sketch<T,S,A>::items_to_string() const {
|
748
|
-
|
750
|
+
// Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
|
751
|
+
// The stream does not support passing an allocator instance, and alternatives are complicated.
|
752
|
+
std::ostringstream os;
|
749
753
|
os << "### Sketch Items" << std::endl;
|
750
754
|
int idx = 0;
|
751
755
|
for (auto record : *this) {
|
752
756
|
os << idx << ": " << record.first << "\twt = " << record.second << std::endl;
|
753
757
|
++idx;
|
754
758
|
}
|
755
|
-
return os.str();
|
759
|
+
return string<A>(os.str().c_str(), allocator_);
|
756
760
|
}
|
757
761
|
|
758
762
|
template<typename T, typename S, typename A>
|
759
763
|
string<A> var_opt_sketch<T,S,A>::items_to_string(bool print_gap) const {
|
760
|
-
|
764
|
+
// Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
|
765
|
+
// The stream does not support passing an allocator instance, and alternatives are complicated.
|
766
|
+
std::ostringstream os;
|
761
767
|
os << "### Sketch Items" << std::endl;
|
762
768
|
const uint32_t array_length = (n_ < k_ ? n_ : k_ + 1);
|
763
769
|
for (uint32_t i = 0, display_idx = 0; i < array_length; ++i) {
|
@@ -774,7 +780,7 @@ string<A> var_opt_sketch<T,S,A>::items_to_string(bool print_gap) const {
|
|
774
780
|
++display_idx;
|
775
781
|
}
|
776
782
|
}
|
777
|
-
return os.str();
|
783
|
+
return string<A>(os.str().c_str(), allocator_);
|
778
784
|
}
|
779
785
|
|
780
786
|
template<typename T, typename S, typename A>
|
@@ -295,14 +295,16 @@ void var_opt_union<T,S,A>::reset() {
|
|
295
295
|
|
296
296
|
template<typename T, typename S, typename A>
|
297
297
|
string<A> var_opt_union<T,S,A>::to_string() const {
|
298
|
-
|
299
|
-
|
300
|
-
|
298
|
+
// Using a temporary stream for implementation here does not comply with AllocatorAwareContainer requirements.
|
299
|
+
// The stream does not support passing an allocator instance, and alternatives are complicated.
|
300
|
+
std::ostringstream os;
|
301
|
+
os << "### VarOpt Union SUMMARY:" << std::endl;
|
302
|
+
os << " n : " << n_ << std::endl;
|
301
303
|
os << " Max k : " << max_k_ << std::endl;
|
302
|
-
os << " Gadget Summary:
|
304
|
+
os << " Gadget Summary:" << std::endl;
|
303
305
|
os << gadget_.to_string();
|
304
|
-
os << "### END VarOpt Union SUMMARY
|
305
|
-
return os.str();
|
306
|
+
os << "### END VarOpt Union SUMMARY" << std::endl;
|
307
|
+
return string<A>(os.str().c_str(), gadget_.allocator_);
|
306
308
|
}
|
307
309
|
|
308
310
|
template<typename T, typename S, typename A>
|
@@ -32,53 +32,34 @@ target_include_directories(theta
|
|
32
32
|
target_link_libraries(theta INTERFACE common)
|
33
33
|
target_compile_features(theta INTERFACE cxx_std_11)
|
34
34
|
|
35
|
-
set(theta_HEADERS "")
|
36
|
-
list(APPEND theta_HEADERS "include/theta_sketch.hpp;include/theta_sketch_impl.hpp")
|
37
|
-
list(APPEND theta_HEADERS "include/theta_union.hpp;include/theta_union_impl.hpp")
|
38
|
-
list(APPEND theta_HEADERS "include/theta_intersection.hpp;include/theta_intersection_impl.hpp")
|
39
|
-
list(APPEND theta_HEADERS "include/theta_a_not_b.hpp;include/theta_a_not_b_impl.hpp")
|
40
|
-
list(APPEND theta_HEADERS "include/theta_jaccard_similarity.hpp")
|
41
|
-
list(APPEND theta_HEADERS "include/theta_comparators.hpp")
|
42
|
-
list(APPEND theta_HEADERS "include/theta_constants.hpp")
|
43
|
-
list(APPEND theta_HEADERS "include/theta_helpers.hpp")
|
44
|
-
list(APPEND theta_HEADERS "include/theta_update_sketch_base.hpp;include/theta_update_sketch_base_impl.hpp")
|
45
|
-
list(APPEND theta_HEADERS "include/theta_union_base.hpp;include/theta_union_base_impl.hpp")
|
46
|
-
list(APPEND theta_HEADERS "include/theta_intersection_base.hpp;include/theta_intersection_base_impl.hpp")
|
47
|
-
list(APPEND theta_HEADERS "include/theta_set_difference_base.hpp;include/theta_set_difference_base_impl.hpp")
|
48
|
-
list(APPEND theta_HEADERS "include/theta_jaccard_similarity_base.hpp")
|
49
|
-
list(APPEND theta_HEADERS "include/bounds_on_ratios_in_sampled_sets.hpp")
|
50
|
-
list(APPEND theta_HEADERS "include/bounds_on_ratios_in_theta_sketched_sets.hpp")
|
51
|
-
|
52
35
|
install(TARGETS theta
|
53
36
|
EXPORT ${PROJECT_NAME}
|
54
37
|
)
|
55
38
|
|
56
|
-
install(FILES
|
39
|
+
install(FILES
|
40
|
+
include/theta_sketch.hpp
|
41
|
+
include/theta_sketch_impl.hpp
|
42
|
+
include/theta_union.hpp
|
43
|
+
include/theta_union_impl.hpp
|
44
|
+
include/theta_intersection.hpp
|
45
|
+
include/theta_intersection_impl.hpp
|
46
|
+
include/theta_a_not_b.hpp
|
47
|
+
include/theta_a_not_b_impl.hpp
|
48
|
+
include/theta_jaccard_similarity.hpp
|
49
|
+
include/theta_comparators.hpp
|
50
|
+
include/theta_constants.hpp
|
51
|
+
include/theta_helpers.hpp
|
52
|
+
include/theta_update_sketch_base.hpp
|
53
|
+
include/theta_update_sketch_base_impl.hpp
|
54
|
+
include/theta_union_base.hpp
|
55
|
+
include/theta_union_base_impl.hpp
|
56
|
+
include/theta_intersection_base.hpp
|
57
|
+
include/theta_intersection_base_impl.hpp
|
58
|
+
include/theta_set_difference_base.hpp
|
59
|
+
include/theta_set_difference_base_impl.hpp
|
60
|
+
include/theta_jaccard_similarity_base.hpp
|
61
|
+
include/bounds_on_ratios_in_sampled_sets.hpp
|
62
|
+
include/bounds_on_ratios_in_theta_sketched_sets.hpp
|
63
|
+
include/compact_theta_sketch_parser.hpp
|
64
|
+
include/compact_theta_sketch_parser_impl.hpp
|
57
65
|
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
|
58
|
-
|
59
|
-
target_sources(theta
|
60
|
-
INTERFACE
|
61
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch.hpp
|
62
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union.hpp
|
63
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection.hpp
|
64
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b.hpp
|
65
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch_impl.hpp
|
66
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_impl.hpp
|
67
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_impl.hpp
|
68
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_a_not_b_impl.hpp
|
69
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity.hpp
|
70
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_comparators.hpp
|
71
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_constants.hpp
|
72
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_helpers.hpp
|
73
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base.hpp
|
74
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_update_sketch_base_impl.hpp
|
75
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base.hpp
|
76
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_base_impl.hpp
|
77
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base.hpp
|
78
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_intersection_base_impl.hpp
|
79
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base.hpp
|
80
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_set_difference_base_impl.hpp
|
81
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/theta_jaccard_similarity_base.hpp
|
82
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_sampled_sets.hpp
|
83
|
-
${CMAKE_CURRENT_SOURCE_DIR}/include/bounds_on_ratios_in_theta_sketched_sets.hpp
|
84
|
-
)
|
@@ -29,32 +29,99 @@ template<bool dummy>
|
|
29
29
|
auto compact_theta_sketch_parser<dummy>::parse(const void* ptr, size_t size, uint64_t seed, bool dump_on_error) -> compact_theta_sketch_data {
|
30
30
|
if (size < 8) throw std::invalid_argument("at least 8 bytes expected, actual " + std::to_string(size)
|
31
31
|
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
32
|
+
|
33
|
+
uint8_t serial_version = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE];
|
34
|
+
|
35
|
+
switch(serial_version) {
|
36
|
+
case COMPACT_SKETCH_SERIAL_VERSION: {
|
37
|
+
checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
|
38
|
+
uint64_t theta = theta_constants::MAX_THETA;
|
39
|
+
const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
|
40
|
+
if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_EMPTY_FLAG)) {
|
41
|
+
return {true, true, seed_hash, 0, theta, nullptr};
|
42
|
+
}
|
43
|
+
checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
44
|
+
const bool has_theta = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] > 2;
|
45
|
+
if (has_theta) {
|
46
|
+
if (size < 16) throw std::invalid_argument("at least 16 bytes expected, actual " + std::to_string(size));
|
47
|
+
theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
|
48
|
+
}
|
49
|
+
if (reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE] == 1) {
|
50
|
+
return {false, true, seed_hash, 1, theta, reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_SINGLE_ENTRY_U64};
|
51
|
+
}
|
52
|
+
const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
|
53
|
+
const size_t entries_start_u64 = has_theta ? COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 : COMPACT_SKETCH_ENTRIES_EXACT_U64;
|
54
|
+
const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + entries_start_u64;
|
55
|
+
const size_t expected_size_bytes = (entries_start_u64 + num_entries) * sizeof(uint64_t);
|
56
|
+
if (size < expected_size_bytes) {
|
57
|
+
throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
|
58
|
+
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
59
|
+
}
|
60
|
+
const bool is_ordered = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_ORDERED_FLAG);
|
61
|
+
return {false, is_ordered, seed_hash, num_entries, theta, entries};
|
39
62
|
}
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
63
|
+
case 1: {
|
64
|
+
uint16_t seed_hash = compute_seed_hash(seed);
|
65
|
+
checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
|
66
|
+
const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
|
67
|
+
uint64_t theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
|
68
|
+
bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
|
69
|
+
if (is_empty) {
|
70
|
+
return {true, true, seed_hash, 0, theta, nullptr};
|
71
|
+
}
|
72
|
+
const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
|
73
|
+
const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
|
74
|
+
if (size < expected_size_bytes) {
|
75
|
+
throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
|
76
|
+
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
77
|
+
}
|
78
|
+
return {false, true, seed_hash, num_entries, theta, entries};
|
44
79
|
}
|
45
|
-
|
46
|
-
|
80
|
+
case 2: {
|
81
|
+
uint8_t preamble_size = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_PRE_LONGS_BYTE];
|
82
|
+
checker<true>::check_sketch_type(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_TYPE_BYTE], COMPACT_SKETCH_TYPE);
|
83
|
+
const uint16_t seed_hash = reinterpret_cast<const uint16_t*>(ptr)[COMPACT_SKETCH_SEED_HASH_U16];
|
84
|
+
checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
|
85
|
+
if (preamble_size == 1) {
|
86
|
+
return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr};
|
87
|
+
} else if (preamble_size == 2) {
|
88
|
+
const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
|
89
|
+
if (num_entries == 0) {
|
90
|
+
return {true, true, seed_hash, 0, theta_constants::MAX_THETA, nullptr};
|
91
|
+
} else {
|
92
|
+
const size_t expected_size_bytes = (preamble_size + num_entries) << 3;
|
93
|
+
if (size < expected_size_bytes) {
|
94
|
+
throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
|
95
|
+
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
96
|
+
}
|
97
|
+
const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_EXACT_U64;
|
98
|
+
return {false, true, seed_hash, num_entries, theta_constants::MAX_THETA, entries};
|
99
|
+
}
|
100
|
+
} else if (preamble_size == 3) {
|
101
|
+
const uint32_t num_entries = reinterpret_cast<const uint32_t*>(ptr)[COMPACT_SKETCH_NUM_ENTRIES_U32];
|
102
|
+
uint64_t theta = reinterpret_cast<const uint64_t*>(ptr)[COMPACT_SKETCH_THETA_U64];
|
103
|
+
bool is_empty = (num_entries == 0) && (theta == theta_constants::MAX_THETA);
|
104
|
+
if (is_empty) {
|
105
|
+
return {true, true, seed_hash, 0, theta, nullptr};
|
106
|
+
}
|
107
|
+
const uint64_t* entries = reinterpret_cast<const uint64_t*>(ptr) + COMPACT_SKETCH_ENTRIES_ESTIMATION_U64;
|
108
|
+
const size_t expected_size_bytes = (COMPACT_SKETCH_ENTRIES_ESTIMATION_U64 + num_entries) * sizeof(uint64_t);
|
109
|
+
if (size < expected_size_bytes) {
|
110
|
+
throw std::invalid_argument(std::to_string(expected_size_bytes) + " bytes expected, actual " + std::to_string(size)
|
111
|
+
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
112
|
+
}
|
113
|
+
return {false, true, seed_hash, num_entries, theta, entries};
|
114
|
+
} else {
|
115
|
+
throw std::invalid_argument(std::to_string(preamble_size) + " longs of premable, but expected 1, 2, or 3");
|
116
|
+
}
|
47
117
|
}
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
+ (dump_on_error ? (", sketch dump: " + hex_dump(reinterpret_cast<const uint8_t*>(ptr), size)) : ""));
|
118
|
+
default:
|
119
|
+
// this should always fail since the valid cases are handled above
|
120
|
+
checker<true>::check_serial_version(reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_SERIAL_VERSION_BYTE], COMPACT_SKETCH_SERIAL_VERSION);
|
121
|
+
// this throw is never reached, because check_serial_version will throw an informative exception.
|
122
|
+
// This is only here to avoid a compiler warning about a path without a return value.
|
123
|
+
throw std::invalid_argument("unexpected sketch serialization version");
|
55
124
|
}
|
56
|
-
const bool is_ordered = reinterpret_cast<const uint8_t*>(ptr)[COMPACT_SKETCH_FLAGS_BYTE] & (1 << COMPACT_SKETCH_IS_ORDERED_FLAG);
|
57
|
-
return {false, is_ordered, seed_hash, num_entries, theta, entries};
|
58
125
|
}
|
59
126
|
|
60
127
|
template<bool dummy>
|
@@ -49,6 +49,21 @@ public:
|
|
49
49
|
}
|
50
50
|
};
|
51
51
|
|
52
|
+
template<bool dummy>
|
53
|
+
class theta_build_helper{
|
54
|
+
public:
|
55
|
+
// consistent way of initializing theta from p
|
56
|
+
// avoids multiplication if p == 1 since it might not yield MAX_THETA exactly
|
57
|
+
static uint64_t starting_theta_from_p(float p) {
|
58
|
+
if (p < 1) return static_cast<uint64_t>(theta_constants::MAX_THETA * p);
|
59
|
+
return theta_constants::MAX_THETA;
|
60
|
+
}
|
61
|
+
|
62
|
+
static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
|
63
|
+
return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
|
64
|
+
}
|
65
|
+
};
|
66
|
+
|
52
67
|
} /* namespace datasketches */
|
53
68
|
|
54
69
|
#endif
|
@@ -29,7 +29,7 @@ template<typename EN, typename EK, typename P, typename S, typename CS, typename
|
|
29
29
|
theta_intersection_base<EN, EK, P, S, CS, A>::theta_intersection_base(uint64_t seed, const P& policy, const A& allocator):
|
30
30
|
policy_(policy),
|
31
31
|
is_valid_(false),
|
32
|
-
table_(0, 0, resize_factor::X1, theta_constants::MAX_THETA, seed, allocator, false)
|
32
|
+
table_(0, 0, resize_factor::X1, 1, theta_constants::MAX_THETA, seed, allocator, false)
|
33
33
|
{}
|
34
34
|
|
35
35
|
template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
|
@@ -38,17 +38,17 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
|
|
38
38
|
if (table_.is_empty_) return;
|
39
39
|
if (!sketch.is_empty() && sketch.get_seed_hash() != compute_seed_hash(table_.seed_)) throw std::invalid_argument("seed hash mismatch");
|
40
40
|
table_.is_empty_ |= sketch.is_empty();
|
41
|
-
table_.theta_ = std::min(table_.theta_, sketch.get_theta64());
|
41
|
+
table_.theta_ = table_.is_empty_ ? theta_constants::MAX_THETA : std::min(table_.theta_, sketch.get_theta64());
|
42
42
|
if (is_valid_ && table_.num_entries_ == 0) return;
|
43
43
|
if (sketch.get_num_retained() == 0) {
|
44
44
|
is_valid_ = true;
|
45
|
-
table_ = hash_table(0, 0, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
45
|
+
table_ = hash_table(0, 0, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
46
46
|
return;
|
47
47
|
}
|
48
48
|
if (!is_valid_) { // first update, copy or move incoming sketch
|
49
49
|
is_valid_ = true;
|
50
50
|
const uint8_t lg_size = lg_size_from_count(sketch.get_num_retained(), theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
|
51
|
-
table_ = hash_table(lg_size, lg_size, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
51
|
+
table_ = hash_table(lg_size, lg_size, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
52
52
|
for (auto& entry: sketch) {
|
53
53
|
auto result = table_.find(EK()(entry));
|
54
54
|
if (result.second) {
|
@@ -83,11 +83,11 @@ void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
|
|
83
83
|
throw std::invalid_argument(" fewer keys than expected, possibly corrupted input sketch");
|
84
84
|
}
|
85
85
|
if (match_count == 0) {
|
86
|
-
table_ = hash_table(0, 0, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
86
|
+
table_ = hash_table(0, 0, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
87
87
|
if (table_.theta_ == theta_constants::MAX_THETA) table_.is_empty_ = true;
|
88
88
|
} else {
|
89
89
|
const uint8_t lg_size = lg_size_from_count(match_count, theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
|
90
|
-
table_ = hash_table(lg_size, lg_size, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
90
|
+
table_ = hash_table(lg_size, lg_size, resize_factor::X1, 1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
|
91
91
|
for (uint32_t i = 0; i < match_count; i++) {
|
92
92
|
auto result = table_.find(EK()(matched_entries[i]));
|
93
93
|
table_.insert(result.first, std::move(matched_entries[i]));
|
@@ -36,7 +36,7 @@ seed_hash_(compute_seed_hash(seed))
|
|
36
36
|
template<typename EN, typename EK, typename CS, typename A>
|
37
37
|
template<typename FwdSketch, typename Sketch>
|
38
38
|
CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch& b, bool ordered) const {
|
39
|
-
if (a.is_empty() || a.get_num_retained()
|
39
|
+
if (a.is_empty() || (a.get_num_retained() > 0 && b.is_empty())) return CS(a, ordered);
|
40
40
|
if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
|
41
41
|
if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
|
42
42
|
|
@@ -53,7 +53,7 @@ CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch
|
|
53
53
|
conditional_back_inserter(entries, key_less_than<uint64_t, EN, EK>(theta)), comparator());
|
54
54
|
} else { // hash-based
|
55
55
|
const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), hash_table::REBUILD_THRESHOLD);
|
56
|
-
hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 0, 0, allocator_); // theta and seed are not used here
|
56
|
+
hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 1, 0, 0, allocator_); // theta and seed are not used here
|
57
57
|
for (const auto& entry: b) {
|
58
58
|
const uint64_t hash = EK()(entry);
|
59
59
|
if (hash < theta) {
|
@@ -25,14 +25,10 @@
|
|
25
25
|
namespace datasketches {
|
26
26
|
|
27
27
|
template<typename Allocator = std::allocator<uint64_t>>
|
28
|
-
class
|
28
|
+
class base_theta_sketch_alloc {
|
29
29
|
public:
|
30
|
-
using Entry = uint64_t;
|
31
|
-
using ExtractKey = trivial_extract_key;
|
32
|
-
using iterator = theta_iterator<Entry, ExtractKey>;
|
33
|
-
using const_iterator = theta_const_iterator<Entry, ExtractKey>;
|
34
30
|
|
35
|
-
virtual ~
|
31
|
+
virtual ~base_theta_sketch_alloc() = default;
|
36
32
|
|
37
33
|
/**
|
38
34
|
* @return allocator
|
@@ -104,6 +100,21 @@ public:
|
|
104
100
|
*/
|
105
101
|
virtual string<Allocator> to_string(bool print_items = false) const;
|
106
102
|
|
103
|
+
protected:
|
104
|
+
virtual void print_specifics(std::ostringstream& os) const = 0;
|
105
|
+
virtual void print_items(std::ostringstream& os) const = 0;
|
106
|
+
};
|
107
|
+
|
108
|
+
template<typename Allocator = std::allocator<uint64_t>>
|
109
|
+
class theta_sketch_alloc: public base_theta_sketch_alloc<Allocator> {
|
110
|
+
public:
|
111
|
+
using Entry = uint64_t;
|
112
|
+
using ExtractKey = trivial_extract_key;
|
113
|
+
using iterator = theta_iterator<Entry, ExtractKey>;
|
114
|
+
using const_iterator = theta_const_iterator<Entry, ExtractKey>;
|
115
|
+
|
116
|
+
virtual ~theta_sketch_alloc() = default;
|
117
|
+
|
107
118
|
/**
|
108
119
|
* Iterator over hash values in this sketch.
|
109
120
|
* @return begin iterator
|
@@ -131,8 +142,7 @@ public:
|
|
131
142
|
virtual const_iterator end() const = 0;
|
132
143
|
|
133
144
|
protected:
|
134
|
-
|
135
|
-
virtual void print_specifics(ostrstream& os) const = 0;
|
145
|
+
virtual void print_items(std::ostringstream& os) const;
|
136
146
|
};
|
137
147
|
|
138
148
|
// forward declaration
|
@@ -269,6 +279,11 @@ public:
|
|
269
279
|
*/
|
270
280
|
void trim();
|
271
281
|
|
282
|
+
/**
|
283
|
+
* Reset the sketch to the initial empty state
|
284
|
+
*/
|
285
|
+
void reset();
|
286
|
+
|
272
287
|
/**
|
273
288
|
* Converts this sketch to a compact sketch (ordered or unordered).
|
274
289
|
* @param ordered optional flag to specify if ordered sketch should be produced
|
@@ -285,11 +300,10 @@ private:
|
|
285
300
|
theta_table table_;
|
286
301
|
|
287
302
|
// for builder
|
288
|
-
update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
|
289
|
-
uint64_t seed, const Allocator& allocator);
|
303
|
+
update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p,
|
304
|
+
uint64_t theta, uint64_t seed, const Allocator& allocator);
|
290
305
|
|
291
|
-
|
292
|
-
virtual void print_specifics(ostrstream& os) const;
|
306
|
+
virtual void print_specifics(std::ostringstream& os) const;
|
293
307
|
};
|
294
308
|
|
295
309
|
// compact sketch
|
@@ -377,8 +391,7 @@ private:
|
|
377
391
|
uint64_t theta_;
|
378
392
|
std::vector<uint64_t, Allocator> entries_;
|
379
393
|
|
380
|
-
|
381
|
-
virtual void print_specifics(ostrstream& os) const;
|
394
|
+
virtual void print_specifics(std::ostringstream& os) const;
|
382
395
|
};
|
383
396
|
|
384
397
|
template<typename Allocator>
|
@@ -392,7 +405,7 @@ public:
|
|
392
405
|
// It does not take the ownership of the buffer.
|
393
406
|
|
394
407
|
template<typename Allocator = std::allocator<uint64_t>>
|
395
|
-
class wrapped_compact_theta_sketch_alloc {
|
408
|
+
class wrapped_compact_theta_sketch_alloc : public base_theta_sketch_alloc<Allocator> {
|
396
409
|
public:
|
397
410
|
using const_iterator = const uint64_t*;
|
398
411
|
|
@@ -415,6 +428,10 @@ public:
|
|
415
428
|
*/
|
416
429
|
static const wrapped_compact_theta_sketch_alloc wrap(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, bool dump_on_error = false);
|
417
430
|
|
431
|
+
protected:
|
432
|
+
virtual void print_specifics(std::ostringstream& os) const;
|
433
|
+
virtual void print_items(std::ostringstream& os) const;
|
434
|
+
|
418
435
|
private:
|
419
436
|
bool is_empty_;
|
420
437
|
bool is_ordered_;
|