datasketches 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/NOTICE +1 -1
- data/README.md +0 -2
- data/ext/datasketches/cpc_wrapper.cpp +2 -2
- data/ext/datasketches/kll_wrapper.cpp +0 -10
- data/lib/datasketches/version.rb +1 -1
- data/lib/datasketches.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
- data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
- data/vendor/datasketches-cpp/Doxyfile +2827 -0
- data/vendor/datasketches-cpp/LICENSE +0 -76
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +1 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
- data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
- data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
- data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
- data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
- data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
- data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
- data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
- data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
- data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
- data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
- data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
- data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
- data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +47 -93
- data/vendor/datasketches-cpp/MANIFEST.in +0 -39
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/pyproject.toml +0 -23
- data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
- data/vendor/datasketches-cpp/python/README.md +0 -85
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
- data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
- data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
- data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
- data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
- data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +0 -110
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tox.ini +0 -26
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
|
@@ -26,45 +26,39 @@
|
|
|
26
26
|
|
|
27
27
|
namespace datasketches {
|
|
28
28
|
|
|
29
|
+
/// CPC constants
|
|
29
30
|
namespace cpc_constants {
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
31
|
+
/// min log2 of K
|
|
32
|
+
const uint8_t MIN_LG_K = 4;
|
|
33
|
+
/// max log2 of K
|
|
34
|
+
const uint8_t MAX_LG_K = 26;
|
|
35
|
+
/// default log2 of K
|
|
36
|
+
const uint8_t DEFAULT_LG_K = 11;
|
|
33
37
|
}
|
|
34
38
|
|
|
35
|
-
// TODO: Redundant and deprecated. Will be removed in next major version release.
|
|
36
|
-
static const uint8_t CPC_MIN_LG_K = cpc_constants::MIN_LG_K;
|
|
37
|
-
static const uint8_t CPC_MAX_LG_K = cpc_constants::MAX_LG_K;
|
|
38
|
-
static const uint8_t CPC_DEFAULT_LG_K = cpc_constants::DEFAULT_LG_K;
|
|
39
|
-
|
|
40
|
-
template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
|
|
41
|
-
template<typename A> using AllocU16 = typename std::allocator_traits<A>::template rebind_alloc<uint16_t>;
|
|
42
|
-
template<typename A> using AllocU32 = typename std::allocator_traits<A>::template rebind_alloc<uint32_t>;
|
|
43
|
-
template<typename A> using AllocU64 = typename std::allocator_traits<A>::template rebind_alloc<uint64_t>;
|
|
44
|
-
|
|
45
|
-
template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
|
|
46
|
-
template<typename A> using vector_u32 = std::vector<uint32_t, AllocU32<A>>;
|
|
47
|
-
template<typename A> using vector_u64 = std::vector<uint64_t, AllocU64<A>>;
|
|
48
|
-
|
|
49
39
|
// forward declaration
|
|
50
40
|
template<typename A> class u32_table;
|
|
51
41
|
|
|
52
42
|
template<typename A>
|
|
53
43
|
struct compressed_state {
|
|
44
|
+
using vector_u32 = std::vector<uint32_t, typename std::allocator_traits<A>::template rebind_alloc<uint32_t>>;
|
|
45
|
+
|
|
54
46
|
explicit compressed_state(const A& allocator): table_data(allocator), table_data_words(0), table_num_entries(0),
|
|
55
47
|
window_data(allocator), window_data_words(0) {}
|
|
56
|
-
vector_u32
|
|
48
|
+
vector_u32 table_data;
|
|
57
49
|
uint32_t table_data_words;
|
|
58
50
|
uint32_t table_num_entries; // can be different from the number of entries in the sketch in hybrid mode
|
|
59
|
-
vector_u32
|
|
51
|
+
vector_u32 window_data;
|
|
60
52
|
uint32_t window_data_words;
|
|
61
53
|
};
|
|
62
54
|
|
|
63
55
|
template<typename A>
|
|
64
56
|
struct uncompressed_state {
|
|
57
|
+
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
|
|
58
|
+
|
|
65
59
|
explicit uncompressed_state(const A& allocator): table(allocator), window(allocator) {}
|
|
66
60
|
u32_table<A> table;
|
|
67
|
-
|
|
61
|
+
vector_bytes window;
|
|
68
62
|
};
|
|
69
63
|
|
|
70
64
|
} /* namespace datasketches */
|
|
@@ -47,6 +47,9 @@ inline cpc_compressor<A>& get_compressor();
|
|
|
47
47
|
template<typename A>
|
|
48
48
|
class cpc_compressor {
|
|
49
49
|
public:
|
|
50
|
+
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
|
|
51
|
+
using vector_u32 = std::vector<uint32_t, typename std::allocator_traits<A>::template rebind_alloc<uint32_t>>;
|
|
52
|
+
|
|
50
53
|
void compress(const cpc_sketch_alloc<A>& source, compressed_state<A>& target) const;
|
|
51
54
|
void uncompress(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const;
|
|
52
55
|
|
|
@@ -126,17 +129,17 @@ private:
|
|
|
126
129
|
uint16_t* make_decoding_table(const uint16_t* encoding_table, unsigned num_byte_values);
|
|
127
130
|
void validate_decoding_table(const uint16_t* decoding_table, const uint16_t* encoding_table) const;
|
|
128
131
|
|
|
129
|
-
void compress_surprising_values(const vector_u32
|
|
132
|
+
void compress_surprising_values(const vector_u32& pairs, uint8_t lg_k, compressed_state<A>& result) const;
|
|
130
133
|
void compress_sliding_window(const uint8_t* window, uint8_t lg_k, uint32_t num_coupons, compressed_state<A>& target) const;
|
|
131
134
|
|
|
132
|
-
vector_u32
|
|
133
|
-
void uncompress_sliding_window(const uint32_t* data, uint32_t data_words,
|
|
135
|
+
vector_u32 uncompress_surprising_values(const uint32_t* data, uint32_t data_words, uint32_t num_pairs, uint8_t lg_k, const A& allocator) const;
|
|
136
|
+
void uncompress_sliding_window(const uint32_t* data, uint32_t data_words, vector_bytes& window, uint8_t lg_k, uint32_t num_coupons) const;
|
|
134
137
|
|
|
135
138
|
static size_t safe_length_for_compressed_pair_buf(uint32_t k, uint32_t num_pairs, uint8_t num_base_bits);
|
|
136
139
|
static size_t safe_length_for_compressed_window_buf(uint32_t k);
|
|
137
140
|
static uint8_t determine_pseudo_phase(uint8_t lg_k, uint32_t c);
|
|
138
141
|
|
|
139
|
-
static inline vector_u32
|
|
142
|
+
static inline vector_u32 tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space, const A& allocator);
|
|
140
143
|
static inline uint8_t golomb_choose_number_of_base_bits(uint32_t k, uint64_t count);
|
|
141
144
|
};
|
|
142
145
|
|
|
@@ -183,7 +183,7 @@ void cpc_compressor<A>::uncompress(const compressed_state<A>& source, uncompress
|
|
|
183
183
|
template<typename A>
|
|
184
184
|
void cpc_compressor<A>::compress_sparse_flavor(const cpc_sketch_alloc<A>& source, compressed_state<A>& result) const {
|
|
185
185
|
if (source.sliding_window.size() > 0) throw std::logic_error("unexpected sliding window");
|
|
186
|
-
vector_u32
|
|
186
|
+
vector_u32 pairs = source.surprising_value_table.unwrapping_get_items();
|
|
187
187
|
u32_table<A>::introspective_insertion_sort(pairs.data(), 0, pairs.size());
|
|
188
188
|
compress_surprising_values(pairs, source.get_lg_k(), result);
|
|
189
189
|
}
|
|
@@ -192,7 +192,7 @@ template<typename A>
|
|
|
192
192
|
void cpc_compressor<A>::uncompress_sparse_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k) const {
|
|
193
193
|
if (source.window_data.size() > 0) throw std::logic_error("unexpected sliding window");
|
|
194
194
|
if (source.table_data.size() == 0) throw std::logic_error("table is expected");
|
|
195
|
-
vector_u32
|
|
195
|
+
vector_u32 pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
|
|
196
196
|
lg_k, source.table_data.get_allocator());
|
|
197
197
|
target.table = u32_table<A>::make_from_pairs(pairs.data(), source.table_num_entries, lg_k, pairs.get_allocator());
|
|
198
198
|
}
|
|
@@ -204,12 +204,12 @@ void cpc_compressor<A>::compress_hybrid_flavor(const cpc_sketch_alloc<A>& source
|
|
|
204
204
|
if (source.sliding_window.size() == 0) throw std::logic_error("no sliding window");
|
|
205
205
|
if (source.window_offset != 0) throw std::logic_error("window_offset != 0");
|
|
206
206
|
const uint32_t k = 1 << source.get_lg_k();
|
|
207
|
-
vector_u32
|
|
207
|
+
vector_u32 pairs_from_table = source.surprising_value_table.unwrapping_get_items();
|
|
208
208
|
const uint32_t num_pairs_from_table = static_cast<uint32_t>(pairs_from_table.size());
|
|
209
209
|
if (num_pairs_from_table > 0) u32_table<A>::introspective_insertion_sort(pairs_from_table.data(), 0, num_pairs_from_table);
|
|
210
210
|
const uint32_t num_pairs_from_window = source.get_num_coupons() - num_pairs_from_table; // because the window offset is zero
|
|
211
211
|
|
|
212
|
-
vector_u32
|
|
212
|
+
vector_u32 all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window, num_pairs_from_table, source.get_allocator());
|
|
213
213
|
|
|
214
214
|
u32_table<A>::merge(
|
|
215
215
|
pairs_from_table.data(), 0, pairs_from_table.size(),
|
|
@@ -224,7 +224,7 @@ template<typename A>
|
|
|
224
224
|
void cpc_compressor<A>::uncompress_hybrid_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k) const {
|
|
225
225
|
if (source.window_data.size() > 0) throw std::logic_error("window is not expected");
|
|
226
226
|
if (source.table_data.size() == 0) throw std::logic_error("table is expected");
|
|
227
|
-
vector_u32
|
|
227
|
+
vector_u32 pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
|
|
228
228
|
lg_k, source.table_data.get_allocator());
|
|
229
229
|
|
|
230
230
|
// In the hybrid flavor, some of these pairs actually
|
|
@@ -250,7 +250,7 @@ void cpc_compressor<A>::uncompress_hybrid_flavor(const compressed_state<A>& sour
|
|
|
250
250
|
template<typename A>
|
|
251
251
|
void cpc_compressor<A>::compress_pinned_flavor(const cpc_sketch_alloc<A>& source, compressed_state<A>& result) const {
|
|
252
252
|
compress_sliding_window(source.sliding_window.data(), source.get_lg_k(), source.get_num_coupons(), result);
|
|
253
|
-
vector_u32
|
|
253
|
+
vector_u32 pairs = source.surprising_value_table.unwrapping_get_items();
|
|
254
254
|
if (pairs.size() > 0) {
|
|
255
255
|
// Here we subtract 8 from the column indices. Because they are stored in the low 6 bits
|
|
256
256
|
// of each row_col pair, and because no column index is less than 8 for a "Pinned" sketch,
|
|
@@ -277,7 +277,7 @@ void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& sour
|
|
|
277
277
|
target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
|
|
278
278
|
} else {
|
|
279
279
|
if (source.table_data.size() == 0) throw std::logic_error("table is expected");
|
|
280
|
-
vector_u32
|
|
280
|
+
vector_u32 pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
|
|
281
281
|
lg_k, source.table_data.get_allocator());
|
|
282
282
|
// undo the compressor's 8-column shift
|
|
283
283
|
for (uint32_t i = 0; i < num_pairs; i++) {
|
|
@@ -291,7 +291,7 @@ void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& sour
|
|
|
291
291
|
template<typename A>
|
|
292
292
|
void cpc_compressor<A>::compress_sliding_flavor(const cpc_sketch_alloc<A>& source, compressed_state<A>& result) const {
|
|
293
293
|
compress_sliding_window(source.sliding_window.data(), source.get_lg_k(), source.get_num_coupons(), result);
|
|
294
|
-
vector_u32
|
|
294
|
+
vector_u32 pairs = source.surprising_value_table.unwrapping_get_items();
|
|
295
295
|
if (pairs.size() > 0) {
|
|
296
296
|
// Here we apply a complicated transformation to the column indices, which
|
|
297
297
|
// changes the implied ordering of the pairs, so we must do it before sorting.
|
|
@@ -330,7 +330,7 @@ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& sou
|
|
|
330
330
|
target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
|
|
331
331
|
} else {
|
|
332
332
|
if (source.table_data.size() == 0) throw std::logic_error("table is expected");
|
|
333
|
-
vector_u32
|
|
333
|
+
vector_u32 pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
|
|
334
334
|
lg_k, source.table_data.get_allocator());
|
|
335
335
|
|
|
336
336
|
const uint8_t pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
|
|
@@ -356,7 +356,7 @@ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& sou
|
|
|
356
356
|
}
|
|
357
357
|
|
|
358
358
|
template<typename A>
|
|
359
|
-
void cpc_compressor<A>::compress_surprising_values(const vector_u32
|
|
359
|
+
void cpc_compressor<A>::compress_surprising_values(const vector_u32& pairs, uint8_t lg_k, compressed_state<A>& result) const {
|
|
360
360
|
const uint32_t k = 1 << lg_k;
|
|
361
361
|
const uint32_t num_pairs = static_cast<uint32_t>(pairs.size());
|
|
362
362
|
const uint8_t num_base_bits = golomb_choose_number_of_base_bits(k + num_pairs, num_pairs);
|
|
@@ -374,10 +374,10 @@ void cpc_compressor<A>::compress_surprising_values(const vector_u32<A>& pairs, u
|
|
|
374
374
|
}
|
|
375
375
|
|
|
376
376
|
template<typename A>
|
|
377
|
-
|
|
378
|
-
uint8_t lg_k, const A& allocator) const {
|
|
377
|
+
auto cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data, uint32_t data_words, uint32_t num_pairs,
|
|
378
|
+
uint8_t lg_k, const A& allocator) const -> vector_u32 {
|
|
379
379
|
const uint32_t k = 1 << lg_k;
|
|
380
|
-
vector_u32
|
|
380
|
+
vector_u32 pairs(num_pairs, 0, allocator);
|
|
381
381
|
const uint8_t num_base_bits = golomb_choose_number_of_base_bits(k + num_pairs, num_pairs);
|
|
382
382
|
low_level_uncompress_pairs(pairs.data(), num_pairs, num_base_bits, data, data_words);
|
|
383
383
|
return pairs;
|
|
@@ -399,7 +399,7 @@ void cpc_compressor<A>::compress_sliding_window(const uint8_t* window, uint8_t l
|
|
|
399
399
|
}
|
|
400
400
|
|
|
401
401
|
template<typename A>
|
|
402
|
-
void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, uint32_t data_words,
|
|
402
|
+
void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, uint32_t data_words, vector_bytes& window,
|
|
403
403
|
uint8_t lg_k, uint32_t num_coupons) const {
|
|
404
404
|
const uint32_t k = 1 << lg_k;
|
|
405
405
|
window.resize(k); // zeroing not needed here (unlike the Hybrid Flavor)
|
|
@@ -722,10 +722,10 @@ void write_unary(
|
|
|
722
722
|
// The empty space that this leaves at the beginning of the output array
|
|
723
723
|
// will be filled in later by the caller.
|
|
724
724
|
template<typename A>
|
|
725
|
-
|
|
726
|
-
uint32_t empty_space, const A& allocator) {
|
|
725
|
+
auto cpc_compressor<A>::tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get,
|
|
726
|
+
uint32_t empty_space, const A& allocator) -> vector_u32 {
|
|
727
727
|
const size_t output_length = empty_space + num_pairs_to_get;
|
|
728
|
-
vector_u32
|
|
728
|
+
vector_u32 pairs(output_length, 0, allocator);
|
|
729
729
|
size_t pair_index = empty_space;
|
|
730
730
|
for (unsigned row_index = 0; row_index < k; row_index++) {
|
|
731
731
|
uint8_t byte = window[row_index];
|
|
@@ -33,58 +33,58 @@
|
|
|
33
33
|
|
|
34
34
|
namespace datasketches {
|
|
35
35
|
|
|
36
|
-
|
|
37
|
-
* High performance C++ implementation of Compressed Probabilistic Counting (CPC) Sketch
|
|
38
|
-
*
|
|
39
|
-
* This is a very compact (in serialized form) distinct counting sketch.
|
|
40
|
-
* The theory is described in the following paper:
|
|
41
|
-
* https://arxiv.org/abs/1708.06839
|
|
42
|
-
*
|
|
43
|
-
* author Kevin Lang
|
|
44
|
-
* author Alexander Saydakov
|
|
45
|
-
*/
|
|
46
|
-
|
|
47
|
-
// forward-declarations
|
|
36
|
+
// forward declarations
|
|
48
37
|
template<typename A> class cpc_sketch_alloc;
|
|
49
38
|
template<typename A> class cpc_union_alloc;
|
|
50
39
|
|
|
51
|
-
|
|
40
|
+
/// CPC sketch alias with default allocator
|
|
52
41
|
using cpc_sketch = cpc_sketch_alloc<std::allocator<uint8_t>>;
|
|
53
42
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
43
|
+
/**
|
|
44
|
+
* Allocation and initialization of global decompression (decoding) tables.
|
|
45
|
+
* Call this before anything else if you want to control the initialization time.
|
|
46
|
+
* For instance, to have this happen outside of a transaction context.
|
|
47
|
+
* Otherwise initialization happens on the first use (serialization or deserialization).
|
|
48
|
+
* It is safe to call more than once assuming no race conditions.
|
|
49
|
+
* This is not thread safe! Neither is the rest of the library.
|
|
50
|
+
*/
|
|
60
51
|
template<typename A> void cpc_init();
|
|
61
52
|
|
|
53
|
+
/**
|
|
54
|
+
* High performance C++ implementation of Compressed Probabilistic Counting (CPC) Sketch
|
|
55
|
+
*
|
|
56
|
+
* This is a very compact (in serialized form) distinct counting sketch.
|
|
57
|
+
* The theory is described in the following paper:
|
|
58
|
+
* https://arxiv.org/abs/1708.06839
|
|
59
|
+
*
|
|
60
|
+
* @author Kevin Lang
|
|
61
|
+
* @author Alexander Saydakov
|
|
62
|
+
*/
|
|
62
63
|
template<typename A>
|
|
63
64
|
class cpc_sketch_alloc {
|
|
64
65
|
public:
|
|
66
|
+
using allocator_type = A;
|
|
67
|
+
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
|
|
68
|
+
using vector_u64 = std::vector<uint64_t, typename std::allocator_traits<A>::template rebind_alloc<uint64_t>>;
|
|
69
|
+
|
|
65
70
|
/**
|
|
66
71
|
* Creates an instance of the sketch given the lg_k parameter and hash seed.
|
|
67
72
|
* @param lg_k base 2 logarithm of the number of bins in the sketch
|
|
68
73
|
* @param seed for hash function
|
|
74
|
+
* @param allocator instance of an allocator
|
|
69
75
|
*/
|
|
70
76
|
explicit cpc_sketch_alloc(uint8_t lg_k = cpc_constants::DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
|
|
71
77
|
|
|
72
|
-
|
|
78
|
+
/// @return allocator
|
|
73
79
|
A get_allocator() const;
|
|
74
80
|
|
|
75
|
-
|
|
76
|
-
* @return configured lg_k of this sketch
|
|
77
|
-
*/
|
|
81
|
+
/// @return configured lg_k of this sketch
|
|
78
82
|
uint8_t get_lg_k() const;
|
|
79
83
|
|
|
80
|
-
|
|
81
|
-
* @return true if this sketch represents an empty set
|
|
82
|
-
*/
|
|
84
|
+
/// @return true if this sketch represents an empty set
|
|
83
85
|
bool is_empty() const;
|
|
84
86
|
|
|
85
|
-
|
|
86
|
-
* @return estimate of the distinct count of the input stream
|
|
87
|
-
*/
|
|
87
|
+
/// @return estimate of the distinct count of the input stream
|
|
88
88
|
double get_estimate() const;
|
|
89
89
|
|
|
90
90
|
/**
|
|
@@ -189,13 +189,14 @@ public:
|
|
|
189
189
|
* Otherwise two sketches that should represent overlapping sets will be disjoint
|
|
190
190
|
* For instance, for signed 32-bit values call update(int32_t) method above,
|
|
191
191
|
* which does widening conversion to int64_t, if compatibility with Java is expected
|
|
192
|
-
* @param
|
|
193
|
-
* @param
|
|
192
|
+
* @param value pointer to the data
|
|
193
|
+
* @param size of the data in bytes
|
|
194
194
|
*/
|
|
195
195
|
void update(const void* value, size_t size);
|
|
196
196
|
|
|
197
197
|
/**
|
|
198
198
|
* Returns a human-readable summary of this sketch
|
|
199
|
+
* @return a human-readable summary of this sketch
|
|
199
200
|
*/
|
|
200
201
|
string<A> to_string() const;
|
|
201
202
|
|
|
@@ -205,16 +206,13 @@ public:
|
|
|
205
206
|
*/
|
|
206
207
|
void serialize(std::ostream& os) const;
|
|
207
208
|
|
|
208
|
-
// This is a convenience alias for users
|
|
209
|
-
// The type returned by the following serialize method
|
|
210
|
-
using vector_bytes = vector_u8<A>;
|
|
211
|
-
|
|
212
209
|
/**
|
|
213
210
|
* This method serializes the sketch as a vector of bytes.
|
|
214
211
|
* An optional header can be reserved in front of the sketch.
|
|
215
212
|
* It is an uninitialized space of a given size.
|
|
216
213
|
* This header is used in Datasketches PostgreSQL extension.
|
|
217
214
|
* @param header_size_bytes space to reserve in front of the sketch
|
|
215
|
+
* @return serialized sketch as a vector of bytes
|
|
218
216
|
*/
|
|
219
217
|
vector_bytes serialize(unsigned header_size_bytes = 0) const;
|
|
220
218
|
|
|
@@ -222,6 +220,7 @@ public:
|
|
|
222
220
|
* This method deserializes a sketch from a given stream.
|
|
223
221
|
* @param is input stream
|
|
224
222
|
* @param seed the seed for the hash function that was used to create the sketch
|
|
223
|
+
* @param allocator instance of an Allocator
|
|
225
224
|
* @return an instance of a sketch
|
|
226
225
|
*/
|
|
227
226
|
static cpc_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
|
|
@@ -231,6 +230,7 @@ public:
|
|
|
231
230
|
* @param bytes pointer to the array of bytes
|
|
232
231
|
* @param size the size of the array
|
|
233
232
|
* @param seed the seed for the hash function that was used to create the sketch
|
|
233
|
+
* @param allocator instance of an Allocator
|
|
234
234
|
* @return an instance of the sketch
|
|
235
235
|
*/
|
|
236
236
|
static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
|
|
@@ -246,10 +246,10 @@ public:
|
|
|
246
246
|
*/
|
|
247
247
|
static size_t get_max_serialized_size_bytes(uint8_t lg_k);
|
|
248
248
|
|
|
249
|
-
|
|
249
|
+
/// @private for internal use
|
|
250
250
|
uint32_t get_num_coupons() const;
|
|
251
251
|
|
|
252
|
-
|
|
252
|
+
/// @private for debugging
|
|
253
253
|
// this should catch some forms of corruption during serialization-deserialization
|
|
254
254
|
bool validate() const;
|
|
255
255
|
|
|
@@ -276,7 +276,7 @@ private:
|
|
|
276
276
|
uint32_t num_coupons; // the number of coupons collected so far
|
|
277
277
|
|
|
278
278
|
u32_table<A> surprising_value_table;
|
|
279
|
-
|
|
279
|
+
vector_bytes sliding_window;
|
|
280
280
|
uint8_t window_offset; // derivable from num_coupons, but made explicit for speed
|
|
281
281
|
uint8_t first_interesting_column; // This is part of a speed optimization
|
|
282
282
|
|
|
@@ -285,7 +285,7 @@ private:
|
|
|
285
285
|
|
|
286
286
|
// for deserialization and cpc_union::get_result()
|
|
287
287
|
cpc_sketch_alloc(uint8_t lg_k, uint32_t num_coupons, uint8_t first_interesting_column, u32_table<A>&& table,
|
|
288
|
-
|
|
288
|
+
vector_bytes&& window, bool has_hip, double kxp, double hip_est_accum, uint64_t seed);
|
|
289
289
|
|
|
290
290
|
inline void row_col_update(uint32_t row_col);
|
|
291
291
|
inline void update_sparse(uint32_t row_col);
|
|
@@ -308,7 +308,7 @@ private:
|
|
|
308
308
|
static inline uint8_t determine_correct_offset(uint8_t lg_k, uint64_t c);
|
|
309
309
|
|
|
310
310
|
// this produces a full-size k-by-64 bit matrix
|
|
311
|
-
vector_u64
|
|
311
|
+
vector_u64 build_bit_matrix() const;
|
|
312
312
|
|
|
313
313
|
static uint8_t get_preamble_ints(uint32_t num_coupons, bool has_hip, bool has_table, bool has_window);
|
|
314
314
|
inline void write_hip(std::ostream& os) const;
|
|
@@ -315,7 +315,7 @@ void cpc_sketch_alloc<A>::move_window() {
|
|
|
315
315
|
const uint32_t k = 1 << lg_k;
|
|
316
316
|
|
|
317
317
|
// Construct the full-sized bit matrix that corresponds to the sketch
|
|
318
|
-
vector_u64
|
|
318
|
+
vector_u64 bit_matrix = build_bit_matrix();
|
|
319
319
|
|
|
320
320
|
// refresh the KXP register on every 8th window shift.
|
|
321
321
|
if ((new_offset & 0x7) == 0) refresh_kxp(bit_matrix.data());
|
|
@@ -458,7 +458,7 @@ void cpc_sketch_alloc<A>::serialize(std::ostream& os) const {
|
|
|
458
458
|
}
|
|
459
459
|
|
|
460
460
|
template<typename A>
|
|
461
|
-
|
|
461
|
+
auto cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
|
|
462
462
|
compressed_state<A> compressed(sliding_window.get_allocator());
|
|
463
463
|
compressed.table_data_words = 0;
|
|
464
464
|
compressed.table_num_entries = 0;
|
|
@@ -469,7 +469,7 @@ vector_u8<A> cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
|
|
|
469
469
|
const bool has_window = compressed.window_data.size() > 0;
|
|
470
470
|
const uint8_t preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window);
|
|
471
471
|
const size_t size = header_size_bytes + (preamble_ints + compressed.table_data_words + compressed.window_data_words) * sizeof(uint32_t);
|
|
472
|
-
|
|
472
|
+
vector_bytes bytes(size, 0, sliding_window.get_allocator());
|
|
473
473
|
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
474
474
|
ptr += copy_to_mem(preamble_ints, ptr);
|
|
475
475
|
const uint8_t serial_version = SERIAL_VERSION;
|
|
@@ -712,15 +712,18 @@ static const size_t CPC_MAX_PREAMBLE_SIZE_BYTES = 40;
|
|
|
712
712
|
template<typename A>
|
|
713
713
|
size_t cpc_sketch_alloc<A>::get_max_serialized_size_bytes(uint8_t lg_k) {
|
|
714
714
|
check_lg_k(lg_k);
|
|
715
|
-
if (lg_k <= CPC_EMPIRICAL_SIZE_MAX_LGK)
|
|
715
|
+
if (lg_k <= CPC_EMPIRICAL_SIZE_MAX_LGK) {
|
|
716
|
+
return CPC_EMPIRICAL_MAX_SIZE_BYTES[lg_k - cpc_constants::MIN_LG_K] + CPC_MAX_PREAMBLE_SIZE_BYTES;
|
|
717
|
+
}
|
|
716
718
|
const uint32_t k = 1 << lg_k;
|
|
717
719
|
return (int) (CPC_EMPIRICAL_MAX_SIZE_FACTOR * k) + CPC_MAX_PREAMBLE_SIZE_BYTES;
|
|
718
720
|
}
|
|
719
721
|
|
|
720
722
|
template<typename A>
|
|
721
723
|
void cpc_sketch_alloc<A>::check_lg_k(uint8_t lg_k) {
|
|
722
|
-
if (lg_k <
|
|
723
|
-
throw std::invalid_argument("lg_k must be >= " + std::to_string(
|
|
724
|
+
if (lg_k < cpc_constants::MIN_LG_K || lg_k > cpc_constants::MAX_LG_K) {
|
|
725
|
+
throw std::invalid_argument("lg_k must be >= " + std::to_string(cpc_constants::MIN_LG_K) + " and <= "
|
|
726
|
+
+ std::to_string(cpc_constants::MAX_LG_K) + ": " + std::to_string(lg_k));
|
|
724
727
|
}
|
|
725
728
|
}
|
|
726
729
|
|
|
@@ -731,14 +734,14 @@ uint32_t cpc_sketch_alloc<A>::get_num_coupons() const {
|
|
|
731
734
|
|
|
732
735
|
template<typename A>
|
|
733
736
|
bool cpc_sketch_alloc<A>::validate() const {
|
|
734
|
-
vector_u64
|
|
737
|
+
vector_u64 bit_matrix = build_bit_matrix();
|
|
735
738
|
const uint64_t num_bits_set = count_bits_set_in_matrix(bit_matrix.data(), 1ULL << lg_k);
|
|
736
739
|
return num_bits_set == num_coupons;
|
|
737
740
|
}
|
|
738
741
|
|
|
739
742
|
template<typename A>
|
|
740
743
|
cpc_sketch_alloc<A>::cpc_sketch_alloc(uint8_t lg_k, uint32_t num_coupons, uint8_t first_interesting_column,
|
|
741
|
-
u32_table<A>&& table,
|
|
744
|
+
u32_table<A>&& table, vector_bytes&& window, bool has_hip, double kxp, double hip_est_accum, uint64_t seed):
|
|
742
745
|
lg_k(lg_k),
|
|
743
746
|
seed(seed),
|
|
744
747
|
was_merged(!has_hip),
|
|
@@ -800,14 +803,14 @@ uint8_t cpc_sketch_alloc<A>::determine_correct_offset(uint8_t lg_k, uint64_t c)
|
|
|
800
803
|
}
|
|
801
804
|
|
|
802
805
|
template<typename A>
|
|
803
|
-
|
|
806
|
+
auto cpc_sketch_alloc<A>::build_bit_matrix() const -> vector_u64 {
|
|
804
807
|
const uint32_t k = 1 << lg_k;
|
|
805
808
|
if (window_offset > 56) throw std::logic_error("offset > 56");
|
|
806
809
|
|
|
807
810
|
// Fill the matrix with default rows in which the "early zone" is filled with ones.
|
|
808
811
|
// This is essential for the routine's O(k) time cost (as opposed to O(C)).
|
|
809
812
|
const uint64_t default_row = (static_cast<uint64_t>(1) << window_offset) - 1;
|
|
810
|
-
vector_u64
|
|
813
|
+
vector_u64 matrix(k, default_row, sliding_window.get_allocator());
|
|
811
814
|
|
|
812
815
|
if (num_coupons == 0) return matrix;
|
|
813
816
|
|
|
@@ -27,31 +27,55 @@
|
|
|
27
27
|
|
|
28
28
|
namespace datasketches {
|
|
29
29
|
|
|
30
|
-
|
|
30
|
+
/// CPC union alias with default allocator
|
|
31
|
+
using cpc_union = cpc_union_alloc<std::allocator<uint8_t>>;
|
|
32
|
+
|
|
33
|
+
/**
|
|
31
34
|
* High performance C++ implementation of Compressed Probabilistic Counting (CPC) Union
|
|
32
35
|
*
|
|
33
36
|
* author Kevin Lang
|
|
34
37
|
* author Alexander Saydakov
|
|
35
38
|
*/
|
|
36
|
-
|
|
37
|
-
// alias with default allocator for convenience
|
|
38
|
-
using cpc_union = cpc_union_alloc<std::allocator<uint8_t>>;
|
|
39
|
-
|
|
40
39
|
template<typename A>
|
|
41
40
|
class cpc_union_alloc {
|
|
42
41
|
public:
|
|
42
|
+
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
|
|
43
|
+
using vector_u64 = std::vector<uint64_t, typename std::allocator_traits<A>::template rebind_alloc<uint64_t>>;
|
|
44
|
+
|
|
43
45
|
/**
|
|
44
46
|
* Creates an instance of the union given the lg_k parameter and hash seed.
|
|
45
47
|
* @param lg_k base 2 logarithm of the number of bins in the sketch
|
|
46
48
|
* @param seed for hash function
|
|
49
|
+
* @param allocator instance of an allocator
|
|
47
50
|
*/
|
|
48
51
|
explicit cpc_union_alloc(uint8_t lg_k = cpc_constants::DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
|
|
49
52
|
|
|
53
|
+
/**
|
|
54
|
+
* Copy constructor
|
|
55
|
+
* @param other union to be copied
|
|
56
|
+
*/
|
|
50
57
|
cpc_union_alloc(const cpc_union_alloc<A>& other);
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Move constructor
|
|
61
|
+
* @param other union to be moved
|
|
62
|
+
*/
|
|
51
63
|
cpc_union_alloc(cpc_union_alloc<A>&& other) noexcept;
|
|
64
|
+
|
|
52
65
|
~cpc_union_alloc();
|
|
53
66
|
|
|
67
|
+
/**
|
|
68
|
+
* Copy assignment
|
|
69
|
+
* @param other union to be copied
|
|
70
|
+
* @return reference to this union
|
|
71
|
+
*/
|
|
54
72
|
cpc_union_alloc<A>& operator=(const cpc_union_alloc<A>& other);
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Move assignment
|
|
76
|
+
* @param other union to be moved
|
|
77
|
+
* @return reference to this union
|
|
78
|
+
*/
|
|
55
79
|
cpc_union_alloc<A>& operator=(cpc_union_alloc<A>&& other) noexcept;
|
|
56
80
|
|
|
57
81
|
/**
|
|
@@ -73,14 +97,14 @@ public:
|
|
|
73
97
|
cpc_sketch_alloc<A> get_result() const;
|
|
74
98
|
|
|
75
99
|
private:
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
100
|
+
using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
|
|
101
|
+
using AllocU64 = typename std::allocator_traits<A>::template rebind_alloc<uint64_t>;
|
|
102
|
+
using AllocCpc = typename std::allocator_traits<A>::template rebind_alloc<cpc_sketch_alloc<A>>;
|
|
79
103
|
|
|
80
104
|
uint8_t lg_k;
|
|
81
105
|
uint64_t seed;
|
|
82
106
|
cpc_sketch_alloc<A>* accumulator;
|
|
83
|
-
vector_u64
|
|
107
|
+
vector_u64 bit_matrix;
|
|
84
108
|
|
|
85
109
|
template<typename S> void internal_update(S&& sketch); // to support both rvalue and lvalue
|
|
86
110
|
|
|
@@ -90,8 +114,8 @@ private:
|
|
|
90
114
|
void switch_to_bit_matrix();
|
|
91
115
|
void walk_table_updating_sketch(const u32_table<A>& table);
|
|
92
116
|
void or_table_into_matrix(const u32_table<A>& table);
|
|
93
|
-
void or_window_into_matrix(const
|
|
94
|
-
void or_matrix_into_matrix(const vector_u64
|
|
117
|
+
void or_window_into_matrix(const vector_bytes& sliding_window, uint8_t offset, uint8_t src_lg_k);
|
|
118
|
+
void or_matrix_into_matrix(const vector_u64& src_matrix, uint8_t src_lg_k);
|
|
95
119
|
void reduce_k(uint8_t new_lg_k);
|
|
96
120
|
};
|
|
97
121
|
|
|
@@ -33,8 +33,8 @@ seed(seed),
|
|
|
33
33
|
accumulator(nullptr),
|
|
34
34
|
bit_matrix(allocator)
|
|
35
35
|
{
|
|
36
|
-
if (lg_k <
|
|
37
|
-
throw std::invalid_argument("lg_k must be >= " + std::to_string(
|
|
36
|
+
if (lg_k < cpc_constants::MIN_LG_K || lg_k > cpc_constants::MAX_LG_K) {
|
|
37
|
+
throw std::invalid_argument("lg_k must be >= " + std::to_string(cpc_constants::MIN_LG_K) + " and <= " + std::to_string(cpc_constants::MAX_LG_K) + ": " + std::to_string(lg_k));
|
|
38
38
|
}
|
|
39
39
|
accumulator = new (AllocCpc(allocator).allocate(1)) cpc_sketch_alloc<A>(lg_k, seed, allocator);
|
|
40
40
|
}
|
|
@@ -166,7 +166,7 @@ void cpc_union_alloc<A>::internal_update(S&& sketch) {
|
|
|
166
166
|
// SLIDING mode involves inverted logic, so we can't just walk the source sketch.
|
|
167
167
|
// Instead, we convert it to a bitMatrix that can be OR'ed into the destination.
|
|
168
168
|
if (cpc_sketch_alloc<A>::flavor::SLIDING != src_flavor) throw std::logic_error("wrong flavor"); // Case D
|
|
169
|
-
vector_u64
|
|
169
|
+
vector_u64 src_matrix = sketch.build_bit_matrix();
|
|
170
170
|
or_matrix_into_matrix(src_matrix, sketch.get_lg_k());
|
|
171
171
|
}
|
|
172
172
|
|
|
@@ -203,7 +203,7 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
|
|
|
203
203
|
|
|
204
204
|
const uint8_t offset = cpc_sketch_alloc<A>::determine_correct_offset(lg_k, num_coupons);
|
|
205
205
|
|
|
206
|
-
|
|
206
|
+
vector_bytes sliding_window(k, 0, bit_matrix.get_allocator());
|
|
207
207
|
// don't need to zero the window's memory
|
|
208
208
|
|
|
209
209
|
// dynamically growing caused snowplow effect
|
|
@@ -289,7 +289,7 @@ void cpc_union_alloc<A>::or_table_into_matrix(const u32_table<A>& table) {
|
|
|
289
289
|
}
|
|
290
290
|
|
|
291
291
|
template<typename A>
|
|
292
|
-
void cpc_union_alloc<A>::or_window_into_matrix(const
|
|
292
|
+
void cpc_union_alloc<A>::or_window_into_matrix(const vector_bytes& sliding_window, uint8_t offset, uint8_t src_lg_k) {
|
|
293
293
|
if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK");
|
|
294
294
|
const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK
|
|
295
295
|
const uint32_t src_k = 1 << src_lg_k;
|
|
@@ -299,7 +299,7 @@ void cpc_union_alloc<A>::or_window_into_matrix(const vector_u8<A>& sliding_windo
|
|
|
299
299
|
}
|
|
300
300
|
|
|
301
301
|
template<typename A>
|
|
302
|
-
void cpc_union_alloc<A>::or_matrix_into_matrix(const vector_u64
|
|
302
|
+
void cpc_union_alloc<A>::or_matrix_into_matrix(const vector_u64& src_matrix, uint8_t src_lg_k) {
|
|
303
303
|
if (lg_k > src_lg_k) throw std::logic_error("dst LgK > src LgK");
|
|
304
304
|
const uint64_t dst_mask = (1 << lg_k) - 1; // downsamples when dst lgK < src LgK
|
|
305
305
|
const uint32_t src_k = 1 << src_lg_k;
|
|
@@ -315,10 +315,10 @@ void cpc_union_alloc<A>::reduce_k(uint8_t new_lg_k) {
|
|
|
315
315
|
|
|
316
316
|
if (bit_matrix.size() > 0) { // downsample the unioner's bit matrix
|
|
317
317
|
if (accumulator != nullptr) throw std::logic_error("accumulator is not null");
|
|
318
|
-
vector_u64
|
|
318
|
+
vector_u64 old_matrix = std::move(bit_matrix);
|
|
319
319
|
const uint8_t old_lg_k = lg_k;
|
|
320
320
|
const uint32_t new_k = 1 << new_lg_k;
|
|
321
|
-
bit_matrix = vector_u64
|
|
321
|
+
bit_matrix = vector_u64(new_k, 0, old_matrix.get_allocator());
|
|
322
322
|
lg_k = new_lg_k;
|
|
323
323
|
or_matrix_into_matrix(old_matrix, old_lg_k);
|
|
324
324
|
return;
|
|
@@ -38,6 +38,7 @@ static const uint32_t U32_TABLE_DOWNSIZE_DENOM = 4LL;
|
|
|
38
38
|
template<typename A>
|
|
39
39
|
class u32_table {
|
|
40
40
|
public:
|
|
41
|
+
using vector_u32 = std::vector<uint32_t, typename std::allocator_traits<A>::template rebind_alloc<uint32_t>>;
|
|
41
42
|
|
|
42
43
|
u32_table(const A& allocator);
|
|
43
44
|
u32_table(uint8_t lg_size, uint8_t num_valid_bits, const A& allocator);
|
|
@@ -54,7 +55,7 @@ public:
|
|
|
54
55
|
|
|
55
56
|
static u32_table make_from_pairs(const uint32_t* pairs, uint32_t num_pairs, uint8_t lg_k, const A& allocator);
|
|
56
57
|
|
|
57
|
-
vector_u32
|
|
58
|
+
vector_u32 unwrapping_get_items() const;
|
|
58
59
|
|
|
59
60
|
static void merge(
|
|
60
61
|
const uint32_t* arr_a, size_t start_a, size_t length_a, // input
|
|
@@ -70,7 +71,7 @@ private:
|
|
|
70
71
|
uint8_t lg_size; // log2 of number of slots
|
|
71
72
|
uint8_t num_valid_bits;
|
|
72
73
|
uint32_t num_items;
|
|
73
|
-
vector_u32
|
|
74
|
+
vector_u32 slots;
|
|
74
75
|
|
|
75
76
|
inline uint32_t lookup(uint32_t item) const;
|
|
76
77
|
inline void must_insert(uint32_t item);
|