datasketches 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/NOTICE +1 -1
- data/README.md +0 -2
- data/ext/datasketches/cpc_wrapper.cpp +2 -2
- data/ext/datasketches/kll_wrapper.cpp +0 -10
- data/lib/datasketches/version.rb +1 -1
- data/lib/datasketches.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
- data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
- data/vendor/datasketches-cpp/Doxyfile +2827 -0
- data/vendor/datasketches-cpp/LICENSE +0 -76
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +1 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
- data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
- data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
- data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
- data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
- data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
- data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
- data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
- data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
- data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
- data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
- data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
- data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
- data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +47 -93
- data/vendor/datasketches-cpp/MANIFEST.in +0 -39
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/pyproject.toml +0 -23
- data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
- data/vendor/datasketches-cpp/python/README.md +0 -85
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
- data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
- data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
- data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
- data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
- data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +0 -110
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tox.ini +0 -26
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
|
@@ -38,78 +38,77 @@ _num_hashes(num_hashes),
|
|
|
38
38
|
_num_buckets(num_buckets),
|
|
39
39
|
_sketch_array((num_hashes*num_buckets < 1<<30) ? num_hashes*num_buckets : 0, 0, _allocator),
|
|
40
40
|
_seed(seed),
|
|
41
|
-
_total_weight(0){
|
|
42
|
-
if(num_buckets < 3) throw std::invalid_argument("Using fewer than 3 buckets incurs relative error greater than 1.")
|
|
41
|
+
_total_weight(0) {
|
|
42
|
+
if (num_buckets < 3) throw std::invalid_argument("Using fewer than 3 buckets incurs relative error greater than 1.");
|
|
43
43
|
|
|
44
44
|
// This check is to ensure later compatibility with a Java implementation whose maximum size can only
|
|
45
45
|
// be 2^31-1. We check only against 2^30 for simplicity.
|
|
46
|
-
if(num_buckets*num_hashes >= 1<<30) {
|
|
46
|
+
if (num_buckets * num_hashes >= 1 << 30) {
|
|
47
47
|
throw std::invalid_argument("These parameters generate a sketch that exceeds 2^30 elements."
|
|
48
|
-
"Try reducing either the number of buckets or the number of hash functions.")
|
|
48
|
+
"Try reducing either the number of buckets or the number of hash functions.");
|
|
49
49
|
}
|
|
50
50
|
|
|
51
51
|
std::default_random_engine rng(_seed);
|
|
52
52
|
std::uniform_int_distribution<uint64_t> extra_hash_seeds(0, std::numeric_limits<uint64_t>::max());
|
|
53
|
-
hash_seeds.reserve(num_hashes)
|
|
53
|
+
hash_seeds.reserve(num_hashes);
|
|
54
54
|
|
|
55
|
-
for(uint64_t i=0
|
|
55
|
+
for (uint64_t i=0; i < num_hashes; ++i) {
|
|
56
56
|
hash_seeds.push_back(extra_hash_seeds(rng) + _seed); // Adds the global seed to all hash functions.
|
|
57
57
|
}
|
|
58
58
|
}
|
|
59
59
|
|
|
60
60
|
template<typename W, typename A>
|
|
61
|
-
uint8_t count_min_sketch<W,A>::get_num_hashes() const{
|
|
62
|
-
|
|
61
|
+
uint8_t count_min_sketch<W,A>::get_num_hashes() const {
|
|
62
|
+
return _num_hashes;
|
|
63
63
|
}
|
|
64
64
|
|
|
65
65
|
template<typename W, typename A>
|
|
66
|
-
uint32_t count_min_sketch<W,A>::get_num_buckets() const{
|
|
67
|
-
|
|
66
|
+
uint32_t count_min_sketch<W,A>::get_num_buckets() const {
|
|
67
|
+
return _num_buckets;
|
|
68
68
|
}
|
|
69
69
|
|
|
70
70
|
template<typename W, typename A>
|
|
71
71
|
uint64_t count_min_sketch<W,A>::get_seed() const {
|
|
72
|
-
|
|
72
|
+
return _seed;
|
|
73
73
|
}
|
|
74
74
|
|
|
75
75
|
template<typename W, typename A>
|
|
76
|
-
double count_min_sketch<W,A>::get_relative_error() const{
|
|
77
|
-
return exp(1.0) / double(_num_buckets)
|
|
76
|
+
double count_min_sketch<W,A>::get_relative_error() const {
|
|
77
|
+
return exp(1.0) / double(_num_buckets);
|
|
78
78
|
}
|
|
79
79
|
|
|
80
80
|
template<typename W, typename A>
|
|
81
|
-
W count_min_sketch<W,A>::get_total_weight() const{
|
|
82
|
-
return _total_weight
|
|
81
|
+
W count_min_sketch<W,A>::get_total_weight() const {
|
|
82
|
+
return _total_weight;
|
|
83
83
|
}
|
|
84
84
|
|
|
85
85
|
template<typename W, typename A>
|
|
86
|
-
uint32_t count_min_sketch<W,A>::suggest_num_buckets(double relative_error){
|
|
86
|
+
uint32_t count_min_sketch<W,A>::suggest_num_buckets(double relative_error) {
|
|
87
87
|
/*
|
|
88
88
|
* Function to help users select a number of buckets for a given error.
|
|
89
89
|
* TODO: Change this when we use only power of 2 buckets.
|
|
90
|
-
*
|
|
91
90
|
*/
|
|
92
|
-
if(relative_error < 0.){
|
|
93
|
-
throw std::invalid_argument(
|
|
91
|
+
if (relative_error < 0.) {
|
|
92
|
+
throw std::invalid_argument("Relative error must be at least 0.");
|
|
94
93
|
}
|
|
95
|
-
return ceil(exp(1.0) / relative_error);
|
|
94
|
+
return static_cast<uint32_t>(ceil(exp(1.0) / relative_error));
|
|
96
95
|
}
|
|
97
96
|
|
|
98
97
|
template<typename W, typename A>
|
|
99
|
-
uint8_t count_min_sketch<W,A>::suggest_num_hashes(double confidence){
|
|
98
|
+
uint8_t count_min_sketch<W,A>::suggest_num_hashes(double confidence) {
|
|
100
99
|
/*
|
|
101
100
|
* Function to help users select a number of hashes for a given confidence
|
|
102
101
|
* e.g. confidence = 1 - failure probability
|
|
103
102
|
* failure probability == delta in the literature.
|
|
104
103
|
*/
|
|
105
|
-
if(confidence < 0. || confidence > 1.0){
|
|
106
|
-
throw std::invalid_argument(
|
|
104
|
+
if (confidence < 0. || confidence > 1.0) {
|
|
105
|
+
throw std::invalid_argument("Confidence must be between 0 and 1.0 (inclusive).");
|
|
107
106
|
}
|
|
108
|
-
return std::min<uint8_t>(
|
|
107
|
+
return std::min<uint8_t>(ceil(log(1.0 / (1.0 - confidence))), UINT8_MAX);
|
|
109
108
|
}
|
|
110
109
|
|
|
111
110
|
template<typename W, typename A>
|
|
112
|
-
std::vector<uint64_t> count_min_sketch<W,A>::get_hashes(const void* item, size_t size) const{
|
|
111
|
+
std::vector<uint64_t> count_min_sketch<W,A>::get_hashes(const void* item, size_t size) const {
|
|
113
112
|
/*
|
|
114
113
|
* Returns the hash locations for the input item using the original hashing
|
|
115
114
|
* scheme from [1].
|
|
@@ -124,20 +123,20 @@ std::vector<uint64_t> count_min_sketch<W,A>::get_hashes(const void* item, size_t
|
|
|
124
123
|
* https://github.com/Claudenw/BloomFilter/wiki/Bloom-Filters----An-overview
|
|
125
124
|
* https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
|
|
126
125
|
*/
|
|
127
|
-
uint64_t bucket_index
|
|
128
|
-
std::vector<uint64_t> sketch_update_locations;
|
|
129
|
-
sketch_update_locations.reserve(_num_hashes)
|
|
126
|
+
uint64_t bucket_index;
|
|
127
|
+
std::vector<uint64_t> sketch_update_locations;
|
|
128
|
+
sketch_update_locations.reserve(_num_hashes);
|
|
130
129
|
|
|
131
|
-
uint64_t hash_seed_index = 0
|
|
132
|
-
for(const auto &it
|
|
130
|
+
uint64_t hash_seed_index = 0;
|
|
131
|
+
for (const auto &it: hash_seeds) {
|
|
133
132
|
HashState hashes;
|
|
134
133
|
MurmurHash3_x64_128(item, size, it, hashes); // ? BEWARE OVERFLOW.
|
|
135
|
-
uint64_t hash = hashes.h1
|
|
136
|
-
bucket_index = hash % _num_buckets
|
|
137
|
-
sketch_update_locations.push_back((hash_seed_index * _num_buckets) + bucket_index)
|
|
138
|
-
hash_seed_index += 1
|
|
134
|
+
uint64_t hash = hashes.h1;
|
|
135
|
+
bucket_index = hash % _num_buckets;
|
|
136
|
+
sketch_update_locations.push_back((hash_seed_index * _num_buckets) + bucket_index);
|
|
137
|
+
hash_seed_index += 1;
|
|
139
138
|
}
|
|
140
|
-
return sketch_update_locations
|
|
139
|
+
return sketch_update_locations;
|
|
141
140
|
}
|
|
142
141
|
|
|
143
142
|
template<typename W, typename A>
|
|
@@ -148,7 +147,7 @@ W count_min_sketch<W,A>::get_estimate(int64_t item) const {return get_estimate(&
|
|
|
148
147
|
|
|
149
148
|
template<typename W, typename A>
|
|
150
149
|
W count_min_sketch<W,A>::get_estimate(const std::string& item) const {
|
|
151
|
-
if (item.empty()) return 0
|
|
150
|
+
if (item.empty()) return 0; // Empty strings are not inserted into the sketch.
|
|
152
151
|
return get_estimate(item.c_str(), item.length());
|
|
153
152
|
}
|
|
154
153
|
|
|
@@ -157,13 +156,12 @@ W count_min_sketch<W,A>::get_estimate(const void* item, size_t size) const {
|
|
|
157
156
|
/*
|
|
158
157
|
* Returns the estimated frequency of the item
|
|
159
158
|
*/
|
|
160
|
-
std::vector<uint64_t> hash_locations = get_hashes(item, size)
|
|
161
|
-
std::vector<W> estimates
|
|
162
|
-
for (auto h: hash_locations){
|
|
163
|
-
estimates.push_back(_sketch_array[h])
|
|
159
|
+
std::vector<uint64_t> hash_locations = get_hashes(item, size);
|
|
160
|
+
std::vector<W> estimates;
|
|
161
|
+
for (const auto h: hash_locations) {
|
|
162
|
+
estimates.push_back(_sketch_array[h]);
|
|
164
163
|
}
|
|
165
|
-
|
|
166
|
-
return result ;
|
|
164
|
+
return *std::min_element(estimates.begin(), estimates.end());
|
|
167
165
|
}
|
|
168
166
|
|
|
169
167
|
template<typename W, typename A>
|
|
@@ -171,44 +169,27 @@ void count_min_sketch<W,A>::update(uint64_t item, W weight) {
|
|
|
171
169
|
update(&item, sizeof(item), weight);
|
|
172
170
|
}
|
|
173
171
|
|
|
174
|
-
template<typename W, typename A>
|
|
175
|
-
void count_min_sketch<W,A>::update(uint64_t item) {
|
|
176
|
-
update(&item, sizeof(item), 1);
|
|
177
|
-
}
|
|
178
|
-
|
|
179
172
|
template<typename W, typename A>
|
|
180
173
|
void count_min_sketch<W,A>::update(int64_t item, W weight) {
|
|
181
174
|
update(&item, sizeof(item), weight);
|
|
182
175
|
}
|
|
183
176
|
|
|
184
|
-
template<typename W, typename A>
|
|
185
|
-
void count_min_sketch<W,A>::update(int64_t item) {
|
|
186
|
-
update(&item, sizeof(item), 1);
|
|
187
|
-
}
|
|
188
|
-
|
|
189
177
|
template<typename W, typename A>
|
|
190
178
|
void count_min_sketch<W,A>::update(const std::string& item, W weight) {
|
|
191
179
|
if (item.empty()) return;
|
|
192
180
|
update(item.c_str(), item.length(), weight);
|
|
193
181
|
}
|
|
194
182
|
|
|
195
|
-
template<typename W, typename A>
|
|
196
|
-
void count_min_sketch<W,A>::update(const std::string& item) {
|
|
197
|
-
if (item.empty()) return;
|
|
198
|
-
update(item.c_str(), item.length(), 1);
|
|
199
|
-
}
|
|
200
|
-
|
|
201
183
|
template<typename W, typename A>
|
|
202
184
|
void count_min_sketch<W,A>::update(const void* item, size_t size, W weight) {
|
|
203
185
|
/*
|
|
204
186
|
* Gets the item's hash locations and then increments the sketch in those
|
|
205
187
|
* locations by the weight.
|
|
206
188
|
*/
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
_sketch_array[h] += weight ;
|
|
189
|
+
_total_weight += weight >= 0 ? weight : -weight;
|
|
190
|
+
std::vector<uint64_t> hash_locations = get_hashes(item, size);
|
|
191
|
+
for (const auto h: hash_locations) {
|
|
192
|
+
_sketch_array[h] += weight;
|
|
212
193
|
}
|
|
213
194
|
}
|
|
214
195
|
|
|
@@ -220,13 +201,13 @@ W count_min_sketch<W,A>::get_upper_bound(int64_t item) const {return get_upper_b
|
|
|
220
201
|
|
|
221
202
|
template<typename W, typename A>
|
|
222
203
|
W count_min_sketch<W,A>::get_upper_bound(const std::string& item) const {
|
|
223
|
-
if (item.empty()) return 0
|
|
204
|
+
if (item.empty()) return 0; // Empty strings are not inserted into the sketch.
|
|
224
205
|
return get_upper_bound(item.c_str(), item.length());
|
|
225
206
|
}
|
|
226
207
|
|
|
227
208
|
template<typename W, typename A>
|
|
228
209
|
W count_min_sketch<W,A>::get_upper_bound(const void* item, size_t size) const {
|
|
229
|
-
return get_estimate(item, size) + get_relative_error()*get_total_weight()
|
|
210
|
+
return static_cast<W>(get_estimate(item, size) + get_relative_error() * get_total_weight());
|
|
230
211
|
}
|
|
231
212
|
|
|
232
213
|
template<typename W, typename A>
|
|
@@ -237,41 +218,41 @@ W count_min_sketch<W,A>::get_lower_bound(int64_t item) const {return get_lower_b
|
|
|
237
218
|
|
|
238
219
|
template<typename W, typename A>
|
|
239
220
|
W count_min_sketch<W,A>::get_lower_bound(const std::string& item) const {
|
|
240
|
-
if (item.empty()) return 0
|
|
221
|
+
if (item.empty()) return 0; // Empty strings are not inserted into the sketch.
|
|
241
222
|
return get_lower_bound(item.c_str(), item.length());
|
|
242
223
|
}
|
|
243
224
|
|
|
244
225
|
template<typename W, typename A>
|
|
245
226
|
W count_min_sketch<W,A>::get_lower_bound(const void* item, size_t size) const {
|
|
246
|
-
return get_estimate(item, size)
|
|
227
|
+
return get_estimate(item, size);
|
|
247
228
|
}
|
|
248
229
|
|
|
249
230
|
template<typename W, typename A>
|
|
250
|
-
void count_min_sketch<W,A>::merge(const count_min_sketch &other_sketch){
|
|
231
|
+
void count_min_sketch<W,A>::merge(const count_min_sketch &other_sketch) {
|
|
251
232
|
/*
|
|
252
233
|
* Merges this sketch into other_sketch sketch by elementwise summing of buckets
|
|
253
234
|
*/
|
|
254
|
-
if(this == &other_sketch){
|
|
235
|
+
if (this == &other_sketch) {
|
|
255
236
|
throw std::invalid_argument( "Cannot merge a sketch with itself." );
|
|
256
237
|
}
|
|
257
238
|
|
|
258
239
|
bool acceptable_config =
|
|
259
240
|
(get_num_hashes() == other_sketch.get_num_hashes()) &&
|
|
260
241
|
(get_num_buckets() == other_sketch.get_num_buckets()) &&
|
|
261
|
-
(get_seed() == other_sketch.get_seed())
|
|
262
|
-
if(!acceptable_config){
|
|
242
|
+
(get_seed() == other_sketch.get_seed());
|
|
243
|
+
if (!acceptable_config) {
|
|
263
244
|
throw std::invalid_argument( "Incompatible sketch configuration." );
|
|
264
245
|
}
|
|
265
246
|
|
|
266
247
|
// Merge step - iterate over the other vector and add the weights to this sketch
|
|
267
|
-
auto it = _sketch_array.begin()
|
|
268
|
-
auto other_it = other_sketch.begin()
|
|
269
|
-
while(it != _sketch_array.end()){
|
|
270
|
-
*it += *other_it
|
|
271
|
-
++it
|
|
272
|
-
++other_it
|
|
248
|
+
auto it = _sketch_array.begin(); // This is a std::vector iterator.
|
|
249
|
+
auto other_it = other_sketch.begin(); //This is a const iterator over the other sketch.
|
|
250
|
+
while (it != _sketch_array.end()) {
|
|
251
|
+
*it += *other_it;
|
|
252
|
+
++it;
|
|
253
|
+
++other_it;
|
|
273
254
|
}
|
|
274
|
-
_total_weight += other_sketch.get_total_weight()
|
|
255
|
+
_total_weight += other_sketch.get_total_weight();
|
|
275
256
|
}
|
|
276
257
|
|
|
277
258
|
// Iterators
|
|
@@ -291,35 +272,34 @@ void count_min_sketch<W,A>::serialize(std::ostream& os) const {
|
|
|
291
272
|
//const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_SHORT : PREAMBLE_LONGS_FULL;
|
|
292
273
|
const uint8_t preamble_longs = PREAMBLE_LONGS_SHORT;
|
|
293
274
|
const uint8_t ser_ver = SERIAL_VERSION_1;
|
|
294
|
-
const uint8_t family_id = FAMILY_ID
|
|
275
|
+
const uint8_t family_id = FAMILY_ID;
|
|
295
276
|
const uint8_t flags_byte = (is_empty() ? 1 << flags::IS_EMPTY : 0);
|
|
296
|
-
const uint32_t unused32 = NULL_32
|
|
297
|
-
write(os, preamble_longs)
|
|
298
|
-
write(os, ser_ver)
|
|
299
|
-
write(os, family_id)
|
|
300
|
-
write(os, flags_byte)
|
|
301
|
-
write(os, unused32)
|
|
277
|
+
const uint32_t unused32 = NULL_32;
|
|
278
|
+
write(os, preamble_longs);
|
|
279
|
+
write(os, ser_ver);
|
|
280
|
+
write(os, family_id);
|
|
281
|
+
write(os, flags_byte);
|
|
282
|
+
write(os, unused32);
|
|
302
283
|
|
|
303
284
|
// Long 1
|
|
304
|
-
const uint32_t nbuckets = _num_buckets
|
|
305
|
-
const uint8_t nhashes = _num_hashes
|
|
285
|
+
const uint32_t nbuckets = _num_buckets;
|
|
286
|
+
const uint8_t nhashes = _num_hashes;
|
|
306
287
|
const uint16_t seed_hash(compute_seed_hash(_seed));
|
|
307
288
|
const uint8_t unused8 = NULL_8;
|
|
308
|
-
write(os, nbuckets)
|
|
309
|
-
write(os, nhashes)
|
|
310
|
-
write(os, seed_hash)
|
|
311
|
-
write(os, unused8)
|
|
312
|
-
if (is_empty()) return
|
|
289
|
+
write(os, nbuckets);
|
|
290
|
+
write(os, nhashes);
|
|
291
|
+
write(os, seed_hash);
|
|
292
|
+
write(os, unused8);
|
|
293
|
+
if (is_empty()) return; // sketch is empty, no need to write further bytes.
|
|
313
294
|
|
|
314
295
|
// Long 2
|
|
315
|
-
|
|
316
|
-
write(os, t_weight) ;
|
|
296
|
+
write(os, _total_weight);
|
|
317
297
|
|
|
318
|
-
// Long
|
|
319
|
-
auto it = _sketch_array.begin()
|
|
320
|
-
while(it != _sketch_array.end()){
|
|
321
|
-
write(os, *it)
|
|
322
|
-
++it
|
|
298
|
+
// Long 3 onwards: remaining bytes are consumed by writing the weight and the array values.
|
|
299
|
+
auto it = _sketch_array.begin();
|
|
300
|
+
while (it != _sketch_array.end()) {
|
|
301
|
+
write(os, *it);
|
|
302
|
+
++it;
|
|
323
303
|
}
|
|
324
304
|
}
|
|
325
305
|
|
|
@@ -327,40 +307,40 @@ template<typename W, typename A>
|
|
|
327
307
|
auto count_min_sketch<W,A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) -> count_min_sketch {
|
|
328
308
|
|
|
329
309
|
// First 8 bytes are 4 bytes of preamble and 4 unused bytes.
|
|
330
|
-
const auto preamble_longs = read<uint8_t>(is)
|
|
331
|
-
const auto serial_version = read<uint8_t>(is)
|
|
332
|
-
const auto family_id = read<uint8_t>(is)
|
|
333
|
-
const auto flags_byte = read<uint8_t>(is)
|
|
334
|
-
read<uint32_t>(is)
|
|
310
|
+
const auto preamble_longs = read<uint8_t>(is);
|
|
311
|
+
const auto serial_version = read<uint8_t>(is);
|
|
312
|
+
const auto family_id = read<uint8_t>(is);
|
|
313
|
+
const auto flags_byte = read<uint8_t>(is);
|
|
314
|
+
read<uint32_t>(is); // 4 unused bytes
|
|
335
315
|
|
|
336
316
|
check_header_validity(preamble_longs, serial_version, family_id, flags_byte);
|
|
337
317
|
|
|
338
318
|
// Sketch parameters
|
|
339
|
-
const auto nbuckets = read<uint32_t>(is)
|
|
319
|
+
const auto nbuckets = read<uint32_t>(is);
|
|
340
320
|
const auto nhashes = read<uint8_t>(is);
|
|
341
|
-
const auto seed_hash = read<uint16_t>(is)
|
|
342
|
-
read<uint8_t>(is)
|
|
321
|
+
const auto seed_hash = read<uint16_t>(is);
|
|
322
|
+
read<uint8_t>(is); // 1 unused byte
|
|
343
323
|
|
|
344
324
|
if (seed_hash != compute_seed_hash(seed)) {
|
|
345
325
|
throw std::invalid_argument("Incompatible seed hashes: " + std::to_string(seed_hash) + ", "
|
|
346
326
|
+ std::to_string(compute_seed_hash(seed)));
|
|
347
327
|
}
|
|
348
|
-
count_min_sketch c(nhashes, nbuckets, seed, allocator)
|
|
328
|
+
count_min_sketch c(nhashes, nbuckets, seed, allocator);
|
|
349
329
|
const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
|
|
350
|
-
if (is_empty == 1) return c
|
|
330
|
+
if (is_empty == 1) return c; // sketch is empty, no need to read further.
|
|
351
331
|
|
|
352
332
|
// Set the sketch weight and read in the sketch values
|
|
353
|
-
const auto weight = read<W>(is)
|
|
354
|
-
c._total_weight += weight
|
|
333
|
+
const auto weight = read<W>(is);
|
|
334
|
+
c._total_weight += weight;
|
|
355
335
|
read(is, c._sketch_array.data(), sizeof(W) * c._sketch_array.size());
|
|
356
336
|
|
|
357
|
-
return c
|
|
337
|
+
return c;
|
|
358
338
|
}
|
|
359
339
|
|
|
360
340
|
template<typename W, typename A>
|
|
361
341
|
size_t count_min_sketch<W,A>::get_serialized_size_bytes() const {
|
|
362
342
|
// The header is always 2 longs, whether empty or full
|
|
363
|
-
size_t preamble_longs = PREAMBLE_LONGS_SHORT;
|
|
343
|
+
const size_t preamble_longs = PREAMBLE_LONGS_SHORT;
|
|
364
344
|
|
|
365
345
|
// If the sketch is empty, we're done. Otherwise, we need the total weight
|
|
366
346
|
// held by the sketch as well as a data table of size (num_buckets * num_hashes)
|
|
@@ -377,33 +357,33 @@ auto count_min_sketch<W,A>::serialize(unsigned header_size_bytes) const -> vecto
|
|
|
377
357
|
ptr += copy_to_mem(preamble_longs, ptr);
|
|
378
358
|
const uint8_t ser_ver = SERIAL_VERSION_1;
|
|
379
359
|
ptr += copy_to_mem(ser_ver, ptr);
|
|
380
|
-
const uint8_t family_id = FAMILY_ID
|
|
360
|
+
const uint8_t family_id = FAMILY_ID;
|
|
381
361
|
ptr += copy_to_mem(family_id, ptr);
|
|
382
362
|
const uint8_t flags_byte = (is_empty() ? 1 << flags::IS_EMPTY : 0);
|
|
383
363
|
ptr += copy_to_mem(flags_byte, ptr);
|
|
384
|
-
const uint32_t unused32 = NULL_32
|
|
385
|
-
ptr += copy_to_mem(unused32, ptr)
|
|
364
|
+
const uint32_t unused32 = NULL_32;
|
|
365
|
+
ptr += copy_to_mem(unused32, ptr);
|
|
386
366
|
|
|
387
367
|
// Long 1
|
|
388
|
-
const uint32_t nbuckets = _num_buckets
|
|
389
|
-
const uint8_t nhashes = _num_hashes
|
|
368
|
+
const uint32_t nbuckets = _num_buckets;
|
|
369
|
+
const uint8_t nhashes = _num_hashes;
|
|
390
370
|
const uint16_t seed_hash(compute_seed_hash(_seed));
|
|
391
371
|
const uint8_t null_characters_8 = NULL_8;
|
|
392
|
-
ptr += copy_to_mem(nbuckets, ptr)
|
|
393
|
-
ptr += copy_to_mem(nhashes, ptr)
|
|
394
|
-
ptr += copy_to_mem(seed_hash, ptr)
|
|
395
|
-
ptr += copy_to_mem(null_characters_8, ptr)
|
|
396
|
-
if (is_empty()) return bytes
|
|
372
|
+
ptr += copy_to_mem(nbuckets, ptr);
|
|
373
|
+
ptr += copy_to_mem(nhashes, ptr);
|
|
374
|
+
ptr += copy_to_mem(seed_hash, ptr);
|
|
375
|
+
ptr += copy_to_mem(null_characters_8, ptr);
|
|
376
|
+
if (is_empty()) return bytes; // sketch is empty, no need to write further bytes.
|
|
397
377
|
|
|
398
378
|
// Long 2
|
|
399
|
-
const W t_weight = _total_weight
|
|
400
|
-
ptr += copy_to_mem(t_weight, ptr)
|
|
379
|
+
const W t_weight = _total_weight;
|
|
380
|
+
ptr += copy_to_mem(t_weight, ptr);
|
|
401
381
|
|
|
402
382
|
// Long 3 onwards: remaining bytes are consumed by writing the weight and the array values.
|
|
403
|
-
auto it = _sketch_array.begin()
|
|
404
|
-
while(it != _sketch_array.end()){
|
|
405
|
-
ptr += copy_to_mem(*it, ptr)
|
|
406
|
-
++it
|
|
383
|
+
auto it = _sketch_array.begin();
|
|
384
|
+
while (it != _sketch_array.end()) {
|
|
385
|
+
ptr += copy_to_mem(*it, ptr);
|
|
386
|
+
++it;
|
|
407
387
|
}
|
|
408
388
|
|
|
409
389
|
return bytes;
|
|
@@ -416,45 +396,45 @@ auto count_min_sketch<W,A>::deserialize(const void* bytes, size_t size, uint64_t
|
|
|
416
396
|
const char* ptr = static_cast<const char*>(bytes);
|
|
417
397
|
|
|
418
398
|
// First 8 bytes are 4 bytes of preamble and 4 unused bytes.
|
|
419
|
-
uint8_t preamble_longs
|
|
420
|
-
ptr += copy_from_mem(ptr, preamble_longs)
|
|
421
|
-
uint8_t serial_version
|
|
422
|
-
ptr += copy_from_mem(ptr, serial_version)
|
|
423
|
-
uint8_t family_id
|
|
424
|
-
ptr += copy_from_mem(ptr, family_id)
|
|
425
|
-
uint8_t flags_byte
|
|
426
|
-
ptr += copy_from_mem(ptr, flags_byte)
|
|
399
|
+
uint8_t preamble_longs;
|
|
400
|
+
ptr += copy_from_mem(ptr, preamble_longs);
|
|
401
|
+
uint8_t serial_version;
|
|
402
|
+
ptr += copy_from_mem(ptr, serial_version);
|
|
403
|
+
uint8_t family_id;
|
|
404
|
+
ptr += copy_from_mem(ptr, family_id);
|
|
405
|
+
uint8_t flags_byte;
|
|
406
|
+
ptr += copy_from_mem(ptr, flags_byte);
|
|
427
407
|
ptr += sizeof(uint32_t);
|
|
428
408
|
|
|
429
409
|
check_header_validity(preamble_longs, serial_version, family_id, flags_byte);
|
|
430
410
|
|
|
431
411
|
// Second 8 bytes are the sketch parameters with a final, unused byte.
|
|
432
|
-
uint32_t nbuckets
|
|
433
|
-
uint8_t nhashes
|
|
434
|
-
uint16_t seed_hash
|
|
435
|
-
ptr += copy_from_mem(ptr, nbuckets)
|
|
436
|
-
ptr += copy_from_mem(ptr, nhashes)
|
|
437
|
-
ptr += copy_from_mem(ptr, seed_hash)
|
|
412
|
+
uint32_t nbuckets;
|
|
413
|
+
uint8_t nhashes;
|
|
414
|
+
uint16_t seed_hash;
|
|
415
|
+
ptr += copy_from_mem(ptr, nbuckets);
|
|
416
|
+
ptr += copy_from_mem(ptr, nhashes);
|
|
417
|
+
ptr += copy_from_mem(ptr, seed_hash);
|
|
438
418
|
ptr += sizeof(uint8_t);
|
|
439
419
|
|
|
440
420
|
if (seed_hash != compute_seed_hash(seed)) {
|
|
441
421
|
throw std::invalid_argument("Incompatible seed hashes: " + std::to_string(seed_hash) + ", "
|
|
442
422
|
+ std::to_string(compute_seed_hash(seed)));
|
|
443
423
|
}
|
|
444
|
-
count_min_sketch c(nhashes, nbuckets, seed, allocator)
|
|
424
|
+
count_min_sketch c(nhashes, nbuckets, seed, allocator);
|
|
445
425
|
const bool is_empty = (flags_byte & (1 << flags::IS_EMPTY)) > 0;
|
|
446
|
-
if (is_empty) return c
|
|
426
|
+
if (is_empty) return c; // sketch is empty, no need to read further.
|
|
447
427
|
|
|
448
428
|
ensure_minimum_memory(size, sizeof(W) * (1 + nbuckets * nhashes));
|
|
449
429
|
|
|
450
430
|
// Long 2 is the weight.
|
|
451
431
|
W weight;
|
|
452
|
-
ptr += copy_from_mem(ptr, weight)
|
|
453
|
-
c._total_weight += weight
|
|
432
|
+
ptr += copy_from_mem(ptr, weight);
|
|
433
|
+
c._total_weight += weight;
|
|
454
434
|
|
|
455
435
|
// All remaining bytes are the sketch table entries.
|
|
456
|
-
for (size_t i = 0; i<c._num_buckets*c._num_hashes
|
|
457
|
-
ptr += copy_from_mem(ptr, c._sketch_array[i])
|
|
436
|
+
for (size_t i = 0; i<c._num_buckets*c._num_hashes; ++i) {
|
|
437
|
+
ptr += copy_from_mem(ptr, c._sketch_array[i]);
|
|
458
438
|
}
|
|
459
439
|
return c;
|
|
460
440
|
}
|
|
@@ -468,7 +448,7 @@ template<typename W, typename A>
|
|
|
468
448
|
string<A> count_min_sketch<W,A>::to_string() const {
|
|
469
449
|
// count the number of used entries in the sketch
|
|
470
450
|
uint64_t num_nonzero = 0;
|
|
471
|
-
for (auto entry
|
|
451
|
+
for (const auto entry: _sketch_array) {
|
|
472
452
|
if (entry != static_cast<W>(0.0))
|
|
473
453
|
++num_nonzero;
|
|
474
454
|
}
|
|
@@ -497,7 +477,7 @@ void count_min_sketch<W,A>::check_header_validity(uint8_t preamble_longs, uint8_
|
|
|
497
477
|
switch (sw) { // exhaustive list and description of all valid cases
|
|
498
478
|
case 138 : break; // !empty, ser_ver==1, family==18, preLongs=2;
|
|
499
479
|
case 139 : break; // empty, ser_ver==1, family==18, preLongs=2;
|
|
500
|
-
//case 170 : break
|
|
480
|
+
//case 170 : break; // !empty, ser_ver==1, family==18, preLongs=3;
|
|
501
481
|
default : // all other case values are invalid
|
|
502
482
|
valid = false;
|
|
503
483
|
}
|
|
@@ -507,7 +487,7 @@ void count_min_sketch<W,A>::check_header_validity(uint8_t preamble_longs, uint8_
|
|
|
507
487
|
os << "Possible sketch corruption. Inconsistent state: "
|
|
508
488
|
<< "preamble_longs = " << static_cast<uint32_t>(preamble_longs)
|
|
509
489
|
<< ", empty = " << (empty ? "true" : "false")
|
|
510
|
-
<< ", serialization_version = " << static_cast<uint32_t>(serial_version)
|
|
490
|
+
<< ", serialization_version = " << static_cast<uint32_t>(serial_version);
|
|
511
491
|
throw std::invalid_argument(os.str());
|
|
512
492
|
}
|
|
513
493
|
}
|
|
@@ -20,24 +20,23 @@ add_executable(count_min_test)
|
|
|
20
20
|
target_link_libraries(count_min_test count common_test_lib)
|
|
21
21
|
|
|
22
22
|
set_target_properties(count_min_test PROPERTIES
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
)
|
|
23
|
+
CXX_STANDARD_REQUIRED YES
|
|
24
|
+
)
|
|
26
25
|
|
|
27
26
|
file(TO_CMAKE_PATH "${CMAKE_CURRENT_SOURCE_DIR}" COUNT_TEST_BINARY_PATH)
|
|
28
27
|
string(APPEND COUNT_TEST_BINARY_PATH "/")
|
|
29
28
|
target_compile_definitions(count_min_test
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
29
|
+
PRIVATE
|
|
30
|
+
TEST_BINARY_INPUT_PATH="${COUNT_TEST_BINARY_PATH}"
|
|
31
|
+
)
|
|
33
32
|
|
|
34
33
|
add_test(
|
|
35
|
-
|
|
36
|
-
|
|
34
|
+
NAME count_min_test
|
|
35
|
+
COMMAND count_min_test
|
|
37
36
|
)
|
|
38
37
|
|
|
39
38
|
target_sources(count_min_test
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
39
|
+
PRIVATE
|
|
40
|
+
count_min_test.cpp
|
|
41
|
+
count_min_allocation_test.cpp
|
|
42
|
+
)
|