datasketches 0.3.2 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/NOTICE +1 -1
- data/README.md +0 -2
- data/ext/datasketches/cpc_wrapper.cpp +2 -2
- data/ext/datasketches/kll_wrapper.cpp +0 -10
- data/lib/datasketches/version.rb +1 -1
- data/lib/datasketches.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
- data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
- data/vendor/datasketches-cpp/Doxyfile +2827 -0
- data/vendor/datasketches-cpp/LICENSE +0 -76
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +1 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
- data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
- data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
- data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
- data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
- data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
- data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
- data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +6 -4
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
- data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
- data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +539 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
- data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
- data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
- data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
- data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +47 -93
- data/vendor/datasketches-cpp/MANIFEST.in +0 -39
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/pyproject.toml +0 -23
- data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
- data/vendor/datasketches-cpp/python/README.md +0 -85
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
- data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
- data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
- data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
- data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
- data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +0 -110
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tox.ini +0 -26
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
@@ -32,120 +32,120 @@ namespace datasketches {
|
|
32
32
|
using count_min_sketch_test_alloc = count_min_sketch<uint64_t, test_allocator<uint64_t>>;
|
33
33
|
using alloc = test_allocator<uint64_t>;
|
34
34
|
|
35
|
-
TEST_CASE("CountMin sketch test allocator: serialize-deserialize empty", "[cm_sketch_alloc]"){
|
35
|
+
TEST_CASE("CountMin sketch test allocator: serialize-deserialize empty", "[cm_sketch_alloc]") {
|
36
36
|
test_allocator_total_bytes = 0;
|
37
37
|
test_allocator_net_allocations = 0;
|
38
38
|
{
|
39
|
-
uint8_t n_hashes = 1
|
40
|
-
uint32_t n_buckets = 5
|
39
|
+
uint8_t n_hashes = 1;
|
40
|
+
uint32_t n_buckets = 5;
|
41
41
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
42
|
-
count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0))
|
42
|
+
count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0));
|
43
43
|
c.serialize(s);
|
44
44
|
count_min_sketch_test_alloc d = count_min_sketch_test_alloc::deserialize(s, DEFAULT_SEED, alloc(0)) ;
|
45
|
-
REQUIRE(c.get_num_hashes() == d.get_num_hashes())
|
46
|
-
REQUIRE(c.get_num_buckets() == d.get_num_buckets())
|
47
|
-
REQUIRE(c.get_seed() == d.get_seed())
|
45
|
+
REQUIRE(c.get_num_hashes() == d.get_num_hashes());
|
46
|
+
REQUIRE(c.get_num_buckets() == d.get_num_buckets());
|
47
|
+
REQUIRE(c.get_seed() == d.get_seed());
|
48
48
|
uint64_t zero = 0;
|
49
|
-
REQUIRE(c.get_estimate(zero) == d.get_estimate(zero))
|
50
|
-
REQUIRE(c.get_total_weight() == d.get_total_weight())
|
49
|
+
REQUIRE(c.get_estimate(zero) == d.get_estimate(zero));
|
50
|
+
REQUIRE(c.get_total_weight() == d.get_total_weight());
|
51
51
|
|
52
52
|
// Check that all entries are equal and 0
|
53
|
-
for(auto di: d){
|
54
|
-
REQUIRE(di == 0)
|
53
|
+
for (auto di: d) {
|
54
|
+
REQUIRE(di == 0);
|
55
55
|
}
|
56
56
|
}
|
57
57
|
REQUIRE(test_allocator_total_bytes == 0);
|
58
58
|
REQUIRE(test_allocator_net_allocations == 0);
|
59
59
|
}
|
60
60
|
|
61
|
-
TEST_CASE("CountMin sketch test allocator: serialize-deserialize non-empty", "[cm_sketch_alloc]"){
|
61
|
+
TEST_CASE("CountMin sketch test allocator: serialize-deserialize non-empty", "[cm_sketch_alloc]") {
|
62
62
|
test_allocator_total_bytes = 0;
|
63
63
|
test_allocator_net_allocations = 0;
|
64
64
|
{
|
65
|
-
uint8_t n_hashes = 3
|
66
|
-
uint32_t n_buckets = 1024
|
65
|
+
uint8_t n_hashes = 3;
|
66
|
+
uint32_t n_buckets = 1024;
|
67
67
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
68
|
-
count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0))
|
69
|
-
for(uint64_t i=0
|
68
|
+
count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0));
|
69
|
+
for (uint64_t i = 0; i < 10; ++i) c.update(i, 10 * i * i);
|
70
70
|
c.serialize(s);
|
71
|
-
count_min_sketch_test_alloc d = count_min_sketch_test_alloc::deserialize(s, DEFAULT_SEED, alloc(0))
|
72
|
-
REQUIRE(c.get_num_hashes() == d.get_num_hashes())
|
73
|
-
REQUIRE(c.get_num_buckets() == d.get_num_buckets())
|
74
|
-
REQUIRE(c.get_seed() == d.get_seed())
|
75
|
-
REQUIRE(c.get_total_weight() == d.get_total_weight())
|
76
|
-
for(uint64_t i=0
|
77
|
-
REQUIRE(c.get_estimate(i) == d.get_estimate(i))
|
71
|
+
count_min_sketch_test_alloc d = count_min_sketch_test_alloc::deserialize(s, DEFAULT_SEED, alloc(0));
|
72
|
+
REQUIRE(c.get_num_hashes() == d.get_num_hashes());
|
73
|
+
REQUIRE(c.get_num_buckets() == d.get_num_buckets());
|
74
|
+
REQUIRE(c.get_seed() == d.get_seed());
|
75
|
+
REQUIRE(c.get_total_weight() == d.get_total_weight());
|
76
|
+
for (uint64_t i = 0; i < 10; ++i) {
|
77
|
+
REQUIRE(c.get_estimate(i) == d.get_estimate(i));
|
78
78
|
}
|
79
79
|
|
80
|
-
auto c_it = c.begin()
|
81
|
-
auto d_it = d.begin()
|
82
|
-
while(c_it != c.end()){
|
83
|
-
REQUIRE(*c_it == *d_it)
|
84
|
-
++c_it
|
85
|
-
++d_it
|
80
|
+
auto c_it = c.begin();
|
81
|
+
auto d_it = d.begin();
|
82
|
+
while (c_it != c.end()) {
|
83
|
+
REQUIRE(*c_it == *d_it);
|
84
|
+
++c_it;
|
85
|
+
++d_it;
|
86
86
|
}
|
87
87
|
}
|
88
88
|
REQUIRE(test_allocator_total_bytes == 0);
|
89
89
|
REQUIRE(test_allocator_net_allocations == 0);
|
90
90
|
}
|
91
91
|
|
92
|
-
TEST_CASE("CountMin sketch test allocator: bytes serialize-deserialize empty", "[cm_sketch_alloc]"){
|
92
|
+
TEST_CASE("CountMin sketch test allocator: bytes serialize-deserialize empty", "[cm_sketch_alloc]") {
|
93
93
|
test_allocator_total_bytes = 0;
|
94
94
|
test_allocator_net_allocations = 0;
|
95
95
|
{
|
96
|
-
uint8_t n_hashes = 3
|
97
|
-
uint32_t n_buckets = 32
|
98
|
-
count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0))
|
99
|
-
auto bytes = c.serialize()
|
96
|
+
uint8_t n_hashes = 3;
|
97
|
+
uint32_t n_buckets = 32;
|
98
|
+
count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0));
|
99
|
+
auto bytes = c.serialize();
|
100
100
|
|
101
101
|
REQUIRE_THROWS_AS(count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED-1, alloc(0)), std::invalid_argument);
|
102
|
-
auto d = count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, alloc(0))
|
103
|
-
REQUIRE(c.get_num_hashes() == d.get_num_hashes())
|
104
|
-
REQUIRE(c.get_num_buckets() == d.get_num_buckets())
|
105
|
-
REQUIRE(c.get_seed() == d.get_seed())
|
102
|
+
auto d = count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, alloc(0));
|
103
|
+
REQUIRE(c.get_num_hashes() == d.get_num_hashes());
|
104
|
+
REQUIRE(c.get_num_buckets() == d.get_num_buckets());
|
105
|
+
REQUIRE(c.get_seed() == d.get_seed());
|
106
106
|
uint64_t zero = 0;
|
107
|
-
REQUIRE(c.get_estimate(zero) == d.get_estimate(zero))
|
108
|
-
REQUIRE(c.get_total_weight() == d.get_total_weight())
|
107
|
+
REQUIRE(c.get_estimate(zero) == d.get_estimate(zero));
|
108
|
+
REQUIRE(c.get_total_weight() == d.get_total_weight());
|
109
109
|
|
110
110
|
// Check that all entries are equal and 0
|
111
|
-
for(auto di: d){
|
112
|
-
REQUIRE(di == 0)
|
111
|
+
for (auto di: d) {
|
112
|
+
REQUIRE(di == 0);
|
113
113
|
}
|
114
114
|
}
|
115
115
|
REQUIRE(test_allocator_total_bytes == 0);
|
116
116
|
REQUIRE(test_allocator_net_allocations == 0);
|
117
117
|
}
|
118
118
|
|
119
|
-
TEST_CASE("CountMin sketch test allocator: bytes serialize-deserialize non-empty", "[cm_sketch_alloc]"){
|
119
|
+
TEST_CASE("CountMin sketch test allocator: bytes serialize-deserialize non-empty", "[cm_sketch_alloc]") {
|
120
120
|
test_allocator_total_bytes = 0;
|
121
121
|
test_allocator_net_allocations = 0;
|
122
122
|
{
|
123
|
-
uint8_t n_hashes = 5
|
124
|
-
uint32_t n_buckets = 64
|
125
|
-
count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0))
|
126
|
-
for(uint64_t i=0
|
123
|
+
uint8_t n_hashes = 5;
|
124
|
+
uint32_t n_buckets = 64;
|
125
|
+
count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0));
|
126
|
+
for (uint64_t i = 0; i < 10; ++i) c.update(i, 10 * i * i);
|
127
127
|
|
128
|
-
auto bytes = c.serialize()
|
128
|
+
auto bytes = c.serialize();
|
129
129
|
REQUIRE_THROWS_AS(count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED-1, alloc(0)), std::invalid_argument);
|
130
|
-
auto d = count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, alloc(0))
|
130
|
+
auto d = count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, alloc(0));
|
131
131
|
|
132
|
-
REQUIRE(c.get_num_hashes() == d.get_num_hashes())
|
133
|
-
REQUIRE(c.get_num_buckets() == d.get_num_buckets())
|
134
|
-
REQUIRE(c.get_seed() == d.get_seed())
|
135
|
-
REQUIRE(c.get_total_weight() == d.get_total_weight())
|
132
|
+
REQUIRE(c.get_num_hashes() == d.get_num_hashes());
|
133
|
+
REQUIRE(c.get_num_buckets() == d.get_num_buckets());
|
134
|
+
REQUIRE(c.get_seed() == d.get_seed());
|
135
|
+
REQUIRE(c.get_total_weight() == d.get_total_weight());
|
136
136
|
|
137
137
|
// Check that all entries are equal
|
138
|
-
auto c_it = c.begin()
|
139
|
-
auto d_it = d.begin()
|
140
|
-
while(c_it != c.end()){
|
141
|
-
REQUIRE(*c_it == *d_it)
|
142
|
-
++c_it
|
143
|
-
++d_it
|
138
|
+
auto c_it = c.begin();
|
139
|
+
auto d_it = d.begin();
|
140
|
+
while (c_it != c.end()) {
|
141
|
+
REQUIRE(*c_it == *d_it);
|
142
|
+
++c_it;
|
143
|
+
++d_it;
|
144
144
|
}
|
145
145
|
|
146
146
|
// Check that the estimates agree
|
147
|
-
for(uint64_t i=0
|
148
|
-
REQUIRE(c.get_estimate(i) == d.get_estimate(i))
|
147
|
+
for (uint64_t i = 0; i < 10; ++i) {
|
148
|
+
REQUIRE(c.get_estimate(i) == d.get_estimate(i));
|
149
149
|
}
|
150
150
|
}
|
151
151
|
REQUIRE(test_allocator_total_bytes == 0);
|
@@ -26,281 +26,278 @@
|
|
26
26
|
#include "count_min.hpp"
|
27
27
|
#include "common_defs.hpp"
|
28
28
|
|
29
|
-
namespace datasketches{
|
29
|
+
namespace datasketches {
|
30
30
|
|
31
31
|
TEST_CASE("CM init - throws") {
|
32
32
|
REQUIRE_THROWS_AS(count_min_sketch<uint64_t>(5, 1), std::invalid_argument);
|
33
33
|
REQUIRE_THROWS_AS(count_min_sketch<uint64_t>(4, 268435456), std::invalid_argument);
|
34
34
|
}
|
35
35
|
|
36
|
-
TEST_CASE("CM init"){
|
37
|
-
uint8_t n_hashes = 3
|
38
|
-
uint32_t n_buckets = 5
|
39
|
-
uint64_t seed = 1234567
|
40
|
-
count_min_sketch<uint64_t> c(n_hashes, n_buckets, seed)
|
41
|
-
REQUIRE(c.get_num_hashes() == n_hashes)
|
42
|
-
REQUIRE(c.get_num_buckets() == n_buckets)
|
43
|
-
REQUIRE(c.get_seed() == seed)
|
44
|
-
REQUIRE(c.is_empty())
|
45
|
-
|
46
|
-
for(auto x: c){
|
47
|
-
REQUIRE(x == 0)
|
36
|
+
TEST_CASE("CM init") {
|
37
|
+
uint8_t n_hashes = 3;
|
38
|
+
uint32_t n_buckets = 5;
|
39
|
+
uint64_t seed = 1234567;
|
40
|
+
count_min_sketch<uint64_t> c(n_hashes, n_buckets, seed);
|
41
|
+
REQUIRE(c.get_num_hashes() == n_hashes);
|
42
|
+
REQUIRE(c.get_num_buckets() == n_buckets);
|
43
|
+
REQUIRE(c.get_seed() == seed);
|
44
|
+
REQUIRE(c.is_empty());
|
45
|
+
|
46
|
+
for (auto x: c) {
|
47
|
+
REQUIRE(x == 0);
|
48
48
|
}
|
49
49
|
|
50
50
|
// Check the default seed is appropriately set.
|
51
|
-
count_min_sketch<uint64_t> c1(n_hashes, n_buckets)
|
52
|
-
REQUIRE(c1.get_seed() == DEFAULT_SEED)
|
51
|
+
count_min_sketch<uint64_t> c1(n_hashes, n_buckets);
|
52
|
+
REQUIRE(c1.get_seed() == DEFAULT_SEED);
|
53
53
|
}
|
54
54
|
|
55
55
|
TEST_CASE("CM parameter suggestions", "[error parameters]") {
|
56
56
|
|
57
57
|
// Bucket suggestions
|
58
|
-
REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_buckets(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." )
|
59
|
-
REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.2) == 14)
|
60
|
-
REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.1) == 28)
|
61
|
-
REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.05) == 55)
|
62
|
-
REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.01) == 272)
|
58
|
+
REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_buckets(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." );
|
59
|
+
REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.2) == 14);
|
60
|
+
REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.1) == 28);
|
61
|
+
REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.05) == 55);
|
62
|
+
REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.01) == 272);
|
63
63
|
|
64
64
|
// Check that the sketch get_epsilon acts inversely to suggest_num_buckets
|
65
|
-
uint8_t n_hashes = 3
|
66
|
-
REQUIRE(count_min_sketch<uint64_t>(n_hashes, 14).get_relative_error() <= 0.2)
|
67
|
-
REQUIRE(count_min_sketch<uint64_t>(n_hashes, 28).get_relative_error() <= 0.1)
|
68
|
-
REQUIRE(count_min_sketch<uint64_t>(n_hashes, 55).get_relative_error() <= 0.05)
|
69
|
-
REQUIRE(count_min_sketch<uint64_t>(n_hashes, 272).get_relative_error() <= 0.01)
|
65
|
+
uint8_t n_hashes = 3;
|
66
|
+
REQUIRE(count_min_sketch<uint64_t>(n_hashes, 14).get_relative_error() <= 0.2);
|
67
|
+
REQUIRE(count_min_sketch<uint64_t>(n_hashes, 28).get_relative_error() <= 0.1);
|
68
|
+
REQUIRE(count_min_sketch<uint64_t>(n_hashes, 55).get_relative_error() <= 0.05);
|
69
|
+
REQUIRE(count_min_sketch<uint64_t>(n_hashes, 272).get_relative_error() <= 0.01);
|
70
70
|
|
71
71
|
// Hash suggestions
|
72
|
-
REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_hashes(10.0), "Confidence must be between 0 and 1.0 (inclusive)." )
|
73
|
-
REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_hashes(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." )
|
74
|
-
REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.682689492) == 2)
|
75
|
-
REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.954499736) == 4)
|
76
|
-
REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.997300204) == 6)
|
72
|
+
REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_hashes(10.0), "Confidence must be between 0 and 1.0 (inclusive)." );
|
73
|
+
REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_hashes(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." );
|
74
|
+
REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.682689492) == 2); // 1 STDDEV
|
75
|
+
REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.954499736) == 4); // 2 STDDEV
|
76
|
+
REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.997300204) == 6); // 3 STDDEV
|
77
77
|
}
|
78
78
|
|
79
|
-
TEST_CASE("CM one update: uint64_t"){
|
80
|
-
uint8_t n_hashes = 3
|
81
|
-
uint32_t n_buckets = 5
|
82
|
-
uint64_t seed =
|
83
|
-
uint64_t inserted_weight = 0
|
84
|
-
count_min_sketch<uint64_t> c(n_hashes, n_buckets, seed)
|
85
|
-
std::string x = "x"
|
86
|
-
|
87
|
-
REQUIRE(c.is_empty())
|
88
|
-
REQUIRE(c.get_estimate("x") == 0)
|
89
|
-
c.update(x)
|
90
|
-
REQUIRE(!c.is_empty())
|
91
|
-
REQUIRE(c.get_estimate(x) == 1)
|
92
|
-
inserted_weight += 1
|
93
|
-
|
94
|
-
uint64_t w = 9
|
95
|
-
inserted_weight += w
|
96
|
-
c.update(x, w)
|
97
|
-
REQUIRE(c.get_estimate(x) == inserted_weight)
|
79
|
+
TEST_CASE("CM one update: uint64_t") {
|
80
|
+
uint8_t n_hashes = 3;
|
81
|
+
uint32_t n_buckets = 5;
|
82
|
+
uint64_t seed = 9223372036854775807; //1234567;
|
83
|
+
uint64_t inserted_weight = 0;
|
84
|
+
count_min_sketch<uint64_t> c(n_hashes, n_buckets, seed);
|
85
|
+
std::string x = "x";
|
86
|
+
|
87
|
+
REQUIRE(c.is_empty());
|
88
|
+
REQUIRE(c.get_estimate("x") == 0); // No items in sketch so estimates should be zero
|
89
|
+
c.update(x);
|
90
|
+
REQUIRE(!c.is_empty());
|
91
|
+
REQUIRE(c.get_estimate(x) == 1);
|
92
|
+
inserted_weight += 1;
|
93
|
+
|
94
|
+
uint64_t w = 9;
|
95
|
+
inserted_weight += w;
|
96
|
+
c.update(x, w);
|
97
|
+
REQUIRE(c.get_estimate(x) == inserted_weight);
|
98
98
|
|
99
99
|
// Doubles are converted to uint64_t
|
100
|
-
double w1 = 10.0
|
101
|
-
inserted_weight += w1
|
102
|
-
c.update(x, w1)
|
103
|
-
REQUIRE(c.get_estimate(x) == inserted_weight)
|
104
|
-
REQUIRE(c.get_total_weight() == inserted_weight)
|
105
|
-
REQUIRE(c.get_estimate(x) <= c.get_upper_bound(x))
|
106
|
-
REQUIRE(c.get_estimate(x) >= c.get_lower_bound(x))
|
100
|
+
double w1 = 10.0;
|
101
|
+
inserted_weight += static_cast<uint64_t>(w1);
|
102
|
+
c.update(x, static_cast<uint64_t>(w1));
|
103
|
+
REQUIRE(c.get_estimate(x) == inserted_weight);
|
104
|
+
REQUIRE(c.get_total_weight() == inserted_weight);
|
105
|
+
REQUIRE(c.get_estimate(x) <= c.get_upper_bound(x));
|
106
|
+
REQUIRE(c.get_estimate(x) >= c.get_lower_bound(x));
|
107
107
|
}
|
108
108
|
|
109
|
-
TEST_CASE("CM frequency cancellation"){
|
110
|
-
count_min_sketch<int64_t> c(1, 5)
|
111
|
-
c.update("x")
|
112
|
-
c.update("y", -1)
|
113
|
-
REQUIRE(c.get_total_weight() == 2)
|
114
|
-
REQUIRE(c.get_estimate("x") == 1)
|
115
|
-
REQUIRE(c.get_estimate("y") == -1)
|
109
|
+
TEST_CASE("CM frequency cancellation") {
|
110
|
+
count_min_sketch<int64_t> c(1, 5);
|
111
|
+
c.update("x");
|
112
|
+
c.update("y", -1);
|
113
|
+
REQUIRE(c.get_total_weight() == 2);
|
114
|
+
REQUIRE(c.get_estimate("x") == 1);
|
115
|
+
REQUIRE(c.get_estimate("y") == -1);
|
116
116
|
}
|
117
117
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
std::vector<uint64_t>
|
122
|
-
std::vector<uint64_t> frequencies(number_of_items) ;
|
118
|
+
TEST_CASE("CM frequency estimates") {
|
119
|
+
int number_of_items = 10;
|
120
|
+
std::vector<uint64_t> data(number_of_items);
|
121
|
+
std::vector<uint64_t> frequencies(number_of_items);
|
123
122
|
|
124
123
|
// Populate data vector
|
125
|
-
for(int i=0; i < number_of_items; i
|
124
|
+
for (int i = 0; i < number_of_items; ++i) {
|
126
125
|
data[i] = i;
|
127
|
-
frequencies[i] =
|
126
|
+
frequencies[i] = 1ULL << (number_of_items - i);
|
128
127
|
}
|
129
128
|
|
130
|
-
double relative_error = 0.1
|
131
|
-
double confidence = 0.99
|
132
|
-
|
133
|
-
|
129
|
+
double relative_error = 0.1;
|
130
|
+
double confidence = 0.99;
|
131
|
+
uint32_t n_buckets = count_min_sketch<uint64_t>::suggest_num_buckets(relative_error);
|
132
|
+
uint8_t n_hashes = count_min_sketch<uint64_t>::suggest_num_hashes(confidence);
|
134
133
|
|
135
|
-
count_min_sketch<uint64_t> c(n_hashes, n_buckets)
|
136
|
-
for(int i=0
|
137
|
-
uint64_t value = data[i]
|
138
|
-
uint64_t freq = frequencies[i]
|
139
|
-
c.update(value, freq)
|
134
|
+
count_min_sketch<uint64_t> c(n_hashes, n_buckets);
|
135
|
+
for (int i = 0; i < number_of_items; ++i) {
|
136
|
+
uint64_t value = data[i];
|
137
|
+
uint64_t freq = frequencies[i];
|
138
|
+
c.update(value, freq);
|
140
139
|
}
|
141
140
|
|
142
|
-
for(const auto i: data){
|
143
|
-
uint64_t est = c.get_estimate(i)
|
144
|
-
uint64_t upp = c.get_upper_bound(i)
|
145
|
-
uint64_t low = c.get_lower_bound(i)
|
146
|
-
REQUIRE(est <= upp)
|
147
|
-
REQUIRE(est >= low)
|
141
|
+
for (const auto i: data) {
|
142
|
+
uint64_t est = c.get_estimate(i);
|
143
|
+
uint64_t upp = c.get_upper_bound(i);
|
144
|
+
uint64_t low = c.get_lower_bound(i);
|
145
|
+
REQUIRE(est <= upp);
|
146
|
+
REQUIRE(est >= low);
|
148
147
|
}
|
149
148
|
}
|
150
149
|
|
151
|
-
TEST_CASE("CM merge - reject", "[reject cases]"){
|
152
|
-
double relative_error = 0.25
|
153
|
-
double confidence = 0.9
|
154
|
-
uint32_t n_buckets = count_min_sketch<uint64_t>::suggest_num_buckets(relative_error)
|
155
|
-
uint8_t n_hashes = count_min_sketch<uint64_t>::suggest_num_hashes(confidence)
|
156
|
-
count_min_sketch<uint64_t> s(n_hashes, n_buckets, 9082435234709287)
|
157
|
-
|
150
|
+
TEST_CASE("CM merge - reject", "[reject cases]") {
|
151
|
+
double relative_error = 0.25;
|
152
|
+
double confidence = 0.9;
|
153
|
+
uint32_t n_buckets = count_min_sketch<uint64_t>::suggest_num_buckets(relative_error);
|
154
|
+
uint8_t n_hashes = count_min_sketch<uint64_t>::suggest_num_hashes(confidence);
|
155
|
+
count_min_sketch<uint64_t> s(n_hashes, n_buckets, 9082435234709287);
|
158
156
|
|
159
157
|
// Generate sketches that we cannot merge into ie they disagree on at least one of the config entries
|
160
|
-
count_min_sketch<uint64_t> s1(n_hashes+1, n_buckets)
|
161
|
-
count_min_sketch<uint64_t> s2(n_hashes, n_buckets+1)
|
162
|
-
count_min_sketch<uint64_t> s3(n_hashes, n_buckets, 1)
|
158
|
+
count_min_sketch<uint64_t> s1(n_hashes+1, n_buckets); // incorrect number of hashes
|
159
|
+
count_min_sketch<uint64_t> s2(n_hashes, n_buckets + 1); // incorrect number of buckets
|
160
|
+
count_min_sketch<uint64_t> s3(n_hashes, n_buckets, 1); // incorrect seed
|
163
161
|
std::vector<count_min_sketch<uint64_t>> sketches = {s1, s2, s3};
|
164
162
|
|
165
163
|
// Fail cases
|
166
|
-
REQUIRE_THROWS(s.merge(s), "Cannot merge a sketch with itself." )
|
167
|
-
for(count_min_sketch<uint64_t> sk : sketches){
|
168
|
-
REQUIRE_THROWS(s.merge(sk), "Incompatible sketch config." )
|
164
|
+
REQUIRE_THROWS(s.merge(s), "Cannot merge a sketch with itself." );
|
165
|
+
for (count_min_sketch<uint64_t> sk : sketches) {
|
166
|
+
REQUIRE_THROWS(s.merge(sk), "Incompatible sketch config." );
|
169
167
|
}
|
170
168
|
}
|
171
169
|
|
172
|
-
TEST_CASE("CM merge - pass", "[acceptable cases]"){
|
173
|
-
double relative_error = 0.25
|
174
|
-
double confidence = 0.9
|
175
|
-
uint32_t n_buckets = count_min_sketch<uint64_t>::suggest_num_buckets(relative_error)
|
176
|
-
uint8_t n_hashes = count_min_sketch<uint64_t>::suggest_num_hashes(confidence)
|
177
|
-
count_min_sketch<uint64_t> s(n_hashes, n_buckets)
|
178
|
-
uint8_t s_hashes = s.get_num_hashes()
|
179
|
-
uint32_t s_buckets = s.get_num_buckets()
|
180
|
-
count_min_sketch<uint64_t> t(s_hashes, s_buckets)
|
170
|
+
TEST_CASE("CM merge - pass", "[acceptable cases]") {
|
171
|
+
double relative_error = 0.25;
|
172
|
+
double confidence = 0.9;
|
173
|
+
uint32_t n_buckets = count_min_sketch<uint64_t>::suggest_num_buckets(relative_error);
|
174
|
+
uint8_t n_hashes = count_min_sketch<uint64_t>::suggest_num_hashes(confidence);
|
175
|
+
count_min_sketch<uint64_t> s(n_hashes, n_buckets);
|
176
|
+
uint8_t s_hashes = s.get_num_hashes();
|
177
|
+
uint32_t s_buckets = s.get_num_buckets();
|
178
|
+
count_min_sketch<uint64_t> t(s_hashes, s_buckets);
|
181
179
|
|
182
180
|
// Merge in an all-zeros sketch t. Should not change the total weight.
|
183
|
-
s.merge(t)
|
184
|
-
REQUIRE(s.get_total_weight() == 0 )
|
181
|
+
s.merge(t);
|
182
|
+
REQUIRE(s.get_total_weight() == 0 );
|
185
183
|
|
186
184
|
std::vector<uint64_t> data = {2,3,5,7};
|
187
|
-
for(auto d: data){
|
188
|
-
s.update(d)
|
189
|
-
t.update(d)
|
185
|
+
for (auto d: data) {
|
186
|
+
s.update(d);
|
187
|
+
t.update(d);
|
190
188
|
}
|
191
189
|
s.merge(t);
|
192
190
|
|
193
|
-
REQUIRE(s.get_total_weight() == 2*t.get_total_weight());
|
191
|
+
REQUIRE(s.get_total_weight() == 2 * t.get_total_weight());
|
194
192
|
|
195
193
|
// Estimator checks.
|
196
|
-
for (auto x
|
197
|
-
REQUIRE(s.get_estimate(x) <= s.get_upper_bound(x))
|
194
|
+
for (auto x: data) {
|
195
|
+
REQUIRE(s.get_estimate(x) <= s.get_upper_bound(x));
|
198
196
|
REQUIRE(s.get_estimate(x) <= 2); // True frequency x == 2 for all x.
|
199
197
|
}
|
200
198
|
}
|
201
199
|
|
202
|
-
TEST_CASE("CountMin sketch: serialize-deserialize empty", "[cm_sketch]"){
|
203
|
-
uint8_t n_hashes = 1
|
204
|
-
uint32_t n_buckets = 5
|
200
|
+
TEST_CASE("CountMin sketch: serialize-deserialize empty", "[cm_sketch]") {
|
201
|
+
uint8_t n_hashes = 1;
|
202
|
+
uint32_t n_buckets = 5;
|
205
203
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
206
|
-
count_min_sketch<uint64_t> c(n_hashes, n_buckets)
|
204
|
+
count_min_sketch<uint64_t> c(n_hashes, n_buckets);
|
207
205
|
c.serialize(s);
|
208
|
-
count_min_sketch<uint64_t> d = count_min_sketch<uint64_t>::deserialize(s, DEFAULT_SEED)
|
209
|
-
REQUIRE(c.get_num_hashes() == d.get_num_hashes())
|
210
|
-
REQUIRE(c.get_num_buckets() == d.get_num_buckets())
|
211
|
-
REQUIRE(c.get_seed() == d.get_seed())
|
206
|
+
count_min_sketch<uint64_t> d = count_min_sketch<uint64_t>::deserialize(s, DEFAULT_SEED);
|
207
|
+
REQUIRE(c.get_num_hashes() == d.get_num_hashes());
|
208
|
+
REQUIRE(c.get_num_buckets() == d.get_num_buckets());
|
209
|
+
REQUIRE(c.get_seed() == d.get_seed());
|
212
210
|
uint64_t zero = 0;
|
213
|
-
REQUIRE(c.get_estimate(zero) == d.get_estimate(zero))
|
214
|
-
REQUIRE(c.get_total_weight() == d.get_total_weight())
|
211
|
+
REQUIRE(c.get_estimate(zero) == d.get_estimate(zero));
|
212
|
+
REQUIRE(c.get_total_weight() == d.get_total_weight());
|
215
213
|
|
216
214
|
// Check that all entries are equal and 0
|
217
|
-
for(auto di: d){
|
218
|
-
REQUIRE(di == 0)
|
215
|
+
for (auto di: d) {
|
216
|
+
REQUIRE(di == 0);
|
219
217
|
}
|
220
218
|
std::ofstream os("count_min-empty.bin");
|
221
219
|
c.serialize(os);
|
222
220
|
}
|
223
221
|
|
224
|
-
TEST_CASE("CountMin sketch: serialize-deserialize non-empty", "[cm_sketch]"){
|
225
|
-
uint8_t n_hashes = 3
|
226
|
-
uint32_t n_buckets = 1024
|
222
|
+
TEST_CASE("CountMin sketch: serialize-deserialize non-empty", "[cm_sketch]") {
|
223
|
+
uint8_t n_hashes = 3;
|
224
|
+
uint32_t n_buckets = 1024;
|
227
225
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
228
|
-
count_min_sketch<uint64_t> c(n_hashes, n_buckets)
|
229
|
-
for(uint64_t i=0
|
226
|
+
count_min_sketch<uint64_t> c(n_hashes, n_buckets);
|
227
|
+
for (uint64_t i = 0; i < 10; ++i) c.update(i, 10 * i * i);
|
230
228
|
c.serialize(s);
|
231
|
-
count_min_sketch<uint64_t> d = count_min_sketch<uint64_t>::deserialize(s, DEFAULT_SEED)
|
232
|
-
REQUIRE(c.get_num_hashes() == d.get_num_hashes())
|
233
|
-
REQUIRE(c.get_num_buckets() == d.get_num_buckets())
|
234
|
-
REQUIRE(c.get_seed() == d.get_seed())
|
235
|
-
REQUIRE(c.get_total_weight() == d.get_total_weight())
|
236
|
-
for(uint64_t i=0
|
237
|
-
REQUIRE(c.get_estimate(i) == d.get_estimate(i))
|
229
|
+
count_min_sketch<uint64_t> d = count_min_sketch<uint64_t>::deserialize(s, DEFAULT_SEED);
|
230
|
+
REQUIRE(c.get_num_hashes() == d.get_num_hashes());
|
231
|
+
REQUIRE(c.get_num_buckets() == d.get_num_buckets());
|
232
|
+
REQUIRE(c.get_seed() == d.get_seed());
|
233
|
+
REQUIRE(c.get_total_weight() == d.get_total_weight());
|
234
|
+
for (uint64_t i = 0; i < 10; ++i) {
|
235
|
+
REQUIRE(c.get_estimate(i) == d.get_estimate(i));
|
238
236
|
}
|
239
237
|
|
240
|
-
auto c_it = c.begin()
|
241
|
-
auto d_it = d.begin()
|
242
|
-
while(c_it != c.end()){
|
243
|
-
REQUIRE(*c_it == *d_it)
|
244
|
-
++c_it
|
245
|
-
++d_it
|
238
|
+
auto c_it = c.begin();
|
239
|
+
auto d_it = d.begin();
|
240
|
+
while (c_it != c.end()) {
|
241
|
+
REQUIRE(*c_it == *d_it);
|
242
|
+
++c_it;
|
243
|
+
++d_it;
|
246
244
|
}
|
247
245
|
|
248
246
|
std::ofstream os("count_min-non-empty.bin");
|
249
247
|
c.serialize(os);
|
250
248
|
}
|
251
249
|
|
252
|
-
TEST_CASE("CountMin sketch: bytes serialize-deserialize empty", "[cm_sketch]"){
|
253
|
-
uint8_t n_hashes = 3
|
254
|
-
uint32_t n_buckets = 32
|
255
|
-
count_min_sketch<uint64_t> c(n_hashes, n_buckets)
|
256
|
-
auto bytes = c.serialize()
|
250
|
+
TEST_CASE("CountMin sketch: bytes serialize-deserialize empty", "[cm_sketch]") {
|
251
|
+
uint8_t n_hashes = 3;
|
252
|
+
uint32_t n_buckets = 32;
|
253
|
+
count_min_sketch<uint64_t> c(n_hashes, n_buckets);
|
254
|
+
auto bytes = c.serialize();
|
257
255
|
|
258
256
|
REQUIRE_THROWS_AS(count_min_sketch<uint64_t>::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED-1), std::invalid_argument);
|
259
|
-
auto d = count_min_sketch<uint64_t>::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED)
|
260
|
-
REQUIRE(c.get_num_hashes() == d.get_num_hashes())
|
261
|
-
REQUIRE(c.get_num_buckets() == d.get_num_buckets())
|
262
|
-
REQUIRE(c.get_seed() == d.get_seed())
|
257
|
+
auto d = count_min_sketch<uint64_t>::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED);
|
258
|
+
REQUIRE(c.get_num_hashes() == d.get_num_hashes());
|
259
|
+
REQUIRE(c.get_num_buckets() == d.get_num_buckets());
|
260
|
+
REQUIRE(c.get_seed() == d.get_seed());
|
263
261
|
uint64_t zero = 0;
|
264
|
-
REQUIRE(c.get_estimate(zero) == d.get_estimate(zero))
|
265
|
-
REQUIRE(c.get_total_weight() == d.get_total_weight())
|
262
|
+
REQUIRE(c.get_estimate(zero) == d.get_estimate(zero));
|
263
|
+
REQUIRE(c.get_total_weight() == d.get_total_weight());
|
266
264
|
|
267
265
|
// Check that all entries are equal and 0
|
268
|
-
for(auto di: d){
|
269
|
-
REQUIRE(di == 0)
|
266
|
+
for (auto di: d) {
|
267
|
+
REQUIRE(di == 0);
|
270
268
|
}
|
271
269
|
}
|
272
270
|
|
273
271
|
|
274
|
-
TEST_CASE("CountMin sketch: bytes serialize-deserialize non-empty", "[cm_sketch]"){
|
275
|
-
uint8_t n_hashes = 5
|
276
|
-
uint32_t n_buckets = 64
|
277
|
-
count_min_sketch<uint64_t> c(n_hashes, n_buckets)
|
278
|
-
for(uint64_t i=0
|
272
|
+
TEST_CASE("CountMin sketch: bytes serialize-deserialize non-empty", "[cm_sketch]") {
|
273
|
+
uint8_t n_hashes = 5;
|
274
|
+
uint32_t n_buckets = 64;
|
275
|
+
count_min_sketch<uint64_t> c(n_hashes, n_buckets);
|
276
|
+
for(uint64_t i=0; i < 10; ++i) c.update(i,10*i*i);
|
279
277
|
|
280
|
-
auto bytes = c.serialize()
|
278
|
+
auto bytes = c.serialize();
|
281
279
|
REQUIRE_THROWS_AS(count_min_sketch<uint64_t>::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED-1), std::invalid_argument);
|
282
|
-
auto d = count_min_sketch<uint64_t>::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED)
|
280
|
+
auto d = count_min_sketch<uint64_t>::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED);
|
283
281
|
|
284
|
-
REQUIRE(c.get_num_hashes() == d.get_num_hashes())
|
285
|
-
REQUIRE(c.get_num_buckets() == d.get_num_buckets())
|
286
|
-
REQUIRE(c.get_seed() == d.get_seed())
|
287
|
-
REQUIRE(c.get_total_weight() == d.get_total_weight())
|
282
|
+
REQUIRE(c.get_num_hashes() == d.get_num_hashes());
|
283
|
+
REQUIRE(c.get_num_buckets() == d.get_num_buckets());
|
284
|
+
REQUIRE(c.get_seed() == d.get_seed());
|
285
|
+
REQUIRE(c.get_total_weight() == d.get_total_weight());
|
288
286
|
|
289
287
|
// Check that all entries are equal
|
290
|
-
auto c_it = c.begin()
|
291
|
-
auto d_it = d.begin()
|
292
|
-
while(c_it != c.end()){
|
293
|
-
REQUIRE(*c_it == *d_it)
|
294
|
-
++c_it
|
295
|
-
++d_it
|
288
|
+
auto c_it = c.begin();
|
289
|
+
auto d_it = d.begin();
|
290
|
+
while (c_it != c.end()) {
|
291
|
+
REQUIRE(*c_it == *d_it);
|
292
|
+
++c_it;
|
293
|
+
++d_it;
|
296
294
|
}
|
297
295
|
|
298
296
|
// Check that the estimates agree
|
299
|
-
for(uint64_t i=0
|
300
|
-
REQUIRE(c.get_estimate(i) == d.get_estimate(i))
|
297
|
+
for (uint64_t i = 0; i < 10; ++i) {
|
298
|
+
REQUIRE(c.get_estimate(i) == d.get_estimate(i));
|
301
299
|
}
|
302
300
|
|
303
301
|
}
|
304
302
|
|
305
303
|
} /* namespace datasketches */
|
306
|
-
|