datasketches 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/NOTICE +1 -1
- data/README.md +0 -2
- data/ext/datasketches/cpc_wrapper.cpp +2 -2
- data/ext/datasketches/kll_wrapper.cpp +0 -10
- data/lib/datasketches/version.rb +1 -1
- data/lib/datasketches.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
- data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
- data/vendor/datasketches-cpp/Doxyfile +2827 -0
- data/vendor/datasketches-cpp/LICENSE +0 -76
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +1 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
- data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
- data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
- data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
- data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
- data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
- data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
- data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
- data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
- data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
- data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
- data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
- data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
- data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +47 -93
- data/vendor/datasketches-cpp/MANIFEST.in +0 -39
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/pyproject.toml +0 -23
- data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
- data/vendor/datasketches-cpp/python/README.md +0 -85
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
- data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
- data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
- data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
- data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
- data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +0 -110
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tox.ini +0 -26
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
|
@@ -32,120 +32,120 @@ namespace datasketches {
|
|
|
32
32
|
using count_min_sketch_test_alloc = count_min_sketch<uint64_t, test_allocator<uint64_t>>;
|
|
33
33
|
using alloc = test_allocator<uint64_t>;
|
|
34
34
|
|
|
35
|
-
TEST_CASE("CountMin sketch test allocator: serialize-deserialize empty", "[cm_sketch_alloc]"){
|
|
35
|
+
TEST_CASE("CountMin sketch test allocator: serialize-deserialize empty", "[cm_sketch_alloc]") {
|
|
36
36
|
test_allocator_total_bytes = 0;
|
|
37
37
|
test_allocator_net_allocations = 0;
|
|
38
38
|
{
|
|
39
|
-
uint8_t n_hashes = 1
|
|
40
|
-
uint32_t n_buckets = 5
|
|
39
|
+
uint8_t n_hashes = 1;
|
|
40
|
+
uint32_t n_buckets = 5;
|
|
41
41
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
42
|
-
count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0))
|
|
42
|
+
count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0));
|
|
43
43
|
c.serialize(s);
|
|
44
44
|
count_min_sketch_test_alloc d = count_min_sketch_test_alloc::deserialize(s, DEFAULT_SEED, alloc(0)) ;
|
|
45
|
-
REQUIRE(c.get_num_hashes() == d.get_num_hashes())
|
|
46
|
-
REQUIRE(c.get_num_buckets() == d.get_num_buckets())
|
|
47
|
-
REQUIRE(c.get_seed() == d.get_seed())
|
|
45
|
+
REQUIRE(c.get_num_hashes() == d.get_num_hashes());
|
|
46
|
+
REQUIRE(c.get_num_buckets() == d.get_num_buckets());
|
|
47
|
+
REQUIRE(c.get_seed() == d.get_seed());
|
|
48
48
|
uint64_t zero = 0;
|
|
49
|
-
REQUIRE(c.get_estimate(zero) == d.get_estimate(zero))
|
|
50
|
-
REQUIRE(c.get_total_weight() == d.get_total_weight())
|
|
49
|
+
REQUIRE(c.get_estimate(zero) == d.get_estimate(zero));
|
|
50
|
+
REQUIRE(c.get_total_weight() == d.get_total_weight());
|
|
51
51
|
|
|
52
52
|
// Check that all entries are equal and 0
|
|
53
|
-
for(auto di: d){
|
|
54
|
-
REQUIRE(di == 0)
|
|
53
|
+
for (auto di: d) {
|
|
54
|
+
REQUIRE(di == 0);
|
|
55
55
|
}
|
|
56
56
|
}
|
|
57
57
|
REQUIRE(test_allocator_total_bytes == 0);
|
|
58
58
|
REQUIRE(test_allocator_net_allocations == 0);
|
|
59
59
|
}
|
|
60
60
|
|
|
61
|
-
TEST_CASE("CountMin sketch test allocator: serialize-deserialize non-empty", "[cm_sketch_alloc]"){
|
|
61
|
+
TEST_CASE("CountMin sketch test allocator: serialize-deserialize non-empty", "[cm_sketch_alloc]") {
|
|
62
62
|
test_allocator_total_bytes = 0;
|
|
63
63
|
test_allocator_net_allocations = 0;
|
|
64
64
|
{
|
|
65
|
-
uint8_t n_hashes = 3
|
|
66
|
-
uint32_t n_buckets = 1024
|
|
65
|
+
uint8_t n_hashes = 3;
|
|
66
|
+
uint32_t n_buckets = 1024;
|
|
67
67
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
68
|
-
count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0))
|
|
69
|
-
for(uint64_t i=0
|
|
68
|
+
count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0));
|
|
69
|
+
for (uint64_t i = 0; i < 10; ++i) c.update(i, 10 * i * i);
|
|
70
70
|
c.serialize(s);
|
|
71
|
-
count_min_sketch_test_alloc d = count_min_sketch_test_alloc::deserialize(s, DEFAULT_SEED, alloc(0))
|
|
72
|
-
REQUIRE(c.get_num_hashes() == d.get_num_hashes())
|
|
73
|
-
REQUIRE(c.get_num_buckets() == d.get_num_buckets())
|
|
74
|
-
REQUIRE(c.get_seed() == d.get_seed())
|
|
75
|
-
REQUIRE(c.get_total_weight() == d.get_total_weight())
|
|
76
|
-
for(uint64_t i=0
|
|
77
|
-
REQUIRE(c.get_estimate(i) == d.get_estimate(i))
|
|
71
|
+
count_min_sketch_test_alloc d = count_min_sketch_test_alloc::deserialize(s, DEFAULT_SEED, alloc(0));
|
|
72
|
+
REQUIRE(c.get_num_hashes() == d.get_num_hashes());
|
|
73
|
+
REQUIRE(c.get_num_buckets() == d.get_num_buckets());
|
|
74
|
+
REQUIRE(c.get_seed() == d.get_seed());
|
|
75
|
+
REQUIRE(c.get_total_weight() == d.get_total_weight());
|
|
76
|
+
for (uint64_t i = 0; i < 10; ++i) {
|
|
77
|
+
REQUIRE(c.get_estimate(i) == d.get_estimate(i));
|
|
78
78
|
}
|
|
79
79
|
|
|
80
|
-
auto c_it = c.begin()
|
|
81
|
-
auto d_it = d.begin()
|
|
82
|
-
while(c_it != c.end()){
|
|
83
|
-
REQUIRE(*c_it == *d_it)
|
|
84
|
-
++c_it
|
|
85
|
-
++d_it
|
|
80
|
+
auto c_it = c.begin();
|
|
81
|
+
auto d_it = d.begin();
|
|
82
|
+
while (c_it != c.end()) {
|
|
83
|
+
REQUIRE(*c_it == *d_it);
|
|
84
|
+
++c_it;
|
|
85
|
+
++d_it;
|
|
86
86
|
}
|
|
87
87
|
}
|
|
88
88
|
REQUIRE(test_allocator_total_bytes == 0);
|
|
89
89
|
REQUIRE(test_allocator_net_allocations == 0);
|
|
90
90
|
}
|
|
91
91
|
|
|
92
|
-
TEST_CASE("CountMin sketch test allocator: bytes serialize-deserialize empty", "[cm_sketch_alloc]"){
|
|
92
|
+
TEST_CASE("CountMin sketch test allocator: bytes serialize-deserialize empty", "[cm_sketch_alloc]") {
|
|
93
93
|
test_allocator_total_bytes = 0;
|
|
94
94
|
test_allocator_net_allocations = 0;
|
|
95
95
|
{
|
|
96
|
-
uint8_t n_hashes = 3
|
|
97
|
-
uint32_t n_buckets = 32
|
|
98
|
-
count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0))
|
|
99
|
-
auto bytes = c.serialize()
|
|
96
|
+
uint8_t n_hashes = 3;
|
|
97
|
+
uint32_t n_buckets = 32;
|
|
98
|
+
count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0));
|
|
99
|
+
auto bytes = c.serialize();
|
|
100
100
|
|
|
101
101
|
REQUIRE_THROWS_AS(count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED-1, alloc(0)), std::invalid_argument);
|
|
102
|
-
auto d = count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, alloc(0))
|
|
103
|
-
REQUIRE(c.get_num_hashes() == d.get_num_hashes())
|
|
104
|
-
REQUIRE(c.get_num_buckets() == d.get_num_buckets())
|
|
105
|
-
REQUIRE(c.get_seed() == d.get_seed())
|
|
102
|
+
auto d = count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, alloc(0));
|
|
103
|
+
REQUIRE(c.get_num_hashes() == d.get_num_hashes());
|
|
104
|
+
REQUIRE(c.get_num_buckets() == d.get_num_buckets());
|
|
105
|
+
REQUIRE(c.get_seed() == d.get_seed());
|
|
106
106
|
uint64_t zero = 0;
|
|
107
|
-
REQUIRE(c.get_estimate(zero) == d.get_estimate(zero))
|
|
108
|
-
REQUIRE(c.get_total_weight() == d.get_total_weight())
|
|
107
|
+
REQUIRE(c.get_estimate(zero) == d.get_estimate(zero));
|
|
108
|
+
REQUIRE(c.get_total_weight() == d.get_total_weight());
|
|
109
109
|
|
|
110
110
|
// Check that all entries are equal and 0
|
|
111
|
-
for(auto di: d){
|
|
112
|
-
REQUIRE(di == 0)
|
|
111
|
+
for (auto di: d) {
|
|
112
|
+
REQUIRE(di == 0);
|
|
113
113
|
}
|
|
114
114
|
}
|
|
115
115
|
REQUIRE(test_allocator_total_bytes == 0);
|
|
116
116
|
REQUIRE(test_allocator_net_allocations == 0);
|
|
117
117
|
}
|
|
118
118
|
|
|
119
|
-
TEST_CASE("CountMin sketch test allocator: bytes serialize-deserialize non-empty", "[cm_sketch_alloc]"){
|
|
119
|
+
TEST_CASE("CountMin sketch test allocator: bytes serialize-deserialize non-empty", "[cm_sketch_alloc]") {
|
|
120
120
|
test_allocator_total_bytes = 0;
|
|
121
121
|
test_allocator_net_allocations = 0;
|
|
122
122
|
{
|
|
123
|
-
uint8_t n_hashes = 5
|
|
124
|
-
uint32_t n_buckets = 64
|
|
125
|
-
count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0))
|
|
126
|
-
for(uint64_t i=0
|
|
123
|
+
uint8_t n_hashes = 5;
|
|
124
|
+
uint32_t n_buckets = 64;
|
|
125
|
+
count_min_sketch_test_alloc c(n_hashes, n_buckets, DEFAULT_SEED, alloc(0));
|
|
126
|
+
for (uint64_t i = 0; i < 10; ++i) c.update(i, 10 * i * i);
|
|
127
127
|
|
|
128
|
-
auto bytes = c.serialize()
|
|
128
|
+
auto bytes = c.serialize();
|
|
129
129
|
REQUIRE_THROWS_AS(count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED-1, alloc(0)), std::invalid_argument);
|
|
130
|
-
auto d = count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, alloc(0))
|
|
130
|
+
auto d = count_min_sketch_test_alloc::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED, alloc(0));
|
|
131
131
|
|
|
132
|
-
REQUIRE(c.get_num_hashes() == d.get_num_hashes())
|
|
133
|
-
REQUIRE(c.get_num_buckets() == d.get_num_buckets())
|
|
134
|
-
REQUIRE(c.get_seed() == d.get_seed())
|
|
135
|
-
REQUIRE(c.get_total_weight() == d.get_total_weight())
|
|
132
|
+
REQUIRE(c.get_num_hashes() == d.get_num_hashes());
|
|
133
|
+
REQUIRE(c.get_num_buckets() == d.get_num_buckets());
|
|
134
|
+
REQUIRE(c.get_seed() == d.get_seed());
|
|
135
|
+
REQUIRE(c.get_total_weight() == d.get_total_weight());
|
|
136
136
|
|
|
137
137
|
// Check that all entries are equal
|
|
138
|
-
auto c_it = c.begin()
|
|
139
|
-
auto d_it = d.begin()
|
|
140
|
-
while(c_it != c.end()){
|
|
141
|
-
REQUIRE(*c_it == *d_it)
|
|
142
|
-
++c_it
|
|
143
|
-
++d_it
|
|
138
|
+
auto c_it = c.begin();
|
|
139
|
+
auto d_it = d.begin();
|
|
140
|
+
while (c_it != c.end()) {
|
|
141
|
+
REQUIRE(*c_it == *d_it);
|
|
142
|
+
++c_it;
|
|
143
|
+
++d_it;
|
|
144
144
|
}
|
|
145
145
|
|
|
146
146
|
// Check that the estimates agree
|
|
147
|
-
for(uint64_t i=0
|
|
148
|
-
REQUIRE(c.get_estimate(i) == d.get_estimate(i))
|
|
147
|
+
for (uint64_t i = 0; i < 10; ++i) {
|
|
148
|
+
REQUIRE(c.get_estimate(i) == d.get_estimate(i));
|
|
149
149
|
}
|
|
150
150
|
}
|
|
151
151
|
REQUIRE(test_allocator_total_bytes == 0);
|
|
@@ -26,281 +26,278 @@
|
|
|
26
26
|
#include "count_min.hpp"
|
|
27
27
|
#include "common_defs.hpp"
|
|
28
28
|
|
|
29
|
-
namespace datasketches{
|
|
29
|
+
namespace datasketches {
|
|
30
30
|
|
|
31
31
|
TEST_CASE("CM init - throws") {
|
|
32
32
|
REQUIRE_THROWS_AS(count_min_sketch<uint64_t>(5, 1), std::invalid_argument);
|
|
33
33
|
REQUIRE_THROWS_AS(count_min_sketch<uint64_t>(4, 268435456), std::invalid_argument);
|
|
34
34
|
}
|
|
35
35
|
|
|
36
|
-
TEST_CASE("CM init"){
|
|
37
|
-
uint8_t n_hashes = 3
|
|
38
|
-
uint32_t n_buckets = 5
|
|
39
|
-
uint64_t seed = 1234567
|
|
40
|
-
count_min_sketch<uint64_t> c(n_hashes, n_buckets, seed)
|
|
41
|
-
REQUIRE(c.get_num_hashes() == n_hashes)
|
|
42
|
-
REQUIRE(c.get_num_buckets() == n_buckets)
|
|
43
|
-
REQUIRE(c.get_seed() == seed)
|
|
44
|
-
REQUIRE(c.is_empty())
|
|
45
|
-
|
|
46
|
-
for(auto x: c){
|
|
47
|
-
REQUIRE(x == 0)
|
|
36
|
+
TEST_CASE("CM init") {
|
|
37
|
+
uint8_t n_hashes = 3;
|
|
38
|
+
uint32_t n_buckets = 5;
|
|
39
|
+
uint64_t seed = 1234567;
|
|
40
|
+
count_min_sketch<uint64_t> c(n_hashes, n_buckets, seed);
|
|
41
|
+
REQUIRE(c.get_num_hashes() == n_hashes);
|
|
42
|
+
REQUIRE(c.get_num_buckets() == n_buckets);
|
|
43
|
+
REQUIRE(c.get_seed() == seed);
|
|
44
|
+
REQUIRE(c.is_empty());
|
|
45
|
+
|
|
46
|
+
for (auto x: c) {
|
|
47
|
+
REQUIRE(x == 0);
|
|
48
48
|
}
|
|
49
49
|
|
|
50
50
|
// Check the default seed is appropriately set.
|
|
51
|
-
count_min_sketch<uint64_t> c1(n_hashes, n_buckets)
|
|
52
|
-
REQUIRE(c1.get_seed() == DEFAULT_SEED)
|
|
51
|
+
count_min_sketch<uint64_t> c1(n_hashes, n_buckets);
|
|
52
|
+
REQUIRE(c1.get_seed() == DEFAULT_SEED);
|
|
53
53
|
}
|
|
54
54
|
|
|
55
55
|
TEST_CASE("CM parameter suggestions", "[error parameters]") {
|
|
56
56
|
|
|
57
57
|
// Bucket suggestions
|
|
58
|
-
REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_buckets(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." )
|
|
59
|
-
REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.2) == 14)
|
|
60
|
-
REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.1) == 28)
|
|
61
|
-
REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.05) == 55)
|
|
62
|
-
REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.01) == 272)
|
|
58
|
+
REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_buckets(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." );
|
|
59
|
+
REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.2) == 14);
|
|
60
|
+
REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.1) == 28);
|
|
61
|
+
REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.05) == 55);
|
|
62
|
+
REQUIRE(count_min_sketch<uint64_t>::suggest_num_buckets(0.01) == 272);
|
|
63
63
|
|
|
64
64
|
// Check that the sketch get_epsilon acts inversely to suggest_num_buckets
|
|
65
|
-
uint8_t n_hashes = 3
|
|
66
|
-
REQUIRE(count_min_sketch<uint64_t>(n_hashes, 14).get_relative_error() <= 0.2)
|
|
67
|
-
REQUIRE(count_min_sketch<uint64_t>(n_hashes, 28).get_relative_error() <= 0.1)
|
|
68
|
-
REQUIRE(count_min_sketch<uint64_t>(n_hashes, 55).get_relative_error() <= 0.05)
|
|
69
|
-
REQUIRE(count_min_sketch<uint64_t>(n_hashes, 272).get_relative_error() <= 0.01)
|
|
65
|
+
uint8_t n_hashes = 3;
|
|
66
|
+
REQUIRE(count_min_sketch<uint64_t>(n_hashes, 14).get_relative_error() <= 0.2);
|
|
67
|
+
REQUIRE(count_min_sketch<uint64_t>(n_hashes, 28).get_relative_error() <= 0.1);
|
|
68
|
+
REQUIRE(count_min_sketch<uint64_t>(n_hashes, 55).get_relative_error() <= 0.05);
|
|
69
|
+
REQUIRE(count_min_sketch<uint64_t>(n_hashes, 272).get_relative_error() <= 0.01);
|
|
70
70
|
|
|
71
71
|
// Hash suggestions
|
|
72
|
-
REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_hashes(10.0), "Confidence must be between 0 and 1.0 (inclusive)." )
|
|
73
|
-
REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_hashes(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." )
|
|
74
|
-
REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.682689492) == 2)
|
|
75
|
-
REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.954499736) == 4)
|
|
76
|
-
REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.997300204) == 6)
|
|
72
|
+
REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_hashes(10.0), "Confidence must be between 0 and 1.0 (inclusive)." );
|
|
73
|
+
REQUIRE_THROWS(count_min_sketch<uint64_t>::suggest_num_hashes(-1.0), "Confidence must be between 0 and 1.0 (inclusive)." );
|
|
74
|
+
REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.682689492) == 2); // 1 STDDEV
|
|
75
|
+
REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.954499736) == 4); // 2 STDDEV
|
|
76
|
+
REQUIRE(count_min_sketch<uint64_t>::suggest_num_hashes(0.997300204) == 6); // 3 STDDEV
|
|
77
77
|
}
|
|
78
78
|
|
|
79
|
-
TEST_CASE("CM one update: uint64_t"){
|
|
80
|
-
uint8_t n_hashes = 3
|
|
81
|
-
uint32_t n_buckets = 5
|
|
82
|
-
uint64_t seed =
|
|
83
|
-
uint64_t inserted_weight = 0
|
|
84
|
-
count_min_sketch<uint64_t> c(n_hashes, n_buckets, seed)
|
|
85
|
-
std::string x = "x"
|
|
86
|
-
|
|
87
|
-
REQUIRE(c.is_empty())
|
|
88
|
-
REQUIRE(c.get_estimate("x") == 0)
|
|
89
|
-
c.update(x)
|
|
90
|
-
REQUIRE(!c.is_empty())
|
|
91
|
-
REQUIRE(c.get_estimate(x) == 1)
|
|
92
|
-
inserted_weight += 1
|
|
93
|
-
|
|
94
|
-
uint64_t w = 9
|
|
95
|
-
inserted_weight += w
|
|
96
|
-
c.update(x, w)
|
|
97
|
-
REQUIRE(c.get_estimate(x) == inserted_weight)
|
|
79
|
+
TEST_CASE("CM one update: uint64_t") {
|
|
80
|
+
uint8_t n_hashes = 3;
|
|
81
|
+
uint32_t n_buckets = 5;
|
|
82
|
+
uint64_t seed = 9223372036854775807; //1234567;
|
|
83
|
+
uint64_t inserted_weight = 0;
|
|
84
|
+
count_min_sketch<uint64_t> c(n_hashes, n_buckets, seed);
|
|
85
|
+
std::string x = "x";
|
|
86
|
+
|
|
87
|
+
REQUIRE(c.is_empty());
|
|
88
|
+
REQUIRE(c.get_estimate("x") == 0); // No items in sketch so estimates should be zero
|
|
89
|
+
c.update(x);
|
|
90
|
+
REQUIRE(!c.is_empty());
|
|
91
|
+
REQUIRE(c.get_estimate(x) == 1);
|
|
92
|
+
inserted_weight += 1;
|
|
93
|
+
|
|
94
|
+
uint64_t w = 9;
|
|
95
|
+
inserted_weight += w;
|
|
96
|
+
c.update(x, w);
|
|
97
|
+
REQUIRE(c.get_estimate(x) == inserted_weight);
|
|
98
98
|
|
|
99
99
|
// Doubles are converted to uint64_t
|
|
100
|
-
double w1 = 10.0
|
|
101
|
-
inserted_weight += w1
|
|
102
|
-
c.update(x, w1)
|
|
103
|
-
REQUIRE(c.get_estimate(x) == inserted_weight)
|
|
104
|
-
REQUIRE(c.get_total_weight() == inserted_weight)
|
|
105
|
-
REQUIRE(c.get_estimate(x) <= c.get_upper_bound(x))
|
|
106
|
-
REQUIRE(c.get_estimate(x) >= c.get_lower_bound(x))
|
|
100
|
+
double w1 = 10.0;
|
|
101
|
+
inserted_weight += static_cast<uint64_t>(w1);
|
|
102
|
+
c.update(x, static_cast<uint64_t>(w1));
|
|
103
|
+
REQUIRE(c.get_estimate(x) == inserted_weight);
|
|
104
|
+
REQUIRE(c.get_total_weight() == inserted_weight);
|
|
105
|
+
REQUIRE(c.get_estimate(x) <= c.get_upper_bound(x));
|
|
106
|
+
REQUIRE(c.get_estimate(x) >= c.get_lower_bound(x));
|
|
107
107
|
}
|
|
108
108
|
|
|
109
|
-
TEST_CASE("CM frequency cancellation"){
|
|
110
|
-
count_min_sketch<int64_t> c(1, 5)
|
|
111
|
-
c.update("x")
|
|
112
|
-
c.update("y", -1)
|
|
113
|
-
REQUIRE(c.get_total_weight() == 2)
|
|
114
|
-
REQUIRE(c.get_estimate("x") == 1)
|
|
115
|
-
REQUIRE(c.get_estimate("y") == -1)
|
|
109
|
+
TEST_CASE("CM frequency cancellation") {
|
|
110
|
+
count_min_sketch<int64_t> c(1, 5);
|
|
111
|
+
c.update("x");
|
|
112
|
+
c.update("y", -1);
|
|
113
|
+
REQUIRE(c.get_total_weight() == 2);
|
|
114
|
+
REQUIRE(c.get_estimate("x") == 1);
|
|
115
|
+
REQUIRE(c.get_estimate("y") == -1);
|
|
116
116
|
}
|
|
117
117
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
std::vector<uint64_t>
|
|
122
|
-
std::vector<uint64_t> frequencies(number_of_items) ;
|
|
118
|
+
TEST_CASE("CM frequency estimates") {
|
|
119
|
+
int number_of_items = 10;
|
|
120
|
+
std::vector<uint64_t> data(number_of_items);
|
|
121
|
+
std::vector<uint64_t> frequencies(number_of_items);
|
|
123
122
|
|
|
124
123
|
// Populate data vector
|
|
125
|
-
for(int i=0; i < number_of_items; i
|
|
124
|
+
for (int i = 0; i < number_of_items; ++i) {
|
|
126
125
|
data[i] = i;
|
|
127
|
-
frequencies[i] =
|
|
126
|
+
frequencies[i] = 1ULL << (number_of_items - i);
|
|
128
127
|
}
|
|
129
128
|
|
|
130
|
-
double relative_error = 0.1
|
|
131
|
-
double confidence = 0.99
|
|
132
|
-
|
|
133
|
-
|
|
129
|
+
double relative_error = 0.1;
|
|
130
|
+
double confidence = 0.99;
|
|
131
|
+
uint32_t n_buckets = count_min_sketch<uint64_t>::suggest_num_buckets(relative_error);
|
|
132
|
+
uint8_t n_hashes = count_min_sketch<uint64_t>::suggest_num_hashes(confidence);
|
|
134
133
|
|
|
135
|
-
count_min_sketch<uint64_t> c(n_hashes, n_buckets)
|
|
136
|
-
for(int i=0
|
|
137
|
-
uint64_t value = data[i]
|
|
138
|
-
uint64_t freq = frequencies[i]
|
|
139
|
-
c.update(value, freq)
|
|
134
|
+
count_min_sketch<uint64_t> c(n_hashes, n_buckets);
|
|
135
|
+
for (int i = 0; i < number_of_items; ++i) {
|
|
136
|
+
uint64_t value = data[i];
|
|
137
|
+
uint64_t freq = frequencies[i];
|
|
138
|
+
c.update(value, freq);
|
|
140
139
|
}
|
|
141
140
|
|
|
142
|
-
for(const auto i: data){
|
|
143
|
-
uint64_t est = c.get_estimate(i)
|
|
144
|
-
uint64_t upp = c.get_upper_bound(i)
|
|
145
|
-
uint64_t low = c.get_lower_bound(i)
|
|
146
|
-
REQUIRE(est <= upp)
|
|
147
|
-
REQUIRE(est >= low)
|
|
141
|
+
for (const auto i: data) {
|
|
142
|
+
uint64_t est = c.get_estimate(i);
|
|
143
|
+
uint64_t upp = c.get_upper_bound(i);
|
|
144
|
+
uint64_t low = c.get_lower_bound(i);
|
|
145
|
+
REQUIRE(est <= upp);
|
|
146
|
+
REQUIRE(est >= low);
|
|
148
147
|
}
|
|
149
148
|
}
|
|
150
149
|
|
|
151
|
-
TEST_CASE("CM merge - reject", "[reject cases]"){
|
|
152
|
-
double relative_error = 0.25
|
|
153
|
-
double confidence = 0.9
|
|
154
|
-
uint32_t n_buckets = count_min_sketch<uint64_t>::suggest_num_buckets(relative_error)
|
|
155
|
-
uint8_t n_hashes = count_min_sketch<uint64_t>::suggest_num_hashes(confidence)
|
|
156
|
-
count_min_sketch<uint64_t> s(n_hashes, n_buckets, 9082435234709287)
|
|
157
|
-
|
|
150
|
+
TEST_CASE("CM merge - reject", "[reject cases]") {
|
|
151
|
+
double relative_error = 0.25;
|
|
152
|
+
double confidence = 0.9;
|
|
153
|
+
uint32_t n_buckets = count_min_sketch<uint64_t>::suggest_num_buckets(relative_error);
|
|
154
|
+
uint8_t n_hashes = count_min_sketch<uint64_t>::suggest_num_hashes(confidence);
|
|
155
|
+
count_min_sketch<uint64_t> s(n_hashes, n_buckets, 9082435234709287);
|
|
158
156
|
|
|
159
157
|
// Generate sketches that we cannot merge into ie they disagree on at least one of the config entries
|
|
160
|
-
count_min_sketch<uint64_t> s1(n_hashes+1, n_buckets)
|
|
161
|
-
count_min_sketch<uint64_t> s2(n_hashes, n_buckets+1)
|
|
162
|
-
count_min_sketch<uint64_t> s3(n_hashes, n_buckets, 1)
|
|
158
|
+
count_min_sketch<uint64_t> s1(n_hashes+1, n_buckets); // incorrect number of hashes
|
|
159
|
+
count_min_sketch<uint64_t> s2(n_hashes, n_buckets + 1); // incorrect number of buckets
|
|
160
|
+
count_min_sketch<uint64_t> s3(n_hashes, n_buckets, 1); // incorrect seed
|
|
163
161
|
std::vector<count_min_sketch<uint64_t>> sketches = {s1, s2, s3};
|
|
164
162
|
|
|
165
163
|
// Fail cases
|
|
166
|
-
REQUIRE_THROWS(s.merge(s), "Cannot merge a sketch with itself." )
|
|
167
|
-
for(count_min_sketch<uint64_t> sk : sketches){
|
|
168
|
-
REQUIRE_THROWS(s.merge(sk), "Incompatible sketch config." )
|
|
164
|
+
REQUIRE_THROWS(s.merge(s), "Cannot merge a sketch with itself." );
|
|
165
|
+
for (count_min_sketch<uint64_t> sk : sketches) {
|
|
166
|
+
REQUIRE_THROWS(s.merge(sk), "Incompatible sketch config." );
|
|
169
167
|
}
|
|
170
168
|
}
|
|
171
169
|
|
|
172
|
-
TEST_CASE("CM merge - pass", "[acceptable cases]"){
|
|
173
|
-
double relative_error = 0.25
|
|
174
|
-
double confidence = 0.9
|
|
175
|
-
uint32_t n_buckets = count_min_sketch<uint64_t>::suggest_num_buckets(relative_error)
|
|
176
|
-
uint8_t n_hashes = count_min_sketch<uint64_t>::suggest_num_hashes(confidence)
|
|
177
|
-
count_min_sketch<uint64_t> s(n_hashes, n_buckets)
|
|
178
|
-
uint8_t s_hashes = s.get_num_hashes()
|
|
179
|
-
uint32_t s_buckets = s.get_num_buckets()
|
|
180
|
-
count_min_sketch<uint64_t> t(s_hashes, s_buckets)
|
|
170
|
+
TEST_CASE("CM merge - pass", "[acceptable cases]") {
|
|
171
|
+
double relative_error = 0.25;
|
|
172
|
+
double confidence = 0.9;
|
|
173
|
+
uint32_t n_buckets = count_min_sketch<uint64_t>::suggest_num_buckets(relative_error);
|
|
174
|
+
uint8_t n_hashes = count_min_sketch<uint64_t>::suggest_num_hashes(confidence);
|
|
175
|
+
count_min_sketch<uint64_t> s(n_hashes, n_buckets);
|
|
176
|
+
uint8_t s_hashes = s.get_num_hashes();
|
|
177
|
+
uint32_t s_buckets = s.get_num_buckets();
|
|
178
|
+
count_min_sketch<uint64_t> t(s_hashes, s_buckets);
|
|
181
179
|
|
|
182
180
|
// Merge in an all-zeros sketch t. Should not change the total weight.
|
|
183
|
-
s.merge(t)
|
|
184
|
-
REQUIRE(s.get_total_weight() == 0 )
|
|
181
|
+
s.merge(t);
|
|
182
|
+
REQUIRE(s.get_total_weight() == 0 );
|
|
185
183
|
|
|
186
184
|
std::vector<uint64_t> data = {2,3,5,7};
|
|
187
|
-
for(auto d: data){
|
|
188
|
-
s.update(d)
|
|
189
|
-
t.update(d)
|
|
185
|
+
for (auto d: data) {
|
|
186
|
+
s.update(d);
|
|
187
|
+
t.update(d);
|
|
190
188
|
}
|
|
191
189
|
s.merge(t);
|
|
192
190
|
|
|
193
|
-
REQUIRE(s.get_total_weight() == 2*t.get_total_weight());
|
|
191
|
+
REQUIRE(s.get_total_weight() == 2 * t.get_total_weight());
|
|
194
192
|
|
|
195
193
|
// Estimator checks.
|
|
196
|
-
for (auto x
|
|
197
|
-
REQUIRE(s.get_estimate(x) <= s.get_upper_bound(x))
|
|
194
|
+
for (auto x: data) {
|
|
195
|
+
REQUIRE(s.get_estimate(x) <= s.get_upper_bound(x));
|
|
198
196
|
REQUIRE(s.get_estimate(x) <= 2); // True frequency x == 2 for all x.
|
|
199
197
|
}
|
|
200
198
|
}
|
|
201
199
|
|
|
202
|
-
TEST_CASE("CountMin sketch: serialize-deserialize empty", "[cm_sketch]"){
|
|
203
|
-
uint8_t n_hashes = 1
|
|
204
|
-
uint32_t n_buckets = 5
|
|
200
|
+
TEST_CASE("CountMin sketch: serialize-deserialize empty", "[cm_sketch]") {
|
|
201
|
+
uint8_t n_hashes = 1;
|
|
202
|
+
uint32_t n_buckets = 5;
|
|
205
203
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
206
|
-
count_min_sketch<uint64_t> c(n_hashes, n_buckets)
|
|
204
|
+
count_min_sketch<uint64_t> c(n_hashes, n_buckets);
|
|
207
205
|
c.serialize(s);
|
|
208
|
-
count_min_sketch<uint64_t> d = count_min_sketch<uint64_t>::deserialize(s, DEFAULT_SEED)
|
|
209
|
-
REQUIRE(c.get_num_hashes() == d.get_num_hashes())
|
|
210
|
-
REQUIRE(c.get_num_buckets() == d.get_num_buckets())
|
|
211
|
-
REQUIRE(c.get_seed() == d.get_seed())
|
|
206
|
+
count_min_sketch<uint64_t> d = count_min_sketch<uint64_t>::deserialize(s, DEFAULT_SEED);
|
|
207
|
+
REQUIRE(c.get_num_hashes() == d.get_num_hashes());
|
|
208
|
+
REQUIRE(c.get_num_buckets() == d.get_num_buckets());
|
|
209
|
+
REQUIRE(c.get_seed() == d.get_seed());
|
|
212
210
|
uint64_t zero = 0;
|
|
213
|
-
REQUIRE(c.get_estimate(zero) == d.get_estimate(zero))
|
|
214
|
-
REQUIRE(c.get_total_weight() == d.get_total_weight())
|
|
211
|
+
REQUIRE(c.get_estimate(zero) == d.get_estimate(zero));
|
|
212
|
+
REQUIRE(c.get_total_weight() == d.get_total_weight());
|
|
215
213
|
|
|
216
214
|
// Check that all entries are equal and 0
|
|
217
|
-
for(auto di: d){
|
|
218
|
-
REQUIRE(di == 0)
|
|
215
|
+
for (auto di: d) {
|
|
216
|
+
REQUIRE(di == 0);
|
|
219
217
|
}
|
|
220
218
|
std::ofstream os("count_min-empty.bin");
|
|
221
219
|
c.serialize(os);
|
|
222
220
|
}
|
|
223
221
|
|
|
224
|
-
TEST_CASE("CountMin sketch: serialize-deserialize non-empty", "[cm_sketch]"){
|
|
225
|
-
uint8_t n_hashes = 3
|
|
226
|
-
uint32_t n_buckets = 1024
|
|
222
|
+
TEST_CASE("CountMin sketch: serialize-deserialize non-empty", "[cm_sketch]") {
|
|
223
|
+
uint8_t n_hashes = 3;
|
|
224
|
+
uint32_t n_buckets = 1024;
|
|
227
225
|
std::stringstream s(std::ios::in | std::ios::out | std::ios::binary);
|
|
228
|
-
count_min_sketch<uint64_t> c(n_hashes, n_buckets)
|
|
229
|
-
for(uint64_t i=0
|
|
226
|
+
count_min_sketch<uint64_t> c(n_hashes, n_buckets);
|
|
227
|
+
for (uint64_t i = 0; i < 10; ++i) c.update(i, 10 * i * i);
|
|
230
228
|
c.serialize(s);
|
|
231
|
-
count_min_sketch<uint64_t> d = count_min_sketch<uint64_t>::deserialize(s, DEFAULT_SEED)
|
|
232
|
-
REQUIRE(c.get_num_hashes() == d.get_num_hashes())
|
|
233
|
-
REQUIRE(c.get_num_buckets() == d.get_num_buckets())
|
|
234
|
-
REQUIRE(c.get_seed() == d.get_seed())
|
|
235
|
-
REQUIRE(c.get_total_weight() == d.get_total_weight())
|
|
236
|
-
for(uint64_t i=0
|
|
237
|
-
REQUIRE(c.get_estimate(i) == d.get_estimate(i))
|
|
229
|
+
count_min_sketch<uint64_t> d = count_min_sketch<uint64_t>::deserialize(s, DEFAULT_SEED);
|
|
230
|
+
REQUIRE(c.get_num_hashes() == d.get_num_hashes());
|
|
231
|
+
REQUIRE(c.get_num_buckets() == d.get_num_buckets());
|
|
232
|
+
REQUIRE(c.get_seed() == d.get_seed());
|
|
233
|
+
REQUIRE(c.get_total_weight() == d.get_total_weight());
|
|
234
|
+
for (uint64_t i = 0; i < 10; ++i) {
|
|
235
|
+
REQUIRE(c.get_estimate(i) == d.get_estimate(i));
|
|
238
236
|
}
|
|
239
237
|
|
|
240
|
-
auto c_it = c.begin()
|
|
241
|
-
auto d_it = d.begin()
|
|
242
|
-
while(c_it != c.end()){
|
|
243
|
-
REQUIRE(*c_it == *d_it)
|
|
244
|
-
++c_it
|
|
245
|
-
++d_it
|
|
238
|
+
auto c_it = c.begin();
|
|
239
|
+
auto d_it = d.begin();
|
|
240
|
+
while (c_it != c.end()) {
|
|
241
|
+
REQUIRE(*c_it == *d_it);
|
|
242
|
+
++c_it;
|
|
243
|
+
++d_it;
|
|
246
244
|
}
|
|
247
245
|
|
|
248
246
|
std::ofstream os("count_min-non-empty.bin");
|
|
249
247
|
c.serialize(os);
|
|
250
248
|
}
|
|
251
249
|
|
|
252
|
-
TEST_CASE("CountMin sketch: bytes serialize-deserialize empty", "[cm_sketch]"){
|
|
253
|
-
uint8_t n_hashes = 3
|
|
254
|
-
uint32_t n_buckets = 32
|
|
255
|
-
count_min_sketch<uint64_t> c(n_hashes, n_buckets)
|
|
256
|
-
auto bytes = c.serialize()
|
|
250
|
+
TEST_CASE("CountMin sketch: bytes serialize-deserialize empty", "[cm_sketch]") {
|
|
251
|
+
uint8_t n_hashes = 3;
|
|
252
|
+
uint32_t n_buckets = 32;
|
|
253
|
+
count_min_sketch<uint64_t> c(n_hashes, n_buckets);
|
|
254
|
+
auto bytes = c.serialize();
|
|
257
255
|
|
|
258
256
|
REQUIRE_THROWS_AS(count_min_sketch<uint64_t>::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED-1), std::invalid_argument);
|
|
259
|
-
auto d = count_min_sketch<uint64_t>::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED)
|
|
260
|
-
REQUIRE(c.get_num_hashes() == d.get_num_hashes())
|
|
261
|
-
REQUIRE(c.get_num_buckets() == d.get_num_buckets())
|
|
262
|
-
REQUIRE(c.get_seed() == d.get_seed())
|
|
257
|
+
auto d = count_min_sketch<uint64_t>::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED);
|
|
258
|
+
REQUIRE(c.get_num_hashes() == d.get_num_hashes());
|
|
259
|
+
REQUIRE(c.get_num_buckets() == d.get_num_buckets());
|
|
260
|
+
REQUIRE(c.get_seed() == d.get_seed());
|
|
263
261
|
uint64_t zero = 0;
|
|
264
|
-
REQUIRE(c.get_estimate(zero) == d.get_estimate(zero))
|
|
265
|
-
REQUIRE(c.get_total_weight() == d.get_total_weight())
|
|
262
|
+
REQUIRE(c.get_estimate(zero) == d.get_estimate(zero));
|
|
263
|
+
REQUIRE(c.get_total_weight() == d.get_total_weight());
|
|
266
264
|
|
|
267
265
|
// Check that all entries are equal and 0
|
|
268
|
-
for(auto di: d){
|
|
269
|
-
REQUIRE(di == 0)
|
|
266
|
+
for (auto di: d) {
|
|
267
|
+
REQUIRE(di == 0);
|
|
270
268
|
}
|
|
271
269
|
}
|
|
272
270
|
|
|
273
271
|
|
|
274
|
-
TEST_CASE("CountMin sketch: bytes serialize-deserialize non-empty", "[cm_sketch]"){
|
|
275
|
-
uint8_t n_hashes = 5
|
|
276
|
-
uint32_t n_buckets = 64
|
|
277
|
-
count_min_sketch<uint64_t> c(n_hashes, n_buckets)
|
|
278
|
-
for(uint64_t i=0
|
|
272
|
+
TEST_CASE("CountMin sketch: bytes serialize-deserialize non-empty", "[cm_sketch]") {
|
|
273
|
+
uint8_t n_hashes = 5;
|
|
274
|
+
uint32_t n_buckets = 64;
|
|
275
|
+
count_min_sketch<uint64_t> c(n_hashes, n_buckets);
|
|
276
|
+
for(uint64_t i=0; i < 10; ++i) c.update(i,10*i*i);
|
|
279
277
|
|
|
280
|
-
auto bytes = c.serialize()
|
|
278
|
+
auto bytes = c.serialize();
|
|
281
279
|
REQUIRE_THROWS_AS(count_min_sketch<uint64_t>::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED-1), std::invalid_argument);
|
|
282
|
-
auto d = count_min_sketch<uint64_t>::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED)
|
|
280
|
+
auto d = count_min_sketch<uint64_t>::deserialize(bytes.data(), bytes.size(), DEFAULT_SEED);
|
|
283
281
|
|
|
284
|
-
REQUIRE(c.get_num_hashes() == d.get_num_hashes())
|
|
285
|
-
REQUIRE(c.get_num_buckets() == d.get_num_buckets())
|
|
286
|
-
REQUIRE(c.get_seed() == d.get_seed())
|
|
287
|
-
REQUIRE(c.get_total_weight() == d.get_total_weight())
|
|
282
|
+
REQUIRE(c.get_num_hashes() == d.get_num_hashes());
|
|
283
|
+
REQUIRE(c.get_num_buckets() == d.get_num_buckets());
|
|
284
|
+
REQUIRE(c.get_seed() == d.get_seed());
|
|
285
|
+
REQUIRE(c.get_total_weight() == d.get_total_weight());
|
|
288
286
|
|
|
289
287
|
// Check that all entries are equal
|
|
290
|
-
auto c_it = c.begin()
|
|
291
|
-
auto d_it = d.begin()
|
|
292
|
-
while(c_it != c.end()){
|
|
293
|
-
REQUIRE(*c_it == *d_it)
|
|
294
|
-
++c_it
|
|
295
|
-
++d_it
|
|
288
|
+
auto c_it = c.begin();
|
|
289
|
+
auto d_it = d.begin();
|
|
290
|
+
while (c_it != c.end()) {
|
|
291
|
+
REQUIRE(*c_it == *d_it);
|
|
292
|
+
++c_it;
|
|
293
|
+
++d_it;
|
|
296
294
|
}
|
|
297
295
|
|
|
298
296
|
// Check that the estimates agree
|
|
299
|
-
for(uint64_t i=0
|
|
300
|
-
REQUIRE(c.get_estimate(i) == d.get_estimate(i))
|
|
297
|
+
for (uint64_t i = 0; i < 10; ++i) {
|
|
298
|
+
REQUIRE(c.get_estimate(i) == d.get_estimate(i));
|
|
301
299
|
}
|
|
302
300
|
|
|
303
301
|
}
|
|
304
302
|
|
|
305
303
|
} /* namespace datasketches */
|
|
306
|
-
|