datasketches 0.3.2 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/NOTICE +1 -1
- data/README.md +0 -2
- data/ext/datasketches/cpc_wrapper.cpp +2 -2
- data/ext/datasketches/kll_wrapper.cpp +0 -10
- data/lib/datasketches/version.rb +1 -1
- data/lib/datasketches.rb +1 -1
- data/vendor/datasketches-cpp/CMakeLists.txt +1 -0
- data/vendor/datasketches-cpp/CODE_OF_CONDUCT.md +3 -0
- data/vendor/datasketches-cpp/CONTRIBUTING.md +50 -0
- data/vendor/datasketches-cpp/Doxyfile +2827 -0
- data/vendor/datasketches-cpp/LICENSE +0 -76
- data/vendor/datasketches-cpp/NOTICE +1 -1
- data/vendor/datasketches-cpp/README.md +1 -3
- data/vendor/datasketches-cpp/common/CMakeLists.txt +12 -11
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +11 -8
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +0 -2
- data/vendor/datasketches-cpp/common/include/kolmogorov_smirnov.hpp +9 -6
- data/vendor/datasketches-cpp/common/include/optional.hpp +148 -0
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view.hpp +95 -2
- data/vendor/datasketches-cpp/common/include/quantiles_sorted_view_impl.hpp +1 -1
- data/vendor/datasketches-cpp/common/include/serde.hpp +69 -20
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +1 -1
- data/vendor/datasketches-cpp/common/test/optional_test.cpp +85 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +14 -14
- data/vendor/datasketches-cpp/count/include/count_min.hpp +132 -78
- data/vendor/datasketches-cpp/count/include/count_min_impl.hpp +132 -152
- data/vendor/datasketches-cpp/count/test/CMakeLists.txt +11 -12
- data/vendor/datasketches-cpp/count/test/count_min_allocation_test.cpp +61 -61
- data/vendor/datasketches-cpp/count/test/count_min_test.cpp +175 -178
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +14 -20
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +7 -4
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +17 -17
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +40 -40
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +13 -10
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +35 -11
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +8 -8
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +3 -2
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +5 -5
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +20 -7
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_deserialize_from_java_test.cpp +60 -0
- data/vendor/datasketches-cpp/{python/include/py_object_lt.hpp → cpc/test/cpc_sketch_serialize_for_java.cpp} +15 -14
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +4 -29
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +4 -4
- data/vendor/datasketches-cpp/density/include/density_sketch.hpp +29 -9
- data/vendor/datasketches-cpp/density/include/density_sketch_impl.hpp +1 -1
- data/vendor/datasketches-cpp/density/test/CMakeLists.txt +0 -1
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +21 -9
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +6 -4
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_deserialize_from_java_test.cpp +95 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_serialize_for_java.cpp +83 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +3 -42
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +2 -2
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +3 -3
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +5 -3
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +4 -4
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +3 -1
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +0 -12
- data/vendor/datasketches-cpp/hll/include/hll.hpp +70 -57
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +0 -68
- data/vendor/datasketches-cpp/hll/test/hll_sketch_deserialize_from_java_test.cpp +69 -0
- data/vendor/datasketches-cpp/hll/test/hll_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +2 -2
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +71 -50
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +59 -130
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/kll/test/kll_sketch_deserialize_from_java_test.cpp +103 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +3 -38
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch.hpp +68 -51
- data/vendor/datasketches-cpp/quantiles/include/quantiles_sketch_impl.hpp +62 -132
- data/vendor/datasketches-cpp/quantiles/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_deserialize_from_java_test.cpp +84 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_serialize_for_java.cpp +52 -0
- data/vendor/datasketches-cpp/quantiles/test/quantiles_sketch_test.cpp +14 -38
- data/vendor/datasketches-cpp/req/include/req_common.hpp +7 -3
- data/vendor/datasketches-cpp/req/include/req_compactor_impl.hpp +2 -2
- data/vendor/datasketches-cpp/req/include/req_sketch.hpp +97 -23
- data/vendor/datasketches-cpp/req/include/req_sketch_impl.hpp +48 -109
- data/vendor/datasketches-cpp/req/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/req/test/req_sketch_deserialize_from_java_test.cpp +55 -0
- data/vendor/datasketches-cpp/{tuple/include/array_of_doubles_intersection_impl.hpp → req/test/req_sketch_serialize_for_java.cpp} +12 -7
- data/vendor/datasketches-cpp/req/test/req_sketch_test.cpp +3 -89
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +4 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample.hpp +210 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sample_impl.hpp +535 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch.hpp +281 -0
- data/vendor/datasketches-cpp/sampling/include/ebpps_sketch_impl.hpp +531 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +69 -26
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +3 -3
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +10 -11
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +4 -4
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +55 -8
- data/vendor/datasketches-cpp/sampling/test/ebpps_allocation_test.cpp +96 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sample_test.cpp +137 -0
- data/vendor/datasketches-cpp/sampling/test/ebpps_sketch_test.cpp +266 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_deserialize_from_java_test.cpp +81 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_serialize_for_java.cpp +54 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +0 -37
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_deserialize_from_java_test.cpp +50 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_serialize_for_java.cpp +56 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +0 -18
- data/vendor/datasketches-cpp/theta/include/bit_packing.hpp +2608 -2608
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_sampled_sets.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/bounds_on_ratios_in_theta_sketched_sets.hpp +7 -6
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +20 -5
- data/vendor/datasketches-cpp/theta/include/theta_constants.hpp +10 -4
- data/vendor/datasketches-cpp/theta/include/theta_helpers.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +13 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_base_impl.hpp +5 -5
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/theta/include/theta_jaccard_similarity_base.hpp +1 -0
- data/vendor/datasketches-cpp/theta/include/theta_set_difference_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +126 -27
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +8 -8
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +17 -10
- data/vendor/datasketches-cpp/theta/include/theta_union_base_impl.hpp +1 -1
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +3 -3
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base.hpp +5 -2
- data/vendor/datasketches-cpp/theta/include/theta_update_sketch_base_impl.hpp +11 -1
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +14 -1
- data/vendor/datasketches-cpp/theta/test/theta_sketch_deserialize_from_java_test.cpp +57 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_serialize_for_java.cpp +61 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +0 -188
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +8 -7
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +19 -144
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b.hpp → array_tuple_a_not_b.hpp} +24 -16
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_a_not_b_impl.hpp → array_tuple_a_not_b_impl.hpp} +4 -4
- data/vendor/datasketches-cpp/tuple/include/array_tuple_intersection.hpp +65 -0
- data/vendor/datasketches-cpp/{python/include/py_object_ostream.hpp → tuple/include/array_tuple_intersection_impl.hpp} +7 -24
- data/vendor/datasketches-cpp/tuple/include/array_tuple_sketch.hpp +237 -0
- data/vendor/datasketches-cpp/tuple/include/{array_of_doubles_sketch_impl.hpp → array_tuple_sketch_impl.hpp} +40 -41
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_tuple_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +11 -2
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +17 -10
- data/vendor/datasketches-cpp/tuple/include/tuple_jaccard_similarity.hpp +2 -1
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +95 -32
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +19 -11
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +16 -1
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_deserialize_from_java_test.cpp +76 -0
- data/vendor/datasketches-cpp/tuple/test/aod_sketch_serialize_for_java.cpp +62 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +5 -129
- data/vendor/datasketches-cpp/tuple/test/engagement_test.cpp +85 -89
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +3 -1
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_deserialize_from_java_test.cpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_serialize_for_java.cpp +38 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +1 -1
- data/vendor/datasketches-cpp/version.cfg.in +1 -1
- metadata +47 -93
- data/vendor/datasketches-cpp/MANIFEST.in +0 -39
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/pyproject.toml +0 -23
- data/vendor/datasketches-cpp/python/CMakeLists.txt +0 -87
- data/vendor/datasketches-cpp/python/README.md +0 -85
- data/vendor/datasketches-cpp/python/datasketches/DensityWrapper.py +0 -87
- data/vendor/datasketches-cpp/python/datasketches/KernelFunction.py +0 -35
- data/vendor/datasketches-cpp/python/datasketches/PySerDe.py +0 -110
- data/vendor/datasketches-cpp/python/datasketches/TuplePolicy.py +0 -77
- data/vendor/datasketches-cpp/python/datasketches/TupleWrapper.py +0 -205
- data/vendor/datasketches-cpp/python/datasketches/__init__.py +0 -38
- data/vendor/datasketches-cpp/python/include/kernel_function.hpp +0 -98
- data/vendor/datasketches-cpp/python/include/py_serde.hpp +0 -113
- data/vendor/datasketches-cpp/python/include/quantile_conditional.hpp +0 -104
- data/vendor/datasketches-cpp/python/include/tuple_policy.hpp +0 -136
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +0 -345
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +0 -354
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +0 -346
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +0 -463
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +0 -403
- data/vendor/datasketches-cpp/python/pybind11Path.cmd +0 -21
- data/vendor/datasketches-cpp/python/src/__init__.py +0 -18
- data/vendor/datasketches-cpp/python/src/count_wrapper.cpp +0 -101
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +0 -76
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +0 -58
- data/vendor/datasketches-cpp/python/src/density_wrapper.cpp +0 -95
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +0 -182
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +0 -126
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +0 -158
- data/vendor/datasketches-cpp/python/src/ks_wrapper.cpp +0 -68
- data/vendor/datasketches-cpp/python/src/py_serde.cpp +0 -112
- data/vendor/datasketches-cpp/python/src/quantiles_wrapper.cpp +0 -155
- data/vendor/datasketches-cpp/python/src/req_wrapper.cpp +0 -154
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +0 -166
- data/vendor/datasketches-cpp/python/src/tuple_wrapper.cpp +0 -215
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +0 -490
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +0 -173
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -16
- data/vendor/datasketches-cpp/python/tests/count_min_test.py +0 -86
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +0 -64
- data/vendor/datasketches-cpp/python/tests/density_test.py +0 -93
- data/vendor/datasketches-cpp/python/tests/fi_test.py +0 -149
- data/vendor/datasketches-cpp/python/tests/hll_test.py +0 -129
- data/vendor/datasketches-cpp/python/tests/kll_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/quantiles_test.py +0 -160
- data/vendor/datasketches-cpp/python/tests/req_test.py +0 -159
- data/vendor/datasketches-cpp/python/tests/theta_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/tuple_test.py +0 -206
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +0 -148
- data/vendor/datasketches-cpp/python/tests/vo_test.py +0 -132
- data/vendor/datasketches-cpp/req/test/req_float_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_raw_items_from_java.sk +0 -0
- data/vendor/datasketches-cpp/req/test/req_float_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +0 -67
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +0 -110
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tox.ini +0 -26
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +0 -52
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +0 -81
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +0 -43
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +0 -1
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +0 -1
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <catch2/catch.hpp>
|
|
21
|
+
#include <fstream>
|
|
22
|
+
#include <frequent_items_sketch.hpp>
|
|
23
|
+
|
|
24
|
+
namespace datasketches {
|
|
25
|
+
|
|
26
|
+
TEST_CASE("frequent longs sketch generate", "[serialize_for_java]") {
|
|
27
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
|
28
|
+
for (const unsigned n: n_arr) {
|
|
29
|
+
frequent_items_sketch<long> sketch(6);
|
|
30
|
+
for (unsigned i = 1; i <= n; ++i) sketch.update(i);
|
|
31
|
+
REQUIRE(sketch.is_empty() == (n == 0));
|
|
32
|
+
if (n > 10) {
|
|
33
|
+
REQUIRE(sketch.get_maximum_error() > 0);
|
|
34
|
+
} else {
|
|
35
|
+
REQUIRE(sketch.get_maximum_error() == 0);
|
|
36
|
+
}
|
|
37
|
+
REQUIRE(sketch.get_total_weight() == n);
|
|
38
|
+
std::ofstream os("frequent_long_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
|
39
|
+
sketch.serialize(os);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
TEST_CASE("frequent strings sketch generate", "[serialize_for_java]") {
|
|
44
|
+
const unsigned n_arr[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000};
|
|
45
|
+
for (const unsigned n: n_arr) {
|
|
46
|
+
frequent_items_sketch<std::string> sketch(6);
|
|
47
|
+
for (unsigned i = 1; i <= n; ++i) sketch.update(std::to_string(i));
|
|
48
|
+
REQUIRE(sketch.is_empty() == (n == 0));
|
|
49
|
+
if (n > 10) {
|
|
50
|
+
REQUIRE(sketch.get_maximum_error() > 0);
|
|
51
|
+
} else {
|
|
52
|
+
REQUIRE(sketch.get_maximum_error() == 0);
|
|
53
|
+
}
|
|
54
|
+
REQUIRE(sketch.get_total_weight() == n);
|
|
55
|
+
std::ofstream os("frequent_string_n" + std::to_string(n) + "_cpp.sk", std::ios::binary);
|
|
56
|
+
sketch.serialize(os);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
TEST_CASE("frequent strings sketch ascii", "[serialize_for_java]") {
|
|
61
|
+
frequent_items_sketch<std::string> sketch(6);
|
|
62
|
+
sketch.update("aaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1);
|
|
63
|
+
sketch.update("bbbbbbbbbbbbbbbbbbbbbbbbbbbbb", 2);
|
|
64
|
+
sketch.update("ccccccccccccccccccccccccccccc", 3);
|
|
65
|
+
sketch.update("ddddddddddddddddddddddddddddd", 4);
|
|
66
|
+
std::ofstream os("frequent_string_ascii_cpp.sk", std::ios::binary);
|
|
67
|
+
sketch.serialize(os);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
TEST_CASE("frequent strings sketch utf8", "[serialize_for_java]") {
|
|
71
|
+
frequent_items_sketch<std::string> sketch(6);
|
|
72
|
+
sketch.update("абвгд", 1);
|
|
73
|
+
sketch.update("еёжзи", 2);
|
|
74
|
+
sketch.update("йклмн", 3);
|
|
75
|
+
sketch.update("опрст", 4);
|
|
76
|
+
sketch.update("уфхцч", 5);
|
|
77
|
+
sketch.update("шщъыь", 6);
|
|
78
|
+
sketch.update("эюя", 7);
|
|
79
|
+
std::ofstream os("frequent_string_utf8_cpp.sk", std::ios::binary);
|
|
80
|
+
sketch.serialize(os);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
} /* namespace datasketches */
|
|
@@ -70,6 +70,7 @@ TEST_CASE("frequent items: several items, no resize, no purge", "[frequent_items
|
|
|
70
70
|
REQUIRE(sketch.get_estimate("b") == 3);
|
|
71
71
|
REQUIRE(sketch.get_estimate("c") == 2);
|
|
72
72
|
REQUIRE(sketch.get_estimate("d") == 1);
|
|
73
|
+
REQUIRE(sketch.get_maximum_error() == 0);
|
|
73
74
|
}
|
|
74
75
|
|
|
75
76
|
TEST_CASE("frequent items: several items, with resize, no purge", "[frequent_items_sketch]") {
|
|
@@ -96,6 +97,7 @@ TEST_CASE("frequent items: several items, with resize, no purge", "[frequent_ite
|
|
|
96
97
|
REQUIRE(sketch.get_estimate("b") == 3);
|
|
97
98
|
REQUIRE(sketch.get_estimate("c") == 2);
|
|
98
99
|
REQUIRE(sketch.get_estimate("d") == 1);
|
|
100
|
+
REQUIRE(sketch.get_maximum_error() == 0);
|
|
99
101
|
}
|
|
100
102
|
|
|
101
103
|
TEST_CASE("frequent items: estimation mode", "[frequent_items_sketch]") {
|
|
@@ -149,6 +151,7 @@ TEST_CASE("frequent items: merge exact mode", "[frequent_items_sketch]") {
|
|
|
149
151
|
REQUIRE(sketch1.get_estimate(2) == 3);
|
|
150
152
|
REQUIRE(sketch1.get_estimate(3) == 2);
|
|
151
153
|
REQUIRE(sketch1.get_estimate(4) == 1);
|
|
154
|
+
REQUIRE(sketch1.get_maximum_error() == 0);
|
|
152
155
|
}
|
|
153
156
|
|
|
154
157
|
TEST_CASE("frequent items: merge estimation mode", "[frequent_items_sketch]") {
|
|
@@ -199,48 +202,6 @@ TEST_CASE("frequent items: merge estimation mode", "[frequent_items_sketch]") {
|
|
|
199
202
|
REQUIRE(9 <= items[1].get_estimate()); // always overestimated
|
|
200
203
|
}
|
|
201
204
|
|
|
202
|
-
TEST_CASE("frequent items: deserialize from java long", "[frequent_items_sketch]") {
|
|
203
|
-
std::ifstream is;
|
|
204
|
-
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
205
|
-
is.open(testBinaryInputPath + "longs_sketch_from_java.sk", std::ios::binary);
|
|
206
|
-
auto sketch = frequent_items_sketch<long long>::deserialize(is);
|
|
207
|
-
REQUIRE_FALSE(sketch.is_empty());
|
|
208
|
-
REQUIRE(sketch.get_total_weight() == 4);
|
|
209
|
-
REQUIRE(sketch.get_num_active_items() == 4);
|
|
210
|
-
REQUIRE(sketch.get_estimate(1) == 1);
|
|
211
|
-
REQUIRE(sketch.get_estimate(2) == 1);
|
|
212
|
-
REQUIRE(sketch.get_estimate(3) == 1);
|
|
213
|
-
REQUIRE(sketch.get_estimate(4) == 1);
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
TEST_CASE("frequent items: deserialize from java string", "[frequent_items_sketch]") {
|
|
217
|
-
std::ifstream is;
|
|
218
|
-
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
219
|
-
is.open(testBinaryInputPath + "items_sketch_string_from_java.sk", std::ios::binary);
|
|
220
|
-
auto sketch = frequent_items_sketch<std::string>::deserialize(is);
|
|
221
|
-
REQUIRE_FALSE(sketch.is_empty());
|
|
222
|
-
REQUIRE(sketch.get_total_weight() == 4);
|
|
223
|
-
REQUIRE(sketch.get_num_active_items() == 4);
|
|
224
|
-
REQUIRE(sketch.get_estimate("aaaaaaaaaaaaaaaaaaaaaaaaaaaaa") == 1);
|
|
225
|
-
REQUIRE(sketch.get_estimate("bbbbbbbbbbbbbbbbbbbbbbbbbbbbb") == 1);
|
|
226
|
-
REQUIRE(sketch.get_estimate("ccccccccccccccccccccccccccccc") == 1);
|
|
227
|
-
REQUIRE(sketch.get_estimate("ddddddddddddddddddddddddddddd") == 1);
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
TEST_CASE("frequent items: deserialize from java string, utf-8", "[frequent_items_sketch]") {
|
|
231
|
-
std::ifstream is;
|
|
232
|
-
is.exceptions(std::ios::failbit | std::ios::badbit);
|
|
233
|
-
is.open(testBinaryInputPath + "items_sketch_string_utf8_from_java.sk", std::ios::binary);
|
|
234
|
-
auto sketch = frequent_items_sketch<std::string>::deserialize(is);
|
|
235
|
-
REQUIRE_FALSE(sketch.is_empty());
|
|
236
|
-
REQUIRE(sketch.get_total_weight() == 10);
|
|
237
|
-
REQUIRE(sketch.get_num_active_items() == 4);
|
|
238
|
-
REQUIRE(sketch.get_estimate("абвгд") == 1);
|
|
239
|
-
REQUIRE(sketch.get_estimate("еёжзи") == 2);
|
|
240
|
-
REQUIRE(sketch.get_estimate("йклмн") == 3);
|
|
241
|
-
REQUIRE(sketch.get_estimate("опрст") == 4);
|
|
242
|
-
}
|
|
243
|
-
|
|
244
205
|
TEST_CASE("frequent items: deserialize long64 stream", "[frequent_items_sketch]") {
|
|
245
206
|
frequent_items_sketch<long long> sketch1(3);
|
|
246
207
|
sketch1.update(1, 1);
|
|
@@ -169,9 +169,9 @@ CouponList<A>* CouponList<A>::newList(std::istream& is, const A& allocator) {
|
|
|
169
169
|
}
|
|
170
170
|
|
|
171
171
|
template<typename A>
|
|
172
|
-
|
|
172
|
+
auto CouponList<A>::serialize(bool compact, unsigned header_size_bytes) const -> vector_bytes {
|
|
173
173
|
const size_t sketchSizeBytes = (compact ? getCompactSerializationBytes() : getUpdatableSerializationBytes()) + header_size_bytes;
|
|
174
|
-
|
|
174
|
+
vector_bytes byteArr(sketchSizeBytes, 0, getAllocator());
|
|
175
175
|
uint8_t* bytes = byteArr.data() + header_size_bytes;
|
|
176
176
|
|
|
177
177
|
bytes[hll_constants::PREAMBLE_INTS_BYTE] = static_cast<uint8_t>(getPreInts());
|
|
@@ -33,12 +33,14 @@ class HllSketchImplFactory;
|
|
|
33
33
|
template<typename A>
|
|
34
34
|
class CouponList : public HllSketchImpl<A> {
|
|
35
35
|
public:
|
|
36
|
+
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
|
|
37
|
+
|
|
36
38
|
CouponList(uint8_t lgConfigK, target_hll_type tgtHllType, hll_mode mode, const A& allocator);
|
|
37
39
|
CouponList(const CouponList& that, target_hll_type tgtHllType);
|
|
38
40
|
|
|
39
41
|
static CouponList* newList(const void* bytes, size_t len, const A& allocator);
|
|
40
42
|
static CouponList* newList(std::istream& is, const A& allocator);
|
|
41
|
-
virtual
|
|
43
|
+
virtual vector_bytes serialize(bool compact, unsigned header_size_bytes) const;
|
|
42
44
|
virtual void serialize(std::ostream& os, bool compact) const;
|
|
43
45
|
|
|
44
46
|
virtual ~CouponList() = default;
|
|
@@ -216,9 +216,9 @@ HllArray<A>* HllArray<A>::newHll(std::istream& is, const A& allocator) {
|
|
|
216
216
|
}
|
|
217
217
|
|
|
218
218
|
template<typename A>
|
|
219
|
-
|
|
219
|
+
auto HllArray<A>::serialize(bool compact, unsigned header_size_bytes) const -> vector_bytes {
|
|
220
220
|
const size_t sketchSizeBytes = (compact ? getCompactSerializationBytes() : getUpdatableSerializationBytes()) + header_size_bytes;
|
|
221
|
-
|
|
221
|
+
vector_bytes byteArr(sketchSizeBytes, 0, getAllocator());
|
|
222
222
|
uint8_t* bytes = byteArr.data() + header_size_bytes;
|
|
223
223
|
AuxHashMap<A>* auxHashMap = getAuxHashMap();
|
|
224
224
|
|
|
@@ -537,7 +537,7 @@ AuxHashMap<A>* HllArray<A>::getAuxHashMap() const {
|
|
|
537
537
|
}
|
|
538
538
|
|
|
539
539
|
template<typename A>
|
|
540
|
-
|
|
540
|
+
auto HllArray<A>::getHllArray() const -> const vector_bytes& {
|
|
541
541
|
return hllByteArr_;
|
|
542
542
|
}
|
|
543
543
|
|
|
@@ -31,13 +31,15 @@ class AuxHashMap;
|
|
|
31
31
|
template<typename A>
|
|
32
32
|
class HllArray : public HllSketchImpl<A> {
|
|
33
33
|
public:
|
|
34
|
+
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
|
|
35
|
+
|
|
34
36
|
HllArray(uint8_t lgConfigK, target_hll_type tgtHllType, bool startFullSize, const A& allocator);
|
|
35
37
|
explicit HllArray(const HllArray& other, target_hll_type tgtHllType);
|
|
36
38
|
|
|
37
39
|
static HllArray* newHll(const void* bytes, size_t len, const A& allocator);
|
|
38
40
|
static HllArray* newHll(std::istream& is, const A& allocator);
|
|
39
41
|
|
|
40
|
-
virtual
|
|
42
|
+
virtual vector_bytes serialize(bool compact, unsigned header_size_bytes) const;
|
|
41
43
|
virtual void serialize(std::ostream& os, bool compact) const;
|
|
42
44
|
|
|
43
45
|
virtual ~HllArray() = default;
|
|
@@ -97,7 +99,7 @@ class HllArray : public HllSketchImpl<A> {
|
|
|
97
99
|
|
|
98
100
|
virtual A getAllocator() const;
|
|
99
101
|
|
|
100
|
-
const
|
|
102
|
+
const vector_bytes& getHllArray() const;
|
|
101
103
|
|
|
102
104
|
protected:
|
|
103
105
|
void hipAndKxQIncrementalUpdate(uint8_t oldValue, uint8_t newValue);
|
|
@@ -107,7 +109,7 @@ class HllArray : public HllSketchImpl<A> {
|
|
|
107
109
|
double hipAccum_;
|
|
108
110
|
double kxq0_;
|
|
109
111
|
double kxq1_;
|
|
110
|
-
|
|
112
|
+
vector_bytes hllByteArr_; //init by sub-classes
|
|
111
113
|
uint8_t curMin_; //always zero for Hll6 and Hll8, only tracked by Hll4Array
|
|
112
114
|
uint32_t numAtCurMin_; //interpreted as num zeros when curMin == 0
|
|
113
115
|
bool oooFlag_; //Out-Of-Order Flag
|
|
@@ -94,14 +94,14 @@ hll_sketch_alloc<A>::hll_sketch_alloc(HllSketchImpl<A>* that) :
|
|
|
94
94
|
{}
|
|
95
95
|
|
|
96
96
|
template<typename A>
|
|
97
|
-
hll_sketch_alloc<A
|
|
97
|
+
hll_sketch_alloc<A>& hll_sketch_alloc<A>::operator=(const hll_sketch_alloc<A>& other) {
|
|
98
98
|
sketch_impl->get_deleter()(sketch_impl);
|
|
99
99
|
sketch_impl = other.sketch_impl->copy();
|
|
100
100
|
return *this;
|
|
101
101
|
}
|
|
102
102
|
|
|
103
103
|
template<typename A>
|
|
104
|
-
hll_sketch_alloc<A
|
|
104
|
+
hll_sketch_alloc<A>& hll_sketch_alloc<A>::operator=(hll_sketch_alloc<A>&& other) {
|
|
105
105
|
std::swap(sketch_impl, other.sketch_impl);
|
|
106
106
|
return *this;
|
|
107
107
|
}
|
|
@@ -232,12 +232,12 @@ void hll_sketch_alloc<A>::serialize_updatable(std::ostream& os) const {
|
|
|
232
232
|
}
|
|
233
233
|
|
|
234
234
|
template<typename A>
|
|
235
|
-
|
|
235
|
+
auto hll_sketch_alloc<A>::serialize_compact(unsigned header_size_bytes) const -> vector_bytes {
|
|
236
236
|
return sketch_impl->serialize(true, header_size_bytes);
|
|
237
237
|
}
|
|
238
238
|
|
|
239
239
|
template<typename A>
|
|
240
|
-
|
|
240
|
+
auto hll_sketch_alloc<A>::serialize_updatable() const -> vector_bytes {
|
|
241
241
|
return sketch_impl->serialize(false, 0);
|
|
242
242
|
}
|
|
243
243
|
|
|
@@ -30,11 +30,13 @@ namespace datasketches {
|
|
|
30
30
|
template<typename A>
|
|
31
31
|
class HllSketchImpl {
|
|
32
32
|
public:
|
|
33
|
+
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
|
|
34
|
+
|
|
33
35
|
HllSketchImpl(uint8_t lgConfigK, target_hll_type tgtHllType, hll_mode mode, bool startFullSize);
|
|
34
36
|
virtual ~HllSketchImpl();
|
|
35
37
|
|
|
36
38
|
virtual void serialize(std::ostream& os, bool compact) const = 0;
|
|
37
|
-
virtual
|
|
39
|
+
virtual vector_bytes serialize(bool compact, unsigned header_size_bytes) const = 0;
|
|
38
40
|
|
|
39
41
|
virtual HllSketchImpl* copy() const = 0;
|
|
40
42
|
virtual HllSketchImpl* copyAs(target_hll_type tgtHllType) const = 0;
|
|
@@ -124,8 +124,6 @@ public:
|
|
|
124
124
|
static uint32_t pair(uint32_t slotNo, uint8_t value);
|
|
125
125
|
static uint32_t getLow26(uint32_t coupon);
|
|
126
126
|
static uint8_t getValue(uint32_t coupon);
|
|
127
|
-
static double invPow2(uint8_t e);
|
|
128
|
-
static uint8_t ceilingPowerOf2(uint32_t n);
|
|
129
127
|
static uint8_t simpleIntLog2(uint32_t n); // n must be power of 2
|
|
130
128
|
static uint8_t computeLgArrInts(hll_mode mode, uint32_t count, uint8_t lgConfigK);
|
|
131
129
|
static double getRelErr(bool upperBound, bool unioned, uint8_t lgConfigK, uint8_t numStdDev);
|
|
@@ -204,16 +202,6 @@ inline uint8_t HllUtil<A>::getValue(uint32_t coupon) {
|
|
|
204
202
|
return coupon >> hll_constants::KEY_BITS_26;
|
|
205
203
|
}
|
|
206
204
|
|
|
207
|
-
template<typename A>
|
|
208
|
-
inline double HllUtil<A>::invPow2(uint8_t e) {
|
|
209
|
-
union {
|
|
210
|
-
long long longVal;
|
|
211
|
-
double doubleVal;
|
|
212
|
-
} conv;
|
|
213
|
-
conv.longVal = (1023L - e) << 52;
|
|
214
|
-
return conv.doubleVal;
|
|
215
|
-
}
|
|
216
|
-
|
|
217
205
|
template<typename A>
|
|
218
206
|
inline uint8_t HllUtil<A>::simpleIntLog2(uint32_t n) {
|
|
219
207
|
if (n == 0) {
|
|
@@ -30,40 +30,15 @@
|
|
|
30
30
|
|
|
31
31
|
namespace datasketches {
|
|
32
32
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
* trade-offs with accuracy, space and performance. These types are specified with the
|
|
42
|
-
* {@link TgtHllType} parameter.
|
|
43
|
-
*
|
|
44
|
-
* <p>In terms of accuracy, all three types, for the same <i>lg_config_k</i>, have the same error
|
|
45
|
-
* distribution as a function of <i>n</i>, the number of unique values fed to the sketch.
|
|
46
|
-
* The configuration parameter <i>lg_config_k</i> is the log-base-2 of <i>K</i>,
|
|
47
|
-
* where <i>K</i> is the number of buckets or slots for the sketch.
|
|
48
|
-
*
|
|
49
|
-
* <p>During warmup, when the sketch has only received a small number of unique items
|
|
50
|
-
* (up to about 10% of <i>K</i>), this implementation leverages a new class of estimator
|
|
51
|
-
* algorithms with significantly better accuracy.
|
|
52
|
-
*
|
|
53
|
-
* <p>This sketch also offers the capability of operating off-heap. Given a WritableMemory object
|
|
54
|
-
* created by the user, the sketch will perform all of its updates and internal phase transitions
|
|
55
|
-
* in that object, which can actually reside either on-heap or off-heap based on how it is
|
|
56
|
-
* configured. In large systems that must update and merge many millions of sketches, having the
|
|
57
|
-
* sketch operate off-heap avoids the serialization and deserialization costs of moving sketches
|
|
58
|
-
* to and from off-heap memory-mapped files, for example, and eliminates big garbage collection
|
|
59
|
-
* delays.
|
|
60
|
-
*
|
|
61
|
-
* author Jon Malkin
|
|
62
|
-
* author Lee Rhodes
|
|
63
|
-
* author Kevin Lang
|
|
64
|
-
*/
|
|
33
|
+
// forward declarations
|
|
34
|
+
template<typename A> class hll_sketch_alloc;
|
|
35
|
+
template<typename A> class hll_union_alloc;
|
|
36
|
+
|
|
37
|
+
/// HLL sketch alias with default allocator
|
|
38
|
+
using hll_sketch = hll_sketch_alloc<std::allocator<uint8_t>>;
|
|
39
|
+
/// HLL union alias with default allocator
|
|
40
|
+
using hll_union = hll_union_alloc<std::allocator<uint8_t>>;
|
|
65
41
|
|
|
66
|
-
|
|
67
42
|
/**
|
|
68
43
|
* Specifies the target type of HLL sketch to be created. It is a target in that the actual
|
|
69
44
|
* allocation of the HLL array is deferred until sufficient number of items have been received by
|
|
@@ -100,14 +75,41 @@ enum target_hll_type {
|
|
|
100
75
|
HLL_8 ///< 8 bits per entry (fastest, fixed size)
|
|
101
76
|
};
|
|
102
77
|
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
78
|
+
/**
|
|
79
|
+
* This is a high performance implementation of Phillipe Flajolet's HLL sketch but with
|
|
80
|
+
* significantly improved error behavior. If the ONLY use case for sketching is counting
|
|
81
|
+
* uniques and merging, the HLL sketch is a reasonable choice, although the highest performing in terms of accuracy for
|
|
82
|
+
* storage space consumed is CPC (Compressed Probabilistic Counting). For large enough counts, this HLL version (with HLL_4) can be 2 to
|
|
83
|
+
* 16 times smaller than the Theta sketch family for the same accuracy.
|
|
84
|
+
*
|
|
85
|
+
* <p>This implementation offers three different types of HLL sketch, each with different
|
|
86
|
+
* trade-offs with accuracy, space and performance. These types are specified with the
|
|
87
|
+
* {@link target_hll_type} parameter.
|
|
88
|
+
*
|
|
89
|
+
* <p>In terms of accuracy, all three types, for the same <i>lg_config_k</i>, have the same error
|
|
90
|
+
* distribution as a function of <i>n</i>, the number of unique values fed to the sketch.
|
|
91
|
+
* The configuration parameter <i>lg_config_k</i> is the log-base-2 of <i>K</i>,
|
|
92
|
+
* where <i>K</i> is the number of buckets or slots for the sketch.
|
|
93
|
+
*
|
|
94
|
+
* <p>During warmup, when the sketch has only received a small number of unique items
|
|
95
|
+
* (up to about 10% of <i>K</i>), this implementation leverages a new class of estimator
|
|
96
|
+
* algorithms with significantly better accuracy.
|
|
97
|
+
*
|
|
98
|
+
* <p>This sketch also offers the capability of operating off-heap. Given a WritableMemory object
|
|
99
|
+
* created by the user, the sketch will perform all of its updates and internal phase transitions
|
|
100
|
+
* in that object, which can actually reside either on-heap or off-heap based on how it is
|
|
101
|
+
* configured. In large systems that must update and merge many millions of sketches, having the
|
|
102
|
+
* sketch operate off-heap avoids the serialization and deserialization costs of moving sketches
|
|
103
|
+
* to and from off-heap memory-mapped files, for example, and eliminates big garbage collection
|
|
104
|
+
* delays.
|
|
105
|
+
*
|
|
106
|
+
* author Jon Malkin
|
|
107
|
+
* author Lee Rhodes
|
|
108
|
+
* author Kevin Lang
|
|
109
|
+
*/
|
|
108
110
|
|
|
109
|
-
|
|
110
|
-
template<typename A>
|
|
111
|
+
// forward declaration
|
|
112
|
+
template<typename A> class HllSketchImpl;
|
|
111
113
|
|
|
112
114
|
template<typename A = std::allocator<uint8_t> >
|
|
113
115
|
class hll_sketch_alloc final {
|
|
@@ -119,27 +121,33 @@ class hll_sketch_alloc final {
|
|
|
119
121
|
* @param start_full_size Indicates whether to start in HLL mode,
|
|
120
122
|
* keeping memory use constant (if HLL_6 or HLL_8) at the cost of
|
|
121
123
|
* starting out using much more memory
|
|
124
|
+
* @param allocator instance of an Allocator
|
|
122
125
|
*/
|
|
123
126
|
explicit hll_sketch_alloc(uint8_t lg_config_k, target_hll_type tgt_type = HLL_4, bool start_full_size = false, const A& allocator = A());
|
|
124
127
|
|
|
125
128
|
/**
|
|
126
129
|
* Copy constructor
|
|
130
|
+
* @param that sketch to be copied
|
|
127
131
|
*/
|
|
128
132
|
hll_sketch_alloc(const hll_sketch_alloc<A>& that);
|
|
129
133
|
|
|
130
134
|
/**
|
|
131
135
|
* Copy constructor to a new target type
|
|
136
|
+
* @param that sketch to be copied
|
|
137
|
+
* @param tgt_type target_hll_type
|
|
132
138
|
*/
|
|
133
139
|
hll_sketch_alloc(const hll_sketch_alloc<A>& that, target_hll_type tgt_type);
|
|
134
140
|
|
|
135
141
|
/**
|
|
136
142
|
* Move constructor
|
|
143
|
+
* @param that sketch to be moved
|
|
137
144
|
*/
|
|
138
145
|
hll_sketch_alloc(hll_sketch_alloc<A>&& that) noexcept;
|
|
139
146
|
|
|
140
147
|
/**
|
|
141
148
|
* Reconstructs a sketch from a serialized image on a stream.
|
|
142
149
|
* @param is An input stream with a binary image of a sketch
|
|
150
|
+
* @param allocator instance of an Allocator
|
|
143
151
|
*/
|
|
144
152
|
static hll_sketch_alloc deserialize(std::istream& is, const A& allocator = A());
|
|
145
153
|
|
|
@@ -147,17 +155,26 @@ class hll_sketch_alloc final {
|
|
|
147
155
|
* Reconstructs a sketch from a serialized image in a byte array.
|
|
148
156
|
* @param bytes An input array with a binary image of a sketch
|
|
149
157
|
* @param len Length of the input array, in bytes
|
|
158
|
+
* @param allocator instance of an Allocator
|
|
150
159
|
*/
|
|
151
160
|
static hll_sketch_alloc deserialize(const void* bytes, size_t len, const A& allocator = A());
|
|
152
161
|
|
|
153
162
|
//! Class destructor
|
|
154
163
|
virtual ~hll_sketch_alloc();
|
|
155
164
|
|
|
156
|
-
|
|
157
|
-
|
|
165
|
+
/**
|
|
166
|
+
* Copy assignment operator
|
|
167
|
+
* @param other sketch to be copied
|
|
168
|
+
* @return reference to this sketch
|
|
169
|
+
*/
|
|
170
|
+
hll_sketch_alloc& operator=(const hll_sketch_alloc<A>& other);
|
|
158
171
|
|
|
159
|
-
|
|
160
|
-
|
|
172
|
+
/**
|
|
173
|
+
* Move assignment operator
|
|
174
|
+
* @param other sketch to be moved
|
|
175
|
+
* @return reference to this sketch
|
|
176
|
+
*/
|
|
177
|
+
hll_sketch_alloc& operator=(hll_sketch_alloc<A>&& other);
|
|
161
178
|
|
|
162
179
|
/**
|
|
163
180
|
* Resets the sketch to an empty state in coupon collection mode.
|
|
@@ -165,18 +182,22 @@ class hll_sketch_alloc final {
|
|
|
165
182
|
*/
|
|
166
183
|
void reset();
|
|
167
184
|
|
|
168
|
-
|
|
185
|
+
// This is a convenience alias for users
|
|
186
|
+
// The type returned by the following serialize method
|
|
187
|
+
using vector_bytes = std::vector<uint8_t, typename std::allocator_traits<A>::template rebind_alloc<uint8_t>>;
|
|
169
188
|
|
|
170
189
|
/**
|
|
171
190
|
* Serializes the sketch to a byte array, compacting data structures
|
|
172
191
|
* where feasible to eliminate unused storage in the serialized image.
|
|
173
192
|
* @param header_size_bytes Allows for PostgreSQL integration
|
|
193
|
+
* @return serialized sketch in binary form
|
|
174
194
|
*/
|
|
175
195
|
vector_bytes serialize_compact(unsigned header_size_bytes = 0) const;
|
|
176
196
|
|
|
177
197
|
/**
|
|
178
198
|
* Serializes the sketch to a byte array, retaining all internal
|
|
179
199
|
* data structures in their current form.
|
|
200
|
+
* @return serialized sketch in binary form
|
|
180
201
|
*/
|
|
181
202
|
vector_bytes serialize_updatable() const;
|
|
182
203
|
|
|
@@ -392,8 +413,6 @@ class hll_sketch_alloc final {
|
|
|
392
413
|
bool is_out_of_order_flag() const;
|
|
393
414
|
bool is_estimation_mode() const;
|
|
394
415
|
|
|
395
|
-
typedef typename std::allocator_traits<A>::template rebind_alloc<hll_sketch_alloc> AllocHllSketch;
|
|
396
|
-
|
|
397
416
|
HllSketchImpl<A>* sketch_impl;
|
|
398
417
|
friend hll_union_alloc<A>;
|
|
399
418
|
};
|
|
@@ -413,8 +432,8 @@ class hll_sketch_alloc final {
|
|
|
413
432
|
* <p>Although the API for this union operator parallels many of the methods of the
|
|
414
433
|
* <i>HllSketch</i>, the behavior of the union operator has some fundamental differences.
|
|
415
434
|
*
|
|
416
|
-
* <p>First, the user cannot specify the #
|
|
417
|
-
* Instead, it is specified for the sketch returned with #get_result
|
|
435
|
+
* <p>First, the user cannot specify the #target_hll_type as an input parameter.
|
|
436
|
+
* Instead, it is specified for the sketch returned with #get_result.
|
|
418
437
|
*
|
|
419
438
|
* <p>Second, the internal effective value of log-base-2 of <i>k</i> for the union operation can
|
|
420
439
|
* change dynamically based on the smallest <i>lg_config_k</i> that the union operation has seen.
|
|
@@ -423,7 +442,6 @@ class hll_sketch_alloc final {
|
|
|
423
442
|
* author Lee Rhodes
|
|
424
443
|
* author Kevin Lang
|
|
425
444
|
*/
|
|
426
|
-
|
|
427
445
|
template<typename A = std::allocator<uint8_t> >
|
|
428
446
|
class hll_union_alloc {
|
|
429
447
|
public:
|
|
@@ -431,6 +449,7 @@ class hll_union_alloc {
|
|
|
431
449
|
* Construct an hll_union operator with the given maximum log2 of k.
|
|
432
450
|
* @param lg_max_k The maximum size, in log2, of k. The value must
|
|
433
451
|
* be between 7 and 21, inclusive.
|
|
452
|
+
* @param allocator instance of an Allocator
|
|
434
453
|
*/
|
|
435
454
|
explicit hll_union_alloc(uint8_t lg_max_k, const A& allocator = A());
|
|
436
455
|
|
|
@@ -495,7 +514,7 @@ class hll_union_alloc {
|
|
|
495
514
|
|
|
496
515
|
/**
|
|
497
516
|
* Returns the result of this union operator with the specified
|
|
498
|
-
* #
|
|
517
|
+
* #target_hll_type.
|
|
499
518
|
* @param tgt_type The tgt_hll_type enum value of the desired result (Default: HLL_4)
|
|
500
519
|
* @return The result of this union with the specified tgt_hll_type
|
|
501
520
|
*/
|
|
@@ -629,12 +648,6 @@ class hll_union_alloc {
|
|
|
629
648
|
hll_sketch_alloc<A> gadget_;
|
|
630
649
|
};
|
|
631
650
|
|
|
632
|
-
/// convenience alias for hll_sketch with default allocator
|
|
633
|
-
typedef hll_sketch_alloc<> hll_sketch;
|
|
634
|
-
|
|
635
|
-
/// convenience alias for hll_union with default allocator
|
|
636
|
-
typedef hll_union_alloc<> hll_union;
|
|
637
|
-
|
|
638
651
|
} // namespace datasketches
|
|
639
652
|
|
|
640
653
|
#include "hll.private.hpp"
|
|
@@ -20,7 +20,6 @@ add_executable(hll_test)
|
|
|
20
20
|
target_link_libraries(hll_test hll common_test_lib)
|
|
21
21
|
|
|
22
22
|
set_target_properties(hll_test PROPERTIES
|
|
23
|
-
CXX_STANDARD 11
|
|
24
23
|
CXX_STANDARD_REQUIRED YES
|
|
25
24
|
)
|
|
26
25
|
|
|
@@ -49,3 +48,17 @@ target_sources(hll_test
|
|
|
49
48
|
ToFromByteArrayTest.cpp
|
|
50
49
|
IsomorphicTest.cpp
|
|
51
50
|
)
|
|
51
|
+
|
|
52
|
+
if (SERDE_COMPAT)
|
|
53
|
+
target_sources(hll_test
|
|
54
|
+
PRIVATE
|
|
55
|
+
hll_sketch_deserialize_from_java_test.cpp
|
|
56
|
+
)
|
|
57
|
+
endif()
|
|
58
|
+
|
|
59
|
+
if (GENERATE)
|
|
60
|
+
target_sources(hll_test
|
|
61
|
+
PRIVATE
|
|
62
|
+
hll_sketch_serialize_for_java.cpp
|
|
63
|
+
)
|
|
64
|
+
endif()
|
|
@@ -53,74 +53,6 @@ TEST_CASE("hll to/from byte array: double serialize", "[hll_byte_array]") {
|
|
|
53
53
|
}
|
|
54
54
|
}
|
|
55
55
|
|
|
56
|
-
TEST_CASE("hll to/from byte array: deserialize from java", "[hll_byte_array]") {
|
|
57
|
-
std::string inputPath;
|
|
58
|
-
#ifdef TEST_BINARY_INPUT_PATH
|
|
59
|
-
inputPath = TEST_BINARY_INPUT_PATH;
|
|
60
|
-
#else
|
|
61
|
-
inputPath = "test/";
|
|
62
|
-
#endif
|
|
63
|
-
|
|
64
|
-
std::ifstream ifs;
|
|
65
|
-
ifs.open(inputPath + "list_from_java.sk", std::ios::binary);
|
|
66
|
-
hll_sketch sk = hll_sketch::deserialize(ifs);
|
|
67
|
-
REQUIRE(sk.is_empty() == false);
|
|
68
|
-
REQUIRE(sk.get_lg_config_k() == 8);
|
|
69
|
-
REQUIRE(sk.get_lower_bound(1) == 7.0);
|
|
70
|
-
REQUIRE(sk.get_estimate() == Approx(7.0).margin(1e-6));
|
|
71
|
-
REQUIRE(sk.get_upper_bound(1) == Approx(7.000350).margin(1e-5));
|
|
72
|
-
ifs.close();
|
|
73
|
-
|
|
74
|
-
ifs.open(inputPath + "compact_set_from_java.sk", std::ios::binary);
|
|
75
|
-
sk = hll_sketch::deserialize(ifs);
|
|
76
|
-
REQUIRE(sk.is_empty() == false);
|
|
77
|
-
REQUIRE(sk.get_lg_config_k() == 8);
|
|
78
|
-
REQUIRE(sk.get_lower_bound(1) == 24.0);
|
|
79
|
-
REQUIRE(sk.get_estimate() == Approx(24.0).margin(1e-5));
|
|
80
|
-
REQUIRE(sk.get_upper_bound(1) == Approx(24.001200).margin(1e-5));
|
|
81
|
-
ifs.close();
|
|
82
|
-
|
|
83
|
-
ifs.open(inputPath + "updatable_set_from_java.sk", std::ios::binary);
|
|
84
|
-
sk = hll_sketch::deserialize(ifs);
|
|
85
|
-
REQUIRE(sk.is_empty() == false);
|
|
86
|
-
REQUIRE(sk.get_lg_config_k() == 8);
|
|
87
|
-
REQUIRE(sk.get_lower_bound(1) == 24.0);
|
|
88
|
-
REQUIRE(sk.get_estimate() == Approx(24.0).margin(1e-5));
|
|
89
|
-
REQUIRE(sk.get_upper_bound(1) == Approx(24.001200).margin(1e-5));
|
|
90
|
-
ifs.close();
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
ifs.open(inputPath + "array6_from_java.sk", std::ios::binary);
|
|
94
|
-
sk = hll_sketch::deserialize(ifs);
|
|
95
|
-
REQUIRE(sk.is_empty() == false);
|
|
96
|
-
REQUIRE(sk.get_lg_config_k() == 8);
|
|
97
|
-
REQUIRE(sk.get_lower_bound(1) == Approx(9589.968564).margin(1e-5));
|
|
98
|
-
REQUIRE(sk.get_estimate() == Approx(10089.150211).margin(1e-5));
|
|
99
|
-
REQUIRE(sk.get_upper_bound(1) == Approx(10642.370492).margin(1e-5));
|
|
100
|
-
ifs.close();
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
ifs.open(inputPath + "compact_array4_from_java.sk", std::ios::binary);
|
|
104
|
-
sk = hll_sketch::deserialize(ifs);
|
|
105
|
-
REQUIRE(sk.is_empty() == false);
|
|
106
|
-
REQUIRE(sk.get_lg_config_k() == 8);
|
|
107
|
-
REQUIRE(sk.get_lower_bound(1) == Approx(9589.968564).margin(1e-5));
|
|
108
|
-
REQUIRE(sk.get_estimate() == Approx(10089.150211).margin(1e-5));
|
|
109
|
-
REQUIRE(sk.get_upper_bound(1) == Approx(10642.370492).margin(1e-5));
|
|
110
|
-
|
|
111
|
-
ifs.close();
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
ifs.open(inputPath + "updatable_array4_from_java.sk", std::ios::binary);
|
|
115
|
-
sk = hll_sketch::deserialize(ifs);
|
|
116
|
-
REQUIRE(sk.is_empty() == false);
|
|
117
|
-
REQUIRE(sk.get_lg_config_k() == 8);
|
|
118
|
-
REQUIRE(sk.get_lower_bound(1) == Approx(9589.968564).margin(1e-5));
|
|
119
|
-
REQUIRE(sk.get_estimate() == Approx(10089.150211).margin(1e-5));
|
|
120
|
-
REQUIRE(sk.get_upper_bound(1) == Approx(10642.370492).margin(1e-5));
|
|
121
|
-
ifs.close();
|
|
122
|
-
}
|
|
123
|
-
|
|
124
56
|
static void checkSketchEquality(hll_sketch& sk1, hll_sketch& sk2) {
|
|
125
57
|
REQUIRE(sk1.get_lg_config_k() == sk2.get_lg_config_k());
|
|
126
58
|
REQUIRE(sk1.get_lower_bound(1) == sk2.get_lower_bound(1));
|