datasketches 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <catch.hpp>
|
|
21
|
+
#include <ostream>
|
|
22
|
+
#include <cmath>
|
|
23
|
+
#include <string>
|
|
24
|
+
|
|
25
|
+
#include "hll.hpp"
|
|
26
|
+
|
|
27
|
+
namespace datasketches {
|
|
28
|
+
|
|
29
|
+
static hll_sketch buildSketch(const int n, const int lgK, const target_hll_type tgtHllType) {
|
|
30
|
+
hll_sketch sketch(lgK, tgtHllType);
|
|
31
|
+
for (int i = 0; i < n; ++i) {
|
|
32
|
+
sketch.update(i);
|
|
33
|
+
}
|
|
34
|
+
return sketch;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
static void crossCountingCheck(const int lgK, const int n) {
|
|
38
|
+
hll_sketch sk4 = buildSketch(n, lgK, HLL_4);
|
|
39
|
+
const double est = sk4.get_estimate();
|
|
40
|
+
const double lb = sk4.get_lower_bound(1);
|
|
41
|
+
const double ub = sk4.get_upper_bound(1);
|
|
42
|
+
|
|
43
|
+
hll_sketch sk6 = buildSketch(n, lgK, HLL_6);
|
|
44
|
+
REQUIRE(sk6.get_estimate() == est);
|
|
45
|
+
REQUIRE(sk6.get_lower_bound(1) == lb);
|
|
46
|
+
REQUIRE(sk6.get_upper_bound(1) == ub);
|
|
47
|
+
|
|
48
|
+
hll_sketch sk8 = buildSketch(n, lgK, HLL_8);
|
|
49
|
+
REQUIRE(sk8.get_estimate() == est);
|
|
50
|
+
REQUIRE(sk8.get_lower_bound(1) == lb);
|
|
51
|
+
REQUIRE(sk8.get_upper_bound(1) == ub);
|
|
52
|
+
|
|
53
|
+
// Conversions
|
|
54
|
+
hll_sketch sk4to6(sk4, HLL_6);
|
|
55
|
+
REQUIRE(sk4to6.get_estimate() == est);
|
|
56
|
+
REQUIRE(sk4to6.get_lower_bound(1) == lb);
|
|
57
|
+
REQUIRE(sk4to6.get_upper_bound(1) == ub);
|
|
58
|
+
|
|
59
|
+
hll_sketch sk4to8(sk4, HLL_8);
|
|
60
|
+
REQUIRE(sk4to8.get_estimate() == est);
|
|
61
|
+
REQUIRE(sk4to8.get_lower_bound(1) == lb);
|
|
62
|
+
REQUIRE(sk4to8.get_upper_bound(1) == ub);
|
|
63
|
+
|
|
64
|
+
hll_sketch sk6to4(sk6, HLL_4);
|
|
65
|
+
REQUIRE(sk6to4.get_estimate() == est);
|
|
66
|
+
REQUIRE(sk6to4.get_lower_bound(1) == lb);
|
|
67
|
+
REQUIRE(sk6to4.get_upper_bound(1) == ub);
|
|
68
|
+
|
|
69
|
+
hll_sketch sk6to8(sk6, HLL_8);
|
|
70
|
+
REQUIRE(sk6to8.get_estimate() == est);
|
|
71
|
+
REQUIRE(sk6to8.get_lower_bound(1) == lb);
|
|
72
|
+
REQUIRE(sk6to8.get_upper_bound(1) == ub);
|
|
73
|
+
|
|
74
|
+
hll_sketch sk8to4(sk8, HLL_4);
|
|
75
|
+
REQUIRE(sk8to4.get_estimate() == est);
|
|
76
|
+
REQUIRE(sk8to4.get_lower_bound(1) == lb);
|
|
77
|
+
REQUIRE(sk8to4.get_upper_bound(1) == ub);
|
|
78
|
+
|
|
79
|
+
hll_sketch sk8to6(sk8, HLL_6);
|
|
80
|
+
REQUIRE(sk8to6.get_estimate() == est);
|
|
81
|
+
REQUIRE(sk8to6.get_lower_bound(1) == lb);
|
|
82
|
+
REQUIRE(sk8to6.get_upper_bound(1) == ub);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
TEST_CASE("cross counting: cross counting checks", "[cross_counting]") {
|
|
86
|
+
crossCountingCheck(4, 100);
|
|
87
|
+
crossCountingCheck(4, 10000);
|
|
88
|
+
crossCountingCheck(12, 7);
|
|
89
|
+
crossCountingCheck(12, 384);
|
|
90
|
+
crossCountingCheck(12, 10000);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
} /* namespace datasketches */
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include "hll.hpp"
|
|
21
|
+
|
|
22
|
+
#include <exception>
|
|
23
|
+
#include <sstream>
|
|
24
|
+
#include <catch.hpp>
|
|
25
|
+
|
|
26
|
+
namespace datasketches {
|
|
27
|
+
|
|
28
|
+
static void testComposite(const int lgK, const target_hll_type tgtHllType, const int n) {
|
|
29
|
+
hll_union u(lgK);
|
|
30
|
+
hll_sketch sk(lgK, tgtHllType);
|
|
31
|
+
for (int i = 0; i < n; ++i) {
|
|
32
|
+
u.update(i);
|
|
33
|
+
sk.update(i);
|
|
34
|
+
}
|
|
35
|
+
u.update(sk); // merge
|
|
36
|
+
hll_sketch res = u.get_result(target_hll_type::HLL_8);
|
|
37
|
+
double est = res.get_composite_estimate();
|
|
38
|
+
REQUIRE(sk.get_composite_estimate() == est);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
TEST_CASE("hll array: check composite estimate", "[hll_array]") {
|
|
42
|
+
testComposite(4, target_hll_type::HLL_8, 10000);
|
|
43
|
+
testComposite(5, target_hll_type::HLL_8, 10000);
|
|
44
|
+
testComposite(6, target_hll_type::HLL_8, 10000);
|
|
45
|
+
testComposite(13, target_hll_type::HLL_8, 10000);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
static void serializeDeserialize(const int lgK, target_hll_type tgtHllType, const int n) {
|
|
49
|
+
hll_sketch sk1(lgK, tgtHllType);
|
|
50
|
+
|
|
51
|
+
for (int i = 0; i < n; ++i) {
|
|
52
|
+
sk1.update(i);
|
|
53
|
+
}
|
|
54
|
+
//REQUIRE(sk1.getCurrentMode() == CurMode::HLL);
|
|
55
|
+
|
|
56
|
+
double est1 = sk1.get_estimate();
|
|
57
|
+
REQUIRE(est1 == Approx(n).margin(n * 0.03));
|
|
58
|
+
|
|
59
|
+
// serialize as compact and updatable, deserialize, compare estimates are exact
|
|
60
|
+
std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
|
|
61
|
+
sk1.serialize_compact(ss);
|
|
62
|
+
hll_sketch sk2 = hll_sketch::deserialize(ss);
|
|
63
|
+
REQUIRE(sk1.get_estimate() == sk2.get_estimate());
|
|
64
|
+
|
|
65
|
+
ss.clear();
|
|
66
|
+
sk1.serialize_updatable(ss);
|
|
67
|
+
sk2 = hll_sketch::deserialize(ss);
|
|
68
|
+
REQUIRE(sk1.get_estimate() == sk2.get_estimate());
|
|
69
|
+
|
|
70
|
+
sk1.reset();
|
|
71
|
+
REQUIRE(sk1.get_estimate() == 0.0);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
TEST_CASE("hll array: check serialize deserialize", "[hll_array]") {
|
|
75
|
+
int lgK = 4;
|
|
76
|
+
int n = 8;
|
|
77
|
+
serializeDeserialize(lgK, HLL_4, n);
|
|
78
|
+
serializeDeserialize(lgK, HLL_6, n);
|
|
79
|
+
serializeDeserialize(lgK, HLL_8, n);
|
|
80
|
+
|
|
81
|
+
lgK = 15;
|
|
82
|
+
n = (((1 << (lgK - 3))*3)/4) + 100;
|
|
83
|
+
serializeDeserialize(lgK, HLL_4, n);
|
|
84
|
+
serializeDeserialize(lgK, HLL_6, n);
|
|
85
|
+
serializeDeserialize(lgK, HLL_8, n);
|
|
86
|
+
|
|
87
|
+
lgK = 21;
|
|
88
|
+
n = (((1 << (lgK - 3))*3)/4) + 1000;
|
|
89
|
+
serializeDeserialize(lgK, HLL_4, n);
|
|
90
|
+
serializeDeserialize(lgK, HLL_6, n);
|
|
91
|
+
serializeDeserialize(lgK, HLL_8, n);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
TEST_CASE("hll array: check is compact", "[hll_array]") {
|
|
95
|
+
hll_sketch sk(4);
|
|
96
|
+
for (int i = 0; i < 8; ++i) {
|
|
97
|
+
sk.update(i);
|
|
98
|
+
}
|
|
99
|
+
REQUIRE_FALSE(sk.is_compact());
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
TEST_CASE("hll array: check corrupt bytearray", "[hll_array]") {
|
|
103
|
+
int lgK = 8;
|
|
104
|
+
hll_sketch sk1(lgK, HLL_8);
|
|
105
|
+
for (int i = 0; i < 50; ++i) {
|
|
106
|
+
sk1.update(i);
|
|
107
|
+
}
|
|
108
|
+
auto sketchBytes = sk1.serialize_compact();
|
|
109
|
+
uint8_t* bytes = sketchBytes.data();
|
|
110
|
+
const size_t size = sketchBytes.size();
|
|
111
|
+
|
|
112
|
+
bytes[HllUtil<>::PREAMBLE_INTS_BYTE] = 0;
|
|
113
|
+
REQUIRE_THROWS_AS(hll_sketch::deserialize(bytes, size), std::invalid_argument);
|
|
114
|
+
REQUIRE_THROWS_AS(HllArray<>::newHll(bytes, size), std::invalid_argument);
|
|
115
|
+
bytes[HllUtil<>::PREAMBLE_INTS_BYTE] = HllUtil<>::HLL_PREINTS;
|
|
116
|
+
|
|
117
|
+
bytes[HllUtil<>::SER_VER_BYTE] = 0;
|
|
118
|
+
REQUIRE_THROWS_AS(hll_sketch::deserialize(bytes, size), std::invalid_argument);
|
|
119
|
+
bytes[HllUtil<>::SER_VER_BYTE] = HllUtil<>::SER_VER;
|
|
120
|
+
|
|
121
|
+
bytes[HllUtil<>::FAMILY_BYTE] = 0;
|
|
122
|
+
REQUIRE_THROWS_AS(hll_sketch::deserialize(bytes, size), std::invalid_argument);
|
|
123
|
+
bytes[HllUtil<>::FAMILY_BYTE] = HllUtil<>::FAMILY_ID;
|
|
124
|
+
|
|
125
|
+
uint8_t tmp = bytes[HllUtil<>::MODE_BYTE];
|
|
126
|
+
bytes[HllUtil<>::MODE_BYTE] = 0x10; // HLL_6, LIST
|
|
127
|
+
REQUIRE_THROWS_AS(hll_sketch::deserialize(bytes, size), std::invalid_argument);
|
|
128
|
+
bytes[HllUtil<>::MODE_BYTE] = tmp;
|
|
129
|
+
|
|
130
|
+
tmp = bytes[HllUtil<>::LG_ARR_BYTE];
|
|
131
|
+
bytes[HllUtil<>::LG_ARR_BYTE] = 0;
|
|
132
|
+
hll_sketch::deserialize(bytes, size);
|
|
133
|
+
// should work fine despite the corruption
|
|
134
|
+
bytes[HllUtil<>::LG_ARR_BYTE] = tmp;
|
|
135
|
+
|
|
136
|
+
REQUIRE_THROWS_AS(hll_sketch::deserialize(bytes, size - 1), std::out_of_range);
|
|
137
|
+
REQUIRE_THROWS_AS(hll_sketch::deserialize(bytes, 3), std::out_of_range);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
TEST_CASE("hll array: check corrupt stream", "[hll_array]") {
|
|
141
|
+
int lgK = 6;
|
|
142
|
+
hll_sketch sk1(lgK);
|
|
143
|
+
for (int i = 0; i < 50; ++i) {
|
|
144
|
+
sk1.update(i);
|
|
145
|
+
}
|
|
146
|
+
std::stringstream ss;
|
|
147
|
+
sk1.serialize_compact(ss);
|
|
148
|
+
|
|
149
|
+
ss.seekp(HllUtil<>::PREAMBLE_INTS_BYTE);
|
|
150
|
+
ss.put(0);
|
|
151
|
+
ss.seekg(0);
|
|
152
|
+
REQUIRE_THROWS_AS(hll_sketch::deserialize(ss), std::invalid_argument);
|
|
153
|
+
REQUIRE_THROWS_AS(HllArray<>::newHll(ss), std::invalid_argument);
|
|
154
|
+
ss.seekp(HllUtil<>::PREAMBLE_INTS_BYTE);
|
|
155
|
+
ss.put(HllUtil<>::HLL_PREINTS);
|
|
156
|
+
|
|
157
|
+
ss.seekp(HllUtil<>::SER_VER_BYTE);
|
|
158
|
+
ss.put(0);
|
|
159
|
+
ss.seekg(0);
|
|
160
|
+
REQUIRE_THROWS_AS(hll_sketch::deserialize(ss), std::invalid_argument);
|
|
161
|
+
ss.seekp(HllUtil<>::SER_VER_BYTE);
|
|
162
|
+
ss.put(HllUtil<>::SER_VER);
|
|
163
|
+
|
|
164
|
+
ss.seekp(HllUtil<>::FAMILY_BYTE);
|
|
165
|
+
ss.put(0);
|
|
166
|
+
ss.seekg(0);
|
|
167
|
+
REQUIRE_THROWS_AS(hll_sketch::deserialize(ss), std::invalid_argument);
|
|
168
|
+
ss.seekp(HllUtil<>::FAMILY_BYTE);
|
|
169
|
+
ss.put(HllUtil<>::FAMILY_ID);
|
|
170
|
+
|
|
171
|
+
ss.seekg(HllUtil<>::MODE_BYTE);
|
|
172
|
+
uint8_t tmp = ss.get();
|
|
173
|
+
ss.seekp(HllUtil<>::MODE_BYTE);
|
|
174
|
+
ss.put(0x11); // HLL_6, SET
|
|
175
|
+
ss.seekg(0);
|
|
176
|
+
REQUIRE_THROWS_AS(hll_sketch::deserialize(ss), std::invalid_argument);
|
|
177
|
+
ss.seekp(HllUtil<>::MODE_BYTE);
|
|
178
|
+
ss.put(tmp);
|
|
179
|
+
|
|
180
|
+
ss.seekg(HllUtil<>::LG_ARR_BYTE);
|
|
181
|
+
tmp = ss.get();
|
|
182
|
+
ss.seekp(HllUtil<>::LG_ARR_BYTE);
|
|
183
|
+
ss.put(0);
|
|
184
|
+
ss.seekg(0);
|
|
185
|
+
hll_sketch::deserialize(ss);
|
|
186
|
+
// should work fine despite the corruption
|
|
187
|
+
ss.seekp(HllUtil<>::LG_ARR_BYTE);
|
|
188
|
+
ss.put(tmp);
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
} /* namespace datasketches */
|
|
@@ -0,0 +1,389 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include "hll.hpp"
|
|
21
|
+
|
|
22
|
+
#include <catch.hpp>
|
|
23
|
+
#include <test_allocator.hpp>
|
|
24
|
+
|
|
25
|
+
namespace datasketches {
|
|
26
|
+
|
|
27
|
+
typedef hll_sketch_alloc<test_allocator<void>> hll_sketch_test_alloc;
|
|
28
|
+
|
|
29
|
+
static void runCheckCopy(int lgConfigK, target_hll_type tgtHllType) {
|
|
30
|
+
hll_sketch_test_alloc sk(lgConfigK, tgtHllType);
|
|
31
|
+
|
|
32
|
+
for (int i = 0; i < 7; ++i) {
|
|
33
|
+
sk.update(i);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
hll_sketch_test_alloc skCopy = sk;
|
|
37
|
+
REQUIRE(sk.get_estimate() == skCopy.get_estimate());
|
|
38
|
+
|
|
39
|
+
// no access to hllSketchImpl, so we'll ensure those differ by adding more
|
|
40
|
+
// data to sk and ensuring the mode and estimates differ
|
|
41
|
+
for (int i = 7; i < 24; ++i) {
|
|
42
|
+
sk.update(i);
|
|
43
|
+
}
|
|
44
|
+
REQUIRE(16.0 < (sk.get_estimate() - skCopy.get_estimate()));
|
|
45
|
+
|
|
46
|
+
skCopy = sk;
|
|
47
|
+
REQUIRE(sk.get_estimate() == skCopy.get_estimate());
|
|
48
|
+
|
|
49
|
+
int u = (sk.get_target_type() == HLL_4) ? 100000 : 25;
|
|
50
|
+
for (int i = 24; i < u; ++i) {
|
|
51
|
+
sk.update(i);
|
|
52
|
+
}
|
|
53
|
+
REQUIRE(sk.get_estimate() != skCopy.get_estimate()); // either 1 or 100k difference
|
|
54
|
+
|
|
55
|
+
skCopy = sk;
|
|
56
|
+
REQUIRE(sk.get_estimate() == skCopy.get_estimate());
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
TEST_CASE("hll sketch: check copies", "[hll_sketch]") {
|
|
60
|
+
test_allocator_total_bytes = 0;
|
|
61
|
+
runCheckCopy(14, HLL_4);
|
|
62
|
+
runCheckCopy(8, HLL_6);
|
|
63
|
+
runCheckCopy(8, HLL_8);
|
|
64
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
static void copyAs(target_hll_type srcType, target_hll_type dstType) {
|
|
68
|
+
int lgK = 8;
|
|
69
|
+
int n1 = 7;
|
|
70
|
+
int n2 = 24;
|
|
71
|
+
int n3 = 1000;
|
|
72
|
+
int base = 0;
|
|
73
|
+
|
|
74
|
+
hll_sketch_test_alloc src(lgK, srcType);
|
|
75
|
+
for (int i = 0; i < n1; ++i) {
|
|
76
|
+
src.update(i + base);
|
|
77
|
+
}
|
|
78
|
+
hll_sketch_test_alloc dst(src, dstType);
|
|
79
|
+
REQUIRE(src.get_estimate() == dst.get_estimate());
|
|
80
|
+
|
|
81
|
+
for (int i = n1; i < n2; ++i) {
|
|
82
|
+
src.update(i + base);
|
|
83
|
+
}
|
|
84
|
+
dst = hll_sketch_test_alloc(src, dstType);
|
|
85
|
+
REQUIRE(src.get_estimate() == dst.get_estimate());
|
|
86
|
+
|
|
87
|
+
for (int i = n2; i < n3; ++i) {
|
|
88
|
+
src.update(i + base);
|
|
89
|
+
}
|
|
90
|
+
dst = hll_sketch_test_alloc(src, dstType);
|
|
91
|
+
REQUIRE(src.get_estimate() == dst.get_estimate());
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
TEST_CASE("hll sketch: check copy as", "[hll_sketch]") {
|
|
95
|
+
test_allocator_total_bytes = 0;
|
|
96
|
+
copyAs(HLL_4, HLL_4);
|
|
97
|
+
copyAs(HLL_4, HLL_6);
|
|
98
|
+
copyAs(HLL_4, HLL_8);
|
|
99
|
+
copyAs(HLL_6, HLL_4);
|
|
100
|
+
copyAs(HLL_6, HLL_6);
|
|
101
|
+
copyAs(HLL_6, HLL_8);
|
|
102
|
+
copyAs(HLL_8, HLL_4);
|
|
103
|
+
copyAs(HLL_8, HLL_6);
|
|
104
|
+
copyAs(HLL_8, HLL_8);
|
|
105
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
TEST_CASE("hll sketch: check misc1", "[hll_sketch]") {
|
|
109
|
+
test_allocator_total_bytes = 0;
|
|
110
|
+
{
|
|
111
|
+
int lgConfigK = 8;
|
|
112
|
+
target_hll_type srcType = target_hll_type::HLL_8;
|
|
113
|
+
hll_sketch_test_alloc sk(lgConfigK, srcType);
|
|
114
|
+
|
|
115
|
+
for (int i = 0; i < 7; ++i) { sk.update(i); } // LIST
|
|
116
|
+
REQUIRE(sk.get_compact_serialization_bytes() == 36);
|
|
117
|
+
REQUIRE(sk.get_updatable_serialization_bytes() == 40);
|
|
118
|
+
|
|
119
|
+
for (int i = 7; i < 24; ++i) { sk.update(i); } // SET
|
|
120
|
+
REQUIRE(sk.get_compact_serialization_bytes() == 108);
|
|
121
|
+
REQUIRE(sk.get_updatable_serialization_bytes() == 140);
|
|
122
|
+
|
|
123
|
+
sk.update(24); // HLL
|
|
124
|
+
REQUIRE(sk.get_updatable_serialization_bytes() == 40 + 256);
|
|
125
|
+
|
|
126
|
+
const int hllBytes = HllUtil<>::HLL_BYTE_ARR_START + (1 << lgConfigK);
|
|
127
|
+
REQUIRE(sk.get_compact_serialization_bytes() == hllBytes);
|
|
128
|
+
REQUIRE(hll_sketch::get_max_updatable_serialization_bytes(lgConfigK, HLL_8) == hllBytes);
|
|
129
|
+
}
|
|
130
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
TEST_CASE("hll sketch: check num std dev", "[hll_sketch]") {
|
|
134
|
+
REQUIRE_THROWS_AS(HllUtil<>::checkNumStdDev(0), std::invalid_argument);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
void checkSerializationSizes(const int lgConfigK, target_hll_type tgtHllType) {
|
|
138
|
+
hll_sketch_test_alloc sk(lgConfigK, tgtHllType);
|
|
139
|
+
int i;
|
|
140
|
+
|
|
141
|
+
// LIST
|
|
142
|
+
for (i = 0; i < 7; ++i) { sk.update(i); }
|
|
143
|
+
int expected = HllUtil<>::LIST_INT_ARR_START + (i << 2);
|
|
144
|
+
REQUIRE(sk.get_compact_serialization_bytes() == expected);
|
|
145
|
+
expected = HllUtil<>::LIST_INT_ARR_START + (4 << HllUtil<>::LG_INIT_LIST_SIZE);
|
|
146
|
+
REQUIRE(sk.get_updatable_serialization_bytes() == expected);
|
|
147
|
+
|
|
148
|
+
// SET
|
|
149
|
+
for (i = 7; i < 24; ++i) { sk.update(i); }
|
|
150
|
+
expected = HllUtil<>::HASH_SET_INT_ARR_START + (i << 2);
|
|
151
|
+
REQUIRE(sk.get_compact_serialization_bytes() == expected);
|
|
152
|
+
expected = HllUtil<>::HASH_SET_INT_ARR_START + (4 << HllUtil<>::LG_INIT_SET_SIZE);
|
|
153
|
+
REQUIRE(sk.get_updatable_serialization_bytes() == expected);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
TEST_CASE("hll sketch: check ser sizes", "[hll_sketch]") {
|
|
157
|
+
test_allocator_total_bytes = 0;
|
|
158
|
+
checkSerializationSizes(8, HLL_8);
|
|
159
|
+
checkSerializationSizes(8, HLL_6);
|
|
160
|
+
checkSerializationSizes(8, HLL_4);
|
|
161
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
TEST_CASE("hll sketch: exercise to string", "[hll_sketch]") {
|
|
165
|
+
test_allocator_total_bytes = 0;
|
|
166
|
+
{
|
|
167
|
+
hll_sketch_test_alloc sk(15, HLL_4);
|
|
168
|
+
for (int i = 0; i < 25; ++i) { sk.update(i); }
|
|
169
|
+
std::ostringstream oss(std::ios::binary);
|
|
170
|
+
oss << sk.to_string(false, true, true, true);
|
|
171
|
+
for (int i = 25; i < (1 << 20); ++i) { sk.update(i); }
|
|
172
|
+
oss << sk.to_string(false, true, true, true);
|
|
173
|
+
oss << sk.to_string(false, true, true, false);
|
|
174
|
+
|
|
175
|
+
sk = hll_sketch_test_alloc(8, HLL_8);
|
|
176
|
+
for (int i = 0; i < 25; ++i) { sk.update(i); }
|
|
177
|
+
oss << sk.to_string(false, true, true, true);
|
|
178
|
+
}
|
|
179
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// Creates and serializes then deserializes sketch.
|
|
183
|
+
// Returns true if deserialized sketch is compact.
|
|
184
|
+
static bool checkCompact(const int lgK, const int n, const target_hll_type type, bool compact) {
|
|
185
|
+
hll_sketch_test_alloc sk(lgK, type);
|
|
186
|
+
for (int i = 0; i < n; ++i) { sk.update(i); }
|
|
187
|
+
|
|
188
|
+
std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
|
|
189
|
+
if (compact) {
|
|
190
|
+
sk.serialize_compact(ss);
|
|
191
|
+
REQUIRE(ss.tellp() == sk.get_compact_serialization_bytes());
|
|
192
|
+
} else {
|
|
193
|
+
sk.serialize_updatable(ss);
|
|
194
|
+
REQUIRE(ss.tellp() == sk.get_updatable_serialization_bytes());
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
hll_sketch_test_alloc sk2 = hll_sketch_test_alloc::deserialize(ss);
|
|
198
|
+
REQUIRE(sk2.get_estimate() == Approx(n).margin(0.01));
|
|
199
|
+
bool isCompact = sk2.is_compact();
|
|
200
|
+
|
|
201
|
+
return isCompact;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
TEST_CASE("hll sketch: check compact flag", "[hll_sketch]") {
|
|
205
|
+
test_allocator_total_bytes = 0;
|
|
206
|
+
{
|
|
207
|
+
int lgK = 8;
|
|
208
|
+
// unless/until we create non-updatable "direct" versions,
|
|
209
|
+
// deserialized image should never be compact
|
|
210
|
+
// LIST: follows serialization request
|
|
211
|
+
REQUIRE(checkCompact(lgK, 7, HLL_8, false) == false);
|
|
212
|
+
REQUIRE(checkCompact(lgK, 7, HLL_8, true) == false);
|
|
213
|
+
|
|
214
|
+
// SET: follows serialization request
|
|
215
|
+
REQUIRE(checkCompact(lgK, 24, HLL_8, false) == false);
|
|
216
|
+
REQUIRE(checkCompact(lgK, 24, HLL_8, true) == false);
|
|
217
|
+
|
|
218
|
+
// HLL8: always updatable
|
|
219
|
+
REQUIRE(checkCompact(lgK, 25, HLL_8, false) == false);
|
|
220
|
+
REQUIRE(checkCompact(lgK, 25, HLL_8, true) == false);
|
|
221
|
+
|
|
222
|
+
// HLL6: always updatable
|
|
223
|
+
REQUIRE(checkCompact(lgK, 25, HLL_6, false) == false);
|
|
224
|
+
REQUIRE(checkCompact(lgK, 25, HLL_6, true) == false);
|
|
225
|
+
|
|
226
|
+
// HLL4: follows serialization request
|
|
227
|
+
REQUIRE(checkCompact(lgK, 25, HLL_4, false) == false);
|
|
228
|
+
REQUIRE(checkCompact(lgK, 25, HLL_4, true) == false);
|
|
229
|
+
}
|
|
230
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
TEST_CASE("hll sketch: check k limits", "[hll_sketch]") {
|
|
234
|
+
test_allocator_total_bytes = 0;
|
|
235
|
+
{
|
|
236
|
+
hll_sketch_test_alloc sketch1(HllUtil<>::MIN_LOG_K, target_hll_type::HLL_8);
|
|
237
|
+
hll_sketch_test_alloc sketch2(HllUtil<>::MAX_LOG_K, target_hll_type::HLL_4);
|
|
238
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc(HllUtil<>::MIN_LOG_K - 1), std::invalid_argument);
|
|
239
|
+
|
|
240
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc(HllUtil<>::MAX_LOG_K + 1), std::invalid_argument);
|
|
241
|
+
}
|
|
242
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
TEST_CASE("hll sketch: check input types", "[hll_sketch]") {
|
|
246
|
+
test_allocator_total_bytes = 0;
|
|
247
|
+
{
|
|
248
|
+
hll_sketch_test_alloc sk(8, target_hll_type::HLL_8);
|
|
249
|
+
|
|
250
|
+
// inserting the same value as a variety of input types
|
|
251
|
+
sk.update((uint8_t) 102);
|
|
252
|
+
sk.update((uint16_t) 102);
|
|
253
|
+
sk.update((uint32_t) 102);
|
|
254
|
+
sk.update((uint64_t) 102);
|
|
255
|
+
sk.update((int8_t) 102);
|
|
256
|
+
sk.update((int16_t) 102);
|
|
257
|
+
sk.update((int32_t) 102);
|
|
258
|
+
sk.update((int64_t) 102);
|
|
259
|
+
REQUIRE(sk.get_estimate() == Approx(1.0).margin(0.01));
|
|
260
|
+
|
|
261
|
+
// identical binary representations
|
|
262
|
+
// no unsigned in Java, but need to sign-extend both as Java would do
|
|
263
|
+
sk.update((uint8_t) 255);
|
|
264
|
+
sk.update((int8_t) -1);
|
|
265
|
+
|
|
266
|
+
sk.update((float) -2.0);
|
|
267
|
+
sk.update((double) -2.0);
|
|
268
|
+
|
|
269
|
+
std::string str = "input string";
|
|
270
|
+
sk.update(str);
|
|
271
|
+
sk.update(str.c_str(), str.length());
|
|
272
|
+
REQUIRE(sk.get_estimate() == Approx(4.0).margin(0.01));
|
|
273
|
+
|
|
274
|
+
sk = hll_sketch_test_alloc(8, target_hll_type::HLL_6);
|
|
275
|
+
sk.update((float) 0.0);
|
|
276
|
+
sk.update((float) -0.0);
|
|
277
|
+
sk.update((double) 0.0);
|
|
278
|
+
sk.update((double) -0.0);
|
|
279
|
+
REQUIRE(sk.get_estimate() == Approx(1.0).margin(0.01));
|
|
280
|
+
|
|
281
|
+
sk = hll_sketch_test_alloc(8, target_hll_type::HLL_4);
|
|
282
|
+
sk.update(std::nanf("3"));
|
|
283
|
+
sk.update(std::nan("9"));
|
|
284
|
+
REQUIRE(sk.get_estimate() == Approx(1.0).margin(0.01));
|
|
285
|
+
|
|
286
|
+
sk = hll_sketch_test_alloc(8, target_hll_type::HLL_4);
|
|
287
|
+
sk.update(nullptr, 0);
|
|
288
|
+
sk.update("");
|
|
289
|
+
REQUIRE(sk.is_empty());
|
|
290
|
+
}
|
|
291
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
TEST_CASE("hll sketch: deserialize list mode buffer overrun", "[hll_sketch]") {
|
|
295
|
+
test_allocator_total_bytes = 0;
|
|
296
|
+
{
|
|
297
|
+
hll_sketch_test_alloc sketch(10);
|
|
298
|
+
sketch.update(1);
|
|
299
|
+
auto bytes = sketch.serialize_compact();
|
|
300
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), 7), std::out_of_range);
|
|
301
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
|
|
302
|
+
|
|
303
|
+
// ckeck for leaks on stream exceptions
|
|
304
|
+
{
|
|
305
|
+
std::stringstream ss;
|
|
306
|
+
ss.exceptions(std::ios::failbit | std::ios::badbit);
|
|
307
|
+
ss.str(std::string((char*)bytes.data(), 7));
|
|
308
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(ss), std::ios_base::failure);
|
|
309
|
+
}
|
|
310
|
+
{
|
|
311
|
+
std::stringstream ss;
|
|
312
|
+
ss.exceptions(std::ios::failbit | std::ios::badbit);
|
|
313
|
+
ss.str(std::string((char*)bytes.data(), bytes.size() - 1));
|
|
314
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(ss), std::ios_base::failure);
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
TEST_CASE("hll sketch: deserialize set mode buffer overrun", "[hll_sketch]") {
|
|
321
|
+
test_allocator_total_bytes = 0;
|
|
322
|
+
{
|
|
323
|
+
hll_sketch_test_alloc sketch(10);
|
|
324
|
+
for (int i = 0; i < 10; ++i) sketch.update(i);
|
|
325
|
+
//std::cout << sketch.to_string();
|
|
326
|
+
auto bytes = sketch.serialize_updatable();
|
|
327
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), 7), std::out_of_range);
|
|
328
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
|
|
329
|
+
|
|
330
|
+
// ckeck for leaks on stream exceptions
|
|
331
|
+
{
|
|
332
|
+
std::stringstream ss;
|
|
333
|
+
ss.exceptions(std::ios::failbit | std::ios::badbit);
|
|
334
|
+
ss.str(std::string((char*)bytes.data(), 7));
|
|
335
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(ss), std::ios_base::failure);
|
|
336
|
+
}
|
|
337
|
+
{
|
|
338
|
+
std::stringstream ss;
|
|
339
|
+
ss.exceptions(std::ios::failbit | std::ios::badbit);
|
|
340
|
+
ss.str(std::string((char*)bytes.data(), bytes.size() - 1));
|
|
341
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(ss), std::ios_base::failure);
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
TEST_CASE("hll sketch: deserialize HLL mode buffer overrun", "[hll_sketch]") {
|
|
348
|
+
test_allocator_total_bytes = 0;
|
|
349
|
+
{
|
|
350
|
+
// this sketch should have aux table
|
|
351
|
+
hll_sketch_test_alloc sketch(15);
|
|
352
|
+
for (int i = 0; i < 14444; ++i) sketch.update(i);
|
|
353
|
+
//std::cout << sketch.to_string();
|
|
354
|
+
auto bytes = sketch.serialize_compact();
|
|
355
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), 7), std::out_of_range);
|
|
356
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), 15), std::out_of_range);
|
|
357
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), 16420), std::out_of_range); // before aux table
|
|
358
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(bytes.data(), bytes.size() - 1), std::out_of_range);
|
|
359
|
+
|
|
360
|
+
// ckeck for leaks on stream exceptions
|
|
361
|
+
{
|
|
362
|
+
std::stringstream ss;
|
|
363
|
+
ss.exceptions(std::ios::failbit | std::ios::badbit);
|
|
364
|
+
ss.str(std::string((char*)bytes.data(), 7));
|
|
365
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(ss), std::ios_base::failure);
|
|
366
|
+
}
|
|
367
|
+
{
|
|
368
|
+
std::stringstream ss;
|
|
369
|
+
ss.exceptions(std::ios::failbit | std::ios::badbit);
|
|
370
|
+
ss.str(std::string((char*)bytes.data(), 15));
|
|
371
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(ss), std::ios_base::failure);
|
|
372
|
+
}
|
|
373
|
+
{
|
|
374
|
+
std::stringstream ss;
|
|
375
|
+
ss.exceptions(std::ios::failbit | std::ios::badbit);
|
|
376
|
+
ss.str(std::string((char*)bytes.data(), 16420)); // before aux table
|
|
377
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(ss), std::ios_base::failure);
|
|
378
|
+
}
|
|
379
|
+
{
|
|
380
|
+
std::stringstream ss;
|
|
381
|
+
ss.exceptions(std::ios::failbit | std::ios::badbit);
|
|
382
|
+
ss.str(std::string((char*)bytes.data(), bytes.size() - 1));
|
|
383
|
+
REQUIRE_THROWS_AS(hll_sketch_test_alloc::deserialize(ss), std::ios_base::failure);
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
REQUIRE(test_allocator_total_bytes == 0);
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
} /* namespace datasketches */
|