datasketches 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
#include <catch.hpp>
|
|
22
|
+
|
|
23
|
+
#include "CubicInterpolation.hpp"
|
|
24
|
+
|
|
25
|
+
namespace datasketches {
|
|
26
|
+
|
|
27
|
+
TEST_CASE("hll tables: interpolation exception", "[hll_tables]") {
|
|
28
|
+
REQUIRE_THROWS_AS(CubicInterpolation<>::usingXAndYTables(-1.0), std::invalid_argument);
|
|
29
|
+
|
|
30
|
+
REQUIRE_THROWS_AS(CubicInterpolation<>::usingXAndYTables(1e12), std::invalid_argument);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
TEST_CASE("hll tables: check corner case", "[hll_tables]") {
|
|
34
|
+
int len = 10;
|
|
35
|
+
double xArr[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0};
|
|
36
|
+
double yArr[] = {2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0};
|
|
37
|
+
double x = xArr[len - 1];
|
|
38
|
+
double y = CubicInterpolation<>::usingXAndYTables(xArr, yArr, len, x);
|
|
39
|
+
double yExp = yArr[len - 1];
|
|
40
|
+
REQUIRE(y == yExp);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
} /* namespace datasketches */
|
|
44
|
+
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <catch.hpp>
|
|
21
|
+
#include <fstream>
|
|
22
|
+
|
|
23
|
+
#include "hll.hpp"
|
|
24
|
+
|
|
25
|
+
namespace datasketches {
|
|
26
|
+
|
|
27
|
+
static const int nArr[] = {1, 3, 10, 30, 100, 300, 1000, 3000, 10000, 30000};
|
|
28
|
+
|
|
29
|
+
TEST_CASE("hll to/from byte array: double serialize", "[hll_byte_array]") {
|
|
30
|
+
hll_sketch sk(9, HLL_8);
|
|
31
|
+
for (int i = 0; i < 1024; ++i) {
|
|
32
|
+
sk.update(i);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
std::stringstream ss1;
|
|
36
|
+
sk.serialize_updatable(ss1);
|
|
37
|
+
auto ser1 = sk.serialize_updatable();
|
|
38
|
+
|
|
39
|
+
std::stringstream ss;
|
|
40
|
+
sk.serialize_updatable(ss);
|
|
41
|
+
std::string str = ss.str();
|
|
42
|
+
|
|
43
|
+
hll_sketch sk2 = hll_sketch::deserialize(ser1.data(), ser1.size());
|
|
44
|
+
auto ser2 = sk.serialize_updatable();
|
|
45
|
+
|
|
46
|
+
REQUIRE(ser1.size() == ser2.size());
|
|
47
|
+
int len = ser1.size();
|
|
48
|
+
uint8_t* b1 = ser1.data();
|
|
49
|
+
uint8_t* b2 = ser2.data();
|
|
50
|
+
|
|
51
|
+
for (int i = 0; i < len; ++i) {
|
|
52
|
+
REQUIRE(b2[i] == b1[i]);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
TEST_CASE("hll to/from byte array: deserialize from java", "[hll_byte_array]") {
|
|
57
|
+
std::string inputPath;
|
|
58
|
+
#ifdef TEST_BINARY_INPUT_PATH
|
|
59
|
+
inputPath = TEST_BINARY_INPUT_PATH;
|
|
60
|
+
#else
|
|
61
|
+
inputPath = "test/";
|
|
62
|
+
#endif
|
|
63
|
+
|
|
64
|
+
std::ifstream ifs;
|
|
65
|
+
ifs.open(inputPath + "list_from_java.sk", std::ios::binary);
|
|
66
|
+
hll_sketch sk = hll_sketch::deserialize(ifs);
|
|
67
|
+
REQUIRE(sk.is_empty() == false);
|
|
68
|
+
REQUIRE(sk.get_lg_config_k() == 8);
|
|
69
|
+
REQUIRE(sk.get_lower_bound(1) == 7.0);
|
|
70
|
+
REQUIRE(sk.get_estimate() == Approx(7.0).margin(1e-6));
|
|
71
|
+
REQUIRE(sk.get_upper_bound(1) == Approx(7.000350).margin(1e-5));
|
|
72
|
+
ifs.close();
|
|
73
|
+
|
|
74
|
+
ifs.open(inputPath + "compact_set_from_java.sk", std::ios::binary);
|
|
75
|
+
sk = hll_sketch::deserialize(ifs);
|
|
76
|
+
REQUIRE(sk.is_empty() == false);
|
|
77
|
+
REQUIRE(sk.get_lg_config_k() == 8);
|
|
78
|
+
REQUIRE(sk.get_lower_bound(1) == 24.0);
|
|
79
|
+
REQUIRE(sk.get_estimate() == Approx(24.0).margin(1e-5));
|
|
80
|
+
REQUIRE(sk.get_upper_bound(1) == Approx(24.001200).margin(1e-5));
|
|
81
|
+
ifs.close();
|
|
82
|
+
|
|
83
|
+
ifs.open(inputPath + "updatable_set_from_java.sk", std::ios::binary);
|
|
84
|
+
sk = hll_sketch::deserialize(ifs);
|
|
85
|
+
REQUIRE(sk.is_empty() == false);
|
|
86
|
+
REQUIRE(sk.get_lg_config_k() == 8);
|
|
87
|
+
REQUIRE(sk.get_lower_bound(1) == 24.0);
|
|
88
|
+
REQUIRE(sk.get_estimate() == Approx(24.0).margin(1e-5));
|
|
89
|
+
REQUIRE(sk.get_upper_bound(1) == Approx(24.001200).margin(1e-5));
|
|
90
|
+
ifs.close();
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
ifs.open(inputPath + "array6_from_java.sk", std::ios::binary);
|
|
94
|
+
sk = hll_sketch::deserialize(ifs);
|
|
95
|
+
REQUIRE(sk.is_empty() == false);
|
|
96
|
+
REQUIRE(sk.get_lg_config_k() == 8);
|
|
97
|
+
REQUIRE(sk.get_lower_bound(1) == Approx(9589.968564).margin(1e-5));
|
|
98
|
+
REQUIRE(sk.get_estimate() == Approx(10089.150211).margin(1e-5));
|
|
99
|
+
REQUIRE(sk.get_upper_bound(1) == Approx(10642.370492).margin(1e-5));
|
|
100
|
+
ifs.close();
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
ifs.open(inputPath + "compact_array4_from_java.sk", std::ios::binary);
|
|
104
|
+
sk = hll_sketch::deserialize(ifs);
|
|
105
|
+
REQUIRE(sk.is_empty() == false);
|
|
106
|
+
REQUIRE(sk.get_lg_config_k() == 8);
|
|
107
|
+
REQUIRE(sk.get_lower_bound(1) == Approx(9589.968564).margin(1e-5));
|
|
108
|
+
REQUIRE(sk.get_estimate() == Approx(10089.150211).margin(1e-5));
|
|
109
|
+
REQUIRE(sk.get_upper_bound(1) == Approx(10642.370492).margin(1e-5));
|
|
110
|
+
|
|
111
|
+
ifs.close();
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
ifs.open(inputPath + "updatable_array4_from_java.sk", std::ios::binary);
|
|
115
|
+
sk = hll_sketch::deserialize(ifs);
|
|
116
|
+
REQUIRE(sk.is_empty() == false);
|
|
117
|
+
REQUIRE(sk.get_lg_config_k() == 8);
|
|
118
|
+
REQUIRE(sk.get_lower_bound(1) == Approx(9589.968564).margin(1e-5));
|
|
119
|
+
REQUIRE(sk.get_estimate() == Approx(10089.150211).margin(1e-5));
|
|
120
|
+
REQUIRE(sk.get_upper_bound(1) == Approx(10642.370492).margin(1e-5));
|
|
121
|
+
ifs.close();
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
static void checkSketchEquality(hll_sketch& sk1, hll_sketch& sk2) {
|
|
125
|
+
REQUIRE(sk1.get_lg_config_k() == sk2.get_lg_config_k());
|
|
126
|
+
REQUIRE(sk1.get_lower_bound(1) == sk2.get_lower_bound(1));
|
|
127
|
+
REQUIRE(sk1.get_estimate() == sk2.get_estimate());
|
|
128
|
+
REQUIRE(sk1.get_upper_bound(1) == sk2.get_upper_bound(1));
|
|
129
|
+
REQUIRE(sk1.get_target_type() == sk2.get_target_type());
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
static void toFrom(const int lgConfigK, const target_hll_type tgtHllType, const int n) {
|
|
133
|
+
hll_sketch src(lgConfigK, tgtHllType);
|
|
134
|
+
for (int i = 0; i < n; ++i) {
|
|
135
|
+
src.update(i);
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
std::stringstream ss(std::ios::in | std::ios::out | std::ios::binary);
|
|
139
|
+
src.serialize_compact(ss);
|
|
140
|
+
hll_sketch dst = hll_sketch::deserialize(ss);
|
|
141
|
+
checkSketchEquality(src, dst);
|
|
142
|
+
|
|
143
|
+
auto bytes1 = src.serialize_compact();
|
|
144
|
+
dst = hll_sketch::deserialize(bytes1.data(), bytes1.size());
|
|
145
|
+
checkSketchEquality(src, dst);
|
|
146
|
+
|
|
147
|
+
ss.clear();
|
|
148
|
+
src.serialize_updatable(ss);
|
|
149
|
+
dst = hll_sketch::deserialize(ss);
|
|
150
|
+
checkSketchEquality(src, dst);
|
|
151
|
+
|
|
152
|
+
auto bytes2 = src.serialize_updatable();
|
|
153
|
+
dst = hll_sketch::deserialize(bytes2.data(), bytes2.size());
|
|
154
|
+
checkSketchEquality(src, dst);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
TEST_CASE("hll to/from byte array: to from sketch", "[hll_byte_array]") {
|
|
158
|
+
for (int i = 0; i < 10; ++i) {
|
|
159
|
+
int n = nArr[i];
|
|
160
|
+
for (int lgK = 4; lgK <= 13; ++lgK) {
|
|
161
|
+
toFrom(lgK, HLL_4, n);
|
|
162
|
+
toFrom(lgK, HLL_6, n);
|
|
163
|
+
toFrom(lgK, HLL_8, n);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
} /* namespace datasketches */
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
18
|
+
add_library(kll INTERFACE)
|
|
19
|
+
|
|
20
|
+
add_library(${PROJECT_NAME}::KLL ALIAS kll)
|
|
21
|
+
|
|
22
|
+
if (BUILD_TESTS)
|
|
23
|
+
add_subdirectory(test)
|
|
24
|
+
endif()
|
|
25
|
+
|
|
26
|
+
target_include_directories(kll
|
|
27
|
+
INTERFACE
|
|
28
|
+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
|
|
29
|
+
$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
target_link_libraries(kll INTERFACE common)
|
|
33
|
+
target_compile_features(kll INTERFACE cxx_std_11)
|
|
34
|
+
|
|
35
|
+
set(kll_HEADERS "")
|
|
36
|
+
list(APPEND kll_HEADERS "include/kll_sketch.hpp")
|
|
37
|
+
list(APPEND kll_HEADERS "include/kll_sketch_impl.hpp")
|
|
38
|
+
list(APPEND kll_HEADERS "include/kll_helper.hpp")
|
|
39
|
+
list(APPEND kll_HEADERS "include/kll_helper_impl.hpp")
|
|
40
|
+
list(APPEND kll_HEADERS "include/kll_quantile_calculator.hpp")
|
|
41
|
+
list(APPEND kll_HEADERS "include/kll_quantile_calculator_impl.hpp")
|
|
42
|
+
|
|
43
|
+
install(TARGETS kll
|
|
44
|
+
EXPORT ${PROJECT_NAME}
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
install(FILES ${kll_HEADERS}
|
|
48
|
+
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
|
|
49
|
+
|
|
50
|
+
target_sources(kll
|
|
51
|
+
INTERFACE
|
|
52
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/kll_helper.hpp
|
|
53
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/kll_helper_impl.hpp
|
|
54
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/kll_sketch.hpp
|
|
55
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/kll_sketch_impl.hpp
|
|
56
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/kll_quantile_calculator.hpp
|
|
57
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/kll_quantile_calculator_impl.hpp
|
|
58
|
+
)
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef KLL_HELPER_HPP_
|
|
21
|
+
#define KLL_HELPER_HPP_
|
|
22
|
+
|
|
23
|
+
#include <random>
|
|
24
|
+
#include <stdexcept>
|
|
25
|
+
#include <chrono>
|
|
26
|
+
|
|
27
|
+
namespace datasketches {
|
|
28
|
+
|
|
29
|
+
static std::independent_bits_engine<std::mt19937, 1, uint32_t> random_bit(std::chrono::system_clock::now().time_since_epoch().count());
|
|
30
|
+
|
|
31
|
+
#ifdef KLL_VALIDATION
|
|
32
|
+
extern uint32_t kll_next_offset;
|
|
33
|
+
#endif
|
|
34
|
+
|
|
35
|
+
// 0 <= power <= 30
|
|
36
|
+
static const uint64_t powers_of_three[] = {1, 3, 9, 27, 81, 243, 729, 2187, 6561, 19683, 59049, 177147, 531441,
|
|
37
|
+
1594323, 4782969, 14348907, 43046721, 129140163, 387420489, 1162261467,
|
|
38
|
+
3486784401, 10460353203, 31381059609, 94143178827, 282429536481,
|
|
39
|
+
847288609443, 2541865828329, 7625597484987, 22876792454961, 68630377364883,
|
|
40
|
+
205891132094649};
|
|
41
|
+
|
|
42
|
+
class kll_helper {
|
|
43
|
+
public:
|
|
44
|
+
static inline bool is_even(uint32_t value);
|
|
45
|
+
static inline bool is_odd(uint32_t value);
|
|
46
|
+
static inline uint8_t floor_of_log2_of_fraction(uint64_t numer, uint64_t denom);
|
|
47
|
+
static inline uint8_t ub_on_num_levels(uint64_t n);
|
|
48
|
+
static inline uint32_t compute_total_capacity(uint16_t k, uint8_t m, uint8_t num_levels);
|
|
49
|
+
static inline uint32_t level_capacity(uint16_t k, uint8_t numLevels, uint8_t height, uint8_t min_wid);
|
|
50
|
+
static inline uint32_t int_cap_aux(uint16_t k, uint8_t depth);
|
|
51
|
+
static inline uint32_t int_cap_aux_aux(uint16_t k, uint8_t depth);
|
|
52
|
+
static inline uint64_t sum_the_sample_weights(uint8_t num_levels, const uint32_t* levels);
|
|
53
|
+
|
|
54
|
+
/*
|
|
55
|
+
* This version is for floating point types
|
|
56
|
+
* Checks the sequential validity of the given array of values.
|
|
57
|
+
* They must be unique, monotonically increasing and not NaN.
|
|
58
|
+
*/
|
|
59
|
+
template <typename T, typename C>
|
|
60
|
+
static typename std::enable_if<std::is_floating_point<T>::value, void>::type
|
|
61
|
+
validate_values(const T* values, uint32_t size) {
|
|
62
|
+
for (uint32_t i = 0; i < size ; i++) {
|
|
63
|
+
if (std::isnan(values[i])) {
|
|
64
|
+
throw std::invalid_argument("Values must not be NaN");
|
|
65
|
+
}
|
|
66
|
+
if ((i < (size - 1)) && !(C()(values[i], values[i + 1]))) {
|
|
67
|
+
throw std::invalid_argument("Values must be unique and monotonically increasing");
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
/*
|
|
72
|
+
* This version is for non-floating point types
|
|
73
|
+
* Checks the sequential validity of the given array of values.
|
|
74
|
+
* They must be unique and monotonically increasing.
|
|
75
|
+
*/
|
|
76
|
+
template <typename T, typename C>
|
|
77
|
+
static typename std::enable_if<!std::is_floating_point<T>::value, void>::type
|
|
78
|
+
validate_values(const T* values, uint32_t size) {
|
|
79
|
+
for (uint32_t i = 0; i < size ; i++) {
|
|
80
|
+
if ((i < (size - 1)) && !(C()(values[i], values[i + 1]))) {
|
|
81
|
+
throw std::invalid_argument("Values must be unique and monotonically increasing");
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
template <typename T>
|
|
87
|
+
static void randomly_halve_down(T* buf, uint32_t start, uint32_t length);
|
|
88
|
+
|
|
89
|
+
template <typename T>
|
|
90
|
+
static void randomly_halve_up(T* buf, uint32_t start, uint32_t length);
|
|
91
|
+
|
|
92
|
+
// this version moves objects within the same buffer
|
|
93
|
+
// assumes that destination has initialized objects
|
|
94
|
+
// does not destroy the originals after the move
|
|
95
|
+
template <typename T, typename C>
|
|
96
|
+
static void merge_sorted_arrays(T* buf, uint32_t start_a, uint32_t len_a, uint32_t start_b, uint32_t len_b, uint32_t start_c);
|
|
97
|
+
|
|
98
|
+
// this version is to merge from two different buffers into a third buffer
|
|
99
|
+
// initializes objects is the destination buffer
|
|
100
|
+
// moves objects from buf_a and destroys the originals
|
|
101
|
+
// copies objects from buf_b
|
|
102
|
+
template <typename T, typename C>
|
|
103
|
+
static void merge_sorted_arrays(const T* buf_a, uint32_t start_a, uint32_t len_a, const T* buf_b, uint32_t start_b, uint32_t len_b, T* buf_c, uint32_t start_c);
|
|
104
|
+
|
|
105
|
+
struct compress_result {
|
|
106
|
+
uint8_t final_num_levels;
|
|
107
|
+
uint32_t final_capacity;
|
|
108
|
+
uint32_t final_num_items;
|
|
109
|
+
};
|
|
110
|
+
|
|
111
|
+
/*
|
|
112
|
+
* Here is what we do for each level:
|
|
113
|
+
* If it does not need to be compacted, then simply copy it over.
|
|
114
|
+
*
|
|
115
|
+
* Otherwise, it does need to be compacted, so...
|
|
116
|
+
* Copy zero or one guy over.
|
|
117
|
+
* If the level above is empty, halve up.
|
|
118
|
+
* Else the level above is nonempty, so...
|
|
119
|
+
* halve down, then merge up.
|
|
120
|
+
* Adjust the boundaries of the level above.
|
|
121
|
+
*
|
|
122
|
+
* It can be proved that general_compress returns a sketch that satisfies the space constraints
|
|
123
|
+
* no matter how much data is passed in.
|
|
124
|
+
* All levels except for level zero must be sorted before calling this, and will still be
|
|
125
|
+
* sorted afterwards.
|
|
126
|
+
* Level zero is not required to be sorted before, and may not be sorted afterwards.
|
|
127
|
+
*/
|
|
128
|
+
template <typename T, typename C>
|
|
129
|
+
static compress_result general_compress(uint16_t k, uint8_t m, uint8_t num_levels_in, T* items,
|
|
130
|
+
uint32_t* in_levels, uint32_t* out_levels, bool is_level_zero_sorted);
|
|
131
|
+
|
|
132
|
+
template<typename T>
|
|
133
|
+
static void copy_construct(const T* src, size_t src_first, size_t src_last, T* dst, size_t dst_first);
|
|
134
|
+
|
|
135
|
+
template<typename T>
|
|
136
|
+
static void move_construct(T* src, size_t src_first, size_t src_last, T* dst, size_t dst_first, bool destroy);
|
|
137
|
+
|
|
138
|
+
#ifdef KLL_VALIDATION
|
|
139
|
+
private:
|
|
140
|
+
|
|
141
|
+
static inline uint32_t deterministic_offset();
|
|
142
|
+
#endif
|
|
143
|
+
|
|
144
|
+
};
|
|
145
|
+
|
|
146
|
+
} /* namespace datasketches */
|
|
147
|
+
|
|
148
|
+
#include "kll_helper_impl.hpp"
|
|
149
|
+
|
|
150
|
+
#endif // KLL_HELPER_HPP_
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef KLL_HELPER_IMPL_HPP_
|
|
21
|
+
#define KLL_HELPER_IMPL_HPP_
|
|
22
|
+
|
|
23
|
+
#include <algorithm>
|
|
24
|
+
|
|
25
|
+
namespace datasketches {
|
|
26
|
+
|
|
27
|
+
bool kll_helper::is_even(uint32_t value) {
|
|
28
|
+
return (value & 1) == 0;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
bool kll_helper::is_odd(uint32_t value) {
|
|
32
|
+
return (value & 1) > 0;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
uint8_t kll_helper::floor_of_log2_of_fraction(uint64_t numer, uint64_t denom) {
|
|
36
|
+
if (denom > numer) return 0;
|
|
37
|
+
uint8_t count = 0;
|
|
38
|
+
while (true) {
|
|
39
|
+
denom <<= 1;
|
|
40
|
+
if (denom > numer) return count;
|
|
41
|
+
count++;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
uint8_t kll_helper::ub_on_num_levels(uint64_t n) {
|
|
46
|
+
if (n == 0) return 1;
|
|
47
|
+
return 1 + floor_of_log2_of_fraction(n, 1);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
uint32_t kll_helper::compute_total_capacity(uint16_t k, uint8_t m, uint8_t num_levels) {
|
|
51
|
+
uint32_t total = 0;
|
|
52
|
+
for (uint8_t h = 0; h < num_levels; h++) {
|
|
53
|
+
total += level_capacity(k, num_levels, h, m);
|
|
54
|
+
}
|
|
55
|
+
return total;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
uint32_t kll_helper::level_capacity(uint16_t k, uint8_t numLevels, uint8_t height, uint8_t min_wid) {
|
|
59
|
+
if (height >= numLevels) throw std::invalid_argument("height >= numLevels");
|
|
60
|
+
const uint8_t depth = numLevels - height - 1;
|
|
61
|
+
return std::max((uint32_t) min_wid, int_cap_aux(k, depth));
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
uint32_t kll_helper::int_cap_aux(uint16_t k, uint8_t depth) {
|
|
65
|
+
if (depth > 60) throw std::invalid_argument("depth > 60");
|
|
66
|
+
if (depth <= 30) return int_cap_aux_aux(k, depth);
|
|
67
|
+
const uint8_t half = depth / 2;
|
|
68
|
+
const uint8_t rest = depth - half;
|
|
69
|
+
const uint32_t tmp = int_cap_aux_aux(k, half);
|
|
70
|
+
return int_cap_aux_aux(tmp, rest);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
uint32_t kll_helper::int_cap_aux_aux(uint16_t k, uint8_t depth) {
|
|
74
|
+
if (depth > 30) throw std::invalid_argument("depth > 30");
|
|
75
|
+
const uint64_t twok = k << 1; // for rounding, we pre-multiply by 2
|
|
76
|
+
const uint64_t tmp = (uint64_t) (((uint64_t) twok << depth) / powers_of_three[depth]);
|
|
77
|
+
const uint64_t result = (tmp + 1) >> 1; // then here we add 1 and divide by 2
|
|
78
|
+
if (result > k) throw std::logic_error("result > k");
|
|
79
|
+
return result;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
uint64_t kll_helper::sum_the_sample_weights(uint8_t num_levels, const uint32_t* levels) {
|
|
83
|
+
uint64_t total = 0;
|
|
84
|
+
uint64_t weight = 1;
|
|
85
|
+
for (uint8_t lvl = 0; lvl < num_levels; lvl++) {
|
|
86
|
+
total += weight * (levels[lvl + 1] - levels[lvl]);
|
|
87
|
+
weight *= 2;
|
|
88
|
+
}
|
|
89
|
+
return total;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
template <typename T>
|
|
93
|
+
void kll_helper::randomly_halve_down(T* buf, uint32_t start, uint32_t length) {
|
|
94
|
+
if (!is_even(length)) throw std::invalid_argument("length must be even");
|
|
95
|
+
const uint32_t half_length = length / 2;
|
|
96
|
+
#ifdef KLL_VALIDATION
|
|
97
|
+
const uint32_t offset = deterministic_offset();
|
|
98
|
+
#else
|
|
99
|
+
const uint32_t offset = random_bit();
|
|
100
|
+
#endif
|
|
101
|
+
uint32_t j = start + offset;
|
|
102
|
+
for (uint32_t i = start; i < (start + half_length); i++) {
|
|
103
|
+
if (i != j) buf[i] = std::move(buf[j]);
|
|
104
|
+
j += 2;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
template <typename T>
|
|
109
|
+
void kll_helper::randomly_halve_up(T* buf, uint32_t start, uint32_t length) {
|
|
110
|
+
if (!is_even(length)) throw std::invalid_argument("length must be even");
|
|
111
|
+
const uint32_t half_length = length / 2;
|
|
112
|
+
#ifdef KLL_VALIDATION
|
|
113
|
+
const uint32_t offset = deterministic_offset();
|
|
114
|
+
#else
|
|
115
|
+
const uint32_t offset = random_bit();
|
|
116
|
+
#endif
|
|
117
|
+
uint32_t j = (start + length) - 1 - offset;
|
|
118
|
+
for (uint32_t i = (start + length) - 1; i >= (start + half_length); i--) {
|
|
119
|
+
if (i != j) buf[i] = std::move(buf[j]);
|
|
120
|
+
j -= 2;
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// this version moves objects within the same buffer
|
|
125
|
+
// assumes that destination has initialized objects
|
|
126
|
+
// does not destroy the originals after the move
|
|
127
|
+
template <typename T, typename C>
|
|
128
|
+
void kll_helper::merge_sorted_arrays(T* buf, uint32_t start_a, uint32_t len_a, uint32_t start_b, uint32_t len_b, uint32_t start_c) {
|
|
129
|
+
const uint32_t len_c = len_a + len_b;
|
|
130
|
+
const uint32_t lim_a = start_a + len_a;
|
|
131
|
+
const uint32_t lim_b = start_b + len_b;
|
|
132
|
+
const uint32_t lim_c = start_c + len_c;
|
|
133
|
+
|
|
134
|
+
uint32_t a = start_a;
|
|
135
|
+
uint32_t b = start_b;
|
|
136
|
+
|
|
137
|
+
for (uint32_t c = start_c; c < lim_c; c++) {
|
|
138
|
+
if (a == lim_a) {
|
|
139
|
+
if (b != c) buf[c] = std::move(buf[b]);
|
|
140
|
+
b++;
|
|
141
|
+
} else if (b == lim_b) {
|
|
142
|
+
if (a != c) buf[c] = std::move(buf[a]);
|
|
143
|
+
a++;
|
|
144
|
+
} else if (C()(buf[a], buf[b])) {
|
|
145
|
+
if (a != c) buf[c] = std::move(buf[a]);
|
|
146
|
+
a++;
|
|
147
|
+
} else {
|
|
148
|
+
if (b != c) buf[c] = std::move(buf[b]);
|
|
149
|
+
b++;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
if (a != lim_a || b != lim_b) throw std::logic_error("inconsistent state");
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// this version is to merge from two different buffers into a third buffer
|
|
156
|
+
// initializes objects is the destination buffer
|
|
157
|
+
// moves objects from buf_a and destroys the originals
|
|
158
|
+
// copies objects from buf_b
|
|
159
|
+
template <typename T, typename C>
|
|
160
|
+
void kll_helper::merge_sorted_arrays(const T* buf_a, uint32_t start_a, uint32_t len_a, const T* buf_b, uint32_t start_b, uint32_t len_b, T* buf_c, uint32_t start_c) {
|
|
161
|
+
const uint32_t len_c = len_a + len_b;
|
|
162
|
+
const uint32_t lim_a = start_a + len_a;
|
|
163
|
+
const uint32_t lim_b = start_b + len_b;
|
|
164
|
+
const uint32_t lim_c = start_c + len_c;
|
|
165
|
+
|
|
166
|
+
uint32_t a = start_a;
|
|
167
|
+
uint32_t b = start_b;
|
|
168
|
+
|
|
169
|
+
for (uint32_t c = start_c; c < lim_c; c++) {
|
|
170
|
+
if (a == lim_a) {
|
|
171
|
+
new (&buf_c[c]) T(buf_b[b++]);
|
|
172
|
+
} else if (b == lim_b) {
|
|
173
|
+
new (&buf_c[c]) T(std::move(buf_a[a]));
|
|
174
|
+
buf_a[a++].~T();
|
|
175
|
+
} else if (C()(buf_a[a], buf_b[b])) {
|
|
176
|
+
new (&buf_c[c]) T(std::move(buf_a[a]));
|
|
177
|
+
buf_a[a++].~T();
|
|
178
|
+
} else {
|
|
179
|
+
new (&buf_c[c]) T(buf_b[b++]);
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
if (a != lim_a || b != lim_b) throw std::logic_error("inconsistent state");
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
/*
|
|
186
|
+
* Here is what we do for each level:
|
|
187
|
+
* If it does not need to be compacted, then simply copy it over.
|
|
188
|
+
*
|
|
189
|
+
* Otherwise, it does need to be compacted, so...
|
|
190
|
+
* Copy zero or one guy over.
|
|
191
|
+
* If the level above is empty, halve up.
|
|
192
|
+
* Else the level above is nonempty, so...
|
|
193
|
+
* halve down, then merge up.
|
|
194
|
+
* Adjust the boundaries of the level above.
|
|
195
|
+
*
|
|
196
|
+
* It can be proved that general_compress returns a sketch that satisfies the space constraints
|
|
197
|
+
* no matter how much data is passed in.
|
|
198
|
+
* All levels except for level zero must be sorted before calling this, and will still be
|
|
199
|
+
* sorted afterwards.
|
|
200
|
+
* Level zero is not required to be sorted before, and may not be sorted afterwards.
|
|
201
|
+
*/
|
|
202
|
+
template <typename T, typename C>
|
|
203
|
+
kll_helper::compress_result kll_helper::general_compress(uint16_t k, uint8_t m, uint8_t num_levels_in, T* items,
|
|
204
|
+
uint32_t* in_levels, uint32_t* out_levels, bool is_level_zero_sorted)
|
|
205
|
+
{
|
|
206
|
+
if (num_levels_in == 0) throw std::invalid_argument("num_levels_in == 0"); // things are too weird if zero levels are allowed
|
|
207
|
+
const uint32_t starting_item_count = in_levels[num_levels_in] - in_levels[0];
|
|
208
|
+
uint8_t current_num_levels = num_levels_in;
|
|
209
|
+
uint32_t current_item_count = starting_item_count; // decreases with each compaction
|
|
210
|
+
uint32_t target_item_count = compute_total_capacity(k, m, current_num_levels); // increases if we add levels
|
|
211
|
+
bool done_yet = false;
|
|
212
|
+
out_levels[0] = 0;
|
|
213
|
+
uint8_t current_level = 0;
|
|
214
|
+
while (!done_yet) {
|
|
215
|
+
|
|
216
|
+
// If we are at the current top level, add an empty level above it for convenience,
|
|
217
|
+
// but do not increment num_levels until later
|
|
218
|
+
if (current_level == (current_num_levels - 1)) {
|
|
219
|
+
in_levels[current_level + 2] = in_levels[current_level + 1];
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
const auto raw_beg = in_levels[current_level];
|
|
223
|
+
const auto raw_lim = in_levels[current_level + 1];
|
|
224
|
+
const auto raw_pop = raw_lim - raw_beg;
|
|
225
|
+
|
|
226
|
+
if ((current_item_count < target_item_count) || (raw_pop < level_capacity(k, current_num_levels, current_level, m))) {
|
|
227
|
+
// move level over as is
|
|
228
|
+
// make sure we are not moving data upwards
|
|
229
|
+
if (raw_beg < out_levels[current_level]) throw std::logic_error("wrong move");
|
|
230
|
+
std::move(&items[raw_beg], &items[raw_lim], &items[out_levels[current_level]]);
|
|
231
|
+
out_levels[current_level + 1] = out_levels[current_level] + raw_pop;
|
|
232
|
+
} else {
|
|
233
|
+
// The sketch is too full AND this level is too full, so we compact it
|
|
234
|
+
// Note: this can add a level and thus change the sketches capacities
|
|
235
|
+
|
|
236
|
+
const auto pop_above = in_levels[current_level + 2] - raw_lim;
|
|
237
|
+
const bool odd_pop = is_odd(raw_pop);
|
|
238
|
+
const auto adj_beg = odd_pop ? 1 + raw_beg : raw_beg;
|
|
239
|
+
const auto adj_pop = odd_pop ? raw_pop - 1 : raw_pop;
|
|
240
|
+
const auto half_adj_pop = adj_pop / 2;
|
|
241
|
+
|
|
242
|
+
if (odd_pop) { // move one guy over
|
|
243
|
+
items[out_levels[current_level]] = std::move(items[raw_beg]);
|
|
244
|
+
out_levels[current_level + 1] = out_levels[current_level] + 1;
|
|
245
|
+
} else { // even number of items
|
|
246
|
+
out_levels[current_level + 1] = out_levels[current_level];
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
// level zero might not be sorted, so we must sort it if we wish to compact it
|
|
250
|
+
if ((current_level == 0) && !is_level_zero_sorted) {
|
|
251
|
+
std::sort(&items[adj_beg], &items[adj_beg + adj_pop], C());
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
if (pop_above == 0) { // Level above is empty, so halve up
|
|
255
|
+
randomly_halve_up(items, adj_beg, adj_pop);
|
|
256
|
+
} else { // Level above is nonempty, so halve down, then merge up
|
|
257
|
+
randomly_halve_down(items, adj_beg, adj_pop);
|
|
258
|
+
merge_sorted_arrays<T, C>(items, adj_beg, half_adj_pop, raw_lim, pop_above, adj_beg + half_adj_pop);
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// track the fact that we just eliminated some data
|
|
262
|
+
current_item_count -= half_adj_pop;
|
|
263
|
+
|
|
264
|
+
// adjust the boundaries of the level above
|
|
265
|
+
in_levels[current_level + 1] = in_levels[current_level + 1] - half_adj_pop;
|
|
266
|
+
|
|
267
|
+
// increment num_levels if we just compacted the old top level
|
|
268
|
+
// this creates some more capacity (the size of the new bottom level)
|
|
269
|
+
if (current_level == (current_num_levels - 1)) {
|
|
270
|
+
current_num_levels++;
|
|
271
|
+
target_item_count += level_capacity(k, current_num_levels, 0, m);
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
} // end of code for compacting a level
|
|
275
|
+
|
|
276
|
+
// determine whether we have processed all levels yet (including any new levels that we created)
|
|
277
|
+
|
|
278
|
+
if (current_level == (current_num_levels - 1)) done_yet = true;
|
|
279
|
+
current_level++;
|
|
280
|
+
} // end of loop over levels
|
|
281
|
+
|
|
282
|
+
if ((out_levels[current_num_levels] - out_levels[0]) != current_item_count) throw std::logic_error("inconsistent state");
|
|
283
|
+
|
|
284
|
+
for (uint32_t i = current_item_count; i < starting_item_count; i++) items[i].~T();
|
|
285
|
+
|
|
286
|
+
compress_result result;
|
|
287
|
+
result.final_num_levels = current_num_levels;
|
|
288
|
+
result.final_capacity = target_item_count;
|
|
289
|
+
result.final_num_items = current_item_count;
|
|
290
|
+
return result;
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
template<typename T>
|
|
294
|
+
void kll_helper::copy_construct(const T* src, size_t src_first, size_t src_last, T* dst, size_t dst_first) {
|
|
295
|
+
while (src_first != src_last) {
|
|
296
|
+
new (&dst[dst_first++]) T(src[src_first++]);
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
template<typename T>
|
|
301
|
+
void kll_helper::move_construct(T* src, size_t src_first, size_t src_last, T* dst, size_t dst_first, bool destroy) {
|
|
302
|
+
while (src_first != src_last) {
|
|
303
|
+
new (&dst[dst_first++]) T(std::move(src[src_first]));
|
|
304
|
+
if (destroy) src[src_first].~T();
|
|
305
|
+
src_first++;
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
#ifdef KLL_VALIDATION
|
|
310
|
+
uint32_t kll_helper::deterministic_offset() {
|
|
311
|
+
const uint32_t result(kll_next_offset);
|
|
312
|
+
kll_next_offset = 1 - kll_next_offset;
|
|
313
|
+
return result;
|
|
314
|
+
}
|
|
315
|
+
#endif
|
|
316
|
+
|
|
317
|
+
} /* namespace datasketches */
|
|
318
|
+
|
|
319
|
+
#endif // KLL_HELPER_IMPL_HPP_
|