datasketches 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
@@ -0,0 +1,274 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
// author Kevin Lang, Oath Research
|
21
|
+
|
22
|
+
#ifndef ICON_ESTIMATOR_HPP_
|
23
|
+
#define ICON_ESTIMATOR_HPP_
|
24
|
+
|
25
|
+
#include <cmath>
|
26
|
+
#include <cstdint>
|
27
|
+
#include <stdexcept>
|
28
|
+
|
29
|
+
namespace datasketches {
|
30
|
+
|
31
|
+
// The ICON estimator for FM85 sketches is defined by the arXiv paper.
|
32
|
+
|
33
|
+
// The current file provides exact and approximate implementations of this estimator.
|
34
|
+
|
35
|
+
// The exact version works for any value of K, but is quite slow.
|
36
|
+
|
37
|
+
// The much faster approximate version works for K values that are powers of two
|
38
|
+
// ranging from 2^4 to 2^32.
|
39
|
+
|
40
|
+
// At a high-level, this approximation can be described as using an
|
41
|
+
// exponential approximation when C > K * (5.6 or 5.7), while smaller
|
42
|
+
// values of C are handled by a degree-19 polynomial approximation of
|
43
|
+
// a pre-conditioned version of the true ICON mapping from C to N_hat.
|
44
|
+
|
45
|
+
// This file also provides a validation procedure that compares its approximate
|
46
|
+
// and exact implementations of the FM85 ICON estimator.
|
47
|
+
|
48
|
+
static const int ICON_MIN_LOG_K = 4;
|
49
|
+
static const int ICON_MAX_LOG_K = 26;
|
50
|
+
static const int ICON_POLYNOMIAL_DEGREE = 19;
|
51
|
+
static const int ICON_POLYNOMIAL_NUM_COEFFICIENTS = 1 + ICON_POLYNOMIAL_DEGREE;
|
52
|
+
static const int ICON_TABLE_SIZE = ICON_POLYNOMIAL_NUM_COEFFICIENTS * (1 + (ICON_MAX_LOG_K - ICON_MIN_LOG_K));
|
53
|
+
|
54
|
+
static const double ICON_POLYNOMIAL_COEFFICIENTS[ICON_TABLE_SIZE] = {
|
55
|
+
|
56
|
+
// log K = 4
|
57
|
+
0.9895027971889700513, 0.3319496644645180128, 0.1242818722715769986, -0.03324149686026930256, -0.2985637298081619817,
|
58
|
+
1.366555923595830002, -4.705499366260569971, 11.61506432505530029, -21.11254986175579873, 28.89421695078809904,
|
59
|
+
-30.1383659011730991, 24.11946778830730054, -14.83391445199539938, 6.983088767267210173, -2.48964120264876998,
|
60
|
+
0.6593243603602499947, -0.125493534558034997, 0.01620971672896159843, -0.001271267679036929953, 4.567178653294529745e-05,
|
61
|
+
|
62
|
+
// log K = 5
|
63
|
+
0.9947713741300230339, 0.3326559581620939787, 0.1250050661634889981, -0.04130073804472530336, -0.2584095537451129854,
|
64
|
+
1.218050389433120051, -4.319106696095399656, 10.87175052045090062, -20.0184979022142997, 27.63210188163320069,
|
65
|
+
-28.97950009664030091, 23.26740804691930009, -14.33375703270860058, 6.751281271241110105, -2.406363094133439962,
|
66
|
+
0.6367414734718820357, -0.1210468076141379967, 0.01561196698118279963, -0.001222335432128580056, 4.383502970318410206e-05,
|
67
|
+
|
68
|
+
// log K = 6
|
69
|
+
0.9973904854982870161, 0.3330148852217920119, 0.125251536589509993, -0.04434075124043219962, -0.2436238890691720116,
|
70
|
+
1.163293254754570016, -4.177758779777369647, 10.60301981340099964, -19.6274507428828997, 27.18420839597660077,
|
71
|
+
-28.56827214174580121, 22.96268674086600114, -14.15234202220280046, 6.665700662642549901, -2.375043356720739851,
|
72
|
+
0.6280993991240929608, -0.119319019358031006, 0.01537674055733759954, -0.001202881695730769916, 4.309894633186929849e-05,
|
73
|
+
|
74
|
+
// log K = 7
|
75
|
+
0.9986963310058679655, 0.3331956705633329907, 0.125337696770523005, -0.04546817338088020299, -0.2386752211125199863,
|
76
|
+
1.145927328111949972, -4.135694445582720036, 10.52805060502839929, -19.52408322548339825, 27.06921653903929936,
|
77
|
+
-28.46207532143190022, 22.88083524357429965, -14.10057147392659971, 6.63958754983273991, -2.364865219283200037,
|
78
|
+
0.6251341806425250169, -0.1186991327450530043, 0.0152892726403408008, -0.001195439764873199896, 4.281098416794090072e-05,
|
79
|
+
|
80
|
+
// log K = 8
|
81
|
+
0.999348600452531044, 0.3332480372393080148, 0.126666900963325002, -0.06495714694254159371, -0.08376282050638980681,
|
82
|
+
0.3760158094643630267, -1.568204791601850001, 4.483117719555970382, -9.119180124379150598, 13.65799293358900002,
|
83
|
+
-15.3100211234349004, 12.97546344654869976, -8.351661538536939489, 4.075022612435580172, -1.49387015887069996,
|
84
|
+
0.4040976870253379927, -0.07813232681879349328, 0.01020545649538820085, -0.0008063279210812720381, 2.909334976414100078e-05,
|
85
|
+
|
86
|
+
// log K = 9
|
87
|
+
0.9996743787297059924, 0.3332925779481850093, 0.1267124599259649986, -0.06550452970936600228, -0.08191738117533520214,
|
88
|
+
0.3773034458363569987, -1.604679509609959975, 4.636761898691969641, -9.487348609558699408, 14.25164235443030059,
|
89
|
+
-15.99674955529870068, 13.56353219046370029, -8.730194904342459594, 4.259010067932120336, -1.56106689792022002,
|
90
|
+
0.4222540912786589828, -0.08165296504921559784, 0.01066878484925220041, -0.0008433887618256910015, 3.045339724886519912e-05,
|
91
|
+
|
92
|
+
// log K = 10
|
93
|
+
0.999837191783945034, 0.3333142252339619804, 0.1267759538087240012, -0.06631005632753710077, -0.07692759158286699428,
|
94
|
+
0.3568943956395980166, -1.546598721379510044, 4.51595019978557044, -9.298431968763770428, 14.02586858080080034,
|
95
|
+
-15.78858959520439953, 13.41484931677589998, -8.647958125130809748, 4.22398017468472009, -1.549708891200570093,
|
96
|
+
0.419507410264540026, -0.08117411611046250475, 0.01061202286184199928, -0.000839300527596772007, 3.03185874520205985e-05,
|
97
|
+
|
98
|
+
// log K = 11
|
99
|
+
0.9999186020796150265, 0.3333249054574359826, 0.126791713589799987, -0.06662487271699729652, -0.07335552427910230211,
|
100
|
+
0.3316370184815959909, -1.434143797561290068, 4.180260309967409604, -8.593906870708760692, 12.95088874800289958,
|
101
|
+
-14.56876092520539956, 12.37074367531410068, -7.969152075707960137, 3.888774396648960074, -1.424923326506990051,
|
102
|
+
0.385084561785229984, -0.07435541911616409816, 0.009695363567476529554, -0.0007644375960047160388, 2.75156194717188011e-05,
|
103
|
+
|
104
|
+
// log K = 12
|
105
|
+
0.9999592955649559967, 0.3333310560725140093, 0.1267379744020450116, -0.06524495415766619344, -0.08854031542298740343,
|
106
|
+
0.4244320628874230228, -1.794077789033230008, 5.133875262768450298, -10.40149374917120007, 15.47808115629240078,
|
107
|
+
-17.2272296137545986, 14.5002173676463002, -9.274819801602760094, 4.500782540026570189, -1.642359389030050076,
|
108
|
+
0.442596113445525019, -0.0853226219238850947, 0.01111969379054169975, -0.0008771614088006969611, 3.161668519459719752e-05,
|
109
|
+
|
110
|
+
// log K = 13
|
111
|
+
0.9999796468102559732, 0.3333336602394039727, 0.126728089053198989, -0.06503798598282370391, -0.09050261023823169548,
|
112
|
+
0.4350609244189960201, -1.831274835815670077, 5.223387516985289913, -10.55574395269979959, 15.67359470222429962,
|
113
|
+
-17.41263416341029924, 14.63297400889229927, -9.346752431221359458, 4.530124905188380069, -1.651245566462089975,
|
114
|
+
0.444542549250713015, -0.08561720963336499901, 0.01114805146185449992, -0.0008786251203363140043, 3.16416341644572998e-05,
|
115
|
+
|
116
|
+
// log K = 14
|
117
|
+
0.9999898187060970445, 0.3333362579300819806, 0.1266984078369459976, -0.06464561179765909715, -0.09343280886228019777,
|
118
|
+
0.4490702549264070087, -1.878087608052450008, 5.338004322057390283, -10.76690603590630069, 15.97069195083200022,
|
119
|
+
-17.73440379943459888, 14.90212518309260048, -9.520506013770420495, 4.616238931978830173, -1.68364817877918993,
|
120
|
+
0.4536194960681350086, -0.087448605434800597, 0.01139929991331390009, -0.0008995891451622229631, 3.244407259782900338e-05,
|
121
|
+
|
122
|
+
// log K = 15
|
123
|
+
0.9999949072549390028, 0.3333376334705290267, 0.126665364358402005, -0.06411790034705669439, -0.09776009134670660128,
|
124
|
+
0.4704691112248470253, -1.948021675295769972, 5.497760972696490001, -11.03165645315390009, 16.29703330781000048,
|
125
|
+
-18.03851029448010124, 15.11836776139680083, -9.638205179917429533, 4.665122328753120051, -1.698980686525759953,
|
126
|
+
0.4571799506245269873, -0.08804011353783609828, 0.01146553155965330043, -0.0009040455800659569869, 3.257931866957050274e-05,
|
127
|
+
|
128
|
+
// log K = 16
|
129
|
+
0.9999974544793589493, 0.3333381337614599871, 0.1266524862971120102, -0.06391676499117690535, -0.09929616211306059592,
|
130
|
+
0.4771390820378790254, -1.965762451227349938, 5.526802350376460282, -11.05703067024660058, 16.29535848023060041,
|
131
|
+
-18.00114005075790047, 15.06214012231560062, -9.58874727382628933, 4.63537541652793017, -1.686222848555620102,
|
132
|
+
0.4532602373715179933, -0.08719448925964939923, 0.01134365425717459921, -0.0008934965241274289835, 3.216436244471380105e-05,
|
133
|
+
|
134
|
+
// log K = 17
|
135
|
+
0.9999987278278800185, 0.3333383411464330148, 0.126642761751724009, -0.06371042959073920653, -0.1013564516034080043,
|
136
|
+
0.4891311195679299839, -2.010971712051409899, 5.644390807952309963, -11.27697253921500042, 16.59957157207080058,
|
137
|
+
-18.31808338317799922, 15.31363518393730061, -9.741451446816620674, 4.706207545519429658, -1.711102469010010063,
|
138
|
+
0.4597587341089349744, -0.08841670767182820134, 0.01149999225097850068, -0.0009056651366963050422, 3.259910736274500059e-05,
|
139
|
+
|
140
|
+
// log K = 18
|
141
|
+
0.9999993637727100371, 0.3333385511608860097, 0.1266341580529160016, -0.06353272828164230335, -0.103139962850642003,
|
142
|
+
0.4996216017206500104, -2.05099128585287982, 5.749874086531799655, -11.47727638570349917, 16.88141587810320132,
|
143
|
+
-18.61744656177490143, 15.55634230427719977, -9.892350736128680211, 4.778033520984200422, -1.737045483861280104,
|
144
|
+
0.4667410882683730167, -0.08977256212421590165, 0.01167940146667079994, -0.0009201381242396030127, 3.313600701586759867e-05,
|
145
|
+
|
146
|
+
// log K = 19
|
147
|
+
0.9999996805376010212, 0.3333372324328989778, 0.1267104737214659882, -0.06504749929326139601, -0.0882341962464350954,
|
148
|
+
0.4131871162041140244, -1.725190703567099915, 4.900817515593920426, -9.883452720776510603, 14.6657081190816001,
|
149
|
+
-16.29398295135089825, 13.69805011761319946, -8.753475239465899449, 4.244072374564439976, -1.547202527706629915,
|
150
|
+
0.4164770109614310267, -0.08017596922092029565, 0.01043146101701039954, -0.00082124200571200305, 2.953319493719429935e-05,
|
151
|
+
|
152
|
+
// log K = 20
|
153
|
+
0.9999998390037539986, 0.3333365859956040067, 0.1267460211029839967, -0.06569456024647769843, -0.0823070353477164951,
|
154
|
+
0.3810826463303410017, -1.611983580241109992, 4.624520077758210057, -9.397308335633589138, 14.03184981378050011,
|
155
|
+
-15.6703191315401007, 13.22992718704790072, -8.484216393184780713, 4.125607133488029987, -1.507690650697159906,
|
156
|
+
0.4066678517577320129, -0.07842110121777939868, 0.01021780862225150042, -0.0008054065857047439754, 2.899431830426989844e-05,
|
157
|
+
|
158
|
+
// log K = 21
|
159
|
+
0.9999999207001479817, 0.3333384953015239849, 0.1266331480396669928, -0.06345750166298599892, -0.1042341210992499961,
|
160
|
+
0.5077112908497130039, -2.087398133609810191, 5.858842546192500222, -11.70620319777190055, 17.23103975433669888,
|
161
|
+
-19.01462552846669851, 15.89674059836560005, -10.11395134034419918, 4.88760796465891989, -1.777886770904629987,
|
162
|
+
0.4780200178339499839, -0.09200895321782050218, 0.01198029553244219989, -0.0009447283875782100165, 3.405716775824710232e-05,
|
163
|
+
|
164
|
+
// log K = 22
|
165
|
+
0.9999999606908690497, 0.3333383929524300071, 0.1266456445096819927, -0.06373504294081690225, -0.1012834291081849969,
|
166
|
+
0.4893810690172959998, -2.01391428223606983, 5.656430437473649597, -11.3067201537791, 16.64980594135310099,
|
167
|
+
-18.3792355790383013, 15.36879753115040081, -9.778831246425049528, 4.725308061988969577, -1.718423596500280093,
|
168
|
+
0.4618308177809870019, -0.08883675060799739454, 0.01155766944804260087, -0.0009104695617243750358, 3.278237729674439666e-05,
|
169
|
+
|
170
|
+
// log K = 23
|
171
|
+
0.9999999794683379628, 0.3333386441751680085, 0.1266463995182049995, -0.06376031920455070556, -0.1010799540803130059,
|
172
|
+
0.488540137426137, -2.012048323537570127, 5.654949475342659682, -11.31023240892979942, 16.66334675284959843,
|
173
|
+
-18.40241452866079896, 15.39443572867130072, -9.798844412838670692, 4.736683907539640082, -1.723168363744929987,
|
174
|
+
0.463270349018644001, -0.08914619066708899531, 0.01160235936257320022, -0.0009143600818183229709, 3.293669304679140117e-05,
|
175
|
+
|
176
|
+
// log K = 24
|
177
|
+
0.9999999911469820146, 0.3333376076934529975, 0.1266944349940530012, -0.06470524278387919381, -0.09189342220283110152,
|
178
|
+
0.4359182372694809793, -1.815980282951169977, 5.149474056470340066, -10.37086570678100017, 15.36962686758569951,
|
179
|
+
-17.05756384717849983, 14.32755177515199918, -9.149944050025640152, 4.434601894497260055, -1.616478926806520056,
|
180
|
+
0.4351979157055039793, -0.08381768225272340223, 0.01091321820476520016, -0.0008600264403629039739, 3.09667800347144002e-05,
|
181
|
+
|
182
|
+
// log K = 25
|
183
|
+
0.9999999968592140354, 0.3333379164881000167, 0.1266782495827009913, -0.06434163088961859789, -0.09575258124988890451,
|
184
|
+
0.4597843575354370049, -1.911374431241559924, 5.411856661251520428, -10.88850084646090011, 16.12298941380269923,
|
185
|
+
-17.88172178487259956, 15.01301780636859995, -9.585542896142529301, 4.645811872761620442, -1.693952293156189892,
|
186
|
+
0.4563143308861309921, -0.08795976148455289523, 0.01146560428011200033, -0.0009048442931930629528, 3.26358391497329992e-05,
|
187
|
+
|
188
|
+
// log K = 26
|
189
|
+
0.9999999970700530483, 0.333338329556315982, 0.126644753076394001, -0.06372365346512399997, -0.1012760856945769949,
|
190
|
+
0.4886852278576360176, -2.009005418394389952, 5.638119224137019714, -11.26276715335160006, 16.57640024218650154,
|
191
|
+
-18.29035093605569884, 15.28892246224570073, -9.724916375991760731, 4.6978877652334603, -1.707974125916829955,
|
192
|
+
0.4588937864564729963, -0.08824617586088029375, 0.01147732114826570046, -0.00090384524860747295, 3.253252703695579795e-05,
|
193
|
+
|
194
|
+
#ifdef LARGER_K_VALUES
|
195
|
+
// log K = 27
|
196
|
+
1.000000000639100106, 0.3333378987508219815, 0.126670943746902992, -0.06418811974745139426, -0.0972951198506895043,
|
197
|
+
0.4687977077401049852, -1.945290489888900076, 5.499494964974400268, -11.05078190574979935, 16.3446428009706004,
|
198
|
+
-18.10936908931320133, 15.19089294103859977, -9.691829972777059155, 4.694320543263319934, -1.710719212277360013,
|
199
|
+
0.4606257962161550146, -0.08875858006645380438, 0.01156634964444109952, -0.0009125838337464230437, 3.290907977404550287e-05,
|
200
|
+
|
201
|
+
// log K = 28
|
202
|
+
0.9999999993590269476, 0.3333385660745579737, 0.1266394134278630013, -0.0636305053404186971, -0.1022354305220320031,
|
203
|
+
0.4945787360853979853, -2.032468917547570086, 5.702461924065530319, -11.38943406618639997, 16.76052144140630062,
|
204
|
+
-18.49169753114890113, 15.4564578116809006, -9.831507534599410292, 4.749667961030789698, -1.72701519749717991,
|
205
|
+
0.4640997252013580043, -0.08927103511252110213, 0.01161455495023329919, -0.000915030036039231982, 3.295110296010450275e-05,
|
206
|
+
|
207
|
+
// log K = 29
|
208
|
+
0.9999999998441060356, 0.3333383341194189886, 0.1266687338487519909, -0.06416245828383730643, -0.09764561286937140094,
|
209
|
+
0.4715274747139350242, -1.958172229464169911, 5.539587632966780362, -11.13784217611559946, 16.48149277721759987,
|
210
|
+
-18.26888916646990069, 15.33085193018819936, -9.78493991484172021, 4.741302923579859829, -1.728568959451310061,
|
211
|
+
0.4656457646521020011, -0.08977142058582450457, 0.01170492245846839995, -0.0009240931538567209464, 3.334703207098030245e-05,
|
212
|
+
|
213
|
+
// log K = 30
|
214
|
+
0.9999999992599339915, 0.3333384538468979752, 0.1266452025739940035, -0.06374775920488300052, -0.1009917742909720029,
|
215
|
+
0.4867931642504759737, -2.000981224888669807, 5.614968747087539569, -11.21527907219130071, 16.50500949673639894,
|
216
|
+
-18.21007853829650003, 15.22056128176249956, -9.680565515478869898, 4.675983737170599674, -1.69980511941418011,
|
217
|
+
0.4566332138743600111, -0.08779650251621799739, 0.01141656381272189956, -0.0008988545845624889468, 3.234448025291899689e-05,
|
218
|
+
|
219
|
+
// log K = 31
|
220
|
+
0.9999999973204000137, 0.333337762450663988, 0.1266965469104399944, -0.06475154253624139378, -0.09133098208494490333,
|
221
|
+
0.4320356889637699815, -1.799236887220760028, 5.100971076171499696, -10.27175516606700079, 15.22198757843720074,
|
222
|
+
-16.89368636262300072, 14.19016571851859965, -9.062390133299189188, 4.39220025249522994, -1.600994848692480099,
|
223
|
+
0.4310075283759189912, -0.08300339267288289746, 0.01080584419810979961, -0.0008514267355136160122, 3.065110087496039805e-05,
|
224
|
+
|
225
|
+
// log K = 32
|
226
|
+
0.9999999987706390536, 0.3333387038350890119, 0.1266354589419070031, -0.06355195838981600454, -0.102952771506954005,
|
227
|
+
0.4983589546197609854, -2.045281215270029929, 5.732181222451769642, -11.43849817800069957, 16.81961198331340057,
|
228
|
+
-18.54433120118400069, 15.49126422718470053, -9.84846998787154071, 4.755615082534379923, -1.728430514092559989,
|
229
|
+
0.4642927653670489985, -0.08927380119154580684, 0.01161055316485629964, -0.0009143724787632470305, 3.291492066818770055e-05,
|
230
|
+
|
231
|
+
#endif
|
232
|
+
};
|
233
|
+
|
234
|
+
static double evaluate_polynomial(const double* coefficients, int start, int num, double x) {
|
235
|
+
const int final = start + num - 1;
|
236
|
+
double total = coefficients[final];
|
237
|
+
for (int j = final - 1; j >= start; j--) {
|
238
|
+
total *= x;
|
239
|
+
total += coefficients[j];
|
240
|
+
}
|
241
|
+
return total;
|
242
|
+
}
|
243
|
+
|
244
|
+
static double icon_exponential_approximation(double k, double c) {
|
245
|
+
return (0.7940236163830469 * k * pow(2.0, c / k));
|
246
|
+
}
|
247
|
+
|
248
|
+
static double compute_icon_estimate(uint8_t lg_k, uint64_t c) {
|
249
|
+
if (lg_k < ICON_MIN_LOG_K || lg_k > ICON_MAX_LOG_K) throw std::out_of_range("lg_k out of range");
|
250
|
+
if (c < 2) return ((c == 0) ? 0.0 : 1.0);
|
251
|
+
const size_t k = 1 << lg_k;
|
252
|
+
const double double_k = k;
|
253
|
+
const double double_c = c;
|
254
|
+
// Differing thresholds ensure that the approximated estimator is monotonically increasing.
|
255
|
+
const double threshold_factor = ((lg_k < 14) ? 5.7 : 5.6);
|
256
|
+
if (double_c > (threshold_factor * double_k)) return icon_exponential_approximation(double_k, double_c);
|
257
|
+
const double factor = evaluate_polynomial(
|
258
|
+
ICON_POLYNOMIAL_COEFFICIENTS,
|
259
|
+
ICON_POLYNOMIAL_NUM_COEFFICIENTS * (lg_k - ICON_MIN_LOG_K),
|
260
|
+
ICON_POLYNOMIAL_NUM_COEFFICIENTS,
|
261
|
+
// The somewhat arbitrary constant 2.0 is baked into the table ICON_POLYNOMIAL_COEFFICIENTS
|
262
|
+
double_c / (2.0 * double_k)
|
263
|
+
);
|
264
|
+
const double ratio = double_c / double_k;
|
265
|
+
// The somewhat arbitrary constant 66.774757 is baked into the table ICON_POLYNOMIAL_COEFFICIENTS
|
266
|
+
const double term = 1.0 + (ratio * ratio * ratio / 66.774757);
|
267
|
+
const double result = double_c * factor * term;
|
268
|
+
if (result >= double_c) return result;
|
269
|
+
else return double_c;
|
270
|
+
}
|
271
|
+
|
272
|
+
} /* namespace datasketches */
|
273
|
+
|
274
|
+
#endif
|
@@ -0,0 +1,81 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#ifndef KXP_BYTE_LOOKUP_HPP_
|
21
|
+
#define KXP_BYTE_LOOKUP_HPP_
|
22
|
+
|
23
|
+
namespace datasketches {
|
24
|
+
|
25
|
+
// the table was created by the following procedure:
|
26
|
+
|
27
|
+
//void fill_kxp_byte_table() {
|
28
|
+
// for (int byte = 0; byte < 256; byte++) {
|
29
|
+
// double sum = 0.0;
|
30
|
+
// for (int col = 0; col < 8; col++) {
|
31
|
+
// const uint8_t bit = (byte >> col) & 1;
|
32
|
+
// if (bit == 0) { // note the inverted logic
|
33
|
+
// sum += INVERSE_POWERS_OF_2[col + 1]; // note the "+1"
|
34
|
+
// }
|
35
|
+
// }
|
36
|
+
// kxp_byte_table[byte] = sum;
|
37
|
+
//
|
38
|
+
// printf("%.17g", kxp_byte_table[byte]);
|
39
|
+
// if (byte != 255) printf(", ");
|
40
|
+
// if ((byte + 1) % 8 == 0) printf("\n");
|
41
|
+
// }
|
42
|
+
//}
|
43
|
+
|
44
|
+
static const double KXP_BYTE_TABLE[256] = {
|
45
|
+
0.99609375, 0.49609375, 0.74609375, 0.24609375, 0.87109375, 0.37109375, 0.62109375, 0.12109375,
|
46
|
+
0.93359375, 0.43359375, 0.68359375, 0.18359375, 0.80859375, 0.30859375, 0.55859375, 0.05859375,
|
47
|
+
0.96484375, 0.46484375, 0.71484375, 0.21484375, 0.83984375, 0.33984375, 0.58984375, 0.08984375,
|
48
|
+
0.90234375, 0.40234375, 0.65234375, 0.15234375, 0.77734375, 0.27734375, 0.52734375, 0.02734375,
|
49
|
+
0.98046875, 0.48046875, 0.73046875, 0.23046875, 0.85546875, 0.35546875, 0.60546875, 0.10546875,
|
50
|
+
0.91796875, 0.41796875, 0.66796875, 0.16796875, 0.79296875, 0.29296875, 0.54296875, 0.04296875,
|
51
|
+
0.94921875, 0.44921875, 0.69921875, 0.19921875, 0.82421875, 0.32421875, 0.57421875, 0.07421875,
|
52
|
+
0.88671875, 0.38671875, 0.63671875, 0.13671875, 0.76171875, 0.26171875, 0.51171875, 0.01171875,
|
53
|
+
0.98828125, 0.48828125, 0.73828125, 0.23828125, 0.86328125, 0.36328125, 0.61328125, 0.11328125,
|
54
|
+
0.92578125, 0.42578125, 0.67578125, 0.17578125, 0.80078125, 0.30078125, 0.55078125, 0.05078125,
|
55
|
+
0.95703125, 0.45703125, 0.70703125, 0.20703125, 0.83203125, 0.33203125, 0.58203125, 0.08203125,
|
56
|
+
0.89453125, 0.39453125, 0.64453125, 0.14453125, 0.76953125, 0.26953125, 0.51953125, 0.01953125,
|
57
|
+
0.97265625, 0.47265625, 0.72265625, 0.22265625, 0.84765625, 0.34765625, 0.59765625, 0.09765625,
|
58
|
+
0.91015625, 0.41015625, 0.66015625, 0.16015625, 0.78515625, 0.28515625, 0.53515625, 0.03515625,
|
59
|
+
0.94140625, 0.44140625, 0.69140625, 0.19140625, 0.81640625, 0.31640625, 0.56640625, 0.06640625,
|
60
|
+
0.87890625, 0.37890625, 0.62890625, 0.12890625, 0.75390625, 0.25390625, 0.50390625, 0.00390625,
|
61
|
+
0.9921875, 0.4921875, 0.7421875, 0.2421875, 0.8671875, 0.3671875, 0.6171875, 0.1171875,
|
62
|
+
0.9296875, 0.4296875, 0.6796875, 0.1796875, 0.8046875, 0.3046875, 0.5546875, 0.0546875,
|
63
|
+
0.9609375, 0.4609375, 0.7109375, 0.2109375, 0.8359375, 0.3359375, 0.5859375, 0.0859375,
|
64
|
+
0.8984375, 0.3984375, 0.6484375, 0.1484375, 0.7734375, 0.2734375, 0.5234375, 0.0234375,
|
65
|
+
0.9765625, 0.4765625, 0.7265625, 0.2265625, 0.8515625, 0.3515625, 0.6015625, 0.1015625,
|
66
|
+
0.9140625, 0.4140625, 0.6640625, 0.1640625, 0.7890625, 0.2890625, 0.5390625, 0.0390625,
|
67
|
+
0.9453125, 0.4453125, 0.6953125, 0.1953125, 0.8203125, 0.3203125, 0.5703125, 0.0703125,
|
68
|
+
0.8828125, 0.3828125, 0.6328125, 0.1328125, 0.7578125, 0.2578125, 0.5078125, 0.0078125,
|
69
|
+
0.984375, 0.484375, 0.734375, 0.234375, 0.859375, 0.359375, 0.609375, 0.109375,
|
70
|
+
0.921875, 0.421875, 0.671875, 0.171875, 0.796875, 0.296875, 0.546875, 0.046875,
|
71
|
+
0.953125, 0.453125, 0.703125, 0.203125, 0.828125, 0.328125, 0.578125, 0.078125,
|
72
|
+
0.890625, 0.390625, 0.640625, 0.140625, 0.765625, 0.265625, 0.515625, 0.015625,
|
73
|
+
0.96875, 0.46875, 0.71875, 0.21875, 0.84375, 0.34375, 0.59375, 0.09375,
|
74
|
+
0.90625, 0.40625, 0.65625, 0.15625, 0.78125, 0.28125, 0.53125, 0.03125,
|
75
|
+
0.9375, 0.4375, 0.6875, 0.1875, 0.8125, 0.3125, 0.5625, 0.0625,
|
76
|
+
0.875, 0.375, 0.625, 0.125, 0.75, 0.25, 0.5, 0
|
77
|
+
};
|
78
|
+
|
79
|
+
} /* namespace datasketches */
|
80
|
+
|
81
|
+
#endif
|
@@ -0,0 +1,84 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
// author Kevin Lang, Oath Research
|
21
|
+
|
22
|
+
#ifndef U32_TABLE_HPP_
|
23
|
+
#define U32_TABLE_HPP_
|
24
|
+
|
25
|
+
// This is a highly specialized hash table that was designed
|
26
|
+
// to be a part of the library's CPC sketch implementation
|
27
|
+
|
28
|
+
#include "cpc_common.hpp"
|
29
|
+
|
30
|
+
namespace datasketches {
|
31
|
+
|
32
|
+
static const uint64_t U32_TABLE_UPSIZE_NUMER = 3LL;
|
33
|
+
static const uint64_t U32_TABLE_UPSIZE_DENOM = 4LL;
|
34
|
+
|
35
|
+
static const uint64_t U32_TABLE_DOWNSIZE_NUMER = 1LL;
|
36
|
+
static const uint64_t U32_TABLE_DOWNSIZE_DENOM = 4LL;
|
37
|
+
|
38
|
+
template<typename A>
|
39
|
+
class u32_table {
|
40
|
+
public:
|
41
|
+
|
42
|
+
u32_table();
|
43
|
+
u32_table(uint8_t lg_size, uint8_t num_valid_bits);
|
44
|
+
|
45
|
+
inline size_t get_num_items() const;
|
46
|
+
inline const uint32_t* get_slots() const;
|
47
|
+
inline uint8_t get_lg_size() const;
|
48
|
+
inline void clear();
|
49
|
+
|
50
|
+
// returns true iff the item was new and was therefore added to the table
|
51
|
+
inline bool maybe_insert(uint32_t item);
|
52
|
+
// returns true iff the item was present and was therefore removed from the table
|
53
|
+
inline bool maybe_delete(uint32_t item);
|
54
|
+
|
55
|
+
static u32_table make_from_pairs(const uint32_t* pairs, size_t num_pairs, uint8_t lg_k);
|
56
|
+
|
57
|
+
vector_u32<A> unwrapping_get_items() const;
|
58
|
+
|
59
|
+
static void merge(
|
60
|
+
const uint32_t* arr_a, size_t start_a, size_t length_a, // input
|
61
|
+
const uint32_t* arr_b, size_t start_b, size_t length_b, // input
|
62
|
+
uint32_t* arr_c, size_t start_c // output
|
63
|
+
);
|
64
|
+
|
65
|
+
static void introspective_insertion_sort(uint32_t* a, size_t l, size_t r);
|
66
|
+
static void knuth_shell_sort3(uint32_t* a, size_t l, size_t r);
|
67
|
+
|
68
|
+
private:
|
69
|
+
|
70
|
+
uint8_t lg_size; // log2 of number of slots
|
71
|
+
uint8_t num_valid_bits;
|
72
|
+
size_t num_items;
|
73
|
+
vector_u32<A> slots;
|
74
|
+
|
75
|
+
inline size_t lookup(uint32_t item) const;
|
76
|
+
inline void must_insert(uint32_t item);
|
77
|
+
inline void rebuild(uint8_t new_lg_size);
|
78
|
+
};
|
79
|
+
|
80
|
+
} /* namespace datasketches */
|
81
|
+
|
82
|
+
#include "u32_table_impl.hpp"
|
83
|
+
|
84
|
+
#endif
|
@@ -0,0 +1,266 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
// author Kevin Lang, Oath Research
|
21
|
+
|
22
|
+
#ifndef U32_TABLE_IMPL_HPP_
|
23
|
+
#define U32_TABLE_IMPL_HPP_
|
24
|
+
|
25
|
+
#include <stdexcept>
|
26
|
+
#include <algorithm>
|
27
|
+
#include <climits>
|
28
|
+
|
29
|
+
namespace datasketches {
|
30
|
+
|
31
|
+
template<typename A>
|
32
|
+
u32_table<A>::u32_table():
|
33
|
+
lg_size(0),
|
34
|
+
num_valid_bits(0),
|
35
|
+
num_items(0),
|
36
|
+
slots()
|
37
|
+
{}
|
38
|
+
|
39
|
+
template<typename A>
|
40
|
+
u32_table<A>::u32_table(uint8_t lg_size, uint8_t num_valid_bits):
|
41
|
+
lg_size(lg_size),
|
42
|
+
num_valid_bits(num_valid_bits),
|
43
|
+
num_items(0),
|
44
|
+
slots(1 << lg_size, UINT32_MAX)
|
45
|
+
{
|
46
|
+
if (lg_size < 2) throw std::invalid_argument("lg_size must be >= 2");
|
47
|
+
if (num_valid_bits < 1 || num_valid_bits > 32) throw std::invalid_argument("num_valid_bits must be between 1 and 32");
|
48
|
+
}
|
49
|
+
|
50
|
+
template<typename A>
|
51
|
+
size_t u32_table<A>::get_num_items() const {
|
52
|
+
return num_items;
|
53
|
+
}
|
54
|
+
|
55
|
+
template<typename A>
|
56
|
+
const uint32_t* u32_table<A>::get_slots() const {
|
57
|
+
return slots.data();
|
58
|
+
}
|
59
|
+
|
60
|
+
template<typename A>
|
61
|
+
uint8_t u32_table<A>::get_lg_size() const {
|
62
|
+
return lg_size;
|
63
|
+
}
|
64
|
+
|
65
|
+
template<typename A>
|
66
|
+
void u32_table<A>::clear() {
|
67
|
+
std::fill(slots.begin(), slots.end(), UINT32_MAX);
|
68
|
+
num_items = 0;
|
69
|
+
}
|
70
|
+
|
71
|
+
template<typename A>
|
72
|
+
bool u32_table<A>::maybe_insert(uint32_t item) {
|
73
|
+
const size_t index = lookup(item);
|
74
|
+
if (slots[index] == item) return false;
|
75
|
+
if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert");
|
76
|
+
slots[index] = item;
|
77
|
+
num_items++;
|
78
|
+
if (U32_TABLE_UPSIZE_DENOM * num_items > U32_TABLE_UPSIZE_NUMER * (1 << lg_size)) {
|
79
|
+
rebuild(lg_size + 1);
|
80
|
+
}
|
81
|
+
return true;
|
82
|
+
}
|
83
|
+
|
84
|
+
template<typename A>
|
85
|
+
bool u32_table<A>::maybe_delete(uint32_t item) {
|
86
|
+
const size_t index = lookup(item);
|
87
|
+
if (slots[index] == UINT32_MAX) return false;
|
88
|
+
if (slots[index] != item) throw std::logic_error("item does not exist");
|
89
|
+
if (num_items == 0) throw std::logic_error("delete error");
|
90
|
+
// delete the item
|
91
|
+
slots[index] = UINT32_MAX;
|
92
|
+
num_items--;
|
93
|
+
|
94
|
+
// re-insert all items between the freed slot and the next empty slot
|
95
|
+
const size_t mask = (1 << lg_size) - 1;
|
96
|
+
size_t probe = (index + 1) & mask;
|
97
|
+
uint32_t fetched = slots[probe];
|
98
|
+
while (fetched != UINT32_MAX) {
|
99
|
+
slots[probe] = UINT32_MAX;
|
100
|
+
must_insert(fetched);
|
101
|
+
probe = (probe + 1) & mask;
|
102
|
+
fetched = slots[probe];
|
103
|
+
}
|
104
|
+
// shrink if necessary
|
105
|
+
if (U32_TABLE_DOWNSIZE_DENOM * num_items < U32_TABLE_DOWNSIZE_NUMER * (1 << lg_size) && lg_size > 2) {
|
106
|
+
rebuild(lg_size - 1);
|
107
|
+
}
|
108
|
+
return true;
|
109
|
+
}
|
110
|
+
|
111
|
+
// this one is specifically tailored to be a part of fm85 decompression scheme
|
112
|
+
template<typename A>
|
113
|
+
u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs, size_t num_pairs, uint8_t lg_k) {
|
114
|
+
uint8_t lg_num_slots = 2;
|
115
|
+
while (U32_TABLE_UPSIZE_DENOM * num_pairs > U32_TABLE_UPSIZE_NUMER * (1 << lg_num_slots)) lg_num_slots++;
|
116
|
+
u32_table<A> table(lg_num_slots, 6 + lg_k);
|
117
|
+
// Note: there is a possible "snowplow effect" here because the caller is passing in a sorted pairs array
|
118
|
+
// However, we are starting out with the correct final table size, so the problem might not occur
|
119
|
+
for (size_t i = 0; i < num_pairs; i++) {
|
120
|
+
table.must_insert(pairs[i]);
|
121
|
+
}
|
122
|
+
table.num_items = num_pairs;
|
123
|
+
return table;
|
124
|
+
}
|
125
|
+
|
126
|
+
template<typename A>
|
127
|
+
size_t u32_table<A>::lookup(uint32_t item) const {
|
128
|
+
const size_t size = 1 << lg_size;
|
129
|
+
const size_t mask = size - 1;
|
130
|
+
const uint8_t shift = num_valid_bits - lg_size;
|
131
|
+
size_t probe = item >> shift;
|
132
|
+
if (probe > mask) throw std::logic_error("probe out of range");
|
133
|
+
while (slots[probe] != item && slots[probe] != UINT32_MAX) {
|
134
|
+
probe = (probe + 1) & mask;
|
135
|
+
}
|
136
|
+
return probe;
|
137
|
+
}
|
138
|
+
|
139
|
+
// counts and resizing must be handled by the caller
|
140
|
+
template<typename A>
|
141
|
+
void u32_table<A>::must_insert(uint32_t item) {
|
142
|
+
const size_t index = lookup(item);
|
143
|
+
if (slots[index] == item) throw std::logic_error("item exists");
|
144
|
+
if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert");
|
145
|
+
slots[index] = item;
|
146
|
+
}
|
147
|
+
|
148
|
+
template<typename A>
|
149
|
+
void u32_table<A>::rebuild(uint8_t new_lg_size) {
|
150
|
+
if (new_lg_size < 2) throw std::logic_error("lg_size must be >= 2");
|
151
|
+
const size_t old_size = 1 << lg_size;
|
152
|
+
const size_t new_size = 1 << new_lg_size;
|
153
|
+
if (new_size <= num_items) throw std::logic_error("new_size <= num_items");
|
154
|
+
vector_u32<A> old_slots = std::move(slots);
|
155
|
+
slots = vector_u32<A>(new_size, UINT32_MAX);
|
156
|
+
lg_size = new_lg_size;
|
157
|
+
for (size_t i = 0; i < old_size; i++) {
|
158
|
+
if (old_slots[i] != UINT32_MAX) {
|
159
|
+
must_insert(old_slots[i]);
|
160
|
+
}
|
161
|
+
}
|
162
|
+
}
|
163
|
+
|
164
|
+
// While extracting the items from a linear probing hashtable,
|
165
|
+
// this will usually undo the wrap-around provided that the table
|
166
|
+
// isn't too full. Experiments suggest that for sufficiently large tables
|
167
|
+
// the load factor would have to be over 90 percent before this would fail frequently,
|
168
|
+
// and even then the subsequent sort would fix things up.
|
169
|
+
// The result is nearly sorted, so make sure to use an efficient sort for that case
|
170
|
+
template<typename A>
|
171
|
+
vector_u32<A> u32_table<A>::unwrapping_get_items() const {
|
172
|
+
if (num_items == 0) return vector_u32<A>();
|
173
|
+
const size_t table_size = 1 << lg_size;
|
174
|
+
vector_u32<A> result(num_items);
|
175
|
+
size_t i = 0;
|
176
|
+
size_t l = 0;
|
177
|
+
size_t r = num_items - 1;
|
178
|
+
|
179
|
+
// special rules for the region before the first empty slot
|
180
|
+
uint32_t hi_bit = 1 << (num_valid_bits - 1);
|
181
|
+
while (i < table_size && slots[i] != UINT32_MAX) {
|
182
|
+
const uint32_t item = slots[i++];
|
183
|
+
if (item & hi_bit) { result[r--] = item; } // this item was probably wrapped, so move to end
|
184
|
+
else { result[l++] = item; }
|
185
|
+
}
|
186
|
+
|
187
|
+
// the rest of the table is processed normally
|
188
|
+
while (i < table_size) {
|
189
|
+
const uint32_t item = slots[i++];
|
190
|
+
if (item != UINT32_MAX) result[l++] = item;
|
191
|
+
}
|
192
|
+
if (l != r + 1) throw std::logic_error("unwrapping error");
|
193
|
+
return result;
|
194
|
+
}
|
195
|
+
|
196
|
+
// This merge is safe to use in carefully designed overlapping scenarios.
|
197
|
+
template<typename A>
|
198
|
+
void u32_table<A>::merge(
|
199
|
+
const uint32_t* arr_a, size_t start_a, size_t length_a, // input
|
200
|
+
const uint32_t* arr_b, size_t start_b, size_t length_b, // input
|
201
|
+
uint32_t* arr_c, size_t start_c // output
|
202
|
+
) {
|
203
|
+
const size_t length_c = length_a + length_b;
|
204
|
+
const size_t lim_a = start_a + length_a;
|
205
|
+
const size_t lim_b = start_b + length_b;
|
206
|
+
const size_t lim_c = start_c + length_c;
|
207
|
+
size_t a = start_a;
|
208
|
+
size_t b = start_b;
|
209
|
+
size_t c = start_c;
|
210
|
+
for ( ; c < lim_c ; c++) {
|
211
|
+
if (b >= lim_b) { arr_c[c] = arr_a[a++]; }
|
212
|
+
else if (a >= lim_a) { arr_c[c] = arr_b[b++]; }
|
213
|
+
else if (arr_a[a] < arr_b[b]) { arr_c[c] = arr_a[a++]; }
|
214
|
+
else { arr_c[c] = arr_b[b++]; }
|
215
|
+
}
|
216
|
+
if (a != lim_a || b != lim_b) throw std::logic_error("merging error");
|
217
|
+
}
|
218
|
+
|
219
|
+
// In applications where the input array is already nearly sorted,
|
220
|
+
// insertion sort runs in linear time with a very small constant.
|
221
|
+
// This introspective version of insertion sort protects against
|
222
|
+
// the quadratic cost of sorting bad input arrays.
|
223
|
+
// It keeps track of how much work has been done, and if that exceeds a
|
224
|
+
// constant times the array length, it switches to a different sorting algorithm.
|
225
|
+
|
226
|
+
template<typename A>
|
227
|
+
void u32_table<A>::introspective_insertion_sort(uint32_t* a, size_t l, size_t r) { // r points past the rightmost element
|
228
|
+
const size_t length = r - l;
|
229
|
+
const size_t cost_limit = 8 * length;
|
230
|
+
size_t cost = 0;
|
231
|
+
for (size_t i = l + 1; i < r; i++) {
|
232
|
+
size_t j = i;
|
233
|
+
uint32_t v = a[i];
|
234
|
+
while (j >= l + 1 && v < a[j - 1]) {
|
235
|
+
a[j] = a[j - 1];
|
236
|
+
j--;
|
237
|
+
}
|
238
|
+
a[j] = v;
|
239
|
+
cost += i - j; // distance moved is a measure of work
|
240
|
+
if (cost > cost_limit) {
|
241
|
+
knuth_shell_sort3(a, l, r);
|
242
|
+
return;
|
243
|
+
}
|
244
|
+
}
|
245
|
+
}
|
246
|
+
|
247
|
+
template<typename A>
|
248
|
+
void u32_table<A>::knuth_shell_sort3(uint32_t* a, size_t l, size_t r) {
|
249
|
+
size_t h;
|
250
|
+
for (h = 1; h < (r - l) / 9; h = 3 * h + 1);
|
251
|
+
for ( ; h > 0; h /= 3) {
|
252
|
+
for (size_t i = l + h; i < r; i++) {
|
253
|
+
size_t j = i;
|
254
|
+
const uint32_t v = a[i];
|
255
|
+
while (j >= l + h && v < a[j - h]) {
|
256
|
+
a[j] = a[j - h];
|
257
|
+
j -= h;
|
258
|
+
}
|
259
|
+
a[j] = v;
|
260
|
+
}
|
261
|
+
}
|
262
|
+
}
|
263
|
+
|
264
|
+
} /* namespace datasketches */
|
265
|
+
|
266
|
+
#endif
|