datasketches 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
@@ -0,0 +1,274 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
// author Kevin Lang, Oath Research
|
21
|
+
|
22
|
+
#ifndef ICON_ESTIMATOR_HPP_
|
23
|
+
#define ICON_ESTIMATOR_HPP_
|
24
|
+
|
25
|
+
#include <cmath>
|
26
|
+
#include <cstdint>
|
27
|
+
#include <stdexcept>
|
28
|
+
|
29
|
+
namespace datasketches {
|
30
|
+
|
31
|
+
// The ICON estimator for FM85 sketches is defined by the arXiv paper.
|
32
|
+
|
33
|
+
// The current file provides exact and approximate implementations of this estimator.
|
34
|
+
|
35
|
+
// The exact version works for any value of K, but is quite slow.
|
36
|
+
|
37
|
+
// The much faster approximate version works for K values that are powers of two
|
38
|
+
// ranging from 2^4 to 2^32.
|
39
|
+
|
40
|
+
// At a high-level, this approximation can be described as using an
|
41
|
+
// exponential approximation when C > K * (5.6 or 5.7), while smaller
|
42
|
+
// values of C are handled by a degree-19 polynomial approximation of
|
43
|
+
// a pre-conditioned version of the true ICON mapping from C to N_hat.
|
44
|
+
|
45
|
+
// This file also provides a validation procedure that compares its approximate
|
46
|
+
// and exact implementations of the FM85 ICON estimator.
|
47
|
+
|
48
|
+
static const int ICON_MIN_LOG_K = 4;
|
49
|
+
static const int ICON_MAX_LOG_K = 26;
|
50
|
+
static const int ICON_POLYNOMIAL_DEGREE = 19;
|
51
|
+
static const int ICON_POLYNOMIAL_NUM_COEFFICIENTS = 1 + ICON_POLYNOMIAL_DEGREE;
|
52
|
+
static const int ICON_TABLE_SIZE = ICON_POLYNOMIAL_NUM_COEFFICIENTS * (1 + (ICON_MAX_LOG_K - ICON_MIN_LOG_K));
|
53
|
+
|
54
|
+
static const double ICON_POLYNOMIAL_COEFFICIENTS[ICON_TABLE_SIZE] = {
|
55
|
+
|
56
|
+
// log K = 4
|
57
|
+
0.9895027971889700513, 0.3319496644645180128, 0.1242818722715769986, -0.03324149686026930256, -0.2985637298081619817,
|
58
|
+
1.366555923595830002, -4.705499366260569971, 11.61506432505530029, -21.11254986175579873, 28.89421695078809904,
|
59
|
+
-30.1383659011730991, 24.11946778830730054, -14.83391445199539938, 6.983088767267210173, -2.48964120264876998,
|
60
|
+
0.6593243603602499947, -0.125493534558034997, 0.01620971672896159843, -0.001271267679036929953, 4.567178653294529745e-05,
|
61
|
+
|
62
|
+
// log K = 5
|
63
|
+
0.9947713741300230339, 0.3326559581620939787, 0.1250050661634889981, -0.04130073804472530336, -0.2584095537451129854,
|
64
|
+
1.218050389433120051, -4.319106696095399656, 10.87175052045090062, -20.0184979022142997, 27.63210188163320069,
|
65
|
+
-28.97950009664030091, 23.26740804691930009, -14.33375703270860058, 6.751281271241110105, -2.406363094133439962,
|
66
|
+
0.6367414734718820357, -0.1210468076141379967, 0.01561196698118279963, -0.001222335432128580056, 4.383502970318410206e-05,
|
67
|
+
|
68
|
+
// log K = 6
|
69
|
+
0.9973904854982870161, 0.3330148852217920119, 0.125251536589509993, -0.04434075124043219962, -0.2436238890691720116,
|
70
|
+
1.163293254754570016, -4.177758779777369647, 10.60301981340099964, -19.6274507428828997, 27.18420839597660077,
|
71
|
+
-28.56827214174580121, 22.96268674086600114, -14.15234202220280046, 6.665700662642549901, -2.375043356720739851,
|
72
|
+
0.6280993991240929608, -0.119319019358031006, 0.01537674055733759954, -0.001202881695730769916, 4.309894633186929849e-05,
|
73
|
+
|
74
|
+
// log K = 7
|
75
|
+
0.9986963310058679655, 0.3331956705633329907, 0.125337696770523005, -0.04546817338088020299, -0.2386752211125199863,
|
76
|
+
1.145927328111949972, -4.135694445582720036, 10.52805060502839929, -19.52408322548339825, 27.06921653903929936,
|
77
|
+
-28.46207532143190022, 22.88083524357429965, -14.10057147392659971, 6.63958754983273991, -2.364865219283200037,
|
78
|
+
0.6251341806425250169, -0.1186991327450530043, 0.0152892726403408008, -0.001195439764873199896, 4.281098416794090072e-05,
|
79
|
+
|
80
|
+
// log K = 8
|
81
|
+
0.999348600452531044, 0.3332480372393080148, 0.126666900963325002, -0.06495714694254159371, -0.08376282050638980681,
|
82
|
+
0.3760158094643630267, -1.568204791601850001, 4.483117719555970382, -9.119180124379150598, 13.65799293358900002,
|
83
|
+
-15.3100211234349004, 12.97546344654869976, -8.351661538536939489, 4.075022612435580172, -1.49387015887069996,
|
84
|
+
0.4040976870253379927, -0.07813232681879349328, 0.01020545649538820085, -0.0008063279210812720381, 2.909334976414100078e-05,
|
85
|
+
|
86
|
+
// log K = 9
|
87
|
+
0.9996743787297059924, 0.3332925779481850093, 0.1267124599259649986, -0.06550452970936600228, -0.08191738117533520214,
|
88
|
+
0.3773034458363569987, -1.604679509609959975, 4.636761898691969641, -9.487348609558699408, 14.25164235443030059,
|
89
|
+
-15.99674955529870068, 13.56353219046370029, -8.730194904342459594, 4.259010067932120336, -1.56106689792022002,
|
90
|
+
0.4222540912786589828, -0.08165296504921559784, 0.01066878484925220041, -0.0008433887618256910015, 3.045339724886519912e-05,
|
91
|
+
|
92
|
+
// log K = 10
|
93
|
+
0.999837191783945034, 0.3333142252339619804, 0.1267759538087240012, -0.06631005632753710077, -0.07692759158286699428,
|
94
|
+
0.3568943956395980166, -1.546598721379510044, 4.51595019978557044, -9.298431968763770428, 14.02586858080080034,
|
95
|
+
-15.78858959520439953, 13.41484931677589998, -8.647958125130809748, 4.22398017468472009, -1.549708891200570093,
|
96
|
+
0.419507410264540026, -0.08117411611046250475, 0.01061202286184199928, -0.000839300527596772007, 3.03185874520205985e-05,
|
97
|
+
|
98
|
+
// log K = 11
|
99
|
+
0.9999186020796150265, 0.3333249054574359826, 0.126791713589799987, -0.06662487271699729652, -0.07335552427910230211,
|
100
|
+
0.3316370184815959909, -1.434143797561290068, 4.180260309967409604, -8.593906870708760692, 12.95088874800289958,
|
101
|
+
-14.56876092520539956, 12.37074367531410068, -7.969152075707960137, 3.888774396648960074, -1.424923326506990051,
|
102
|
+
0.385084561785229984, -0.07435541911616409816, 0.009695363567476529554, -0.0007644375960047160388, 2.75156194717188011e-05,
|
103
|
+
|
104
|
+
// log K = 12
|
105
|
+
0.9999592955649559967, 0.3333310560725140093, 0.1267379744020450116, -0.06524495415766619344, -0.08854031542298740343,
|
106
|
+
0.4244320628874230228, -1.794077789033230008, 5.133875262768450298, -10.40149374917120007, 15.47808115629240078,
|
107
|
+
-17.2272296137545986, 14.5002173676463002, -9.274819801602760094, 4.500782540026570189, -1.642359389030050076,
|
108
|
+
0.442596113445525019, -0.0853226219238850947, 0.01111969379054169975, -0.0008771614088006969611, 3.161668519459719752e-05,
|
109
|
+
|
110
|
+
// log K = 13
|
111
|
+
0.9999796468102559732, 0.3333336602394039727, 0.126728089053198989, -0.06503798598282370391, -0.09050261023823169548,
|
112
|
+
0.4350609244189960201, -1.831274835815670077, 5.223387516985289913, -10.55574395269979959, 15.67359470222429962,
|
113
|
+
-17.41263416341029924, 14.63297400889229927, -9.346752431221359458, 4.530124905188380069, -1.651245566462089975,
|
114
|
+
0.444542549250713015, -0.08561720963336499901, 0.01114805146185449992, -0.0008786251203363140043, 3.16416341644572998e-05,
|
115
|
+
|
116
|
+
// log K = 14
|
117
|
+
0.9999898187060970445, 0.3333362579300819806, 0.1266984078369459976, -0.06464561179765909715, -0.09343280886228019777,
|
118
|
+
0.4490702549264070087, -1.878087608052450008, 5.338004322057390283, -10.76690603590630069, 15.97069195083200022,
|
119
|
+
-17.73440379943459888, 14.90212518309260048, -9.520506013770420495, 4.616238931978830173, -1.68364817877918993,
|
120
|
+
0.4536194960681350086, -0.087448605434800597, 0.01139929991331390009, -0.0008995891451622229631, 3.244407259782900338e-05,
|
121
|
+
|
122
|
+
// log K = 15
|
123
|
+
0.9999949072549390028, 0.3333376334705290267, 0.126665364358402005, -0.06411790034705669439, -0.09776009134670660128,
|
124
|
+
0.4704691112248470253, -1.948021675295769972, 5.497760972696490001, -11.03165645315390009, 16.29703330781000048,
|
125
|
+
-18.03851029448010124, 15.11836776139680083, -9.638205179917429533, 4.665122328753120051, -1.698980686525759953,
|
126
|
+
0.4571799506245269873, -0.08804011353783609828, 0.01146553155965330043, -0.0009040455800659569869, 3.257931866957050274e-05,
|
127
|
+
|
128
|
+
// log K = 16
|
129
|
+
0.9999974544793589493, 0.3333381337614599871, 0.1266524862971120102, -0.06391676499117690535, -0.09929616211306059592,
|
130
|
+
0.4771390820378790254, -1.965762451227349938, 5.526802350376460282, -11.05703067024660058, 16.29535848023060041,
|
131
|
+
-18.00114005075790047, 15.06214012231560062, -9.58874727382628933, 4.63537541652793017, -1.686222848555620102,
|
132
|
+
0.4532602373715179933, -0.08719448925964939923, 0.01134365425717459921, -0.0008934965241274289835, 3.216436244471380105e-05,
|
133
|
+
|
134
|
+
// log K = 17
|
135
|
+
0.9999987278278800185, 0.3333383411464330148, 0.126642761751724009, -0.06371042959073920653, -0.1013564516034080043,
|
136
|
+
0.4891311195679299839, -2.010971712051409899, 5.644390807952309963, -11.27697253921500042, 16.59957157207080058,
|
137
|
+
-18.31808338317799922, 15.31363518393730061, -9.741451446816620674, 4.706207545519429658, -1.711102469010010063,
|
138
|
+
0.4597587341089349744, -0.08841670767182820134, 0.01149999225097850068, -0.0009056651366963050422, 3.259910736274500059e-05,
|
139
|
+
|
140
|
+
// log K = 18
|
141
|
+
0.9999993637727100371, 0.3333385511608860097, 0.1266341580529160016, -0.06353272828164230335, -0.103139962850642003,
|
142
|
+
0.4996216017206500104, -2.05099128585287982, 5.749874086531799655, -11.47727638570349917, 16.88141587810320132,
|
143
|
+
-18.61744656177490143, 15.55634230427719977, -9.892350736128680211, 4.778033520984200422, -1.737045483861280104,
|
144
|
+
0.4667410882683730167, -0.08977256212421590165, 0.01167940146667079994, -0.0009201381242396030127, 3.313600701586759867e-05,
|
145
|
+
|
146
|
+
// log K = 19
|
147
|
+
0.9999996805376010212, 0.3333372324328989778, 0.1267104737214659882, -0.06504749929326139601, -0.0882341962464350954,
|
148
|
+
0.4131871162041140244, -1.725190703567099915, 4.900817515593920426, -9.883452720776510603, 14.6657081190816001,
|
149
|
+
-16.29398295135089825, 13.69805011761319946, -8.753475239465899449, 4.244072374564439976, -1.547202527706629915,
|
150
|
+
0.4164770109614310267, -0.08017596922092029565, 0.01043146101701039954, -0.00082124200571200305, 2.953319493719429935e-05,
|
151
|
+
|
152
|
+
// log K = 20
|
153
|
+
0.9999998390037539986, 0.3333365859956040067, 0.1267460211029839967, -0.06569456024647769843, -0.0823070353477164951,
|
154
|
+
0.3810826463303410017, -1.611983580241109992, 4.624520077758210057, -9.397308335633589138, 14.03184981378050011,
|
155
|
+
-15.6703191315401007, 13.22992718704790072, -8.484216393184780713, 4.125607133488029987, -1.507690650697159906,
|
156
|
+
0.4066678517577320129, -0.07842110121777939868, 0.01021780862225150042, -0.0008054065857047439754, 2.899431830426989844e-05,
|
157
|
+
|
158
|
+
// log K = 21
|
159
|
+
0.9999999207001479817, 0.3333384953015239849, 0.1266331480396669928, -0.06345750166298599892, -0.1042341210992499961,
|
160
|
+
0.5077112908497130039, -2.087398133609810191, 5.858842546192500222, -11.70620319777190055, 17.23103975433669888,
|
161
|
+
-19.01462552846669851, 15.89674059836560005, -10.11395134034419918, 4.88760796465891989, -1.777886770904629987,
|
162
|
+
0.4780200178339499839, -0.09200895321782050218, 0.01198029553244219989, -0.0009447283875782100165, 3.405716775824710232e-05,
|
163
|
+
|
164
|
+
// log K = 22
|
165
|
+
0.9999999606908690497, 0.3333383929524300071, 0.1266456445096819927, -0.06373504294081690225, -0.1012834291081849969,
|
166
|
+
0.4893810690172959998, -2.01391428223606983, 5.656430437473649597, -11.3067201537791, 16.64980594135310099,
|
167
|
+
-18.3792355790383013, 15.36879753115040081, -9.778831246425049528, 4.725308061988969577, -1.718423596500280093,
|
168
|
+
0.4618308177809870019, -0.08883675060799739454, 0.01155766944804260087, -0.0009104695617243750358, 3.278237729674439666e-05,
|
169
|
+
|
170
|
+
// log K = 23
|
171
|
+
0.9999999794683379628, 0.3333386441751680085, 0.1266463995182049995, -0.06376031920455070556, -0.1010799540803130059,
|
172
|
+
0.488540137426137, -2.012048323537570127, 5.654949475342659682, -11.31023240892979942, 16.66334675284959843,
|
173
|
+
-18.40241452866079896, 15.39443572867130072, -9.798844412838670692, 4.736683907539640082, -1.723168363744929987,
|
174
|
+
0.463270349018644001, -0.08914619066708899531, 0.01160235936257320022, -0.0009143600818183229709, 3.293669304679140117e-05,
|
175
|
+
|
176
|
+
// log K = 24
|
177
|
+
0.9999999911469820146, 0.3333376076934529975, 0.1266944349940530012, -0.06470524278387919381, -0.09189342220283110152,
|
178
|
+
0.4359182372694809793, -1.815980282951169977, 5.149474056470340066, -10.37086570678100017, 15.36962686758569951,
|
179
|
+
-17.05756384717849983, 14.32755177515199918, -9.149944050025640152, 4.434601894497260055, -1.616478926806520056,
|
180
|
+
0.4351979157055039793, -0.08381768225272340223, 0.01091321820476520016, -0.0008600264403629039739, 3.09667800347144002e-05,
|
181
|
+
|
182
|
+
// log K = 25
|
183
|
+
0.9999999968592140354, 0.3333379164881000167, 0.1266782495827009913, -0.06434163088961859789, -0.09575258124988890451,
|
184
|
+
0.4597843575354370049, -1.911374431241559924, 5.411856661251520428, -10.88850084646090011, 16.12298941380269923,
|
185
|
+
-17.88172178487259956, 15.01301780636859995, -9.585542896142529301, 4.645811872761620442, -1.693952293156189892,
|
186
|
+
0.4563143308861309921, -0.08795976148455289523, 0.01146560428011200033, -0.0009048442931930629528, 3.26358391497329992e-05,
|
187
|
+
|
188
|
+
// log K = 26
|
189
|
+
0.9999999970700530483, 0.333338329556315982, 0.126644753076394001, -0.06372365346512399997, -0.1012760856945769949,
|
190
|
+
0.4886852278576360176, -2.009005418394389952, 5.638119224137019714, -11.26276715335160006, 16.57640024218650154,
|
191
|
+
-18.29035093605569884, 15.28892246224570073, -9.724916375991760731, 4.6978877652334603, -1.707974125916829955,
|
192
|
+
0.4588937864564729963, -0.08824617586088029375, 0.01147732114826570046, -0.00090384524860747295, 3.253252703695579795e-05,
|
193
|
+
|
194
|
+
#ifdef LARGER_K_VALUES
|
195
|
+
// log K = 27
|
196
|
+
1.000000000639100106, 0.3333378987508219815, 0.126670943746902992, -0.06418811974745139426, -0.0972951198506895043,
|
197
|
+
0.4687977077401049852, -1.945290489888900076, 5.499494964974400268, -11.05078190574979935, 16.3446428009706004,
|
198
|
+
-18.10936908931320133, 15.19089294103859977, -9.691829972777059155, 4.694320543263319934, -1.710719212277360013,
|
199
|
+
0.4606257962161550146, -0.08875858006645380438, 0.01156634964444109952, -0.0009125838337464230437, 3.290907977404550287e-05,
|
200
|
+
|
201
|
+
// log K = 28
|
202
|
+
0.9999999993590269476, 0.3333385660745579737, 0.1266394134278630013, -0.0636305053404186971, -0.1022354305220320031,
|
203
|
+
0.4945787360853979853, -2.032468917547570086, 5.702461924065530319, -11.38943406618639997, 16.76052144140630062,
|
204
|
+
-18.49169753114890113, 15.4564578116809006, -9.831507534599410292, 4.749667961030789698, -1.72701519749717991,
|
205
|
+
0.4640997252013580043, -0.08927103511252110213, 0.01161455495023329919, -0.000915030036039231982, 3.295110296010450275e-05,
|
206
|
+
|
207
|
+
// log K = 29
|
208
|
+
0.9999999998441060356, 0.3333383341194189886, 0.1266687338487519909, -0.06416245828383730643, -0.09764561286937140094,
|
209
|
+
0.4715274747139350242, -1.958172229464169911, 5.539587632966780362, -11.13784217611559946, 16.48149277721759987,
|
210
|
+
-18.26888916646990069, 15.33085193018819936, -9.78493991484172021, 4.741302923579859829, -1.728568959451310061,
|
211
|
+
0.4656457646521020011, -0.08977142058582450457, 0.01170492245846839995, -0.0009240931538567209464, 3.334703207098030245e-05,
|
212
|
+
|
213
|
+
// log K = 30
|
214
|
+
0.9999999992599339915, 0.3333384538468979752, 0.1266452025739940035, -0.06374775920488300052, -0.1009917742909720029,
|
215
|
+
0.4867931642504759737, -2.000981224888669807, 5.614968747087539569, -11.21527907219130071, 16.50500949673639894,
|
216
|
+
-18.21007853829650003, 15.22056128176249956, -9.680565515478869898, 4.675983737170599674, -1.69980511941418011,
|
217
|
+
0.4566332138743600111, -0.08779650251621799739, 0.01141656381272189956, -0.0008988545845624889468, 3.234448025291899689e-05,
|
218
|
+
|
219
|
+
// log K = 31
|
220
|
+
0.9999999973204000137, 0.333337762450663988, 0.1266965469104399944, -0.06475154253624139378, -0.09133098208494490333,
|
221
|
+
0.4320356889637699815, -1.799236887220760028, 5.100971076171499696, -10.27175516606700079, 15.22198757843720074,
|
222
|
+
-16.89368636262300072, 14.19016571851859965, -9.062390133299189188, 4.39220025249522994, -1.600994848692480099,
|
223
|
+
0.4310075283759189912, -0.08300339267288289746, 0.01080584419810979961, -0.0008514267355136160122, 3.065110087496039805e-05,
|
224
|
+
|
225
|
+
// log K = 32
|
226
|
+
0.9999999987706390536, 0.3333387038350890119, 0.1266354589419070031, -0.06355195838981600454, -0.102952771506954005,
|
227
|
+
0.4983589546197609854, -2.045281215270029929, 5.732181222451769642, -11.43849817800069957, 16.81961198331340057,
|
228
|
+
-18.54433120118400069, 15.49126422718470053, -9.84846998787154071, 4.755615082534379923, -1.728430514092559989,
|
229
|
+
0.4642927653670489985, -0.08927380119154580684, 0.01161055316485629964, -0.0009143724787632470305, 3.291492066818770055e-05,
|
230
|
+
|
231
|
+
#endif
|
232
|
+
};
|
233
|
+
|
234
|
+
static double evaluate_polynomial(const double* coefficients, int start, int num, double x) {
|
235
|
+
const int final = start + num - 1;
|
236
|
+
double total = coefficients[final];
|
237
|
+
for (int j = final - 1; j >= start; j--) {
|
238
|
+
total *= x;
|
239
|
+
total += coefficients[j];
|
240
|
+
}
|
241
|
+
return total;
|
242
|
+
}
|
243
|
+
|
244
|
+
static double icon_exponential_approximation(double k, double c) {
|
245
|
+
return (0.7940236163830469 * k * pow(2.0, c / k));
|
246
|
+
}
|
247
|
+
|
248
|
+
static double compute_icon_estimate(uint8_t lg_k, uint64_t c) {
|
249
|
+
if (lg_k < ICON_MIN_LOG_K || lg_k > ICON_MAX_LOG_K) throw std::out_of_range("lg_k out of range");
|
250
|
+
if (c < 2) return ((c == 0) ? 0.0 : 1.0);
|
251
|
+
const size_t k = 1 << lg_k;
|
252
|
+
const double double_k = k;
|
253
|
+
const double double_c = c;
|
254
|
+
// Differing thresholds ensure that the approximated estimator is monotonically increasing.
|
255
|
+
const double threshold_factor = ((lg_k < 14) ? 5.7 : 5.6);
|
256
|
+
if (double_c > (threshold_factor * double_k)) return icon_exponential_approximation(double_k, double_c);
|
257
|
+
const double factor = evaluate_polynomial(
|
258
|
+
ICON_POLYNOMIAL_COEFFICIENTS,
|
259
|
+
ICON_POLYNOMIAL_NUM_COEFFICIENTS * (lg_k - ICON_MIN_LOG_K),
|
260
|
+
ICON_POLYNOMIAL_NUM_COEFFICIENTS,
|
261
|
+
// The somewhat arbitrary constant 2.0 is baked into the table ICON_POLYNOMIAL_COEFFICIENTS
|
262
|
+
double_c / (2.0 * double_k)
|
263
|
+
);
|
264
|
+
const double ratio = double_c / double_k;
|
265
|
+
// The somewhat arbitrary constant 66.774757 is baked into the table ICON_POLYNOMIAL_COEFFICIENTS
|
266
|
+
const double term = 1.0 + (ratio * ratio * ratio / 66.774757);
|
267
|
+
const double result = double_c * factor * term;
|
268
|
+
if (result >= double_c) return result;
|
269
|
+
else return double_c;
|
270
|
+
}
|
271
|
+
|
272
|
+
} /* namespace datasketches */
|
273
|
+
|
274
|
+
#endif
|
@@ -0,0 +1,81 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#ifndef KXP_BYTE_LOOKUP_HPP_
|
21
|
+
#define KXP_BYTE_LOOKUP_HPP_
|
22
|
+
|
23
|
+
namespace datasketches {
|
24
|
+
|
25
|
+
// the table was created by the following procedure:
|
26
|
+
|
27
|
+
//void fill_kxp_byte_table() {
|
28
|
+
// for (int byte = 0; byte < 256; byte++) {
|
29
|
+
// double sum = 0.0;
|
30
|
+
// for (int col = 0; col < 8; col++) {
|
31
|
+
// const uint8_t bit = (byte >> col) & 1;
|
32
|
+
// if (bit == 0) { // note the inverted logic
|
33
|
+
// sum += INVERSE_POWERS_OF_2[col + 1]; // note the "+1"
|
34
|
+
// }
|
35
|
+
// }
|
36
|
+
// kxp_byte_table[byte] = sum;
|
37
|
+
//
|
38
|
+
// printf("%.17g", kxp_byte_table[byte]);
|
39
|
+
// if (byte != 255) printf(", ");
|
40
|
+
// if ((byte + 1) % 8 == 0) printf("\n");
|
41
|
+
// }
|
42
|
+
//}
|
43
|
+
|
44
|
+
static const double KXP_BYTE_TABLE[256] = {
|
45
|
+
0.99609375, 0.49609375, 0.74609375, 0.24609375, 0.87109375, 0.37109375, 0.62109375, 0.12109375,
|
46
|
+
0.93359375, 0.43359375, 0.68359375, 0.18359375, 0.80859375, 0.30859375, 0.55859375, 0.05859375,
|
47
|
+
0.96484375, 0.46484375, 0.71484375, 0.21484375, 0.83984375, 0.33984375, 0.58984375, 0.08984375,
|
48
|
+
0.90234375, 0.40234375, 0.65234375, 0.15234375, 0.77734375, 0.27734375, 0.52734375, 0.02734375,
|
49
|
+
0.98046875, 0.48046875, 0.73046875, 0.23046875, 0.85546875, 0.35546875, 0.60546875, 0.10546875,
|
50
|
+
0.91796875, 0.41796875, 0.66796875, 0.16796875, 0.79296875, 0.29296875, 0.54296875, 0.04296875,
|
51
|
+
0.94921875, 0.44921875, 0.69921875, 0.19921875, 0.82421875, 0.32421875, 0.57421875, 0.07421875,
|
52
|
+
0.88671875, 0.38671875, 0.63671875, 0.13671875, 0.76171875, 0.26171875, 0.51171875, 0.01171875,
|
53
|
+
0.98828125, 0.48828125, 0.73828125, 0.23828125, 0.86328125, 0.36328125, 0.61328125, 0.11328125,
|
54
|
+
0.92578125, 0.42578125, 0.67578125, 0.17578125, 0.80078125, 0.30078125, 0.55078125, 0.05078125,
|
55
|
+
0.95703125, 0.45703125, 0.70703125, 0.20703125, 0.83203125, 0.33203125, 0.58203125, 0.08203125,
|
56
|
+
0.89453125, 0.39453125, 0.64453125, 0.14453125, 0.76953125, 0.26953125, 0.51953125, 0.01953125,
|
57
|
+
0.97265625, 0.47265625, 0.72265625, 0.22265625, 0.84765625, 0.34765625, 0.59765625, 0.09765625,
|
58
|
+
0.91015625, 0.41015625, 0.66015625, 0.16015625, 0.78515625, 0.28515625, 0.53515625, 0.03515625,
|
59
|
+
0.94140625, 0.44140625, 0.69140625, 0.19140625, 0.81640625, 0.31640625, 0.56640625, 0.06640625,
|
60
|
+
0.87890625, 0.37890625, 0.62890625, 0.12890625, 0.75390625, 0.25390625, 0.50390625, 0.00390625,
|
61
|
+
0.9921875, 0.4921875, 0.7421875, 0.2421875, 0.8671875, 0.3671875, 0.6171875, 0.1171875,
|
62
|
+
0.9296875, 0.4296875, 0.6796875, 0.1796875, 0.8046875, 0.3046875, 0.5546875, 0.0546875,
|
63
|
+
0.9609375, 0.4609375, 0.7109375, 0.2109375, 0.8359375, 0.3359375, 0.5859375, 0.0859375,
|
64
|
+
0.8984375, 0.3984375, 0.6484375, 0.1484375, 0.7734375, 0.2734375, 0.5234375, 0.0234375,
|
65
|
+
0.9765625, 0.4765625, 0.7265625, 0.2265625, 0.8515625, 0.3515625, 0.6015625, 0.1015625,
|
66
|
+
0.9140625, 0.4140625, 0.6640625, 0.1640625, 0.7890625, 0.2890625, 0.5390625, 0.0390625,
|
67
|
+
0.9453125, 0.4453125, 0.6953125, 0.1953125, 0.8203125, 0.3203125, 0.5703125, 0.0703125,
|
68
|
+
0.8828125, 0.3828125, 0.6328125, 0.1328125, 0.7578125, 0.2578125, 0.5078125, 0.0078125,
|
69
|
+
0.984375, 0.484375, 0.734375, 0.234375, 0.859375, 0.359375, 0.609375, 0.109375,
|
70
|
+
0.921875, 0.421875, 0.671875, 0.171875, 0.796875, 0.296875, 0.546875, 0.046875,
|
71
|
+
0.953125, 0.453125, 0.703125, 0.203125, 0.828125, 0.328125, 0.578125, 0.078125,
|
72
|
+
0.890625, 0.390625, 0.640625, 0.140625, 0.765625, 0.265625, 0.515625, 0.015625,
|
73
|
+
0.96875, 0.46875, 0.71875, 0.21875, 0.84375, 0.34375, 0.59375, 0.09375,
|
74
|
+
0.90625, 0.40625, 0.65625, 0.15625, 0.78125, 0.28125, 0.53125, 0.03125,
|
75
|
+
0.9375, 0.4375, 0.6875, 0.1875, 0.8125, 0.3125, 0.5625, 0.0625,
|
76
|
+
0.875, 0.375, 0.625, 0.125, 0.75, 0.25, 0.5, 0
|
77
|
+
};
|
78
|
+
|
79
|
+
} /* namespace datasketches */
|
80
|
+
|
81
|
+
#endif
|
@@ -0,0 +1,84 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
// author Kevin Lang, Oath Research
|
21
|
+
|
22
|
+
#ifndef U32_TABLE_HPP_
|
23
|
+
#define U32_TABLE_HPP_
|
24
|
+
|
25
|
+
// This is a highly specialized hash table that was designed
|
26
|
+
// to be a part of the library's CPC sketch implementation
|
27
|
+
|
28
|
+
#include "cpc_common.hpp"
|
29
|
+
|
30
|
+
namespace datasketches {
|
31
|
+
|
32
|
+
static const uint64_t U32_TABLE_UPSIZE_NUMER = 3LL;
|
33
|
+
static const uint64_t U32_TABLE_UPSIZE_DENOM = 4LL;
|
34
|
+
|
35
|
+
static const uint64_t U32_TABLE_DOWNSIZE_NUMER = 1LL;
|
36
|
+
static const uint64_t U32_TABLE_DOWNSIZE_DENOM = 4LL;
|
37
|
+
|
38
|
+
template<typename A>
|
39
|
+
class u32_table {
|
40
|
+
public:
|
41
|
+
|
42
|
+
u32_table();
|
43
|
+
u32_table(uint8_t lg_size, uint8_t num_valid_bits);
|
44
|
+
|
45
|
+
inline size_t get_num_items() const;
|
46
|
+
inline const uint32_t* get_slots() const;
|
47
|
+
inline uint8_t get_lg_size() const;
|
48
|
+
inline void clear();
|
49
|
+
|
50
|
+
// returns true iff the item was new and was therefore added to the table
|
51
|
+
inline bool maybe_insert(uint32_t item);
|
52
|
+
// returns true iff the item was present and was therefore removed from the table
|
53
|
+
inline bool maybe_delete(uint32_t item);
|
54
|
+
|
55
|
+
static u32_table make_from_pairs(const uint32_t* pairs, size_t num_pairs, uint8_t lg_k);
|
56
|
+
|
57
|
+
vector_u32<A> unwrapping_get_items() const;
|
58
|
+
|
59
|
+
static void merge(
|
60
|
+
const uint32_t* arr_a, size_t start_a, size_t length_a, // input
|
61
|
+
const uint32_t* arr_b, size_t start_b, size_t length_b, // input
|
62
|
+
uint32_t* arr_c, size_t start_c // output
|
63
|
+
);
|
64
|
+
|
65
|
+
static void introspective_insertion_sort(uint32_t* a, size_t l, size_t r);
|
66
|
+
static void knuth_shell_sort3(uint32_t* a, size_t l, size_t r);
|
67
|
+
|
68
|
+
private:
|
69
|
+
|
70
|
+
uint8_t lg_size; // log2 of number of slots
|
71
|
+
uint8_t num_valid_bits;
|
72
|
+
size_t num_items;
|
73
|
+
vector_u32<A> slots;
|
74
|
+
|
75
|
+
inline size_t lookup(uint32_t item) const;
|
76
|
+
inline void must_insert(uint32_t item);
|
77
|
+
inline void rebuild(uint8_t new_lg_size);
|
78
|
+
};
|
79
|
+
|
80
|
+
} /* namespace datasketches */
|
81
|
+
|
82
|
+
#include "u32_table_impl.hpp"
|
83
|
+
|
84
|
+
#endif
|
@@ -0,0 +1,266 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
// author Kevin Lang, Oath Research
|
21
|
+
|
22
|
+
#ifndef U32_TABLE_IMPL_HPP_
|
23
|
+
#define U32_TABLE_IMPL_HPP_
|
24
|
+
|
25
|
+
#include <stdexcept>
|
26
|
+
#include <algorithm>
|
27
|
+
#include <climits>
|
28
|
+
|
29
|
+
namespace datasketches {
|
30
|
+
|
31
|
+
template<typename A>
|
32
|
+
u32_table<A>::u32_table():
|
33
|
+
lg_size(0),
|
34
|
+
num_valid_bits(0),
|
35
|
+
num_items(0),
|
36
|
+
slots()
|
37
|
+
{}
|
38
|
+
|
39
|
+
template<typename A>
|
40
|
+
u32_table<A>::u32_table(uint8_t lg_size, uint8_t num_valid_bits):
|
41
|
+
lg_size(lg_size),
|
42
|
+
num_valid_bits(num_valid_bits),
|
43
|
+
num_items(0),
|
44
|
+
slots(1 << lg_size, UINT32_MAX)
|
45
|
+
{
|
46
|
+
if (lg_size < 2) throw std::invalid_argument("lg_size must be >= 2");
|
47
|
+
if (num_valid_bits < 1 || num_valid_bits > 32) throw std::invalid_argument("num_valid_bits must be between 1 and 32");
|
48
|
+
}
|
49
|
+
|
50
|
+
template<typename A>
|
51
|
+
size_t u32_table<A>::get_num_items() const {
|
52
|
+
return num_items;
|
53
|
+
}
|
54
|
+
|
55
|
+
template<typename A>
|
56
|
+
const uint32_t* u32_table<A>::get_slots() const {
|
57
|
+
return slots.data();
|
58
|
+
}
|
59
|
+
|
60
|
+
template<typename A>
|
61
|
+
uint8_t u32_table<A>::get_lg_size() const {
|
62
|
+
return lg_size;
|
63
|
+
}
|
64
|
+
|
65
|
+
template<typename A>
|
66
|
+
void u32_table<A>::clear() {
|
67
|
+
std::fill(slots.begin(), slots.end(), UINT32_MAX);
|
68
|
+
num_items = 0;
|
69
|
+
}
|
70
|
+
|
71
|
+
template<typename A>
|
72
|
+
bool u32_table<A>::maybe_insert(uint32_t item) {
|
73
|
+
const size_t index = lookup(item);
|
74
|
+
if (slots[index] == item) return false;
|
75
|
+
if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert");
|
76
|
+
slots[index] = item;
|
77
|
+
num_items++;
|
78
|
+
if (U32_TABLE_UPSIZE_DENOM * num_items > U32_TABLE_UPSIZE_NUMER * (1 << lg_size)) {
|
79
|
+
rebuild(lg_size + 1);
|
80
|
+
}
|
81
|
+
return true;
|
82
|
+
}
|
83
|
+
|
84
|
+
template<typename A>
|
85
|
+
bool u32_table<A>::maybe_delete(uint32_t item) {
|
86
|
+
const size_t index = lookup(item);
|
87
|
+
if (slots[index] == UINT32_MAX) return false;
|
88
|
+
if (slots[index] != item) throw std::logic_error("item does not exist");
|
89
|
+
if (num_items == 0) throw std::logic_error("delete error");
|
90
|
+
// delete the item
|
91
|
+
slots[index] = UINT32_MAX;
|
92
|
+
num_items--;
|
93
|
+
|
94
|
+
// re-insert all items between the freed slot and the next empty slot
|
95
|
+
const size_t mask = (1 << lg_size) - 1;
|
96
|
+
size_t probe = (index + 1) & mask;
|
97
|
+
uint32_t fetched = slots[probe];
|
98
|
+
while (fetched != UINT32_MAX) {
|
99
|
+
slots[probe] = UINT32_MAX;
|
100
|
+
must_insert(fetched);
|
101
|
+
probe = (probe + 1) & mask;
|
102
|
+
fetched = slots[probe];
|
103
|
+
}
|
104
|
+
// shrink if necessary
|
105
|
+
if (U32_TABLE_DOWNSIZE_DENOM * num_items < U32_TABLE_DOWNSIZE_NUMER * (1 << lg_size) && lg_size > 2) {
|
106
|
+
rebuild(lg_size - 1);
|
107
|
+
}
|
108
|
+
return true;
|
109
|
+
}
|
110
|
+
|
111
|
+
// this one is specifically tailored to be a part of fm85 decompression scheme
|
112
|
+
template<typename A>
|
113
|
+
u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs, size_t num_pairs, uint8_t lg_k) {
|
114
|
+
uint8_t lg_num_slots = 2;
|
115
|
+
while (U32_TABLE_UPSIZE_DENOM * num_pairs > U32_TABLE_UPSIZE_NUMER * (1 << lg_num_slots)) lg_num_slots++;
|
116
|
+
u32_table<A> table(lg_num_slots, 6 + lg_k);
|
117
|
+
// Note: there is a possible "snowplow effect" here because the caller is passing in a sorted pairs array
|
118
|
+
// However, we are starting out with the correct final table size, so the problem might not occur
|
119
|
+
for (size_t i = 0; i < num_pairs; i++) {
|
120
|
+
table.must_insert(pairs[i]);
|
121
|
+
}
|
122
|
+
table.num_items = num_pairs;
|
123
|
+
return table;
|
124
|
+
}
|
125
|
+
|
126
|
+
template<typename A>
|
127
|
+
size_t u32_table<A>::lookup(uint32_t item) const {
|
128
|
+
const size_t size = 1 << lg_size;
|
129
|
+
const size_t mask = size - 1;
|
130
|
+
const uint8_t shift = num_valid_bits - lg_size;
|
131
|
+
size_t probe = item >> shift;
|
132
|
+
if (probe > mask) throw std::logic_error("probe out of range");
|
133
|
+
while (slots[probe] != item && slots[probe] != UINT32_MAX) {
|
134
|
+
probe = (probe + 1) & mask;
|
135
|
+
}
|
136
|
+
return probe;
|
137
|
+
}
|
138
|
+
|
139
|
+
// counts and resizing must be handled by the caller
|
140
|
+
template<typename A>
|
141
|
+
void u32_table<A>::must_insert(uint32_t item) {
|
142
|
+
const size_t index = lookup(item);
|
143
|
+
if (slots[index] == item) throw std::logic_error("item exists");
|
144
|
+
if (slots[index] != UINT32_MAX) throw std::logic_error("could not insert");
|
145
|
+
slots[index] = item;
|
146
|
+
}
|
147
|
+
|
148
|
+
template<typename A>
|
149
|
+
void u32_table<A>::rebuild(uint8_t new_lg_size) {
|
150
|
+
if (new_lg_size < 2) throw std::logic_error("lg_size must be >= 2");
|
151
|
+
const size_t old_size = 1 << lg_size;
|
152
|
+
const size_t new_size = 1 << new_lg_size;
|
153
|
+
if (new_size <= num_items) throw std::logic_error("new_size <= num_items");
|
154
|
+
vector_u32<A> old_slots = std::move(slots);
|
155
|
+
slots = vector_u32<A>(new_size, UINT32_MAX);
|
156
|
+
lg_size = new_lg_size;
|
157
|
+
for (size_t i = 0; i < old_size; i++) {
|
158
|
+
if (old_slots[i] != UINT32_MAX) {
|
159
|
+
must_insert(old_slots[i]);
|
160
|
+
}
|
161
|
+
}
|
162
|
+
}
|
163
|
+
|
164
|
+
// While extracting the items from a linear probing hashtable,
|
165
|
+
// this will usually undo the wrap-around provided that the table
|
166
|
+
// isn't too full. Experiments suggest that for sufficiently large tables
|
167
|
+
// the load factor would have to be over 90 percent before this would fail frequently,
|
168
|
+
// and even then the subsequent sort would fix things up.
|
169
|
+
// The result is nearly sorted, so make sure to use an efficient sort for that case
|
170
|
+
template<typename A>
|
171
|
+
vector_u32<A> u32_table<A>::unwrapping_get_items() const {
|
172
|
+
if (num_items == 0) return vector_u32<A>();
|
173
|
+
const size_t table_size = 1 << lg_size;
|
174
|
+
vector_u32<A> result(num_items);
|
175
|
+
size_t i = 0;
|
176
|
+
size_t l = 0;
|
177
|
+
size_t r = num_items - 1;
|
178
|
+
|
179
|
+
// special rules for the region before the first empty slot
|
180
|
+
uint32_t hi_bit = 1 << (num_valid_bits - 1);
|
181
|
+
while (i < table_size && slots[i] != UINT32_MAX) {
|
182
|
+
const uint32_t item = slots[i++];
|
183
|
+
if (item & hi_bit) { result[r--] = item; } // this item was probably wrapped, so move to end
|
184
|
+
else { result[l++] = item; }
|
185
|
+
}
|
186
|
+
|
187
|
+
// the rest of the table is processed normally
|
188
|
+
while (i < table_size) {
|
189
|
+
const uint32_t item = slots[i++];
|
190
|
+
if (item != UINT32_MAX) result[l++] = item;
|
191
|
+
}
|
192
|
+
if (l != r + 1) throw std::logic_error("unwrapping error");
|
193
|
+
return result;
|
194
|
+
}
|
195
|
+
|
196
|
+
// This merge is safe to use in carefully designed overlapping scenarios.
|
197
|
+
template<typename A>
|
198
|
+
void u32_table<A>::merge(
|
199
|
+
const uint32_t* arr_a, size_t start_a, size_t length_a, // input
|
200
|
+
const uint32_t* arr_b, size_t start_b, size_t length_b, // input
|
201
|
+
uint32_t* arr_c, size_t start_c // output
|
202
|
+
) {
|
203
|
+
const size_t length_c = length_a + length_b;
|
204
|
+
const size_t lim_a = start_a + length_a;
|
205
|
+
const size_t lim_b = start_b + length_b;
|
206
|
+
const size_t lim_c = start_c + length_c;
|
207
|
+
size_t a = start_a;
|
208
|
+
size_t b = start_b;
|
209
|
+
size_t c = start_c;
|
210
|
+
for ( ; c < lim_c ; c++) {
|
211
|
+
if (b >= lim_b) { arr_c[c] = arr_a[a++]; }
|
212
|
+
else if (a >= lim_a) { arr_c[c] = arr_b[b++]; }
|
213
|
+
else if (arr_a[a] < arr_b[b]) { arr_c[c] = arr_a[a++]; }
|
214
|
+
else { arr_c[c] = arr_b[b++]; }
|
215
|
+
}
|
216
|
+
if (a != lim_a || b != lim_b) throw std::logic_error("merging error");
|
217
|
+
}
|
218
|
+
|
219
|
+
// In applications where the input array is already nearly sorted,
|
220
|
+
// insertion sort runs in linear time with a very small constant.
|
221
|
+
// This introspective version of insertion sort protects against
|
222
|
+
// the quadratic cost of sorting bad input arrays.
|
223
|
+
// It keeps track of how much work has been done, and if that exceeds a
|
224
|
+
// constant times the array length, it switches to a different sorting algorithm.
|
225
|
+
|
226
|
+
template<typename A>
|
227
|
+
void u32_table<A>::introspective_insertion_sort(uint32_t* a, size_t l, size_t r) { // r points past the rightmost element
|
228
|
+
const size_t length = r - l;
|
229
|
+
const size_t cost_limit = 8 * length;
|
230
|
+
size_t cost = 0;
|
231
|
+
for (size_t i = l + 1; i < r; i++) {
|
232
|
+
size_t j = i;
|
233
|
+
uint32_t v = a[i];
|
234
|
+
while (j >= l + 1 && v < a[j - 1]) {
|
235
|
+
a[j] = a[j - 1];
|
236
|
+
j--;
|
237
|
+
}
|
238
|
+
a[j] = v;
|
239
|
+
cost += i - j; // distance moved is a measure of work
|
240
|
+
if (cost > cost_limit) {
|
241
|
+
knuth_shell_sort3(a, l, r);
|
242
|
+
return;
|
243
|
+
}
|
244
|
+
}
|
245
|
+
}
|
246
|
+
|
247
|
+
template<typename A>
|
248
|
+
void u32_table<A>::knuth_shell_sort3(uint32_t* a, size_t l, size_t r) {
|
249
|
+
size_t h;
|
250
|
+
for (h = 1; h < (r - l) / 9; h = 3 * h + 1);
|
251
|
+
for ( ; h > 0; h /= 3) {
|
252
|
+
for (size_t i = l + h; i < r; i++) {
|
253
|
+
size_t j = i;
|
254
|
+
const uint32_t v = a[i];
|
255
|
+
while (j >= l + h && v < a[j - h]) {
|
256
|
+
a[j] = a[j - h];
|
257
|
+
j -= h;
|
258
|
+
}
|
259
|
+
a[j] = v;
|
260
|
+
}
|
261
|
+
}
|
262
|
+
}
|
263
|
+
|
264
|
+
} /* namespace datasketches */
|
265
|
+
|
266
|
+
#endif
|