datasketches 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
@@ -0,0 +1,1131 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#ifndef KLL_SKETCH_IMPL_HPP_
|
21
|
+
#define KLL_SKETCH_IMPL_HPP_
|
22
|
+
|
23
|
+
#include <iostream>
|
24
|
+
#include <iomanip>
|
25
|
+
#include <sstream>
|
26
|
+
|
27
|
+
#include "memory_operations.hpp"
|
28
|
+
#include "kll_helper.hpp"
|
29
|
+
|
30
|
+
namespace datasketches {
|
31
|
+
|
32
|
+
template<typename T, typename C, typename S, typename A>
|
33
|
+
kll_sketch<T, C, S, A>::kll_sketch(uint16_t k):
|
34
|
+
k_(k),
|
35
|
+
m_(DEFAULT_M),
|
36
|
+
min_k_(k),
|
37
|
+
n_(0),
|
38
|
+
num_levels_(1),
|
39
|
+
levels_(2),
|
40
|
+
items_(nullptr),
|
41
|
+
items_size_(k_),
|
42
|
+
min_value_(nullptr),
|
43
|
+
max_value_(nullptr),
|
44
|
+
is_level_zero_sorted_(false)
|
45
|
+
{
|
46
|
+
if (k < MIN_K || k > MAX_K) {
|
47
|
+
throw std::invalid_argument("K must be >= " + std::to_string(MIN_K) + " and <= " + std::to_string(MAX_K) + ": " + std::to_string(k));
|
48
|
+
}
|
49
|
+
levels_[0] = levels_[1] = k;
|
50
|
+
items_ = A().allocate(items_size_);
|
51
|
+
}
|
52
|
+
|
53
|
+
template<typename T, typename C, typename S, typename A>
|
54
|
+
kll_sketch<T, C, S, A>::kll_sketch(const kll_sketch& other):
|
55
|
+
k_(other.k_),
|
56
|
+
m_(other.m_),
|
57
|
+
min_k_(other.min_k_),
|
58
|
+
n_(other.n_),
|
59
|
+
num_levels_(other.num_levels_),
|
60
|
+
levels_(other.levels_),
|
61
|
+
items_(nullptr),
|
62
|
+
items_size_(other.items_size_),
|
63
|
+
min_value_(nullptr),
|
64
|
+
max_value_(nullptr),
|
65
|
+
is_level_zero_sorted_(other.is_level_zero_sorted_)
|
66
|
+
{
|
67
|
+
items_ = A().allocate(items_size_);
|
68
|
+
std::copy(&other.items_[levels_[0]], &other.items_[levels_[num_levels_]], &items_[levels_[0]]);
|
69
|
+
if (other.min_value_ != nullptr) min_value_ = new (A().allocate(1)) T(*other.min_value_);
|
70
|
+
if (other.max_value_ != nullptr) max_value_ = new (A().allocate(1)) T(*other.max_value_);
|
71
|
+
}
|
72
|
+
|
73
|
+
template<typename T, typename C, typename S, typename A>
|
74
|
+
kll_sketch<T, C, S, A>::kll_sketch(kll_sketch&& other) noexcept:
|
75
|
+
k_(other.k_),
|
76
|
+
m_(other.m_),
|
77
|
+
min_k_(other.min_k_),
|
78
|
+
n_(other.n_),
|
79
|
+
num_levels_(other.num_levels_),
|
80
|
+
levels_(std::move(other.levels_)),
|
81
|
+
items_(other.items_),
|
82
|
+
items_size_(other.items_size_),
|
83
|
+
min_value_(other.min_value_),
|
84
|
+
max_value_(other.max_value_),
|
85
|
+
is_level_zero_sorted_(other.is_level_zero_sorted_)
|
86
|
+
{
|
87
|
+
other.items_ = nullptr;
|
88
|
+
other.min_value_ = nullptr;
|
89
|
+
other.max_value_ = nullptr;
|
90
|
+
}
|
91
|
+
|
92
|
+
template<typename T, typename C, typename S, typename A>
|
93
|
+
kll_sketch<T, C, S, A>& kll_sketch<T, C, S, A>::operator=(const kll_sketch& other) {
|
94
|
+
kll_sketch copy(other);
|
95
|
+
std::swap(k_, copy.k_);
|
96
|
+
std::swap(m_, copy.m_);
|
97
|
+
std::swap(min_k_, copy.min_k_);
|
98
|
+
std::swap(n_, copy.n_);
|
99
|
+
std::swap(num_levels_, copy.num_levels_);
|
100
|
+
std::swap(levels_, copy.levels_);
|
101
|
+
std::swap(items_, copy.items_);
|
102
|
+
std::swap(items_size_, copy.items_size_);
|
103
|
+
std::swap(min_value_, copy.min_value_);
|
104
|
+
std::swap(max_value_, copy.max_value_);
|
105
|
+
std::swap(is_level_zero_sorted_, copy.is_level_zero_sorted_);
|
106
|
+
return *this;
|
107
|
+
}
|
108
|
+
|
109
|
+
template<typename T, typename C, typename S, typename A>
|
110
|
+
kll_sketch<T, C, S, A>& kll_sketch<T, C, S, A>::operator=(kll_sketch&& other) {
|
111
|
+
std::swap(k_, other.k_);
|
112
|
+
std::swap(m_, other.m_);
|
113
|
+
std::swap(min_k_, other.min_k_);
|
114
|
+
std::swap(n_, other.n_);
|
115
|
+
std::swap(num_levels_, other.num_levels_);
|
116
|
+
std::swap(levels_, other.levels_);
|
117
|
+
std::swap(items_, other.items_);
|
118
|
+
std::swap(items_size_, other.items_size_);
|
119
|
+
std::swap(min_value_, other.min_value_);
|
120
|
+
std::swap(max_value_, other.max_value_);
|
121
|
+
std::swap(is_level_zero_sorted_, other.is_level_zero_sorted_);
|
122
|
+
return *this;
|
123
|
+
}
|
124
|
+
|
125
|
+
template<typename T, typename C, typename S, typename A>
|
126
|
+
kll_sketch<T, C, S, A>::~kll_sketch() {
|
127
|
+
if (items_ != nullptr) {
|
128
|
+
const uint32_t begin = levels_[0];
|
129
|
+
const uint32_t end = levels_[num_levels_];
|
130
|
+
for (uint32_t i = begin; i < end; i++) items_[i].~T();
|
131
|
+
A().deallocate(items_, items_size_);
|
132
|
+
}
|
133
|
+
if (min_value_ != nullptr) {
|
134
|
+
min_value_->~T();
|
135
|
+
A().deallocate(min_value_, 1);
|
136
|
+
}
|
137
|
+
if (max_value_ != nullptr) {
|
138
|
+
max_value_->~T();
|
139
|
+
A().deallocate(max_value_, 1);
|
140
|
+
}
|
141
|
+
}
|
142
|
+
|
143
|
+
template<typename T, typename C, typename S, typename A>
|
144
|
+
void kll_sketch<T, C, S, A>::update(const T& value) {
|
145
|
+
if (!check_update_value(value)) { return; }
|
146
|
+
update_min_max(value);
|
147
|
+
const uint32_t index = internal_update();
|
148
|
+
new (&items_[index]) T(value);
|
149
|
+
}
|
150
|
+
|
151
|
+
template<typename T, typename C, typename S, typename A>
|
152
|
+
void kll_sketch<T, C, S, A>::update(T&& value) {
|
153
|
+
if (!check_update_value(value)) { return; }
|
154
|
+
update_min_max(value);
|
155
|
+
const uint32_t index = internal_update();
|
156
|
+
new (&items_[index]) T(std::move(value));
|
157
|
+
}
|
158
|
+
|
159
|
+
template<typename T, typename C, typename S, typename A>
|
160
|
+
void kll_sketch<T, C, S, A>::update_min_max(const T& value) {
|
161
|
+
if (is_empty()) {
|
162
|
+
min_value_ = new (A().allocate(1)) T(value);
|
163
|
+
max_value_ = new (A().allocate(1)) T(value);
|
164
|
+
} else {
|
165
|
+
if (C()(value, *min_value_)) *min_value_ = value;
|
166
|
+
if (C()(*max_value_, value)) *max_value_ = value;
|
167
|
+
}
|
168
|
+
}
|
169
|
+
|
170
|
+
template<typename T, typename C, typename S, typename A>
|
171
|
+
uint32_t kll_sketch<T, C, S, A>::internal_update() {
|
172
|
+
if (levels_[0] == 0) compress_while_updating();
|
173
|
+
n_++;
|
174
|
+
is_level_zero_sorted_ = false;
|
175
|
+
return --levels_[0];
|
176
|
+
}
|
177
|
+
|
178
|
+
template<typename T, typename C, typename S, typename A>
|
179
|
+
void kll_sketch<T, C, S, A>::merge(const kll_sketch& other) {
|
180
|
+
if (other.is_empty()) return;
|
181
|
+
if (m_ != other.m_) {
|
182
|
+
throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
|
183
|
+
}
|
184
|
+
if (is_empty()) {
|
185
|
+
min_value_ = new (A().allocate(1)) T(*other.min_value_);
|
186
|
+
max_value_ = new (A().allocate(1)) T(*other.max_value_);
|
187
|
+
} else {
|
188
|
+
if (C()(*other.min_value_, *min_value_)) *min_value_ = *other.min_value_;
|
189
|
+
if (C()(*max_value_, *other.max_value_)) *max_value_ = *other.max_value_;
|
190
|
+
}
|
191
|
+
const uint64_t final_n = n_ + other.n_;
|
192
|
+
for (uint32_t i = other.levels_[0]; i < other.levels_[1]; i++) {
|
193
|
+
const uint32_t index = internal_update();
|
194
|
+
new (&items_[index]) T(other.items_[i]);
|
195
|
+
}
|
196
|
+
if (other.num_levels_ >= 2) merge_higher_levels(other, final_n);
|
197
|
+
n_ = final_n;
|
198
|
+
if (other.is_estimation_mode()) min_k_ = std::min(min_k_, other.min_k_);
|
199
|
+
assert_correct_total_weight();
|
200
|
+
}
|
201
|
+
|
202
|
+
template<typename T, typename C, typename S, typename A>
|
203
|
+
void kll_sketch<T, C, S, A>::merge(kll_sketch&& other) {
|
204
|
+
if (other.is_empty()) return;
|
205
|
+
if (m_ != other.m_) {
|
206
|
+
throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
|
207
|
+
}
|
208
|
+
if (is_empty()) {
|
209
|
+
min_value_ = new (A().allocate(1)) T(std::move(*other.min_value_));
|
210
|
+
max_value_ = new (A().allocate(1)) T(std::move(*other.max_value_));
|
211
|
+
} else {
|
212
|
+
if (C()(*other.min_value_, *min_value_)) *min_value_ = std::move(*other.min_value_);
|
213
|
+
if (C()(*max_value_, *other.max_value_)) *max_value_ = std::move(*other.max_value_);
|
214
|
+
}
|
215
|
+
const uint64_t final_n = n_ + other.n_;
|
216
|
+
for (uint32_t i = other.levels_[0]; i < other.levels_[1]; i++) {
|
217
|
+
const uint32_t index = internal_update();
|
218
|
+
new (&items_[index]) T(std::move(other.items_[i]));
|
219
|
+
}
|
220
|
+
if (other.num_levels_ >= 2) merge_higher_levels(std::forward<kll_sketch>(other), final_n);
|
221
|
+
n_ = final_n;
|
222
|
+
if (other.is_estimation_mode()) min_k_ = std::min(min_k_, other.min_k_);
|
223
|
+
assert_correct_total_weight();
|
224
|
+
}
|
225
|
+
|
226
|
+
template<typename T, typename C, typename S, typename A>
|
227
|
+
bool kll_sketch<T, C, S, A>::is_empty() const {
|
228
|
+
return n_ == 0;
|
229
|
+
}
|
230
|
+
|
231
|
+
template<typename T, typename C, typename S, typename A>
|
232
|
+
uint64_t kll_sketch<T, C, S, A>::get_n() const {
|
233
|
+
return n_;
|
234
|
+
}
|
235
|
+
|
236
|
+
template<typename T, typename C, typename S, typename A>
|
237
|
+
uint32_t kll_sketch<T, C, S, A>::get_num_retained() const {
|
238
|
+
return levels_[num_levels_] - levels_[0];
|
239
|
+
}
|
240
|
+
|
241
|
+
template<typename T, typename C, typename S, typename A>
|
242
|
+
bool kll_sketch<T, C, S, A>::is_estimation_mode() const {
|
243
|
+
return num_levels_ > 1;
|
244
|
+
}
|
245
|
+
|
246
|
+
template<typename T, typename C, typename S, typename A>
|
247
|
+
T kll_sketch<T, C, S, A>::get_min_value() const {
|
248
|
+
if (is_empty()) return get_invalid_value();
|
249
|
+
return *min_value_;
|
250
|
+
}
|
251
|
+
|
252
|
+
template<typename T, typename C, typename S, typename A>
|
253
|
+
T kll_sketch<T, C, S, A>::get_max_value() const {
|
254
|
+
if (is_empty()) return get_invalid_value();
|
255
|
+
return *max_value_;
|
256
|
+
}
|
257
|
+
|
258
|
+
template<typename T, typename C, typename S, typename A>
|
259
|
+
T kll_sketch<T, C, S, A>::get_quantile(double fraction) const {
|
260
|
+
if (is_empty()) return get_invalid_value();
|
261
|
+
if (fraction == 0.0) return *min_value_;
|
262
|
+
if (fraction == 1.0) return *max_value_;
|
263
|
+
if ((fraction < 0.0) || (fraction > 1.0)) {
|
264
|
+
throw std::invalid_argument("Fraction cannot be less than zero or greater than 1.0");
|
265
|
+
}
|
266
|
+
// has side effect of sorting level zero if needed
|
267
|
+
auto quantile_calculator(const_cast<kll_sketch*>(this)->get_quantile_calculator());
|
268
|
+
return quantile_calculator->get_quantile(fraction);
|
269
|
+
}
|
270
|
+
|
271
|
+
template<typename T, typename C, typename S, typename A>
|
272
|
+
std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(const double* fractions, uint32_t size) const {
|
273
|
+
std::vector<T, A> quantiles;
|
274
|
+
quantiles.reserve(size);
|
275
|
+
if (is_empty()) return quantiles;
|
276
|
+
std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator;
|
277
|
+
quantiles.reserve(size);
|
278
|
+
for (uint32_t i = 0; i < size; i++) {
|
279
|
+
const double fraction = fractions[i];
|
280
|
+
if ((fraction < 0.0) || (fraction > 1.0)) {
|
281
|
+
throw std::invalid_argument("Fraction cannot be less than zero or greater than 1.0");
|
282
|
+
}
|
283
|
+
if (fraction == 0.0) quantiles.push_back(*min_value_);
|
284
|
+
else if (fraction == 1.0) quantiles.push_back(*max_value_);
|
285
|
+
else {
|
286
|
+
if (!quantile_calculator) {
|
287
|
+
// has side effect of sorting level zero if needed
|
288
|
+
quantile_calculator = const_cast<kll_sketch*>(this)->get_quantile_calculator();
|
289
|
+
}
|
290
|
+
quantiles.push_back(quantile_calculator->get_quantile(fraction));
|
291
|
+
}
|
292
|
+
}
|
293
|
+
return quantiles;
|
294
|
+
}
|
295
|
+
|
296
|
+
template<typename T, typename C, typename S, typename A>
|
297
|
+
std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(size_t num) const {
|
298
|
+
if (is_empty()) return std::vector<T, A>();
|
299
|
+
if (num == 0) {
|
300
|
+
throw std::invalid_argument("num must be > 0");
|
301
|
+
}
|
302
|
+
std::vector<double> fractions(num);
|
303
|
+
fractions[0] = 0.0;
|
304
|
+
for (size_t i = 1; i < num; i++) {
|
305
|
+
fractions[i] = static_cast<double>(i) / (num - 1);
|
306
|
+
}
|
307
|
+
if (num > 1) {
|
308
|
+
fractions[num - 1] = 1.0;
|
309
|
+
}
|
310
|
+
return get_quantiles(fractions.data(), num);
|
311
|
+
}
|
312
|
+
|
313
|
+
template<typename T, typename C, typename S, typename A>
|
314
|
+
double kll_sketch<T, C, S, A>::get_rank(const T& value) const {
|
315
|
+
if (is_empty()) return std::numeric_limits<double>::quiet_NaN();
|
316
|
+
uint8_t level = 0;
|
317
|
+
uint64_t weight = 1;
|
318
|
+
uint64_t total = 0;
|
319
|
+
while (level < num_levels_) {
|
320
|
+
const auto from_index(levels_[level]);
|
321
|
+
const auto to_index(levels_[level + 1]); // exclusive
|
322
|
+
for (uint32_t i = from_index; i < to_index; i++) {
|
323
|
+
if (C()(items_[i], value)) {
|
324
|
+
total += weight;
|
325
|
+
} else if ((level > 0) || is_level_zero_sorted_) {
|
326
|
+
break; // levels above 0 are sorted, no point comparing further
|
327
|
+
}
|
328
|
+
}
|
329
|
+
level++;
|
330
|
+
weight *= 2;
|
331
|
+
}
|
332
|
+
return (double) total / n_;
|
333
|
+
}
|
334
|
+
|
335
|
+
template<typename T, typename C, typename S, typename A>
|
336
|
+
vector_d<A> kll_sketch<T, C, S, A>::get_PMF(const T* split_points, uint32_t size) const {
|
337
|
+
return get_PMF_or_CDF(split_points, size, false);
|
338
|
+
}
|
339
|
+
|
340
|
+
template<typename T, typename C, typename S, typename A>
|
341
|
+
vector_d<A> kll_sketch<T, C, S, A>::get_CDF(const T* split_points, uint32_t size) const {
|
342
|
+
return get_PMF_or_CDF(split_points, size, true);
|
343
|
+
}
|
344
|
+
|
345
|
+
template<typename T, typename C, typename S, typename A>
|
346
|
+
double kll_sketch<T, C, S, A>::get_normalized_rank_error(bool pmf) const {
|
347
|
+
return get_normalized_rank_error(min_k_, pmf);
|
348
|
+
}
|
349
|
+
|
350
|
+
// implementation for fixed-size arithmetic types (integral and floating point)
|
351
|
+
template<typename T, typename C, typename S, typename A>
|
352
|
+
template<typename TT, typename std::enable_if<std::is_arithmetic<TT>::value, int>::type>
|
353
|
+
size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
|
354
|
+
if (is_empty()) { return EMPTY_SIZE_BYTES; }
|
355
|
+
if (num_levels_ == 1 && get_num_retained() == 1) {
|
356
|
+
return DATA_START_SINGLE_ITEM + sizeof(TT);
|
357
|
+
}
|
358
|
+
// the last integer in the levels_ array is not serialized because it can be derived
|
359
|
+
return DATA_START + num_levels_ * sizeof(uint32_t) + (get_num_retained() + 2) * sizeof(TT);
|
360
|
+
}
|
361
|
+
|
362
|
+
// implementation for all other types
|
363
|
+
template<typename T, typename C, typename S, typename A>
|
364
|
+
template<typename TT, typename std::enable_if<!std::is_arithmetic<TT>::value, int>::type>
|
365
|
+
size_t kll_sketch<T, C, S, A>::get_serialized_size_bytes() const {
|
366
|
+
if (is_empty()) { return EMPTY_SIZE_BYTES; }
|
367
|
+
if (num_levels_ == 1 && get_num_retained() == 1) {
|
368
|
+
return DATA_START_SINGLE_ITEM + S().size_of_item(items_[levels_[0]]);
|
369
|
+
}
|
370
|
+
// the last integer in the levels_ array is not serialized because it can be derived
|
371
|
+
size_t size = DATA_START + num_levels_ * sizeof(uint32_t);
|
372
|
+
size += S().size_of_item(*min_value_);
|
373
|
+
size += S().size_of_item(*max_value_);
|
374
|
+
for (auto& it: *this) size += S().size_of_item(it.first);
|
375
|
+
return size;
|
376
|
+
}
|
377
|
+
|
378
|
+
template<typename T, typename C, typename S, typename A>
|
379
|
+
void kll_sketch<T, C, S, A>::serialize(std::ostream& os) const {
|
380
|
+
const bool is_single_item = n_ == 1;
|
381
|
+
const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
|
382
|
+
os.write(reinterpret_cast<const char*>(&preamble_ints), sizeof(preamble_ints));
|
383
|
+
const uint8_t serial_version(is_single_item ? SERIAL_VERSION_2 : SERIAL_VERSION_1);
|
384
|
+
os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
|
385
|
+
const uint8_t family(FAMILY);
|
386
|
+
os.write(reinterpret_cast<const char*>(&family), sizeof(family));
|
387
|
+
const uint8_t flags_byte(
|
388
|
+
(is_empty() ? 1 << flags::IS_EMPTY : 0)
|
389
|
+
| (is_level_zero_sorted_ ? 1 << flags::IS_LEVEL_ZERO_SORTED : 0)
|
390
|
+
| (is_single_item ? 1 << flags::IS_SINGLE_ITEM : 0)
|
391
|
+
);
|
392
|
+
os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
|
393
|
+
os.write((char*)&k_, sizeof(k_));
|
394
|
+
os.write((char*)&m_, sizeof(m_));
|
395
|
+
const uint8_t unused = 0;
|
396
|
+
os.write(reinterpret_cast<const char*>(&unused), sizeof(unused));
|
397
|
+
if (is_empty()) return;
|
398
|
+
if (!is_single_item) {
|
399
|
+
os.write((char*)&n_, sizeof(n_));
|
400
|
+
os.write((char*)&min_k_, sizeof(min_k_));
|
401
|
+
os.write((char*)&num_levels_, sizeof(num_levels_));
|
402
|
+
os.write((char*)&unused, sizeof(unused));
|
403
|
+
os.write((char*)levels_.data(), sizeof(levels_[0]) * num_levels_);
|
404
|
+
S().serialize(os, min_value_, 1);
|
405
|
+
S().serialize(os, max_value_, 1);
|
406
|
+
}
|
407
|
+
S().serialize(os, &items_[levels_[0]], get_num_retained());
|
408
|
+
}
|
409
|
+
|
410
|
+
template<typename T, typename C, typename S, typename A>
|
411
|
+
vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const {
|
412
|
+
const bool is_single_item = n_ == 1;
|
413
|
+
const size_t size = header_size_bytes + get_serialized_size_bytes();
|
414
|
+
vector_u8<A> bytes(size);
|
415
|
+
uint8_t* ptr = bytes.data() + header_size_bytes;
|
416
|
+
const uint8_t* end_ptr = ptr + size;
|
417
|
+
const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
|
418
|
+
ptr += copy_to_mem(&preamble_ints, ptr, sizeof(preamble_ints));
|
419
|
+
const uint8_t serial_version(is_single_item ? SERIAL_VERSION_2 : SERIAL_VERSION_1);
|
420
|
+
ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
|
421
|
+
const uint8_t family(FAMILY);
|
422
|
+
ptr += copy_to_mem(&family, ptr, sizeof(family));
|
423
|
+
const uint8_t flags_byte(
|
424
|
+
(is_empty() ? 1 << flags::IS_EMPTY : 0)
|
425
|
+
| (is_level_zero_sorted_ ? 1 << flags::IS_LEVEL_ZERO_SORTED : 0)
|
426
|
+
| (is_single_item ? 1 << flags::IS_SINGLE_ITEM : 0)
|
427
|
+
);
|
428
|
+
ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
|
429
|
+
ptr += copy_to_mem(&k_, ptr, sizeof(k_));
|
430
|
+
ptr += copy_to_mem(&m_, ptr, sizeof(m_));
|
431
|
+
const uint8_t unused = 0;
|
432
|
+
ptr += copy_to_mem(&unused, ptr, sizeof(unused));
|
433
|
+
if (!is_empty()) {
|
434
|
+
if (!is_single_item) {
|
435
|
+
ptr += copy_to_mem(&n_, ptr, sizeof(n_));
|
436
|
+
ptr += copy_to_mem(&min_k_, ptr, sizeof(min_k_));
|
437
|
+
ptr += copy_to_mem(&num_levels_, ptr, sizeof(num_levels_));
|
438
|
+
ptr += copy_to_mem(&unused, ptr, sizeof(unused));
|
439
|
+
ptr += copy_to_mem(levels_.data(), ptr, sizeof(levels_[0]) * num_levels_);
|
440
|
+
ptr += S().serialize(ptr, end_ptr - ptr, min_value_, 1);
|
441
|
+
ptr += S().serialize(ptr, end_ptr - ptr, max_value_, 1);
|
442
|
+
}
|
443
|
+
const size_t bytes_remaining = end_ptr - ptr;
|
444
|
+
ptr += S().serialize(ptr, bytes_remaining, &items_[levels_[0]], get_num_retained());
|
445
|
+
}
|
446
|
+
const size_t delta = ptr - bytes.data();
|
447
|
+
if (delta != size) throw std::logic_error("serialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size));
|
448
|
+
return bytes;
|
449
|
+
}
|
450
|
+
|
451
|
+
template<typename T, typename C, typename S, typename A>
|
452
|
+
kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is) {
|
453
|
+
uint8_t preamble_ints;
|
454
|
+
is.read((char*)&preamble_ints, sizeof(preamble_ints));
|
455
|
+
uint8_t serial_version;
|
456
|
+
is.read((char*)&serial_version, sizeof(serial_version));
|
457
|
+
uint8_t family_id;
|
458
|
+
is.read((char*)&family_id, sizeof(family_id));
|
459
|
+
uint8_t flags_byte;
|
460
|
+
is.read((char*)&flags_byte, sizeof(flags_byte));
|
461
|
+
uint16_t k;
|
462
|
+
is.read((char*)&k, sizeof(k));
|
463
|
+
uint8_t m;
|
464
|
+
is.read((char*)&m, sizeof(m));
|
465
|
+
uint8_t unused;
|
466
|
+
is.read((char*)&unused, sizeof(unused));
|
467
|
+
|
468
|
+
check_m(m);
|
469
|
+
check_preamble_ints(preamble_ints, flags_byte);
|
470
|
+
check_serial_version(serial_version);
|
471
|
+
check_family_id(family_id);
|
472
|
+
|
473
|
+
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
474
|
+
const bool is_empty(flags_byte & (1 << flags::IS_EMPTY));
|
475
|
+
if (is_empty) return kll_sketch(k);
|
476
|
+
|
477
|
+
uint64_t n;
|
478
|
+
uint16_t min_k;
|
479
|
+
uint8_t num_levels;
|
480
|
+
const bool is_single_item(flags_byte & (1 << flags::IS_SINGLE_ITEM)); // used in serial version 2
|
481
|
+
if (is_single_item) {
|
482
|
+
n = 1;
|
483
|
+
min_k = k;
|
484
|
+
num_levels = 1;
|
485
|
+
} else {
|
486
|
+
is.read((char*)&n, sizeof(n_));
|
487
|
+
is.read((char*)&min_k, sizeof(min_k_));
|
488
|
+
is.read((char*)&num_levels, sizeof(num_levels));
|
489
|
+
is.read((char*)&unused, sizeof(unused));
|
490
|
+
}
|
491
|
+
vector_u32<A> levels(num_levels + 1);
|
492
|
+
const uint32_t capacity(kll_helper::compute_total_capacity(k, m, num_levels));
|
493
|
+
if (is_single_item) {
|
494
|
+
levels[0] = capacity - 1;
|
495
|
+
} else {
|
496
|
+
// the last integer in levels_ is not serialized because it can be derived
|
497
|
+
is.read((char*)levels.data(), sizeof(levels[0]) * num_levels);
|
498
|
+
}
|
499
|
+
levels[num_levels] = capacity;
|
500
|
+
auto item_buffer_deleter = [](T* ptr) { A().deallocate(ptr, 1); };
|
501
|
+
std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(A().allocate(1), item_buffer_deleter);
|
502
|
+
std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(A().allocate(1), item_buffer_deleter);
|
503
|
+
std::unique_ptr<T, item_deleter> min_value;
|
504
|
+
std::unique_ptr<T, item_deleter> max_value;
|
505
|
+
if (!is_single_item) {
|
506
|
+
S().deserialize(is, min_value_buffer.get(), 1);
|
507
|
+
// serde call did not throw, repackage with destrtuctor
|
508
|
+
min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter());
|
509
|
+
S().deserialize(is, max_value_buffer.get(), 1);
|
510
|
+
// serde call did not throw, repackage with destrtuctor
|
511
|
+
max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter());
|
512
|
+
}
|
513
|
+
auto items_buffer_deleter = [capacity](T* ptr) { A().deallocate(ptr, capacity); };
|
514
|
+
std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(A().allocate(capacity), items_buffer_deleter);
|
515
|
+
const auto num_items = levels[num_levels] - levels[0];
|
516
|
+
S().deserialize(is, &items_buffer.get()[levels[0]], num_items);
|
517
|
+
// serde call did not throw, repackage with destrtuctors
|
518
|
+
std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity));
|
519
|
+
const bool is_level_zero_sorted = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
|
520
|
+
if (is_single_item) {
|
521
|
+
new (min_value_buffer.get()) T(items.get()[levels[0]]);
|
522
|
+
// copy did not throw, repackage with destrtuctor
|
523
|
+
min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter());
|
524
|
+
new (max_value_buffer.get()) T(items.get()[levels[0]]);
|
525
|
+
// copy did not throw, repackage with destrtuctor
|
526
|
+
max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter());
|
527
|
+
}
|
528
|
+
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
529
|
+
return kll_sketch(k, min_k, n, num_levels, std::move(levels), std::move(items), capacity,
|
530
|
+
std::move(min_value), std::move(max_value), is_level_zero_sorted);
|
531
|
+
}
|
532
|
+
|
533
|
+
template<typename T, typename C, typename S, typename A>
|
534
|
+
kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, size_t size) {
|
535
|
+
ensure_minimum_memory(size, 8);
|
536
|
+
const char* ptr = static_cast<const char*>(bytes);
|
537
|
+
uint8_t preamble_ints;
|
538
|
+
ptr += copy_from_mem(ptr, &preamble_ints, sizeof(preamble_ints));
|
539
|
+
uint8_t serial_version;
|
540
|
+
ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
|
541
|
+
uint8_t family_id;
|
542
|
+
ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
|
543
|
+
uint8_t flags_byte;
|
544
|
+
ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
|
545
|
+
uint16_t k;
|
546
|
+
ptr += copy_from_mem(ptr, &k, sizeof(k));
|
547
|
+
uint8_t m;
|
548
|
+
ptr += copy_from_mem(ptr, &m, sizeof(m));
|
549
|
+
ptr++; // skip unused byte
|
550
|
+
|
551
|
+
check_m(m);
|
552
|
+
check_preamble_ints(preamble_ints, flags_byte);
|
553
|
+
check_serial_version(serial_version);
|
554
|
+
check_family_id(family_id);
|
555
|
+
ensure_minimum_memory(size, 1 << preamble_ints);
|
556
|
+
|
557
|
+
const bool is_empty(flags_byte & (1 << flags::IS_EMPTY));
|
558
|
+
if (is_empty) return kll_sketch<T, C, S, A>(k);
|
559
|
+
|
560
|
+
uint64_t n;
|
561
|
+
uint16_t min_k;
|
562
|
+
uint8_t num_levels;
|
563
|
+
const bool is_single_item(flags_byte & (1 << flags::IS_SINGLE_ITEM)); // used in serial version 2
|
564
|
+
const char* end_ptr = static_cast<const char*>(bytes) + size;
|
565
|
+
if (is_single_item) {
|
566
|
+
n = 1;
|
567
|
+
min_k = k;
|
568
|
+
num_levels = 1;
|
569
|
+
} else {
|
570
|
+
ptr += copy_from_mem(ptr, &n, sizeof(n));
|
571
|
+
ptr += copy_from_mem(ptr, &min_k, sizeof(min_k));
|
572
|
+
ptr += copy_from_mem(ptr, &num_levels, sizeof(num_levels));
|
573
|
+
ptr++; // skip unused byte
|
574
|
+
}
|
575
|
+
vector_u32<A> levels(num_levels + 1);
|
576
|
+
const uint32_t capacity(kll_helper::compute_total_capacity(k, m, num_levels));
|
577
|
+
if (is_single_item) {
|
578
|
+
levels[0] = capacity - 1;
|
579
|
+
} else {
|
580
|
+
// the last integer in levels_ is not serialized because it can be derived
|
581
|
+
ptr += copy_from_mem(ptr, levels.data(), sizeof(levels[0]) * num_levels);
|
582
|
+
}
|
583
|
+
levels[num_levels] = capacity;
|
584
|
+
auto item_buffer_deleter = [](T* ptr) { A().deallocate(ptr, 1); };
|
585
|
+
std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(A().allocate(1), item_buffer_deleter);
|
586
|
+
std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(A().allocate(1), item_buffer_deleter);
|
587
|
+
std::unique_ptr<T, item_deleter> min_value;
|
588
|
+
std::unique_ptr<T, item_deleter> max_value;
|
589
|
+
if (!is_single_item) {
|
590
|
+
ptr += S().deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
|
591
|
+
// serde call did not throw, repackage with destrtuctor
|
592
|
+
min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter());
|
593
|
+
ptr += S().deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
|
594
|
+
// serde call did not throw, repackage with destrtuctor
|
595
|
+
max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter());
|
596
|
+
}
|
597
|
+
auto items_buffer_deleter = [capacity](T* ptr) { A().deallocate(ptr, capacity); };
|
598
|
+
std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(A().allocate(capacity), items_buffer_deleter);
|
599
|
+
const auto num_items = levels[num_levels] - levels[0];
|
600
|
+
ptr += S().deserialize(ptr, end_ptr - ptr, &items_buffer.get()[levels[0]], num_items);
|
601
|
+
// serde call did not throw, repackage with destrtuctors
|
602
|
+
std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity));
|
603
|
+
const size_t delta = ptr - static_cast<const char*>(bytes);
|
604
|
+
if (delta != size) throw std::logic_error("deserialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size));
|
605
|
+
const bool is_level_zero_sorted = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
|
606
|
+
if (is_single_item) {
|
607
|
+
new (min_value_buffer.get()) T(items.get()[levels[0]]);
|
608
|
+
// copy did not throw, repackage with destrtuctor
|
609
|
+
min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter());
|
610
|
+
new (max_value_buffer.get()) T(items.get()[levels[0]]);
|
611
|
+
// copy did not throw, repackage with destrtuctor
|
612
|
+
max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter());
|
613
|
+
}
|
614
|
+
return kll_sketch(k, min_k, n, num_levels, std::move(levels), std::move(items), capacity,
|
615
|
+
std::move(min_value), std::move(max_value), is_level_zero_sorted);
|
616
|
+
}
|
617
|
+
|
618
|
+
/*
|
619
|
+
* Gets the normalized rank error given k and pmf.
|
620
|
+
* k - the configuration parameter
|
621
|
+
* pmf - if true, returns the "double-sided" normalized rank error for the get_PMF() function.
|
622
|
+
* Otherwise, it is the "single-sided" normalized rank error for all the other queries.
|
623
|
+
* Constants were derived as the best fit to 99 percentile empirically measured max error in thousands of trials
|
624
|
+
*/
|
625
|
+
template<typename T, typename C, typename S, typename A>
|
626
|
+
double kll_sketch<T, C, S, A>::get_normalized_rank_error(uint16_t k, bool pmf) {
|
627
|
+
return pmf
|
628
|
+
? 2.446 / pow(k, 0.9433)
|
629
|
+
: 2.296 / pow(k, 0.9723);
|
630
|
+
}
|
631
|
+
|
632
|
+
// for deserialization
|
633
|
+
template<typename T, typename C, typename S, typename A>
|
634
|
+
kll_sketch<T, C, S, A>::kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32<A>&& levels,
|
635
|
+
std::unique_ptr<T, items_deleter> items, uint32_t items_size, std::unique_ptr<T, item_deleter> min_value,
|
636
|
+
std::unique_ptr<T, item_deleter> max_value, bool is_level_zero_sorted):
|
637
|
+
k_(k),
|
638
|
+
m_(DEFAULT_M),
|
639
|
+
min_k_(min_k),
|
640
|
+
n_(n),
|
641
|
+
num_levels_(num_levels),
|
642
|
+
levels_(std::move(levels)),
|
643
|
+
items_(items.release()),
|
644
|
+
items_size_(items_size),
|
645
|
+
min_value_(min_value.release()),
|
646
|
+
max_value_(max_value.release()),
|
647
|
+
is_level_zero_sorted_(is_level_zero_sorted)
|
648
|
+
{}
|
649
|
+
|
650
|
+
// The following code is only valid in the special case of exactly reaching capacity while updating.
|
651
|
+
// It cannot be used while merging, while reducing k, or anything else.
|
652
|
+
template<typename T, typename C, typename S, typename A>
|
653
|
+
void kll_sketch<T, C, S, A>::compress_while_updating(void) {
|
654
|
+
const uint8_t level = find_level_to_compact();
|
655
|
+
|
656
|
+
// It is important to add the new top level right here. Be aware that this operation
|
657
|
+
// grows the buffer and shifts the data and also the boundaries of the data and grows the
|
658
|
+
// levels array and increments num_levels_
|
659
|
+
if (level == (num_levels_ - 1)) {
|
660
|
+
add_empty_top_level_to_completely_full_sketch();
|
661
|
+
}
|
662
|
+
|
663
|
+
const uint32_t raw_beg = levels_[level];
|
664
|
+
const uint32_t raw_lim = levels_[level + 1];
|
665
|
+
// +2 is OK because we already added a new top level if necessary
|
666
|
+
const uint32_t pop_above = levels_[level + 2] - raw_lim;
|
667
|
+
const uint32_t raw_pop = raw_lim - raw_beg;
|
668
|
+
const bool odd_pop = kll_helper::is_odd(raw_pop);
|
669
|
+
const uint32_t adj_beg = odd_pop ? raw_beg + 1 : raw_beg;
|
670
|
+
const uint32_t adj_pop = odd_pop ? raw_pop - 1 : raw_pop;
|
671
|
+
const uint32_t half_adj_pop = adj_pop / 2;
|
672
|
+
const uint32_t destroy_beg = levels_[0];
|
673
|
+
|
674
|
+
// level zero might not be sorted, so we must sort it if we wish to compact it
|
675
|
+
// sort_level_zero() is not used here because of the adjustment for odd number of items
|
676
|
+
if ((level == 0) && !is_level_zero_sorted_) {
|
677
|
+
std::sort(&items_[adj_beg], &items_[adj_beg + adj_pop], C());
|
678
|
+
}
|
679
|
+
if (pop_above == 0) {
|
680
|
+
kll_helper::randomly_halve_up(items_, adj_beg, adj_pop);
|
681
|
+
} else {
|
682
|
+
kll_helper::randomly_halve_down(items_, adj_beg, adj_pop);
|
683
|
+
kll_helper::merge_sorted_arrays<T, C>(items_, adj_beg, half_adj_pop, raw_lim, pop_above, adj_beg + half_adj_pop);
|
684
|
+
}
|
685
|
+
levels_[level + 1] -= half_adj_pop; // adjust boundaries of the level above
|
686
|
+
if (odd_pop) {
|
687
|
+
levels_[level] = levels_[level + 1] - 1; // the current level now contains one item
|
688
|
+
if (levels_[level] != raw_beg) items_[levels_[level]] = std::move(items_[raw_beg]); // namely this leftover guy
|
689
|
+
} else {
|
690
|
+
levels_[level] = levels_[level + 1]; // the current level is now empty
|
691
|
+
}
|
692
|
+
|
693
|
+
// verify that we freed up half_adj_pop array slots just below the current level
|
694
|
+
if (levels_[level] != (raw_beg + half_adj_pop)) throw std::logic_error("compaction error");
|
695
|
+
|
696
|
+
// finally, we need to shift up the data in the levels below
|
697
|
+
// so that the freed-up space can be used by level zero
|
698
|
+
if (level > 0) {
|
699
|
+
const uint32_t amount = raw_beg - levels_[0];
|
700
|
+
std::move_backward(&items_[levels_[0]], &items_[levels_[0] + amount], &items_[levels_[0] + half_adj_pop + amount]);
|
701
|
+
for (uint8_t lvl = 0; lvl < level; lvl++) levels_[lvl] += half_adj_pop;
|
702
|
+
}
|
703
|
+
for (uint32_t i = 0; i < half_adj_pop; i++) items_[i + destroy_beg].~T();
|
704
|
+
}
|
705
|
+
|
706
|
+
template<typename T, typename C, typename S, typename A>
|
707
|
+
uint8_t kll_sketch<T, C, S, A>::find_level_to_compact() const {
|
708
|
+
uint8_t level = 0;
|
709
|
+
while (true) {
|
710
|
+
if (level >= num_levels_) throw std::logic_error("capacity calculation error");
|
711
|
+
const uint32_t pop = levels_[level + 1] - levels_[level];
|
712
|
+
const uint32_t cap = kll_helper::level_capacity(k_, num_levels_, level, m_);
|
713
|
+
if (pop >= cap) {
|
714
|
+
return level;
|
715
|
+
}
|
716
|
+
level++;
|
717
|
+
}
|
718
|
+
}
|
719
|
+
|
720
|
+
template<typename T, typename C, typename S, typename A>
|
721
|
+
void kll_sketch<T, C, S, A>::add_empty_top_level_to_completely_full_sketch() {
|
722
|
+
const uint32_t cur_total_cap = levels_[num_levels_];
|
723
|
+
|
724
|
+
// make sure that we are following a certain growth scheme
|
725
|
+
if (levels_[0] != 0) throw std::logic_error("full sketch expected");
|
726
|
+
if (items_size_ != cur_total_cap) throw std::logic_error("current capacity mismatch");
|
727
|
+
|
728
|
+
// note that merging MIGHT over-grow levels_, in which case we might not have to grow it here
|
729
|
+
const uint8_t new_levels_size = num_levels_ + 2;
|
730
|
+
if (levels_.size() < new_levels_size) {
|
731
|
+
levels_.resize(new_levels_size);
|
732
|
+
}
|
733
|
+
|
734
|
+
const uint32_t delta_cap = kll_helper::level_capacity(k_, num_levels_ + 1, 0, m_);
|
735
|
+
const uint32_t new_total_cap = cur_total_cap + delta_cap;
|
736
|
+
|
737
|
+
// move (and shift) the current data into the new buffer
|
738
|
+
T* new_buf = A().allocate(new_total_cap);
|
739
|
+
kll_helper::move_construct<T>(items_, 0, cur_total_cap, new_buf, delta_cap, true);
|
740
|
+
A().deallocate(items_, items_size_);
|
741
|
+
items_ = new_buf;
|
742
|
+
items_size_ = new_total_cap;
|
743
|
+
|
744
|
+
// this loop includes the old "extra" index at the top
|
745
|
+
for (uint8_t i = 0; i <= num_levels_; i++) {
|
746
|
+
levels_[i] += delta_cap;
|
747
|
+
}
|
748
|
+
|
749
|
+
if (levels_[num_levels_] != new_total_cap) throw std::logic_error("new capacity mismatch");
|
750
|
+
|
751
|
+
num_levels_++;
|
752
|
+
levels_[num_levels_] = new_total_cap; // initialize the new "extra" index at the top
|
753
|
+
}
|
754
|
+
|
755
|
+
template<typename T, typename C, typename S, typename A>
|
756
|
+
void kll_sketch<T, C, S, A>::sort_level_zero() {
|
757
|
+
if (!is_level_zero_sorted_) {
|
758
|
+
std::sort(&items_[levels_[0]], &items_[levels_[1]], C());
|
759
|
+
is_level_zero_sorted_ = true;
|
760
|
+
}
|
761
|
+
}
|
762
|
+
|
763
|
+
template<typename T, typename C, typename S, typename A>
|
764
|
+
std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> kll_sketch<T, C, S, A>::get_quantile_calculator() {
|
765
|
+
sort_level_zero();
|
766
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<kll_quantile_calculator<T, C, A>> AllocCalc;
|
767
|
+
std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator(
|
768
|
+
new (AllocCalc().allocate(1)) kll_quantile_calculator<T, C, A>(items_, levels_.data(), num_levels_, n_),
|
769
|
+
[](kll_quantile_calculator<T, C, A>* ptr){ ptr->~kll_quantile_calculator<T, C, A>(); AllocCalc().deallocate(ptr, 1); }
|
770
|
+
);
|
771
|
+
return quantile_calculator;
|
772
|
+
}
|
773
|
+
|
774
|
+
template<typename T, typename C, typename S, typename A>
|
775
|
+
vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32_t size, bool is_CDF) const {
|
776
|
+
if (is_empty()) return vector_d<A>();
|
777
|
+
kll_helper::validate_values<T, C>(split_points, size);
|
778
|
+
vector_d<A> buckets(size + 1, 0);
|
779
|
+
uint8_t level = 0;
|
780
|
+
uint64_t weight = 1;
|
781
|
+
while (level < num_levels_) {
|
782
|
+
const auto from_index = levels_[level];
|
783
|
+
const auto to_index = levels_[level + 1]; // exclusive
|
784
|
+
if ((level == 0) && !is_level_zero_sorted_) {
|
785
|
+
increment_buckets_unsorted_level(from_index, to_index, weight, split_points, size, buckets.data());
|
786
|
+
} else {
|
787
|
+
increment_buckets_sorted_level(from_index, to_index, weight, split_points, size, buckets.data());
|
788
|
+
}
|
789
|
+
level++;
|
790
|
+
weight *= 2;
|
791
|
+
}
|
792
|
+
// normalize and, if CDF, convert to cumulative
|
793
|
+
if (is_CDF) {
|
794
|
+
double subtotal = 0;
|
795
|
+
for (uint32_t i = 0; i <= size; i++) {
|
796
|
+
subtotal += buckets[i];
|
797
|
+
buckets[i] = subtotal / n_;
|
798
|
+
}
|
799
|
+
} else {
|
800
|
+
for (uint32_t i = 0; i <= size; i++) {
|
801
|
+
buckets[i] /= n_;
|
802
|
+
}
|
803
|
+
}
|
804
|
+
return buckets;
|
805
|
+
}
|
806
|
+
|
807
|
+
template<typename T, typename C, typename S, typename A>
|
808
|
+
void kll_sketch<T, C, S, A>::increment_buckets_unsorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
|
809
|
+
const T* split_points, uint32_t size, double* buckets) const
|
810
|
+
{
|
811
|
+
for (uint32_t i = from_index; i < to_index; i++) {
|
812
|
+
uint32_t j;
|
813
|
+
for (j = 0; j < size; j++) {
|
814
|
+
if (C()(items_[i], split_points[j])) {
|
815
|
+
break;
|
816
|
+
}
|
817
|
+
}
|
818
|
+
buckets[j] += weight;
|
819
|
+
}
|
820
|
+
}
|
821
|
+
|
822
|
+
template<typename T, typename C, typename S, typename A>
|
823
|
+
void kll_sketch<T, C, S, A>::increment_buckets_sorted_level(uint32_t from_index, uint32_t to_index, uint64_t weight,
|
824
|
+
const T* split_points, uint32_t size, double* buckets) const
|
825
|
+
{
|
826
|
+
uint32_t i = from_index;
|
827
|
+
uint32_t j = 0;
|
828
|
+
while ((i < to_index) && (j < size)) {
|
829
|
+
if (C()(items_[i], split_points[j])) {
|
830
|
+
buckets[j] += weight; // this sample goes into this bucket
|
831
|
+
i++; // move on to next sample and see whether it also goes into this bucket
|
832
|
+
} else {
|
833
|
+
j++; // no more samples for this bucket
|
834
|
+
}
|
835
|
+
}
|
836
|
+
// now either i == to_index (we are out of samples), or
|
837
|
+
// j == size (we are out of buckets, but there are more samples remaining)
|
838
|
+
// we only need to do something in the latter case
|
839
|
+
if (j == size) {
|
840
|
+
buckets[j] += weight * (to_index - i);
|
841
|
+
}
|
842
|
+
}
|
843
|
+
|
844
|
+
template<typename T, typename C, typename S, typename A>
|
845
|
+
template<typename O>
|
846
|
+
void kll_sketch<T, C, S, A>::merge_higher_levels(O&& other, uint64_t final_n) {
|
847
|
+
const uint32_t tmp_num_items = get_num_retained() + other.get_num_retained_above_level_zero();
|
848
|
+
auto tmp_items_deleter = [tmp_num_items](T* ptr) { A().deallocate(ptr, tmp_num_items); }; // no destructor needed
|
849
|
+
const std::unique_ptr<T, decltype(tmp_items_deleter)> workbuf(A().allocate(tmp_num_items), tmp_items_deleter);
|
850
|
+
const uint8_t ub = kll_helper::ub_on_num_levels(final_n);
|
851
|
+
const size_t work_levels_size = ub + 2; // ub+1 does not work
|
852
|
+
vector_u32<A> worklevels(work_levels_size);
|
853
|
+
vector_u32<A> outlevels(work_levels_size);
|
854
|
+
|
855
|
+
const uint8_t provisional_num_levels = std::max(num_levels_, other.num_levels_);
|
856
|
+
|
857
|
+
populate_work_arrays(std::forward<O>(other), workbuf.get(), worklevels.data(), provisional_num_levels);
|
858
|
+
|
859
|
+
const kll_helper::compress_result result = kll_helper::general_compress<T, C>(k_, m_, provisional_num_levels, workbuf.get(),
|
860
|
+
worklevels.data(), outlevels.data(), is_level_zero_sorted_);
|
861
|
+
|
862
|
+
// ub can sometimes be much bigger
|
863
|
+
if (result.final_num_levels > ub) throw std::logic_error("merge error");
|
864
|
+
|
865
|
+
// now we need to transfer the results back into "this" sketch
|
866
|
+
if (result.final_capacity != items_size_) {
|
867
|
+
A().deallocate(items_, items_size_);
|
868
|
+
items_size_ = result.final_capacity;
|
869
|
+
items_ = A().allocate(items_size_);
|
870
|
+
}
|
871
|
+
const uint32_t free_space_at_bottom = result.final_capacity - result.final_num_items;
|
872
|
+
kll_helper::move_construct<T>(workbuf.get(), outlevels[0], outlevels[0] + result.final_num_items, items_, free_space_at_bottom, true);
|
873
|
+
|
874
|
+
const size_t new_levels_size = result.final_num_levels + 1;
|
875
|
+
if (levels_.size() < new_levels_size) {
|
876
|
+
levels_.resize(new_levels_size);
|
877
|
+
}
|
878
|
+
const uint32_t offset = free_space_at_bottom - outlevels[0];
|
879
|
+
for (uint8_t lvl = 0; lvl < levels_.size(); lvl++) { // includes the "extra" index
|
880
|
+
levels_[lvl] = outlevels[lvl] + offset;
|
881
|
+
}
|
882
|
+
num_levels_ = result.final_num_levels;
|
883
|
+
}
|
884
|
+
|
885
|
+
// this leaves items_ uninitialized (all objects moved out and destroyed)
|
886
|
+
// this version copies objects from the incoming sketch
|
887
|
+
template<typename T, typename C, typename S, typename A>
|
888
|
+
void kll_sketch<T, C, S, A>::populate_work_arrays(const kll_sketch& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels) {
|
889
|
+
worklevels[0] = 0;
|
890
|
+
|
891
|
+
// the level zero data from "other" was already inserted into "this"
|
892
|
+
kll_helper::move_construct<T>(items_, levels_[0], levels_[1], workbuf, 0, true);
|
893
|
+
worklevels[1] = safe_level_size(0);
|
894
|
+
|
895
|
+
for (uint8_t lvl = 1; lvl < provisional_num_levels; lvl++) {
|
896
|
+
const uint32_t self_pop = safe_level_size(lvl);
|
897
|
+
const uint32_t other_pop = other.safe_level_size(lvl);
|
898
|
+
worklevels[lvl + 1] = worklevels[lvl] + self_pop + other_pop;
|
899
|
+
|
900
|
+
if ((self_pop > 0) && (other_pop == 0)) {
|
901
|
+
kll_helper::move_construct<T>(items_, levels_[lvl], levels_[lvl] + self_pop, workbuf, worklevels[lvl], true);
|
902
|
+
} else if ((self_pop == 0) && (other_pop > 0)) {
|
903
|
+
kll_helper::copy_construct<T>(other.items_, other.levels_[lvl], other.levels_[lvl] + other_pop, workbuf, worklevels[lvl]);
|
904
|
+
} else if ((self_pop > 0) && (other_pop > 0)) {
|
905
|
+
kll_helper::merge_sorted_arrays<T, C>(items_, levels_[lvl], self_pop, other.items_, other.levels_[lvl], other_pop, workbuf, worklevels[lvl]);
|
906
|
+
}
|
907
|
+
}
|
908
|
+
}
|
909
|
+
|
910
|
+
// this leaves items_ uninitialized (all objects moved out and destroyed)
|
911
|
+
// this version moves objects from the incoming sketch
|
912
|
+
template<typename T, typename C, typename S, typename A>
|
913
|
+
void kll_sketch<T, C, S, A>::populate_work_arrays(kll_sketch&& other, T* workbuf, uint32_t* worklevels, uint8_t provisional_num_levels) {
|
914
|
+
worklevels[0] = 0;
|
915
|
+
|
916
|
+
// the level zero data from "other" was already inserted into "this"
|
917
|
+
kll_helper::move_construct<T>(items_, levels_[0], levels_[1], workbuf, 0, true);
|
918
|
+
worklevels[1] = safe_level_size(0);
|
919
|
+
|
920
|
+
for (uint8_t lvl = 1; lvl < provisional_num_levels; lvl++) {
|
921
|
+
const uint32_t self_pop = safe_level_size(lvl);
|
922
|
+
const uint32_t other_pop = other.safe_level_size(lvl);
|
923
|
+
worklevels[lvl + 1] = worklevels[lvl] + self_pop + other_pop;
|
924
|
+
|
925
|
+
if ((self_pop > 0) && (other_pop == 0)) {
|
926
|
+
kll_helper::move_construct<T>(items_, levels_[lvl], levels_[lvl] + self_pop, workbuf, worklevels[lvl], true);
|
927
|
+
} else if ((self_pop == 0) && (other_pop > 0)) {
|
928
|
+
kll_helper::move_construct<T>(other.items_, other.levels_[lvl], other.levels_[lvl] + other_pop, workbuf, worklevels[lvl], false);
|
929
|
+
} else if ((self_pop > 0) && (other_pop > 0)) {
|
930
|
+
kll_helper::merge_sorted_arrays<T, C>(items_, levels_[lvl], self_pop, other.items_, other.levels_[lvl], other_pop, workbuf, worklevels[lvl]);
|
931
|
+
}
|
932
|
+
}
|
933
|
+
}
|
934
|
+
|
935
|
+
template<typename T, typename C, typename S, typename A>
|
936
|
+
void kll_sketch<T, C, S, A>::assert_correct_total_weight() const {
|
937
|
+
const uint64_t total(kll_helper::sum_the_sample_weights(num_levels_, levels_.data()));
|
938
|
+
if (total != n_) {
|
939
|
+
throw std::logic_error("Total weight does not match N");
|
940
|
+
}
|
941
|
+
}
|
942
|
+
|
943
|
+
template<typename T, typename C, typename S, typename A>
|
944
|
+
uint32_t kll_sketch<T, C, S, A>::safe_level_size(uint8_t level) const {
|
945
|
+
if (level >= num_levels_) return 0;
|
946
|
+
return levels_[level + 1] - levels_[level];
|
947
|
+
}
|
948
|
+
|
949
|
+
template<typename T, typename C, typename S, typename A>
|
950
|
+
uint32_t kll_sketch<T, C, S, A>::get_num_retained_above_level_zero() const {
|
951
|
+
if (num_levels_ == 1) return 0;
|
952
|
+
return levels_[num_levels_] - levels_[1];
|
953
|
+
}
|
954
|
+
|
955
|
+
template<typename T, typename C, typename S, typename A>
|
956
|
+
void kll_sketch<T, C, S, A>::check_m(uint8_t m) {
|
957
|
+
if (m != DEFAULT_M) {
|
958
|
+
throw std::invalid_argument("Possible corruption: M must be " + std::to_string(DEFAULT_M)
|
959
|
+
+ ": " + std::to_string(m));
|
960
|
+
}
|
961
|
+
}
|
962
|
+
|
963
|
+
template<typename T, typename C, typename S, typename A>
|
964
|
+
void kll_sketch<T, C, S, A>::check_preamble_ints(uint8_t preamble_ints, uint8_t flags_byte) {
|
965
|
+
const bool is_empty(flags_byte & (1 << flags::IS_EMPTY));
|
966
|
+
const bool is_single_item(flags_byte & (1 << flags::IS_SINGLE_ITEM));
|
967
|
+
if (is_empty || is_single_item) {
|
968
|
+
if (preamble_ints != PREAMBLE_INTS_SHORT) {
|
969
|
+
throw std::invalid_argument("Possible corruption: preamble ints must be "
|
970
|
+
+ std::to_string(PREAMBLE_INTS_SHORT) + " for an empty or single item sketch: " + std::to_string(preamble_ints));
|
971
|
+
}
|
972
|
+
} else {
|
973
|
+
if (preamble_ints != PREAMBLE_INTS_FULL) {
|
974
|
+
throw std::invalid_argument("Possible corruption: preamble ints must be "
|
975
|
+
+ std::to_string(PREAMBLE_INTS_FULL) + " for a sketch with more than one item: " + std::to_string(preamble_ints));
|
976
|
+
}
|
977
|
+
}
|
978
|
+
}
|
979
|
+
|
980
|
+
template<typename T, typename C, typename S, typename A>
|
981
|
+
void kll_sketch<T, C, S, A>::check_serial_version(uint8_t serial_version) {
|
982
|
+
if (serial_version != SERIAL_VERSION_1 && serial_version != SERIAL_VERSION_2) {
|
983
|
+
throw std::invalid_argument("Possible corruption: serial version mismatch: expected "
|
984
|
+
+ std::to_string(SERIAL_VERSION_1) + " or " + std::to_string(SERIAL_VERSION_2)
|
985
|
+
+ ", got " + std::to_string(serial_version));
|
986
|
+
}
|
987
|
+
}
|
988
|
+
|
989
|
+
template<typename T, typename C, typename S, typename A>
|
990
|
+
void kll_sketch<T, C, S, A>::check_family_id(uint8_t family_id) {
|
991
|
+
if (family_id != FAMILY) {
|
992
|
+
throw std::invalid_argument("Possible corruption: family mismatch: expected "
|
993
|
+
+ std::to_string(FAMILY) + ", got " + std::to_string(family_id));
|
994
|
+
}
|
995
|
+
}
|
996
|
+
|
997
|
+
template <typename T, typename C, typename S, typename A>
|
998
|
+
string<A> kll_sketch<T, C, S, A>::to_string(bool print_levels, bool print_items) const {
|
999
|
+
std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
|
1000
|
+
os << "### KLL sketch summary:" << std::endl;
|
1001
|
+
os << " K : " << k_ << std::endl;
|
1002
|
+
os << " min K : " << min_k_ << std::endl;
|
1003
|
+
os << " M : " << (unsigned int) m_ << std::endl;
|
1004
|
+
os << " N : " << n_ << std::endl;
|
1005
|
+
os << " Epsilon : " << std::setprecision(3) << get_normalized_rank_error(false) * 100 << "%" << std::endl;
|
1006
|
+
os << " Epsilon PMF : " << get_normalized_rank_error(true) * 100 << "%" << std::endl;
|
1007
|
+
os << " Empty : " << (is_empty() ? "true" : "false") << std::endl;
|
1008
|
+
os << " Estimation mode: " << (is_estimation_mode() ? "true" : "false") << std::endl;
|
1009
|
+
os << " Levels : " << (unsigned int) num_levels_ << std::endl;
|
1010
|
+
os << " Sorted : " << (is_level_zero_sorted_ ? "true" : "false") << std::endl;
|
1011
|
+
os << " Capacity items : " << items_size_ << std::endl;
|
1012
|
+
os << " Retained items : " << get_num_retained() << std::endl;
|
1013
|
+
os << " Storage bytes : " << get_serialized_size_bytes() << std::endl;
|
1014
|
+
if (!is_empty()) {
|
1015
|
+
os << " Min value : " << *min_value_ << std::endl;
|
1016
|
+
os << " Max value : " << *max_value_ << std::endl;
|
1017
|
+
}
|
1018
|
+
os << "### End sketch summary" << std::endl;
|
1019
|
+
|
1020
|
+
if (print_levels) {
|
1021
|
+
os << "### KLL sketch levels:" << std::endl;
|
1022
|
+
os << " index: nominal capacity, actual size" << std::endl;
|
1023
|
+
for (uint8_t i = 0; i < num_levels_; i++) {
|
1024
|
+
os << " " << (unsigned int) i << ": " << kll_helper::level_capacity(k_, num_levels_, i, m_) << ", " << safe_level_size(i) << std::endl;
|
1025
|
+
}
|
1026
|
+
os << "### End sketch levels" << std::endl;
|
1027
|
+
}
|
1028
|
+
|
1029
|
+
if (print_items) {
|
1030
|
+
os << "### KLL sketch data:" << std::endl;
|
1031
|
+
uint8_t level = 0;
|
1032
|
+
while (level < num_levels_) {
|
1033
|
+
const uint32_t from_index = levels_[level];
|
1034
|
+
const uint32_t to_index = levels_[level + 1]; // exclusive
|
1035
|
+
if (from_index < to_index) {
|
1036
|
+
os << " level " << (unsigned int) level << ":" << std::endl;
|
1037
|
+
}
|
1038
|
+
for (uint32_t i = from_index; i < to_index; i++) {
|
1039
|
+
os << " " << items_[i] << std::endl;
|
1040
|
+
}
|
1041
|
+
level++;
|
1042
|
+
}
|
1043
|
+
os << "### End sketch data" << std::endl;
|
1044
|
+
}
|
1045
|
+
return os.str();
|
1046
|
+
}
|
1047
|
+
|
1048
|
+
template <typename T, typename C, typename S, typename A>
|
1049
|
+
typename kll_sketch<T, C, S, A>::const_iterator kll_sketch<T, C, S, A>::begin() const {
|
1050
|
+
return kll_sketch<T, C, S, A>::const_iterator(items_, levels_.data(), num_levels_);
|
1051
|
+
}
|
1052
|
+
|
1053
|
+
template <typename T, typename C, typename S, typename A>
|
1054
|
+
typename kll_sketch<T, C, S, A>::const_iterator kll_sketch<T, C, S, A>::end() const {
|
1055
|
+
return kll_sketch<T, C, S, A>::const_iterator(nullptr, nullptr, num_levels_);
|
1056
|
+
}
|
1057
|
+
|
1058
|
+
// kll_sketch::const_iterator implementation
|
1059
|
+
|
1060
|
+
template<typename T, typename C, typename S, typename A>
|
1061
|
+
kll_sketch<T, C, S, A>::const_iterator::const_iterator(const T* items, const uint32_t* levels, const uint8_t num_levels):
|
1062
|
+
items(items), levels(levels), num_levels(num_levels), index(levels == nullptr ? 0 : levels[0]), level(levels == nullptr ? num_levels : 0), weight(1)
|
1063
|
+
{}
|
1064
|
+
|
1065
|
+
template<typename T, typename C, typename S, typename A>
|
1066
|
+
typename kll_sketch<T, C, S, A>::const_iterator& kll_sketch<T, C, S, A>::const_iterator::operator++() {
|
1067
|
+
++index;
|
1068
|
+
if (index == levels[level + 1]) { // go to the next non-empty level
|
1069
|
+
do {
|
1070
|
+
++level;
|
1071
|
+
weight *= 2;
|
1072
|
+
} while (level < num_levels && levels[level] == levels[level + 1]);
|
1073
|
+
}
|
1074
|
+
return *this;
|
1075
|
+
}
|
1076
|
+
|
1077
|
+
template<typename T, typename C, typename S, typename A>
|
1078
|
+
typename kll_sketch<T, C, S, A>::const_iterator& kll_sketch<T, C, S, A>::const_iterator::operator++(int) {
|
1079
|
+
const_iterator tmp(*this);
|
1080
|
+
operator++();
|
1081
|
+
return tmp;
|
1082
|
+
}
|
1083
|
+
|
1084
|
+
template<typename T, typename C, typename S, typename A>
|
1085
|
+
bool kll_sketch<T, C, S, A>::const_iterator::operator==(const const_iterator& other) const {
|
1086
|
+
if (level != other.level) return false;
|
1087
|
+
if (level == num_levels) return true; // end
|
1088
|
+
return index == other.index;
|
1089
|
+
}
|
1090
|
+
|
1091
|
+
template<typename T, typename C, typename S, typename A>
|
1092
|
+
bool kll_sketch<T, C, S, A>::const_iterator::operator!=(const const_iterator& other) const {
|
1093
|
+
return !operator==(other);
|
1094
|
+
}
|
1095
|
+
|
1096
|
+
template<typename T, typename C, typename S, typename A>
|
1097
|
+
const std::pair<const T&, const uint64_t> kll_sketch<T, C, S, A>::const_iterator::operator*() const {
|
1098
|
+
return std::pair<const T&, const uint64_t>(items[index], weight);
|
1099
|
+
}
|
1100
|
+
|
1101
|
+
template<typename T, typename C, typename S, typename A>
|
1102
|
+
class kll_sketch<T, C, S, A>::item_deleter {
|
1103
|
+
public:
|
1104
|
+
void operator() (T* ptr) const {
|
1105
|
+
if (ptr != nullptr) {
|
1106
|
+
ptr->~T();
|
1107
|
+
A().deallocate(ptr, 1);
|
1108
|
+
}
|
1109
|
+
}
|
1110
|
+
};
|
1111
|
+
|
1112
|
+
template<typename T, typename C, typename S, typename A>
|
1113
|
+
class kll_sketch<T, C, S, A>::items_deleter {
|
1114
|
+
public:
|
1115
|
+
items_deleter(uint32_t start, uint32_t num): start(start), num(num) {}
|
1116
|
+
void operator() (T* ptr) const {
|
1117
|
+
if (ptr != nullptr) {
|
1118
|
+
for (uint32_t i = start; i < num; ++i) {
|
1119
|
+
ptr[i].~T();
|
1120
|
+
}
|
1121
|
+
A().deallocate(ptr, num);
|
1122
|
+
}
|
1123
|
+
}
|
1124
|
+
private:
|
1125
|
+
uint32_t start;
|
1126
|
+
uint32_t num;
|
1127
|
+
};
|
1128
|
+
|
1129
|
+
} /* namespace datasketches */
|
1130
|
+
|
1131
|
+
#endif
|