datasketches 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
@@ -0,0 +1,810 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#ifndef CPC_SKETCH_IMPL_HPP_
|
21
|
+
#define CPC_SKETCH_IMPL_HPP_
|
22
|
+
|
23
|
+
#include <stdexcept>
|
24
|
+
#include <cmath>
|
25
|
+
#include <cstring>
|
26
|
+
#include <sstream>
|
27
|
+
|
28
|
+
#include "cpc_confidence.hpp"
|
29
|
+
#include "kxp_byte_lookup.hpp"
|
30
|
+
#include "inv_pow2_table.hpp"
|
31
|
+
#include "cpc_util.hpp"
|
32
|
+
#include "icon_estimator.hpp"
|
33
|
+
#include "serde.hpp"
|
34
|
+
#include "count_zeros.hpp"
|
35
|
+
|
36
|
+
namespace datasketches {
|
37
|
+
|
38
|
+
template<typename A>
|
39
|
+
void cpc_init() {
|
40
|
+
get_compressor<A>(); // this initializes a global static instance of the compressor on the first use
|
41
|
+
}
|
42
|
+
|
43
|
+
template<typename A>
|
44
|
+
cpc_sketch_alloc<A>::cpc_sketch_alloc(uint8_t lg_k, uint64_t seed):
|
45
|
+
lg_k(lg_k),
|
46
|
+
seed(seed),
|
47
|
+
was_merged(false),
|
48
|
+
num_coupons(0),
|
49
|
+
surprising_value_table(2, 6 + lg_k),
|
50
|
+
sliding_window(),
|
51
|
+
window_offset(0),
|
52
|
+
first_interesting_column(0),
|
53
|
+
kxp(1 << lg_k),
|
54
|
+
hip_est_accum(0)
|
55
|
+
{
|
56
|
+
if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
|
57
|
+
throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
|
58
|
+
}
|
59
|
+
}
|
60
|
+
|
61
|
+
template<typename A>
|
62
|
+
uint8_t cpc_sketch_alloc<A>::get_lg_k() const {
|
63
|
+
return lg_k;
|
64
|
+
}
|
65
|
+
|
66
|
+
template<typename A>
|
67
|
+
bool cpc_sketch_alloc<A>::is_empty() const {
|
68
|
+
return num_coupons == 0;
|
69
|
+
}
|
70
|
+
|
71
|
+
template<typename A>
|
72
|
+
double cpc_sketch_alloc<A>::get_estimate() const {
|
73
|
+
if (!was_merged) return get_hip_estimate();
|
74
|
+
return get_icon_estimate();
|
75
|
+
}
|
76
|
+
|
77
|
+
template<typename A>
|
78
|
+
double cpc_sketch_alloc<A>::get_hip_estimate() const {
|
79
|
+
return hip_est_accum;
|
80
|
+
}
|
81
|
+
|
82
|
+
template<typename A>
|
83
|
+
double cpc_sketch_alloc<A>::get_icon_estimate() const {
|
84
|
+
return compute_icon_estimate(lg_k, num_coupons);
|
85
|
+
}
|
86
|
+
|
87
|
+
template<typename A>
|
88
|
+
double cpc_sketch_alloc<A>::get_lower_bound(unsigned kappa) const {
|
89
|
+
if (kappa < 1 || kappa > 3) {
|
90
|
+
throw std::invalid_argument("kappa must be 1, 2 or 3");
|
91
|
+
}
|
92
|
+
if (!was_merged) return get_hip_confidence_lb<A>(*this, kappa);
|
93
|
+
return get_icon_confidence_lb<A>(*this, kappa);
|
94
|
+
}
|
95
|
+
|
96
|
+
template<typename A>
|
97
|
+
double cpc_sketch_alloc<A>::get_upper_bound(unsigned kappa) const {
|
98
|
+
if (kappa < 1 || kappa > 3) {
|
99
|
+
throw std::invalid_argument("kappa must be 1, 2 or 3");
|
100
|
+
}
|
101
|
+
if (!was_merged) return get_hip_confidence_ub<A>(*this, kappa);
|
102
|
+
return get_icon_confidence_ub<A>(*this, kappa);
|
103
|
+
}
|
104
|
+
|
105
|
+
template<typename A>
|
106
|
+
void cpc_sketch_alloc<A>::update(const std::string& value) {
|
107
|
+
if (value.empty()) return;
|
108
|
+
update(value.c_str(), value.length());
|
109
|
+
}
|
110
|
+
|
111
|
+
template<typename A>
|
112
|
+
void cpc_sketch_alloc<A>::update(uint64_t value) {
|
113
|
+
update(&value, sizeof(value));
|
114
|
+
}
|
115
|
+
|
116
|
+
template<typename A>
|
117
|
+
void cpc_sketch_alloc<A>::update(int64_t value) {
|
118
|
+
update(&value, sizeof(value));
|
119
|
+
}
|
120
|
+
|
121
|
+
template<typename A>
|
122
|
+
void cpc_sketch_alloc<A>::update(uint32_t value) {
|
123
|
+
update(static_cast<int32_t>(value));
|
124
|
+
}
|
125
|
+
|
126
|
+
template<typename A>
|
127
|
+
void cpc_sketch_alloc<A>::update(int32_t value) {
|
128
|
+
update(static_cast<int64_t>(value));
|
129
|
+
}
|
130
|
+
|
131
|
+
template<typename A>
|
132
|
+
void cpc_sketch_alloc<A>::update(uint16_t value) {
|
133
|
+
update(static_cast<int16_t>(value));
|
134
|
+
}
|
135
|
+
|
136
|
+
template<typename A>
|
137
|
+
void cpc_sketch_alloc<A>::update(int16_t value) {
|
138
|
+
update(static_cast<int64_t>(value));
|
139
|
+
}
|
140
|
+
|
141
|
+
template<typename A>
|
142
|
+
void cpc_sketch_alloc<A>::update(uint8_t value) {
|
143
|
+
update(static_cast<int8_t>(value));
|
144
|
+
}
|
145
|
+
|
146
|
+
template<typename A>
|
147
|
+
void cpc_sketch_alloc<A>::update(int8_t value) {
|
148
|
+
update(static_cast<int64_t>(value));
|
149
|
+
}
|
150
|
+
|
151
|
+
template<typename A>
|
152
|
+
void cpc_sketch_alloc<A>::update(double value) {
|
153
|
+
union {
|
154
|
+
int64_t long_value;
|
155
|
+
double double_value;
|
156
|
+
} ldu;
|
157
|
+
if (value == 0.0) {
|
158
|
+
ldu.double_value = 0.0; // canonicalize -0.0 to 0.0
|
159
|
+
} else if (std::isnan(value)) {
|
160
|
+
ldu.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
|
161
|
+
} else {
|
162
|
+
ldu.double_value = value;
|
163
|
+
}
|
164
|
+
update(&ldu, sizeof(ldu));
|
165
|
+
}
|
166
|
+
|
167
|
+
template<typename A>
|
168
|
+
void cpc_sketch_alloc<A>::update(float value) {
|
169
|
+
update(static_cast<double>(value));
|
170
|
+
}
|
171
|
+
|
172
|
+
static inline uint32_t row_col_from_two_hashes(uint64_t hash0, uint64_t hash1, uint8_t lg_k) {
|
173
|
+
if (lg_k > 26) throw std::logic_error("lg_k > 26");
|
174
|
+
const uint64_t k = 1 << lg_k;
|
175
|
+
uint8_t col = count_leading_zeros_in_u64(hash1); // 0 <= col <= 64
|
176
|
+
if (col > 63) col = 63; // clip so that 0 <= col <= 63
|
177
|
+
const uint32_t row = hash0 & (k - 1);
|
178
|
+
uint32_t row_col = (row << 6) | col;
|
179
|
+
// To avoid the hash table's "empty" value, we change the row of the following pair.
|
180
|
+
// This case is extremely unlikely, but we might as well handle it.
|
181
|
+
if (row_col == UINT32_MAX) row_col ^= 1 << 6;
|
182
|
+
return row_col;
|
183
|
+
}
|
184
|
+
|
185
|
+
template<typename A>
|
186
|
+
void cpc_sketch_alloc<A>::update(const void* value, int size) {
|
187
|
+
HashState hashes;
|
188
|
+
MurmurHash3_x64_128(value, size, seed, hashes);
|
189
|
+
row_col_update(row_col_from_two_hashes(hashes.h1, hashes.h2, lg_k));
|
190
|
+
}
|
191
|
+
|
192
|
+
template<typename A>
|
193
|
+
void cpc_sketch_alloc<A>::row_col_update(uint32_t row_col) {
|
194
|
+
const uint8_t col = row_col & 63;
|
195
|
+
if (col < first_interesting_column) return; // important speed optimization
|
196
|
+
// window size is 0 until sketch is promoted from sparse to windowed
|
197
|
+
if (sliding_window.size() == 0) {
|
198
|
+
update_sparse(row_col);
|
199
|
+
} else {
|
200
|
+
update_windowed(row_col);
|
201
|
+
}
|
202
|
+
}
|
203
|
+
|
204
|
+
template<typename A>
|
205
|
+
void cpc_sketch_alloc<A>::update_sparse(uint32_t row_col) {
|
206
|
+
const uint64_t k = 1 << lg_k;
|
207
|
+
const uint64_t c32pre = static_cast<uint64_t>(num_coupons) << 5;
|
208
|
+
if (c32pre >= 3 * k) throw std::logic_error("c32pre >= 3 * k"); // C < 3K/32, in other words flavor == SPARSE
|
209
|
+
bool is_novel = surprising_value_table.maybe_insert(row_col);
|
210
|
+
if (is_novel) {
|
211
|
+
num_coupons++;
|
212
|
+
update_hip(row_col);
|
213
|
+
const uint64_t c32post = static_cast<uint64_t>(num_coupons) << 5;
|
214
|
+
if (c32post >= 3 * k) promote_sparse_to_windowed(); // C >= 3K/32
|
215
|
+
}
|
216
|
+
}
|
217
|
+
|
218
|
+
// the flavor is HYBRID, PINNED, or SLIDING
|
219
|
+
template<typename A>
|
220
|
+
void cpc_sketch_alloc<A>::update_windowed(uint32_t row_col) {
|
221
|
+
if (window_offset > 56) throw std::logic_error("wrong window offset");
|
222
|
+
const uint64_t k = 1 << lg_k;
|
223
|
+
const uint64_t c32pre = static_cast<uint64_t>(num_coupons) << 5;
|
224
|
+
if (c32pre < 3 * k) throw std::logic_error("c32pre < 3 * k"); // C < 3K/32, in other words flavor >= HYBRID
|
225
|
+
const uint64_t c8pre = static_cast<uint64_t>(num_coupons) << 3;
|
226
|
+
const uint64_t w8pre = static_cast<uint64_t>(window_offset) << 3;
|
227
|
+
if (c8pre >= (27 + w8pre) * k) throw std::logic_error("c8pre is wrong"); // C < (K * 27/8) + (K * window_offset)
|
228
|
+
|
229
|
+
bool is_novel = false;
|
230
|
+
const uint8_t col = row_col & 63;
|
231
|
+
|
232
|
+
if (col < window_offset) { // track the surprising 0's "before" the window
|
233
|
+
is_novel = surprising_value_table.maybe_delete(row_col); // inverted logic
|
234
|
+
} else if (col < window_offset + 8) { // track the 8 bits inside the window
|
235
|
+
if (col < window_offset) throw std::logic_error("col < window_offset");
|
236
|
+
const uint32_t row = row_col >> 6;
|
237
|
+
const uint8_t old_bits = sliding_window[row];
|
238
|
+
const uint8_t new_bits = old_bits | (1 << (col - window_offset));
|
239
|
+
if (new_bits != old_bits) {
|
240
|
+
sliding_window[row] = new_bits;
|
241
|
+
is_novel = true;
|
242
|
+
}
|
243
|
+
} else { // track the surprising 1's "after" the window
|
244
|
+
if (col < window_offset + 8) throw std::logic_error("col < window_offset + 8");
|
245
|
+
is_novel = surprising_value_table.maybe_insert(row_col); // normal logic
|
246
|
+
}
|
247
|
+
|
248
|
+
if (is_novel) {
|
249
|
+
num_coupons++;
|
250
|
+
update_hip(row_col);
|
251
|
+
const uint64_t c8post = static_cast<uint64_t>(num_coupons) << 3;
|
252
|
+
if (c8post >= (27 + w8pre) * k) {
|
253
|
+
move_window();
|
254
|
+
if (window_offset < 1 || window_offset > 56) throw std::logic_error("wrong window offset");
|
255
|
+
const uint64_t w8post = static_cast<uint64_t>(window_offset) << 3;
|
256
|
+
if (c8post >= (27 + w8post) * k) throw std::logic_error("c8pre is wrong"); // C < (K * 27/8) + (K * window_offset)
|
257
|
+
}
|
258
|
+
}
|
259
|
+
}
|
260
|
+
|
261
|
+
// Call this whenever a new coupon has been collected.
|
262
|
+
template<typename A>
|
263
|
+
void cpc_sketch_alloc<A>::update_hip(uint32_t row_col) {
|
264
|
+
const uint64_t k = 1 << lg_k;
|
265
|
+
const uint8_t col = row_col & 63;
|
266
|
+
const double one_over_p = static_cast<double>(k) / kxp;
|
267
|
+
hip_est_accum += one_over_p;
|
268
|
+
kxp -= INVERSE_POWERS_OF_2[col + 1]; // notice the "+1"
|
269
|
+
}
|
270
|
+
|
271
|
+
// In terms of flavor, this promotes SPARSE to HYBRID
|
272
|
+
template<typename A>
|
273
|
+
void cpc_sketch_alloc<A>::promote_sparse_to_windowed() {
|
274
|
+
const uint64_t k = 1 << lg_k;
|
275
|
+
const uint64_t c32 = static_cast<uint64_t>(num_coupons) << 5;
|
276
|
+
if (!(c32 == 3 * k || (lg_k == 4 && c32 > 3 * k))) throw std::logic_error("wrong c32");
|
277
|
+
|
278
|
+
sliding_window.resize(k, 0); // zero the memory (because we will be OR'ing into it)
|
279
|
+
|
280
|
+
u32_table<A> new_table(2, 6 + lg_k);
|
281
|
+
|
282
|
+
const uint32_t* old_slots = surprising_value_table.get_slots();
|
283
|
+
const size_t old_num_slots = 1 << surprising_value_table.get_lg_size();
|
284
|
+
|
285
|
+
if (window_offset != 0) throw std::logic_error("window_offset != 0");
|
286
|
+
|
287
|
+
for (size_t i = 0; i < old_num_slots; i++) {
|
288
|
+
const uint32_t row_col = old_slots[i];
|
289
|
+
if (row_col != UINT32_MAX) {
|
290
|
+
const uint8_t col = row_col & 63;
|
291
|
+
if (col < 8) {
|
292
|
+
const size_t row = row_col >> 6;
|
293
|
+
sliding_window[row] |= 1 << col;
|
294
|
+
} else {
|
295
|
+
// cannot use u32_table::must_insert(), because it doesn't provide for growth
|
296
|
+
const bool is_novel = new_table.maybe_insert(row_col);
|
297
|
+
if (!is_novel) throw std::logic_error("is_novel != true");
|
298
|
+
}
|
299
|
+
}
|
300
|
+
}
|
301
|
+
|
302
|
+
surprising_value_table = std::move(new_table);
|
303
|
+
}
|
304
|
+
|
305
|
+
template<typename A>
|
306
|
+
void cpc_sketch_alloc<A>::move_window() {
|
307
|
+
const uint8_t new_offset = window_offset + 1;
|
308
|
+
if (new_offset > 56) throw std::logic_error("new_offset > 56");
|
309
|
+
if (new_offset != determine_correct_offset(lg_k, num_coupons)) throw std::logic_error("new_offset is wrong");
|
310
|
+
|
311
|
+
if (sliding_window.size() == 0) throw std::logic_error("no sliding window");
|
312
|
+
const uint64_t k = 1 << lg_k;
|
313
|
+
|
314
|
+
// Construct the full-sized bit matrix that corresponds to the sketch
|
315
|
+
vector_u64<A> bit_matrix = build_bit_matrix();
|
316
|
+
|
317
|
+
// refresh the KXP register on every 8th window shift.
|
318
|
+
if ((new_offset & 0x7) == 0) refresh_kxp(bit_matrix.data());
|
319
|
+
|
320
|
+
surprising_value_table.clear(); // the new number of surprises will be about the same
|
321
|
+
|
322
|
+
const uint64_t mask_for_clearing_window = (static_cast<uint64_t>(0xff) << new_offset) ^ UINT64_MAX;
|
323
|
+
const uint64_t mask_for_flipping_early_zone = (static_cast<uint64_t>(1) << new_offset) - 1;
|
324
|
+
uint64_t all_surprises_ored = 0;
|
325
|
+
|
326
|
+
for (size_t i = 0; i < k; i++) {
|
327
|
+
uint64_t pattern = bit_matrix[i];
|
328
|
+
sliding_window[i] = (pattern >> new_offset) & 0xff;
|
329
|
+
pattern &= mask_for_clearing_window;
|
330
|
+
// The following line converts surprising 0's to 1's in the "early zone",
|
331
|
+
// (and vice versa, which is essential for this procedure's O(k) time cost).
|
332
|
+
pattern ^= mask_for_flipping_early_zone;
|
333
|
+
all_surprises_ored |= pattern; // a cheap way to recalculate first_interesting_column
|
334
|
+
while (pattern != 0) {
|
335
|
+
const uint8_t col = count_trailing_zeros_in_u64(pattern);
|
336
|
+
pattern = pattern ^ (static_cast<uint64_t>(1) << col); // erase the 1
|
337
|
+
const uint32_t row_col = (i << 6) | col;
|
338
|
+
const bool is_novel = surprising_value_table.maybe_insert(row_col);
|
339
|
+
if (!is_novel) throw std::logic_error("is_novel != true");
|
340
|
+
}
|
341
|
+
}
|
342
|
+
|
343
|
+
window_offset = new_offset;
|
344
|
+
|
345
|
+
first_interesting_column = count_trailing_zeros_in_u64(all_surprises_ored);
|
346
|
+
if (first_interesting_column > new_offset) first_interesting_column = new_offset; // corner case
|
347
|
+
}
|
348
|
+
|
349
|
+
// The KXP register is a double with roughly 50 bits of precision, but
|
350
|
+
// it might need roughly 90 bits to track the value with perfect accuracy.
|
351
|
+
// Therefore we recalculate KXP occasionally from the sketch's full bitmatrix
|
352
|
+
// so that it will reflect changes that were previously outside the mantissa.
|
353
|
+
template<typename A>
|
354
|
+
void cpc_sketch_alloc<A>::refresh_kxp(const uint64_t* bit_matrix) {
|
355
|
+
const uint64_t k = 1 << lg_k;
|
356
|
+
|
357
|
+
// for improved numerical accuracy, we separately sum the bytes of the U64's
|
358
|
+
double byte_sums[8]; // allocating on the stack
|
359
|
+
std::fill(byte_sums, &byte_sums[8], 0);
|
360
|
+
|
361
|
+
for (size_t i = 0; i < k; i++) {
|
362
|
+
uint64_t word = bit_matrix[i];
|
363
|
+
for (unsigned j = 0; j < 8; j++) {
|
364
|
+
const uint8_t byte = word & 0xff;
|
365
|
+
byte_sums[j] += KXP_BYTE_TABLE[byte];
|
366
|
+
word >>= 8;
|
367
|
+
}
|
368
|
+
}
|
369
|
+
|
370
|
+
double total = 0.0;
|
371
|
+
for (int j = 7; j >= 0; j--) { // the reverse order is important
|
372
|
+
const double factor = INVERSE_POWERS_OF_2[8 * j]; // pow (256.0, (-1.0 * ((double) j)));
|
373
|
+
total += factor * byte_sums[j];
|
374
|
+
}
|
375
|
+
|
376
|
+
kxp = total;
|
377
|
+
}
|
378
|
+
|
379
|
+
template<typename A>
|
380
|
+
string<A> cpc_sketch_alloc<A>::to_string() const {
|
381
|
+
std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
|
382
|
+
os << "### CPC sketch summary:" << std::endl;
|
383
|
+
os << " lg_k : " << std::to_string(lg_k) << std::endl;
|
384
|
+
os << " seed hash : " << std::hex << compute_seed_hash(seed) << std::dec << std::endl;
|
385
|
+
os << " C : " << num_coupons << std::endl;
|
386
|
+
os << " flavor : " << determine_flavor() << std::endl;
|
387
|
+
os << " merged : " << (was_merged ? "true" : "false") << std::endl;
|
388
|
+
if (!was_merged) {
|
389
|
+
os << " HIP estimate : " << hip_est_accum << std::endl;
|
390
|
+
os << " kxp : " << kxp << std::endl;
|
391
|
+
}
|
392
|
+
os << " intresting col : " << std::to_string(first_interesting_column) << std::endl;
|
393
|
+
os << " table entries : " << surprising_value_table.get_num_items() << std::endl;
|
394
|
+
os << " window : " << (sliding_window.size() == 0 ? "not " : "") << "allocated" << std::endl;
|
395
|
+
if (sliding_window.size() > 0) {
|
396
|
+
os << " window offset : " << std::to_string(window_offset) << std::endl;
|
397
|
+
}
|
398
|
+
os << "### End sketch summary" << std::endl;
|
399
|
+
return os.str();
|
400
|
+
}
|
401
|
+
|
402
|
+
template<typename A>
|
403
|
+
void cpc_sketch_alloc<A>::serialize(std::ostream& os) const {
|
404
|
+
compressed_state<A> compressed;
|
405
|
+
compressed.table_data_words = 0;
|
406
|
+
compressed.table_num_entries = 0;
|
407
|
+
compressed.window_data_words = 0;
|
408
|
+
get_compressor<A>().compress(*this, compressed);
|
409
|
+
const bool has_hip = !was_merged;
|
410
|
+
const bool has_table = compressed.table_data.size() > 0;
|
411
|
+
const bool has_window = compressed.window_data.size() > 0;
|
412
|
+
const uint8_t preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window);
|
413
|
+
os.write(reinterpret_cast<const char*>(&preamble_ints), sizeof(preamble_ints));
|
414
|
+
const uint8_t serial_version = SERIAL_VERSION;
|
415
|
+
os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
|
416
|
+
const uint8_t family = FAMILY;
|
417
|
+
os.write(reinterpret_cast<const char*>(&family), sizeof(family));
|
418
|
+
os.write(reinterpret_cast<const char*>(&lg_k), sizeof(lg_k));
|
419
|
+
os.write(reinterpret_cast<const char*>(&first_interesting_column), sizeof(first_interesting_column));
|
420
|
+
const uint8_t flags_byte(
|
421
|
+
(1 << flags::IS_COMPRESSED)
|
422
|
+
| (has_hip ? 1 << flags::HAS_HIP : 0)
|
423
|
+
| (has_table ? 1 << flags::HAS_TABLE : 0)
|
424
|
+
| (has_window ? 1 << flags::HAS_WINDOW : 0)
|
425
|
+
);
|
426
|
+
os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
|
427
|
+
const uint16_t seed_hash(compute_seed_hash(seed));
|
428
|
+
os.write((char*)&seed_hash, sizeof(seed_hash));
|
429
|
+
if (!is_empty()) {
|
430
|
+
os.write((char*)&num_coupons, sizeof(num_coupons));
|
431
|
+
if (has_table && has_window) {
|
432
|
+
// if there is no window it is the same as number of coupons
|
433
|
+
os.write((char*)&compressed.table_num_entries, sizeof(compressed.table_num_entries));
|
434
|
+
// HIP values can be in two different places in the sequence of fields
|
435
|
+
// this is the first HIP decision point
|
436
|
+
if (has_hip) write_hip(os);
|
437
|
+
}
|
438
|
+
if (has_table) {
|
439
|
+
os.write((char*)&compressed.table_data_words, sizeof(compressed.table_data_words));
|
440
|
+
}
|
441
|
+
if (has_window) {
|
442
|
+
os.write((char*)&compressed.window_data_words, sizeof(compressed.window_data_words));
|
443
|
+
}
|
444
|
+
// this is the second HIP decision point
|
445
|
+
if (has_hip && !(has_table && has_window)) write_hip(os);
|
446
|
+
if (has_window) {
|
447
|
+
os.write((char*)compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
|
448
|
+
}
|
449
|
+
if (has_table) {
|
450
|
+
os.write((char*)compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
|
451
|
+
}
|
452
|
+
}
|
453
|
+
}
|
454
|
+
|
455
|
+
template<typename A>
|
456
|
+
vector_u8<A> cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
|
457
|
+
compressed_state<A> compressed;
|
458
|
+
compressed.table_data_words = 0;
|
459
|
+
compressed.table_num_entries = 0;
|
460
|
+
compressed.window_data_words = 0;
|
461
|
+
get_compressor<A>().compress(*this, compressed);
|
462
|
+
const bool has_hip = !was_merged;
|
463
|
+
const bool has_table = compressed.table_data.size() > 0;
|
464
|
+
const bool has_window = compressed.window_data.size() > 0;
|
465
|
+
const uint8_t preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window);
|
466
|
+
const size_t size = header_size_bytes + (preamble_ints + compressed.table_data_words + compressed.window_data_words) * sizeof(uint32_t);
|
467
|
+
vector_u8<A> bytes(size);
|
468
|
+
uint8_t* ptr = bytes.data() + header_size_bytes;
|
469
|
+
ptr += copy_to_mem(&preamble_ints, ptr, sizeof(preamble_ints));
|
470
|
+
const uint8_t serial_version = SERIAL_VERSION;
|
471
|
+
ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
|
472
|
+
const uint8_t family = FAMILY;
|
473
|
+
ptr += copy_to_mem(&family, ptr, sizeof(family));
|
474
|
+
ptr += copy_to_mem(&lg_k, ptr, sizeof(lg_k));
|
475
|
+
ptr += copy_to_mem(&first_interesting_column, ptr, sizeof(first_interesting_column));
|
476
|
+
const uint8_t flags_byte(
|
477
|
+
(1 << flags::IS_COMPRESSED)
|
478
|
+
| (has_hip ? 1 << flags::HAS_HIP : 0)
|
479
|
+
| (has_table ? 1 << flags::HAS_TABLE : 0)
|
480
|
+
| (has_window ? 1 << flags::HAS_WINDOW : 0)
|
481
|
+
);
|
482
|
+
ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
|
483
|
+
const uint16_t seed_hash = compute_seed_hash(seed);
|
484
|
+
ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
|
485
|
+
if (!is_empty()) {
|
486
|
+
ptr += copy_to_mem(&num_coupons, ptr, sizeof(num_coupons));
|
487
|
+
if (has_table && has_window) {
|
488
|
+
// if there is no window it is the same as number of coupons
|
489
|
+
ptr += copy_to_mem(&compressed.table_num_entries, ptr, sizeof(compressed.table_num_entries));
|
490
|
+
// HIP values can be in two different places in the sequence of fields
|
491
|
+
// this is the first HIP decision point
|
492
|
+
if (has_hip) ptr += copy_hip_to_mem(ptr);
|
493
|
+
}
|
494
|
+
if (has_table) {
|
495
|
+
ptr += copy_to_mem(&compressed.table_data_words, ptr, sizeof(compressed.table_data_words));
|
496
|
+
}
|
497
|
+
if (has_window) {
|
498
|
+
ptr += copy_to_mem(&compressed.window_data_words, ptr, sizeof(compressed.window_data_words));
|
499
|
+
}
|
500
|
+
// this is the second HIP decision point
|
501
|
+
if (has_hip && !(has_table && has_window)) ptr += copy_hip_to_mem(ptr);
|
502
|
+
if (has_window) {
|
503
|
+
ptr += copy_to_mem(compressed.window_data.data(), ptr, compressed.window_data_words * sizeof(uint32_t));
|
504
|
+
}
|
505
|
+
if (has_table) {
|
506
|
+
ptr += copy_to_mem(compressed.table_data.data(), ptr, compressed.table_data_words * sizeof(uint32_t));
|
507
|
+
}
|
508
|
+
}
|
509
|
+
if (ptr != bytes.data() + size) throw std::logic_error("serialized size mismatch");
|
510
|
+
return bytes;
|
511
|
+
}
|
512
|
+
|
513
|
+
template<typename A>
|
514
|
+
cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed) {
|
515
|
+
uint8_t preamble_ints;
|
516
|
+
is.read((char*)&preamble_ints, sizeof(preamble_ints));
|
517
|
+
uint8_t serial_version;
|
518
|
+
is.read((char*)&serial_version, sizeof(serial_version));
|
519
|
+
uint8_t family_id;
|
520
|
+
is.read((char*)&family_id, sizeof(family_id));
|
521
|
+
uint8_t lg_k;
|
522
|
+
is.read((char*)&lg_k, sizeof(lg_k));
|
523
|
+
uint8_t first_interesting_column;
|
524
|
+
is.read((char*)&first_interesting_column, sizeof(first_interesting_column));
|
525
|
+
uint8_t flags_byte;
|
526
|
+
is.read((char*)&flags_byte, sizeof(flags_byte));
|
527
|
+
uint16_t seed_hash;
|
528
|
+
is.read((char*)&seed_hash, sizeof(seed_hash));
|
529
|
+
const bool has_hip = flags_byte & (1 << flags::HAS_HIP);
|
530
|
+
const bool has_table = flags_byte & (1 << flags::HAS_TABLE);
|
531
|
+
const bool has_window = flags_byte & (1 << flags::HAS_WINDOW);
|
532
|
+
compressed_state<A> compressed;
|
533
|
+
compressed.table_data_words = 0;
|
534
|
+
compressed.table_num_entries = 0;
|
535
|
+
compressed.window_data_words = 0;
|
536
|
+
uint32_t num_coupons = 0;
|
537
|
+
double kxp = 0;
|
538
|
+
double hip_est_accum = 0;
|
539
|
+
if (has_table || has_window) {
|
540
|
+
is.read((char*)&num_coupons, sizeof(num_coupons));
|
541
|
+
if (has_table && has_window) {
|
542
|
+
is.read((char*)&compressed.table_num_entries, sizeof(compressed.table_num_entries));
|
543
|
+
if (has_hip) {
|
544
|
+
is.read((char*)&kxp, sizeof(kxp));
|
545
|
+
is.read((char*)&hip_est_accum, sizeof(hip_est_accum));
|
546
|
+
}
|
547
|
+
}
|
548
|
+
if (has_table) {
|
549
|
+
is.read((char*)&compressed.table_data_words, sizeof(compressed.table_data_words));
|
550
|
+
}
|
551
|
+
if (has_window) {
|
552
|
+
is.read((char*)&compressed.window_data_words, sizeof(compressed.window_data_words));
|
553
|
+
}
|
554
|
+
if (has_hip && !(has_table && has_window)) {
|
555
|
+
is.read((char*)&kxp, sizeof(kxp));
|
556
|
+
is.read((char*)&hip_est_accum, sizeof(hip_est_accum));
|
557
|
+
}
|
558
|
+
if (has_window) {
|
559
|
+
compressed.window_data.resize(compressed.window_data_words);
|
560
|
+
is.read((char*)compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
|
561
|
+
}
|
562
|
+
if (has_table) {
|
563
|
+
compressed.table_data.resize(compressed.table_data_words);
|
564
|
+
is.read((char*)compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
|
565
|
+
}
|
566
|
+
if (!has_window) compressed.table_num_entries = num_coupons;
|
567
|
+
}
|
568
|
+
|
569
|
+
uint8_t expected_preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window);
|
570
|
+
if (preamble_ints != expected_preamble_ints) {
|
571
|
+
throw std::invalid_argument("Possible corruption: preamble ints: expected "
|
572
|
+
+ std::to_string(expected_preamble_ints) + ", got " + std::to_string(preamble_ints));
|
573
|
+
}
|
574
|
+
if (serial_version != SERIAL_VERSION) {
|
575
|
+
throw std::invalid_argument("Possible corruption: serial version: expected "
|
576
|
+
+ std::to_string(SERIAL_VERSION) + ", got " + std::to_string(serial_version));
|
577
|
+
}
|
578
|
+
if (family_id != FAMILY) {
|
579
|
+
throw std::invalid_argument("Possible corruption: family: expected "
|
580
|
+
+ std::to_string(FAMILY) + ", got " + std::to_string(family_id));
|
581
|
+
}
|
582
|
+
if (seed_hash != compute_seed_hash(seed)) {
|
583
|
+
throw std::invalid_argument("Incompatible seed hashes: " + std::to_string(seed_hash) + ", "
|
584
|
+
+ std::to_string(compute_seed_hash(seed)));
|
585
|
+
}
|
586
|
+
uncompressed_state<A> uncompressed;
|
587
|
+
get_compressor<A>().uncompress(compressed, uncompressed, lg_k, num_coupons);
|
588
|
+
if (!is.good())
|
589
|
+
throw std::runtime_error("error reading from std::istream");
|
590
|
+
return cpc_sketch_alloc(lg_k, num_coupons, first_interesting_column, std::move(uncompressed.table),
|
591
|
+
std::move(uncompressed.window), has_hip, kxp, hip_est_accum, seed);
|
592
|
+
}
|
593
|
+
|
594
|
+
template<typename A>
|
595
|
+
cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed) {
|
596
|
+
ensure_minimum_memory(size, 8);
|
597
|
+
const char* ptr = static_cast<const char*>(bytes);
|
598
|
+
const char* base = static_cast<const char*>(bytes);
|
599
|
+
uint8_t preamble_ints;
|
600
|
+
ptr += copy_from_mem(ptr, &preamble_ints, sizeof(preamble_ints));
|
601
|
+
uint8_t serial_version;
|
602
|
+
ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
|
603
|
+
uint8_t family_id;
|
604
|
+
ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
|
605
|
+
uint8_t lg_k;
|
606
|
+
ptr += copy_from_mem(ptr, &lg_k, sizeof(lg_k));
|
607
|
+
uint8_t first_interesting_column;
|
608
|
+
ptr += copy_from_mem(ptr, &first_interesting_column, sizeof(first_interesting_column));
|
609
|
+
uint8_t flags_byte;
|
610
|
+
ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
|
611
|
+
uint16_t seed_hash;
|
612
|
+
ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
|
613
|
+
const bool has_hip = flags_byte & (1 << flags::HAS_HIP);
|
614
|
+
const bool has_table = flags_byte & (1 << flags::HAS_TABLE);
|
615
|
+
const bool has_window = flags_byte & (1 << flags::HAS_WINDOW);
|
616
|
+
ensure_minimum_memory(size, preamble_ints << 2);
|
617
|
+
compressed_state<A> compressed;
|
618
|
+
compressed.table_data_words = 0;
|
619
|
+
compressed.table_num_entries = 0;
|
620
|
+
compressed.window_data_words = 0;
|
621
|
+
uint32_t num_coupons = 0;
|
622
|
+
double kxp = 0;
|
623
|
+
double hip_est_accum = 0;
|
624
|
+
if (has_table || has_window) {
|
625
|
+
check_memory_size(ptr - base + sizeof(num_coupons), size);
|
626
|
+
ptr += copy_from_mem(ptr, &num_coupons, sizeof(num_coupons));
|
627
|
+
if (has_table && has_window) {
|
628
|
+
check_memory_size(ptr - base + sizeof(compressed.table_num_entries), size);
|
629
|
+
ptr += copy_from_mem(ptr, &compressed.table_num_entries, sizeof(compressed.table_num_entries));
|
630
|
+
if (has_hip) {
|
631
|
+
check_memory_size(ptr - base + sizeof(kxp) + sizeof(hip_est_accum), size);
|
632
|
+
ptr += copy_from_mem(ptr, &kxp, sizeof(kxp));
|
633
|
+
ptr += copy_from_mem(ptr, &hip_est_accum, sizeof(hip_est_accum));
|
634
|
+
}
|
635
|
+
}
|
636
|
+
if (has_table) {
|
637
|
+
check_memory_size(ptr - base + sizeof(compressed.table_data_words), size);
|
638
|
+
ptr += copy_from_mem(ptr, &compressed.table_data_words, sizeof(compressed.table_data_words));
|
639
|
+
}
|
640
|
+
if (has_window) {
|
641
|
+
check_memory_size(ptr - base + sizeof(compressed.window_data_words), size);
|
642
|
+
ptr += copy_from_mem(ptr, &compressed.window_data_words, sizeof(compressed.window_data_words));
|
643
|
+
}
|
644
|
+
if (has_hip && !(has_table && has_window)) {
|
645
|
+
check_memory_size(ptr - base + sizeof(kxp) + sizeof(hip_est_accum), size);
|
646
|
+
ptr += copy_from_mem(ptr, &kxp, sizeof(kxp));
|
647
|
+
ptr += copy_from_mem(ptr, &hip_est_accum, sizeof(hip_est_accum));
|
648
|
+
}
|
649
|
+
if (has_window) {
|
650
|
+
compressed.window_data.resize(compressed.window_data_words);
|
651
|
+
check_memory_size(ptr - base + (compressed.window_data_words * sizeof(uint32_t)), size);
|
652
|
+
ptr += copy_from_mem(ptr, compressed.window_data.data(), compressed.window_data_words * sizeof(uint32_t));
|
653
|
+
}
|
654
|
+
if (has_table) {
|
655
|
+
compressed.table_data.resize(compressed.table_data_words);
|
656
|
+
check_memory_size(ptr - base + (compressed.table_data_words * sizeof(uint32_t)), size);
|
657
|
+
ptr += copy_from_mem(ptr, compressed.table_data.data(), compressed.table_data_words * sizeof(uint32_t));
|
658
|
+
}
|
659
|
+
if (!has_window) compressed.table_num_entries = num_coupons;
|
660
|
+
}
|
661
|
+
if (ptr != static_cast<const char*>(bytes) + size) throw std::logic_error("deserialized size mismatch");
|
662
|
+
|
663
|
+
uint8_t expected_preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window);
|
664
|
+
if (preamble_ints != expected_preamble_ints) {
|
665
|
+
throw std::invalid_argument("Possible corruption: preamble ints: expected "
|
666
|
+
+ std::to_string(expected_preamble_ints) + ", got " + std::to_string(preamble_ints));
|
667
|
+
}
|
668
|
+
if (serial_version != SERIAL_VERSION) {
|
669
|
+
throw std::invalid_argument("Possible corruption: serial version: expected "
|
670
|
+
+ std::to_string(SERIAL_VERSION) + ", got " + std::to_string(serial_version));
|
671
|
+
}
|
672
|
+
if (family_id != FAMILY) {
|
673
|
+
throw std::invalid_argument("Possible corruption: family: expected "
|
674
|
+
+ std::to_string(FAMILY) + ", got " + std::to_string(family_id));
|
675
|
+
}
|
676
|
+
if (seed_hash != compute_seed_hash(seed)) {
|
677
|
+
throw std::invalid_argument("Incompatible seed hashes: " + std::to_string(seed_hash) + ", "
|
678
|
+
+ std::to_string(compute_seed_hash(seed)));
|
679
|
+
}
|
680
|
+
uncompressed_state<A> uncompressed;
|
681
|
+
get_compressor<A>().uncompress(compressed, uncompressed, lg_k, num_coupons);
|
682
|
+
return cpc_sketch_alloc(lg_k, num_coupons, first_interesting_column, std::move(uncompressed.table),
|
683
|
+
std::move(uncompressed.window), has_hip, kxp, hip_est_accum, seed);
|
684
|
+
}
|
685
|
+
|
686
|
+
template<typename A>
|
687
|
+
uint32_t cpc_sketch_alloc<A>::get_num_coupons() const {
|
688
|
+
return num_coupons;
|
689
|
+
}
|
690
|
+
|
691
|
+
template<typename A>
|
692
|
+
bool cpc_sketch_alloc<A>::validate() const {
|
693
|
+
vector_u64<A> bit_matrix = build_bit_matrix();
|
694
|
+
const uint64_t num_bits_set = count_bits_set_in_matrix(bit_matrix.data(), 1 << lg_k);
|
695
|
+
return num_bits_set == num_coupons;
|
696
|
+
}
|
697
|
+
|
698
|
+
template<typename A>
|
699
|
+
cpc_sketch_alloc<A>::cpc_sketch_alloc(uint8_t lg_k, uint32_t num_coupons, uint8_t first_interesting_column,
|
700
|
+
u32_table<A>&& table, vector_u8<A>&& window, bool has_hip, double kxp, double hip_est_accum, uint64_t seed):
|
701
|
+
lg_k(lg_k),
|
702
|
+
seed(seed),
|
703
|
+
was_merged(!has_hip),
|
704
|
+
num_coupons(num_coupons),
|
705
|
+
surprising_value_table(std::move(table)),
|
706
|
+
sliding_window(std::move(window)),
|
707
|
+
window_offset(determine_correct_offset(lg_k, num_coupons)),
|
708
|
+
first_interesting_column(first_interesting_column),
|
709
|
+
kxp(kxp),
|
710
|
+
hip_est_accum(hip_est_accum)
|
711
|
+
{}
|
712
|
+
|
713
|
+
template<typename A>
|
714
|
+
uint8_t cpc_sketch_alloc<A>::get_preamble_ints(uint32_t num_coupons, bool has_hip, bool has_table, bool has_window) {
|
715
|
+
uint8_t preamble_ints = 2;
|
716
|
+
if (num_coupons > 0) {
|
717
|
+
preamble_ints += 1; // number of coupons
|
718
|
+
if (has_hip) {
|
719
|
+
preamble_ints += 4; // HIP
|
720
|
+
}
|
721
|
+
if (has_table) {
|
722
|
+
preamble_ints += 1; // table data length
|
723
|
+
// number of values (if there is no window it is the same as number of coupons)
|
724
|
+
if (has_window) {
|
725
|
+
preamble_ints += 1;
|
726
|
+
}
|
727
|
+
}
|
728
|
+
if (has_window) {
|
729
|
+
preamble_ints += 1; // window length
|
730
|
+
}
|
731
|
+
}
|
732
|
+
return preamble_ints;
|
733
|
+
}
|
734
|
+
|
735
|
+
template<typename A>
|
736
|
+
typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor() const {
|
737
|
+
return determine_flavor(lg_k, num_coupons);
|
738
|
+
}
|
739
|
+
|
740
|
+
template<typename A>
|
741
|
+
typename cpc_sketch_alloc<A>::flavor cpc_sketch_alloc<A>::determine_flavor(uint8_t lg_k, uint64_t c) {
|
742
|
+
const uint64_t k = 1 << lg_k;
|
743
|
+
const uint64_t c2 = c << 1;
|
744
|
+
const uint64_t c8 = c << 3;
|
745
|
+
const uint64_t c32 = c << 5;
|
746
|
+
if (c == 0) return EMPTY; // 0 == C < 1
|
747
|
+
if (c32 < 3 * k) return SPARSE; // 1 <= C < 3K/32
|
748
|
+
if (c2 < k) return HYBRID; // 3K/32 <= C < K/2
|
749
|
+
if (c8 < 27 * k) return PINNED; // K/2 <= C < 27K/8
|
750
|
+
else return SLIDING; // 27K/8 <= C
|
751
|
+
}
|
752
|
+
|
753
|
+
template<typename A>
|
754
|
+
uint8_t cpc_sketch_alloc<A>::determine_correct_offset(uint8_t lg_k, uint64_t c) {
|
755
|
+
const uint64_t k = 1 << lg_k;
|
756
|
+
const int64_t tmp = static_cast<int64_t>(c << 3) - static_cast<int64_t>(19 * k); // 8C - 19K
|
757
|
+
if (tmp < 0) return 0;
|
758
|
+
return tmp >> (lg_k + 3); // tmp / 8K
|
759
|
+
}
|
760
|
+
|
761
|
+
template<typename A>
|
762
|
+
vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
|
763
|
+
const size_t k = 1 << lg_k;
|
764
|
+
if (window_offset > 56) throw std::logic_error("offset > 56");
|
765
|
+
|
766
|
+
// Fill the matrix with default rows in which the "early zone" is filled with ones.
|
767
|
+
// This is essential for the routine's O(k) time cost (as opposed to O(C)).
|
768
|
+
const uint64_t default_row = (static_cast<uint64_t>(1) << window_offset) - 1;
|
769
|
+
vector_u64<A> matrix(k, default_row);
|
770
|
+
|
771
|
+
if (num_coupons == 0) return matrix;
|
772
|
+
|
773
|
+
if (sliding_window.size() > 0) { // In other words, we are in window mode, not sparse mode
|
774
|
+
for (size_t i = 0; i < k; i++) { // set the window bits, trusting the sketch's current offset
|
775
|
+
matrix[i] |= static_cast<uint64_t>(sliding_window[i]) << window_offset;
|
776
|
+
}
|
777
|
+
}
|
778
|
+
|
779
|
+
const uint32_t* slots = surprising_value_table.get_slots();
|
780
|
+
const size_t num_slots = 1 << surprising_value_table.get_lg_size();
|
781
|
+
for (size_t i = 0; i < num_slots; i++) {
|
782
|
+
const uint32_t row_col = slots[i];
|
783
|
+
if (row_col != UINT32_MAX) {
|
784
|
+
const uint8_t col = row_col & 63;
|
785
|
+
const size_t row = row_col >> 6;
|
786
|
+
// Flip the specified matrix bit from its default value.
|
787
|
+
// In the "early" zone the bit changes from 1 to 0.
|
788
|
+
// In the "late" zone the bit changes from 0 to 1.
|
789
|
+
matrix[row] ^= static_cast<uint64_t>(1) << col;
|
790
|
+
}
|
791
|
+
}
|
792
|
+
return matrix;
|
793
|
+
}
|
794
|
+
|
795
|
+
template<typename A>
|
796
|
+
void cpc_sketch_alloc<A>::write_hip(std::ostream& os) const {
|
797
|
+
os.write(reinterpret_cast<const char*>(&kxp), sizeof(kxp));
|
798
|
+
os.write(reinterpret_cast<const char*>(&hip_est_accum), sizeof(hip_est_accum));
|
799
|
+
}
|
800
|
+
|
801
|
+
template<typename A>
|
802
|
+
size_t cpc_sketch_alloc<A>::copy_hip_to_mem(void* dst) const {
|
803
|
+
memcpy(dst, &kxp, sizeof(kxp));
|
804
|
+
memcpy(static_cast<char*>(dst) + sizeof(kxp), &hip_est_accum, sizeof(hip_est_accum));
|
805
|
+
return sizeof(kxp) + sizeof(hip_est_accum);
|
806
|
+
}
|
807
|
+
|
808
|
+
} /* namespace datasketches */
|
809
|
+
|
810
|
+
#endif
|