datasketches 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
|
@@ -0,0 +1,939 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef THETA_SKETCH_IMPL_HPP_
|
|
21
|
+
#define THETA_SKETCH_IMPL_HPP_
|
|
22
|
+
|
|
23
|
+
#include <algorithm>
|
|
24
|
+
#include <cmath>
|
|
25
|
+
#include <memory>
|
|
26
|
+
#include <functional>
|
|
27
|
+
#include <istream>
|
|
28
|
+
#include <ostream>
|
|
29
|
+
#include <sstream>
|
|
30
|
+
|
|
31
|
+
#include "MurmurHash3.h"
|
|
32
|
+
#include "serde.hpp"
|
|
33
|
+
#include "binomial_bounds.hpp"
|
|
34
|
+
#include "memory_operations.hpp"
|
|
35
|
+
|
|
36
|
+
namespace datasketches {
|
|
37
|
+
|
|
38
|
+
/*
|
|
39
|
+
* author Alexander Saydakov
|
|
40
|
+
* author Lee Rhodes
|
|
41
|
+
* author Kevin Lang
|
|
42
|
+
*/
|
|
43
|
+
|
|
44
|
+
template<typename A>
|
|
45
|
+
theta_sketch_alloc<A>::theta_sketch_alloc(bool is_empty, uint64_t theta):
|
|
46
|
+
is_empty_(is_empty), theta_(theta)
|
|
47
|
+
{}
|
|
48
|
+
|
|
49
|
+
template<typename A>
|
|
50
|
+
bool theta_sketch_alloc<A>::is_empty() const {
|
|
51
|
+
return is_empty_;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
template<typename A>
|
|
55
|
+
double theta_sketch_alloc<A>::get_estimate() const {
|
|
56
|
+
return get_num_retained() / get_theta();
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
template<typename A>
|
|
60
|
+
double theta_sketch_alloc<A>::get_lower_bound(uint8_t num_std_devs) const {
|
|
61
|
+
if (!is_estimation_mode()) return get_num_retained();
|
|
62
|
+
return binomial_bounds::get_lower_bound(get_num_retained(), get_theta(), num_std_devs);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
template<typename A>
|
|
66
|
+
double theta_sketch_alloc<A>::get_upper_bound(uint8_t num_std_devs) const {
|
|
67
|
+
if (!is_estimation_mode()) return get_num_retained();
|
|
68
|
+
return binomial_bounds::get_upper_bound(get_num_retained(), get_theta(), num_std_devs);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
template<typename A>
|
|
72
|
+
bool theta_sketch_alloc<A>::is_estimation_mode() const {
|
|
73
|
+
return theta_ < MAX_THETA && !is_empty_;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
template<typename A>
|
|
77
|
+
double theta_sketch_alloc<A>::get_theta() const {
|
|
78
|
+
return (double) theta_ / MAX_THETA;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
template<typename A>
|
|
82
|
+
uint64_t theta_sketch_alloc<A>::get_theta64() const {
|
|
83
|
+
return theta_;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
template<typename A>
|
|
87
|
+
typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed) {
|
|
88
|
+
uint8_t preamble_longs;
|
|
89
|
+
is.read((char*)&preamble_longs, sizeof(preamble_longs));
|
|
90
|
+
uint8_t serial_version;
|
|
91
|
+
is.read((char*)&serial_version, sizeof(serial_version));
|
|
92
|
+
uint8_t type;
|
|
93
|
+
is.read((char*)&type, sizeof(type));
|
|
94
|
+
uint8_t lg_nom_size;
|
|
95
|
+
is.read((char*)&lg_nom_size, sizeof(lg_nom_size));
|
|
96
|
+
uint8_t lg_cur_size;
|
|
97
|
+
is.read((char*)&lg_cur_size, sizeof(lg_cur_size));
|
|
98
|
+
uint8_t flags_byte;
|
|
99
|
+
is.read((char*)&flags_byte, sizeof(flags_byte));
|
|
100
|
+
uint16_t seed_hash;
|
|
101
|
+
is.read((char*)&seed_hash, sizeof(seed_hash));
|
|
102
|
+
|
|
103
|
+
check_serial_version(serial_version, SERIAL_VERSION);
|
|
104
|
+
|
|
105
|
+
if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
|
|
106
|
+
check_seed_hash(seed_hash, get_seed_hash(seed));
|
|
107
|
+
typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
|
|
108
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
|
|
109
|
+
return unique_ptr(
|
|
110
|
+
static_cast<theta_sketch_alloc<A>*>(new (AU().allocate(1)) update_theta_sketch_alloc<A>(update_theta_sketch_alloc<A>::internal_deserialize(is, rf, lg_cur_size, lg_nom_size, flags_byte, seed))),
|
|
111
|
+
[](theta_sketch_alloc<A>* ptr) {
|
|
112
|
+
ptr->~theta_sketch_alloc();
|
|
113
|
+
AU().deallocate(static_cast<update_theta_sketch_alloc<A>*>(ptr), 1);
|
|
114
|
+
}
|
|
115
|
+
);
|
|
116
|
+
} else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
|
|
117
|
+
const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
|
|
118
|
+
if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
|
|
119
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
|
|
120
|
+
return unique_ptr(
|
|
121
|
+
static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(compact_theta_sketch_alloc<A>::internal_deserialize(is, preamble_longs, flags_byte, seed_hash))),
|
|
122
|
+
[](theta_sketch_alloc<A>* ptr) {
|
|
123
|
+
ptr->~theta_sketch_alloc();
|
|
124
|
+
AC().deallocate(static_cast<compact_theta_sketch_alloc<A>*>(ptr), 1);
|
|
125
|
+
}
|
|
126
|
+
);
|
|
127
|
+
}
|
|
128
|
+
throw std::invalid_argument("unsupported sketch type " + std::to_string((int) type));
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
template<typename A>
|
|
132
|
+
typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed) {
|
|
133
|
+
ensure_minimum_memory(size, static_cast<size_t>(8));
|
|
134
|
+
const char* ptr = static_cast<const char*>(bytes);
|
|
135
|
+
uint8_t preamble_longs;
|
|
136
|
+
ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
|
|
137
|
+
uint8_t serial_version;
|
|
138
|
+
ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
|
|
139
|
+
uint8_t type;
|
|
140
|
+
ptr += copy_from_mem(ptr, &type, sizeof(type));
|
|
141
|
+
uint8_t lg_nom_size;
|
|
142
|
+
ptr += copy_from_mem(ptr, &lg_nom_size, sizeof(lg_nom_size));
|
|
143
|
+
uint8_t lg_cur_size;
|
|
144
|
+
ptr += copy_from_mem(ptr, &lg_cur_size, sizeof(lg_cur_size));
|
|
145
|
+
uint8_t flags_byte;
|
|
146
|
+
ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
|
|
147
|
+
uint16_t seed_hash;
|
|
148
|
+
ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
|
|
149
|
+
|
|
150
|
+
check_serial_version(serial_version, SERIAL_VERSION);
|
|
151
|
+
|
|
152
|
+
if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
|
|
153
|
+
check_seed_hash(seed_hash, get_seed_hash(seed));
|
|
154
|
+
typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
|
|
155
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
|
|
156
|
+
return unique_ptr(
|
|
157
|
+
static_cast<theta_sketch_alloc<A>*>(new (AU().allocate(1)) update_theta_sketch_alloc<A>(
|
|
158
|
+
update_theta_sketch_alloc<A>::internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), rf, lg_cur_size, lg_nom_size, flags_byte, seed))
|
|
159
|
+
),
|
|
160
|
+
[](theta_sketch_alloc<A>* ptr) {
|
|
161
|
+
ptr->~theta_sketch_alloc();
|
|
162
|
+
AU().deallocate(static_cast<update_theta_sketch_alloc<A>*>(ptr), 1);
|
|
163
|
+
}
|
|
164
|
+
);
|
|
165
|
+
} else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
|
|
166
|
+
const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
|
|
167
|
+
if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
|
|
168
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
|
|
169
|
+
return unique_ptr(
|
|
170
|
+
static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(
|
|
171
|
+
compact_theta_sketch_alloc<A>::internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), preamble_longs, flags_byte, seed_hash))
|
|
172
|
+
),
|
|
173
|
+
[](theta_sketch_alloc<A>* ptr) {
|
|
174
|
+
ptr->~theta_sketch_alloc();
|
|
175
|
+
AC().deallocate(static_cast<compact_theta_sketch_alloc<A>*>(ptr), 1);
|
|
176
|
+
}
|
|
177
|
+
);
|
|
178
|
+
}
|
|
179
|
+
throw std::invalid_argument("unsupported sketch type " + std::to_string((int) type));
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
template<typename A>
|
|
183
|
+
uint16_t theta_sketch_alloc<A>::get_seed_hash(uint64_t seed) {
|
|
184
|
+
HashState hashes;
|
|
185
|
+
MurmurHash3_x64_128(&seed, sizeof(seed), 0, hashes);
|
|
186
|
+
return hashes.h1;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
template<typename A>
|
|
190
|
+
void theta_sketch_alloc<A>::check_sketch_type(uint8_t actual, uint8_t expected) {
|
|
191
|
+
if (actual != expected) {
|
|
192
|
+
throw std::invalid_argument("Sketch type mismatch: expected " + std::to_string((int)expected) + ", actual " + std::to_string((int)actual));
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
template<typename A>
|
|
197
|
+
void theta_sketch_alloc<A>::check_serial_version(uint8_t actual, uint8_t expected) {
|
|
198
|
+
if (actual != expected) {
|
|
199
|
+
throw std::invalid_argument("Sketch serial version mismatch: expected " + std::to_string((int)expected) + ", actual " + std::to_string((int)actual));
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
template<typename A>
|
|
204
|
+
void theta_sketch_alloc<A>::check_seed_hash(uint16_t actual, uint16_t expected) {
|
|
205
|
+
if (actual != expected) {
|
|
206
|
+
throw std::invalid_argument("Sketch seed hash mismatch: expected " + std::to_string(expected) + ", actual " + std::to_string(actual));
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// update sketch
|
|
211
|
+
|
|
212
|
+
template<typename A>
|
|
213
|
+
update_theta_sketch_alloc<A>::update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed):
|
|
214
|
+
theta_sketch_alloc<A>(true, theta_sketch_alloc<A>::MAX_THETA),
|
|
215
|
+
lg_cur_size_(lg_cur_size),
|
|
216
|
+
lg_nom_size_(lg_nom_size),
|
|
217
|
+
keys_(1 << lg_cur_size_, 0),
|
|
218
|
+
num_keys_(0),
|
|
219
|
+
rf_(rf),
|
|
220
|
+
p_(p),
|
|
221
|
+
seed_(seed),
|
|
222
|
+
capacity_(get_capacity(lg_cur_size, lg_nom_size))
|
|
223
|
+
{
|
|
224
|
+
if (p < 1) this->theta_ *= p;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
template<typename A>
|
|
228
|
+
update_theta_sketch_alloc<A>::update_theta_sketch_alloc(bool is_empty, uint64_t theta, uint8_t lg_cur_size, uint8_t lg_nom_size, vector_u64<A>&& keys, uint32_t num_keys, resize_factor rf, float p, uint64_t seed):
|
|
229
|
+
theta_sketch_alloc<A>(is_empty, theta),
|
|
230
|
+
lg_cur_size_(lg_cur_size),
|
|
231
|
+
lg_nom_size_(lg_nom_size),
|
|
232
|
+
keys_(std::move(keys)),
|
|
233
|
+
num_keys_(num_keys),
|
|
234
|
+
rf_(rf),
|
|
235
|
+
p_(p),
|
|
236
|
+
seed_(seed),
|
|
237
|
+
capacity_(get_capacity(lg_cur_size, lg_nom_size))
|
|
238
|
+
{}
|
|
239
|
+
|
|
240
|
+
template<typename A>
|
|
241
|
+
uint32_t update_theta_sketch_alloc<A>::get_num_retained() const {
|
|
242
|
+
return num_keys_;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
template<typename A>
|
|
246
|
+
uint16_t update_theta_sketch_alloc<A>::get_seed_hash() const {
|
|
247
|
+
return theta_sketch_alloc<A>::get_seed_hash(seed_);
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
template<typename A>
|
|
251
|
+
bool update_theta_sketch_alloc<A>::is_ordered() const {
|
|
252
|
+
return false;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
template<typename A>
|
|
256
|
+
string<A> update_theta_sketch_alloc<A>::to_string(bool print_items) const {
|
|
257
|
+
std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
|
|
258
|
+
os << "### Update Theta sketch summary:" << std::endl;
|
|
259
|
+
os << " lg nominal size : " << (int) lg_nom_size_ << std::endl;
|
|
260
|
+
os << " lg current size : " << (int) lg_cur_size_ << std::endl;
|
|
261
|
+
os << " num retained keys : " << num_keys_ << std::endl;
|
|
262
|
+
os << " resize factor : " << (1 << rf_) << std::endl;
|
|
263
|
+
os << " sampling probability : " << p_ << std::endl;
|
|
264
|
+
os << " seed hash : " << this->get_seed_hash() << std::endl;
|
|
265
|
+
os << " empty? : " << (this->is_empty() ? "true" : "false") << std::endl;
|
|
266
|
+
os << " ordered? : " << (this->is_ordered() ? "true" : "false") << std::endl;
|
|
267
|
+
os << " estimation mode? : " << (this->is_estimation_mode() ? "true" : "false") << std::endl;
|
|
268
|
+
os << " theta (fraction) : " << this->get_theta() << std::endl;
|
|
269
|
+
os << " theta (raw 64-bit) : " << this->theta_ << std::endl;
|
|
270
|
+
os << " estimate : " << this->get_estimate() << std::endl;
|
|
271
|
+
os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
|
|
272
|
+
os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
|
|
273
|
+
os << "### End sketch summary" << std::endl;
|
|
274
|
+
if (print_items) {
|
|
275
|
+
os << "### Retained keys" << std::endl;
|
|
276
|
+
for (auto key: *this) os << " " << key << std::endl;
|
|
277
|
+
os << "### End retained keys" << std::endl;
|
|
278
|
+
}
|
|
279
|
+
return os.str();
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
template<typename A>
|
|
283
|
+
void update_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
|
|
284
|
+
const uint8_t preamble_longs_and_rf = 3 | (rf_ << 6);
|
|
285
|
+
os.write((char*)&preamble_longs_and_rf, sizeof(preamble_longs_and_rf));
|
|
286
|
+
const uint8_t serial_version = theta_sketch_alloc<A>::SERIAL_VERSION;
|
|
287
|
+
os.write((char*)&serial_version, sizeof(serial_version));
|
|
288
|
+
const uint8_t type = SKETCH_TYPE;
|
|
289
|
+
os.write((char*)&type, sizeof(type));
|
|
290
|
+
os.write((char*)&lg_nom_size_, sizeof(lg_nom_size_));
|
|
291
|
+
os.write((char*)&lg_cur_size_, sizeof(lg_cur_size_));
|
|
292
|
+
const uint8_t flags_byte(
|
|
293
|
+
(this->is_empty() ? 1 << theta_sketch_alloc<A>::flags::IS_EMPTY : 0)
|
|
294
|
+
);
|
|
295
|
+
os.write((char*)&flags_byte, sizeof(flags_byte));
|
|
296
|
+
const uint16_t seed_hash = get_seed_hash();
|
|
297
|
+
os.write((char*)&seed_hash, sizeof(seed_hash));
|
|
298
|
+
os.write((char*)&num_keys_, sizeof(num_keys_));
|
|
299
|
+
os.write((char*)&p_, sizeof(p_));
|
|
300
|
+
os.write((char*)&(this->theta_), sizeof(uint64_t));
|
|
301
|
+
os.write((char*)keys_.data(), sizeof(uint64_t) * keys_.size());
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
template<typename A>
|
|
305
|
+
vector_u8<A> update_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
|
|
306
|
+
const uint8_t preamble_longs = 3;
|
|
307
|
+
const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + sizeof(uint64_t) * keys_.size();
|
|
308
|
+
vector_u8<A> bytes(size);
|
|
309
|
+
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
310
|
+
|
|
311
|
+
const uint8_t preamble_longs_and_rf = preamble_longs | (rf_ << 6);
|
|
312
|
+
ptr += copy_to_mem(&preamble_longs_and_rf, ptr, sizeof(preamble_longs_and_rf));
|
|
313
|
+
const uint8_t serial_version = theta_sketch_alloc<A>::SERIAL_VERSION;
|
|
314
|
+
ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
|
|
315
|
+
const uint8_t type = SKETCH_TYPE;
|
|
316
|
+
ptr += copy_to_mem(&type, ptr, sizeof(type));
|
|
317
|
+
ptr += copy_to_mem(&lg_nom_size_, ptr, sizeof(lg_nom_size_));
|
|
318
|
+
ptr += copy_to_mem(&lg_cur_size_, ptr, sizeof(lg_cur_size_));
|
|
319
|
+
const uint8_t flags_byte(
|
|
320
|
+
(this->is_empty() ? 1 << theta_sketch_alloc<A>::flags::IS_EMPTY : 0)
|
|
321
|
+
);
|
|
322
|
+
ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
|
|
323
|
+
const uint16_t seed_hash = get_seed_hash();
|
|
324
|
+
ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
|
|
325
|
+
ptr += copy_to_mem(&num_keys_, ptr, sizeof(num_keys_));
|
|
326
|
+
ptr += copy_to_mem(&p_, ptr, sizeof(p_));
|
|
327
|
+
ptr += copy_to_mem(&(this->theta_), ptr, sizeof(uint64_t));
|
|
328
|
+
ptr += copy_to_mem(keys_.data(), ptr, sizeof(uint64_t) * keys_.size());
|
|
329
|
+
|
|
330
|
+
return bytes;
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
template<typename A>
|
|
334
|
+
update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed) {
|
|
335
|
+
uint8_t preamble_longs;
|
|
336
|
+
is.read((char*)&preamble_longs, sizeof(preamble_longs));
|
|
337
|
+
resize_factor rf = static_cast<resize_factor>(preamble_longs >> 6);
|
|
338
|
+
preamble_longs &= 0x3f; // remove resize factor
|
|
339
|
+
uint8_t serial_version;
|
|
340
|
+
is.read((char*)&serial_version, sizeof(serial_version));
|
|
341
|
+
uint8_t type;
|
|
342
|
+
is.read((char*)&type, sizeof(type));
|
|
343
|
+
uint8_t lg_nom_size;
|
|
344
|
+
is.read((char*)&lg_nom_size, sizeof(lg_nom_size));
|
|
345
|
+
uint8_t lg_cur_size;
|
|
346
|
+
is.read((char*)&lg_cur_size, sizeof(lg_cur_size));
|
|
347
|
+
uint8_t flags_byte;
|
|
348
|
+
is.read((char*)&flags_byte, sizeof(flags_byte));
|
|
349
|
+
uint16_t seed_hash;
|
|
350
|
+
is.read((char*)&seed_hash, sizeof(seed_hash));
|
|
351
|
+
theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
|
|
352
|
+
theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
|
|
353
|
+
theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
|
|
354
|
+
return internal_deserialize(is, rf, lg_cur_size, lg_nom_size, flags_byte, seed);
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
template<typename A>
|
|
358
|
+
update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::internal_deserialize(std::istream& is, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed) {
|
|
359
|
+
uint32_t num_keys;
|
|
360
|
+
is.read((char*)&num_keys, sizeof(num_keys));
|
|
361
|
+
float p;
|
|
362
|
+
is.read((char*)&p, sizeof(p));
|
|
363
|
+
uint64_t theta;
|
|
364
|
+
is.read((char*)&theta, sizeof(theta));
|
|
365
|
+
vector_u64<A> keys(1 << lg_cur_size);
|
|
366
|
+
is.read((char*)keys.data(), sizeof(uint64_t) * keys.size());
|
|
367
|
+
const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
|
|
368
|
+
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
|
369
|
+
return update_theta_sketch_alloc<A>(is_empty, theta, lg_cur_size, lg_nom_size, std::move(keys), num_keys, rf, p, seed);
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
template<typename A>
|
|
373
|
+
update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed) {
|
|
374
|
+
ensure_minimum_memory(size, 8);
|
|
375
|
+
const char* ptr = static_cast<const char*>(bytes);
|
|
376
|
+
uint8_t preamble_longs;
|
|
377
|
+
ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
|
|
378
|
+
resize_factor rf = static_cast<resize_factor>(preamble_longs >> 6);
|
|
379
|
+
preamble_longs &= 0x3f; // remove resize factor
|
|
380
|
+
uint8_t serial_version;
|
|
381
|
+
ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
|
|
382
|
+
uint8_t type;
|
|
383
|
+
ptr += copy_from_mem(ptr, &type, sizeof(type));
|
|
384
|
+
uint8_t lg_nom_size;
|
|
385
|
+
ptr += copy_from_mem(ptr, &lg_nom_size, sizeof(lg_nom_size));
|
|
386
|
+
uint8_t lg_cur_size;
|
|
387
|
+
ptr += copy_from_mem(ptr, &lg_cur_size, sizeof(lg_cur_size));
|
|
388
|
+
uint8_t flags_byte;
|
|
389
|
+
ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
|
|
390
|
+
uint16_t seed_hash;
|
|
391
|
+
ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
|
|
392
|
+
theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
|
|
393
|
+
theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
|
|
394
|
+
theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
|
|
395
|
+
return internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), rf, lg_cur_size, lg_nom_size, flags_byte, seed);
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
template<typename A>
|
|
399
|
+
update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::internal_deserialize(const void* bytes, size_t size, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed) {
|
|
400
|
+
const uint32_t table_size = 1 << lg_cur_size;
|
|
401
|
+
ensure_minimum_memory(size, 16 + sizeof(uint64_t) * table_size);
|
|
402
|
+
const char* ptr = static_cast<const char*>(bytes);
|
|
403
|
+
uint32_t num_keys;
|
|
404
|
+
ptr += copy_from_mem(ptr, &num_keys, sizeof(num_keys));
|
|
405
|
+
float p;
|
|
406
|
+
ptr += copy_from_mem(ptr, &p, sizeof(p));
|
|
407
|
+
uint64_t theta;
|
|
408
|
+
ptr += copy_from_mem(ptr, &theta, sizeof(theta));
|
|
409
|
+
vector_u64<A> keys(table_size);
|
|
410
|
+
ptr += copy_from_mem(ptr, keys.data(), sizeof(uint64_t) * table_size);
|
|
411
|
+
const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
|
|
412
|
+
return update_theta_sketch_alloc<A>(is_empty, theta, lg_cur_size, lg_nom_size, std::move(keys), num_keys, rf, p, seed);
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
template<typename A>
|
|
416
|
+
void update_theta_sketch_alloc<A>::update(const std::string& value) {
|
|
417
|
+
if (value.empty()) return;
|
|
418
|
+
update(value.c_str(), value.length());
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
template<typename A>
|
|
422
|
+
void update_theta_sketch_alloc<A>::update(uint64_t value) {
|
|
423
|
+
update(&value, sizeof(value));
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
template<typename A>
|
|
427
|
+
void update_theta_sketch_alloc<A>::update(int64_t value) {
|
|
428
|
+
update(&value, sizeof(value));
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
template<typename A>
|
|
432
|
+
void update_theta_sketch_alloc<A>::update(uint32_t value) {
|
|
433
|
+
update(static_cast<int32_t>(value));
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
template<typename A>
|
|
437
|
+
void update_theta_sketch_alloc<A>::update(int32_t value) {
|
|
438
|
+
update(static_cast<int64_t>(value));
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
template<typename A>
|
|
442
|
+
void update_theta_sketch_alloc<A>::update(uint16_t value) {
|
|
443
|
+
update(static_cast<int16_t>(value));
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
template<typename A>
|
|
447
|
+
void update_theta_sketch_alloc<A>::update(int16_t value) {
|
|
448
|
+
update(static_cast<int64_t>(value));
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
template<typename A>
|
|
452
|
+
void update_theta_sketch_alloc<A>::update(uint8_t value) {
|
|
453
|
+
update(static_cast<int8_t>(value));
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
template<typename A>
|
|
457
|
+
void update_theta_sketch_alloc<A>::update(int8_t value) {
|
|
458
|
+
update(static_cast<int64_t>(value));
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
template<typename A>
|
|
462
|
+
void update_theta_sketch_alloc<A>::update(double value) {
|
|
463
|
+
union {
|
|
464
|
+
int64_t long_value;
|
|
465
|
+
double double_value;
|
|
466
|
+
} long_double_union;
|
|
467
|
+
|
|
468
|
+
if (value == 0.0) {
|
|
469
|
+
long_double_union.double_value = 0.0; // canonicalize -0.0 to 0.0
|
|
470
|
+
} else if (std::isnan(value)) {
|
|
471
|
+
long_double_union.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
|
|
472
|
+
} else {
|
|
473
|
+
long_double_union.double_value = value;
|
|
474
|
+
}
|
|
475
|
+
update(&long_double_union, sizeof(long_double_union));
|
|
476
|
+
}
|
|
477
|
+
|
|
478
|
+
template<typename A>
|
|
479
|
+
void update_theta_sketch_alloc<A>::update(float value) {
|
|
480
|
+
update(static_cast<double>(value));
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
template<typename A>
|
|
484
|
+
void update_theta_sketch_alloc<A>::update(const void* data, unsigned length) {
|
|
485
|
+
HashState hashes;
|
|
486
|
+
MurmurHash3_x64_128(data, length, seed_, hashes);
|
|
487
|
+
const uint64_t hash = hashes.h1 >> 1; // Java implementation does logical shift >>> to make values positive
|
|
488
|
+
internal_update(hash);
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
template<typename A>
|
|
492
|
+
compact_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::compact(bool ordered) const {
|
|
493
|
+
return compact_theta_sketch_alloc<A>(*this, ordered);
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
template<typename A>
|
|
497
|
+
void update_theta_sketch_alloc<A>::internal_update(uint64_t hash) {
|
|
498
|
+
this->is_empty_ = false;
|
|
499
|
+
if (hash >= this->theta_ || hash == 0) return; // hash == 0 is reserved to mark empty slots in the table
|
|
500
|
+
if (hash_search_or_insert(hash, keys_.data(), lg_cur_size_)) {
|
|
501
|
+
num_keys_++;
|
|
502
|
+
if (num_keys_ > capacity_) {
|
|
503
|
+
if (lg_cur_size_ <= lg_nom_size_) {
|
|
504
|
+
resize();
|
|
505
|
+
} else {
|
|
506
|
+
rebuild();
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
template<typename A>
|
|
513
|
+
void update_theta_sketch_alloc<A>::trim() {
|
|
514
|
+
if (num_keys_ > static_cast<uint32_t>(1 << lg_nom_size_)) rebuild();
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
template<typename A>
|
|
518
|
+
void update_theta_sketch_alloc<A>::resize() {
|
|
519
|
+
const uint8_t lg_tgt_size = lg_nom_size_ + 1;
|
|
520
|
+
const uint8_t factor = std::max(1, std::min(static_cast<int>(rf_), lg_tgt_size - lg_cur_size_));
|
|
521
|
+
const uint8_t lg_new_size = lg_cur_size_ + factor;
|
|
522
|
+
const uint32_t new_size = 1 << lg_new_size;
|
|
523
|
+
vector_u64<A> new_keys(new_size, 0);
|
|
524
|
+
for (uint32_t i = 0; i < keys_.size(); i++) {
|
|
525
|
+
if (keys_[i] != 0) {
|
|
526
|
+
hash_search_or_insert(keys_[i], new_keys.data(), lg_new_size); // TODO hash_insert
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
keys_ = std::move(new_keys);
|
|
530
|
+
lg_cur_size_ += factor;
|
|
531
|
+
capacity_ = get_capacity(lg_cur_size_, lg_nom_size_);
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
template<typename A>
|
|
535
|
+
void update_theta_sketch_alloc<A>::rebuild() {
|
|
536
|
+
const uint32_t pivot = (1 << lg_nom_size_) + keys_.size() - num_keys_;
|
|
537
|
+
std::nth_element(keys_.begin(), keys_.begin() + pivot, keys_.end());
|
|
538
|
+
this->theta_ = keys_[pivot];
|
|
539
|
+
vector_u64<A> new_keys(keys_.size(), 0);
|
|
540
|
+
num_keys_ = 0;
|
|
541
|
+
for (uint32_t i = 0; i < keys_.size(); i++) {
|
|
542
|
+
if (keys_[i] != 0 && keys_[i] < this->theta_) {
|
|
543
|
+
hash_search_or_insert(keys_[i], new_keys.data(), lg_cur_size_); // TODO hash_insert
|
|
544
|
+
num_keys_++;
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
keys_ = std::move(new_keys);
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
template<typename A>
|
|
551
|
+
uint32_t update_theta_sketch_alloc<A>::get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size) {
|
|
552
|
+
const double fraction = (lg_cur_size <= lg_nom_size) ? RESIZE_THRESHOLD : REBUILD_THRESHOLD;
|
|
553
|
+
return std::floor(fraction * (1 << lg_cur_size));
|
|
554
|
+
}
|
|
555
|
+
|
|
556
|
+
template<typename A>
|
|
557
|
+
uint32_t update_theta_sketch_alloc<A>::get_stride(uint64_t hash, uint8_t lg_size) {
|
|
558
|
+
// odd and independent of index assuming lg_size lowest bits of the hash were used for the index
|
|
559
|
+
return (2 * static_cast<uint32_t>((hash >> lg_size) & STRIDE_MASK)) + 1;
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
template<typename A>
|
|
563
|
+
bool update_theta_sketch_alloc<A>::hash_search_or_insert(uint64_t hash, uint64_t* table, uint8_t lg_size) {
|
|
564
|
+
const uint32_t mask = (1 << lg_size) - 1;
|
|
565
|
+
const uint32_t stride = get_stride(hash, lg_size);
|
|
566
|
+
uint32_t cur_probe = static_cast<uint32_t>(hash) & mask;
|
|
567
|
+
|
|
568
|
+
// search for duplicate or zero
|
|
569
|
+
const uint32_t loop_index = cur_probe;
|
|
570
|
+
do {
|
|
571
|
+
const uint64_t value = table[cur_probe];
|
|
572
|
+
if (value == 0) {
|
|
573
|
+
table[cur_probe] = hash; // insert value
|
|
574
|
+
return true;
|
|
575
|
+
} else if (value == hash) {
|
|
576
|
+
return false; // found a duplicate
|
|
577
|
+
}
|
|
578
|
+
cur_probe = (cur_probe + stride) & mask;
|
|
579
|
+
} while (cur_probe != loop_index);
|
|
580
|
+
throw std::logic_error("key not found and no empty slots!");
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
template<typename A>
|
|
584
|
+
bool update_theta_sketch_alloc<A>::hash_search(uint64_t hash, const uint64_t* table, uint8_t lg_size) {
|
|
585
|
+
const uint32_t mask = (1 << lg_size) - 1;
|
|
586
|
+
const uint32_t stride = update_theta_sketch_alloc<A>::get_stride(hash, lg_size);
|
|
587
|
+
uint32_t cur_probe = static_cast<uint32_t>(hash) & mask;
|
|
588
|
+
const uint32_t loop_index = cur_probe;
|
|
589
|
+
do {
|
|
590
|
+
const uint64_t value = table[cur_probe];
|
|
591
|
+
if (value == 0) {
|
|
592
|
+
return false;
|
|
593
|
+
} else if (value == hash) {
|
|
594
|
+
return true;
|
|
595
|
+
}
|
|
596
|
+
cur_probe = (cur_probe + stride) & mask;
|
|
597
|
+
} while (cur_probe != loop_index);
|
|
598
|
+
throw std::logic_error("key not found and search wrapped");
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
template<typename A>
|
|
602
|
+
typename theta_sketch_alloc<A>::const_iterator update_theta_sketch_alloc<A>::begin() const {
|
|
603
|
+
return typename theta_sketch_alloc<A>::const_iterator(keys_.data(), keys_.size(), 0);
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
template<typename A>
|
|
607
|
+
typename theta_sketch_alloc<A>::const_iterator update_theta_sketch_alloc<A>::end() const {
|
|
608
|
+
return typename theta_sketch_alloc<A>::const_iterator(keys_.data(), keys_.size(), keys_.size());
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
// compact sketch
|
|
612
|
+
|
|
613
|
+
template<typename A>
|
|
614
|
+
compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(bool is_empty, uint64_t theta, vector_u64<A>&& keys, uint16_t seed_hash, bool is_ordered):
|
|
615
|
+
theta_sketch_alloc<A>(is_empty, theta),
|
|
616
|
+
keys_(std::move(keys)),
|
|
617
|
+
seed_hash_(seed_hash),
|
|
618
|
+
is_ordered_(is_ordered)
|
|
619
|
+
{}
|
|
620
|
+
|
|
621
|
+
template<typename A>
|
|
622
|
+
compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(const theta_sketch_alloc<A>& other, bool ordered):
|
|
623
|
+
theta_sketch_alloc<A>(other),
|
|
624
|
+
keys_(other.get_num_retained()),
|
|
625
|
+
seed_hash_(other.get_seed_hash()),
|
|
626
|
+
is_ordered_(other.is_ordered() || ordered)
|
|
627
|
+
{
|
|
628
|
+
std::copy(other.begin(), other.end(), keys_.begin());
|
|
629
|
+
if (ordered && !other.is_ordered()) std::sort(keys_.begin(), keys_.end());
|
|
630
|
+
}
|
|
631
|
+
|
|
632
|
+
template<typename A>
|
|
633
|
+
uint32_t compact_theta_sketch_alloc<A>::get_num_retained() const {
|
|
634
|
+
return keys_.size();
|
|
635
|
+
}
|
|
636
|
+
|
|
637
|
+
template<typename A>
|
|
638
|
+
uint16_t compact_theta_sketch_alloc<A>::get_seed_hash() const {
|
|
639
|
+
return seed_hash_;
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
template<typename A>
|
|
643
|
+
bool compact_theta_sketch_alloc<A>::is_ordered() const {
|
|
644
|
+
return is_ordered_;
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
template<typename A>
|
|
648
|
+
string<A> compact_theta_sketch_alloc<A>::to_string(bool print_items) const {
|
|
649
|
+
std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
|
|
650
|
+
os << "### Compact Theta sketch summary:" << std::endl;
|
|
651
|
+
os << " num retained keys : " << keys_.size() << std::endl;
|
|
652
|
+
os << " seed hash : " << this->get_seed_hash() << std::endl;
|
|
653
|
+
os << " empty? : " << (this->is_empty() ? "true" : "false") << std::endl;
|
|
654
|
+
os << " ordered? : " << (this->is_ordered() ? "true" : "false") << std::endl;
|
|
655
|
+
os << " estimation mode? : " << (this->is_estimation_mode() ? "true" : "false") << std::endl;
|
|
656
|
+
os << " theta (fraction) : " << this->get_theta() << std::endl;
|
|
657
|
+
os << " theta (raw 64-bit) : " << this->theta_ << std::endl;
|
|
658
|
+
os << " estimate : " << this->get_estimate() << std::endl;
|
|
659
|
+
os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
|
|
660
|
+
os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
|
|
661
|
+
os << "### End sketch summary" << std::endl;
|
|
662
|
+
if (print_items) {
|
|
663
|
+
os << "### Retained keys" << std::endl;
|
|
664
|
+
for (auto key: *this) os << " " << key << std::endl;
|
|
665
|
+
os << "### End retained keys" << std::endl;
|
|
666
|
+
}
|
|
667
|
+
return os.str();
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
template<typename A>
|
|
671
|
+
void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
|
|
672
|
+
const bool is_single_item = keys_.size() == 1 && !this->is_estimation_mode();
|
|
673
|
+
const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
|
|
674
|
+
os.write(reinterpret_cast<const char*>(&preamble_longs), sizeof(preamble_longs));
|
|
675
|
+
const uint8_t serial_version = theta_sketch_alloc<A>::SERIAL_VERSION;
|
|
676
|
+
os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
|
|
677
|
+
const uint8_t type = SKETCH_TYPE;
|
|
678
|
+
os.write(reinterpret_cast<const char*>(&type), sizeof(type));
|
|
679
|
+
const uint16_t unused16 = 0;
|
|
680
|
+
os.write(reinterpret_cast<const char*>(&unused16), sizeof(unused16));
|
|
681
|
+
const uint8_t flags_byte(
|
|
682
|
+
(1 << theta_sketch_alloc<A>::flags::IS_COMPACT) |
|
|
683
|
+
(1 << theta_sketch_alloc<A>::flags::IS_READ_ONLY) |
|
|
684
|
+
(this->is_empty() ? 1 << theta_sketch_alloc<A>::flags::IS_EMPTY : 0) |
|
|
685
|
+
(this->is_ordered() ? 1 << theta_sketch_alloc<A>::flags::IS_ORDERED : 0)
|
|
686
|
+
);
|
|
687
|
+
os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
|
|
688
|
+
const uint16_t seed_hash = get_seed_hash();
|
|
689
|
+
os.write((char*)&seed_hash, sizeof(seed_hash));
|
|
690
|
+
if (!this->is_empty()) {
|
|
691
|
+
if (!is_single_item) {
|
|
692
|
+
const uint32_t num_keys = keys_.size();
|
|
693
|
+
os.write((char*)&num_keys, sizeof(num_keys));
|
|
694
|
+
const uint32_t unused32 = 0;
|
|
695
|
+
os.write((char*)&unused32, sizeof(unused32));
|
|
696
|
+
if (this->is_estimation_mode()) {
|
|
697
|
+
os.write((char*)&(this->theta_), sizeof(uint64_t));
|
|
698
|
+
}
|
|
699
|
+
}
|
|
700
|
+
os.write((char*)keys_.data(), sizeof(uint64_t) * keys_.size());
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
template<typename A>
|
|
705
|
+
vector_u8<A> compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
|
|
706
|
+
const bool is_single_item = keys_.size() == 1 && !this->is_estimation_mode();
|
|
707
|
+
const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
|
|
708
|
+
const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + sizeof(uint64_t) * keys_.size();
|
|
709
|
+
vector_u8<A> bytes(size);
|
|
710
|
+
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
711
|
+
|
|
712
|
+
ptr += copy_to_mem(&preamble_longs, ptr, sizeof(preamble_longs));
|
|
713
|
+
const uint8_t serial_version = theta_sketch_alloc<A>::SERIAL_VERSION;
|
|
714
|
+
ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
|
|
715
|
+
const uint8_t type = SKETCH_TYPE;
|
|
716
|
+
ptr += copy_to_mem(&type, ptr, sizeof(type));
|
|
717
|
+
const uint16_t unused16 = 0;
|
|
718
|
+
ptr += copy_to_mem(&unused16, ptr, sizeof(unused16));
|
|
719
|
+
const uint8_t flags_byte(
|
|
720
|
+
(1 << theta_sketch_alloc<A>::flags::IS_COMPACT) |
|
|
721
|
+
(1 << theta_sketch_alloc<A>::flags::IS_READ_ONLY) |
|
|
722
|
+
(this->is_empty() ? 1 << theta_sketch_alloc<A>::flags::IS_EMPTY : 0) |
|
|
723
|
+
(this->is_ordered() ? 1 << theta_sketch_alloc<A>::flags::IS_ORDERED : 0)
|
|
724
|
+
);
|
|
725
|
+
ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
|
|
726
|
+
const uint16_t seed_hash = get_seed_hash();
|
|
727
|
+
ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
|
|
728
|
+
if (!this->is_empty()) {
|
|
729
|
+
if (!is_single_item) {
|
|
730
|
+
const uint32_t num_keys = keys_.size();
|
|
731
|
+
ptr += copy_to_mem(&num_keys, ptr, sizeof(num_keys));
|
|
732
|
+
const uint32_t unused32 = 0;
|
|
733
|
+
ptr += copy_to_mem(&unused32, ptr, sizeof(unused32));
|
|
734
|
+
if (this->is_estimation_mode()) {
|
|
735
|
+
ptr += copy_to_mem(&(this->theta_), ptr, sizeof(uint64_t));
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
ptr += copy_to_mem(keys_.data(), ptr, sizeof(uint64_t) * keys_.size());
|
|
739
|
+
}
|
|
740
|
+
|
|
741
|
+
return bytes;
|
|
742
|
+
}
|
|
743
|
+
|
|
744
|
+
template<typename A>
|
|
745
|
+
compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed) {
|
|
746
|
+
uint8_t preamble_longs;
|
|
747
|
+
is.read((char*)&preamble_longs, sizeof(preamble_longs));
|
|
748
|
+
uint8_t serial_version;
|
|
749
|
+
is.read((char*)&serial_version, sizeof(serial_version));
|
|
750
|
+
uint8_t type;
|
|
751
|
+
is.read((char*)&type, sizeof(type));
|
|
752
|
+
uint16_t unused16;
|
|
753
|
+
is.read((char*)&unused16, sizeof(unused16));
|
|
754
|
+
uint8_t flags_byte;
|
|
755
|
+
is.read((char*)&flags_byte, sizeof(flags_byte));
|
|
756
|
+
uint16_t seed_hash;
|
|
757
|
+
is.read((char*)&seed_hash, sizeof(seed_hash));
|
|
758
|
+
theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
|
|
759
|
+
theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
|
|
760
|
+
const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
|
|
761
|
+
if (!is_empty) theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
|
|
762
|
+
return internal_deserialize(is, preamble_longs, flags_byte, seed_hash);
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
template<typename A>
|
|
766
|
+
compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::internal_deserialize(std::istream& is, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash) {
|
|
767
|
+
uint64_t theta = theta_sketch_alloc<A>::MAX_THETA;
|
|
768
|
+
uint32_t num_keys = 0;
|
|
769
|
+
|
|
770
|
+
const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
|
|
771
|
+
if (!is_empty) {
|
|
772
|
+
if (preamble_longs == 1) {
|
|
773
|
+
num_keys = 1;
|
|
774
|
+
} else {
|
|
775
|
+
is.read((char*)&num_keys, sizeof(num_keys));
|
|
776
|
+
uint32_t unused32;
|
|
777
|
+
is.read((char*)&unused32, sizeof(unused32));
|
|
778
|
+
if (preamble_longs > 2) {
|
|
779
|
+
is.read((char*)&theta, sizeof(theta));
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
}
|
|
783
|
+
vector_u64<A> keys(num_keys);
|
|
784
|
+
if (!is_empty) is.read((char*)keys.data(), sizeof(uint64_t) * keys.size());
|
|
785
|
+
|
|
786
|
+
const bool is_ordered = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_ORDERED);
|
|
787
|
+
if (!is.good()) throw std::runtime_error("error reading from std::istream");
|
|
788
|
+
return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash, is_ordered);
|
|
789
|
+
}
|
|
790
|
+
|
|
791
|
+
template<typename A>
|
|
792
|
+
compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed) {
|
|
793
|
+
ensure_minimum_memory(size, 8);
|
|
794
|
+
const char* ptr = static_cast<const char*>(bytes);
|
|
795
|
+
uint8_t preamble_longs;
|
|
796
|
+
ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
|
|
797
|
+
uint8_t serial_version;
|
|
798
|
+
ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
|
|
799
|
+
uint8_t type;
|
|
800
|
+
ptr += copy_from_mem(ptr, &type, sizeof(type));
|
|
801
|
+
uint16_t unused16;
|
|
802
|
+
ptr += copy_from_mem(ptr, &unused16, sizeof(unused16));
|
|
803
|
+
uint8_t flags_byte;
|
|
804
|
+
ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
|
|
805
|
+
uint16_t seed_hash;
|
|
806
|
+
ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
|
|
807
|
+
theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
|
|
808
|
+
theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
|
|
809
|
+
const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
|
|
810
|
+
if (!is_empty) theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
|
|
811
|
+
return internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), preamble_longs, flags_byte, seed_hash);
|
|
812
|
+
}
|
|
813
|
+
|
|
814
|
+
template<typename A>
|
|
815
|
+
compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::internal_deserialize(const void* bytes, size_t size, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash) {
|
|
816
|
+
const char* ptr = static_cast<const char*>(bytes);
|
|
817
|
+
const char* base = ptr;
|
|
818
|
+
|
|
819
|
+
uint64_t theta = theta_sketch_alloc<A>::MAX_THETA;
|
|
820
|
+
uint32_t num_keys = 0;
|
|
821
|
+
|
|
822
|
+
const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
|
|
823
|
+
if (!is_empty) {
|
|
824
|
+
if (preamble_longs == 1) {
|
|
825
|
+
num_keys = 1;
|
|
826
|
+
} else {
|
|
827
|
+
ensure_minimum_memory(size, 8); // read the first prelong before this method
|
|
828
|
+
ptr += copy_from_mem(ptr, &num_keys, sizeof(num_keys));
|
|
829
|
+
uint32_t unused32;
|
|
830
|
+
ptr += copy_from_mem(ptr, &unused32, sizeof(unused32));
|
|
831
|
+
if (preamble_longs > 2) {
|
|
832
|
+
ensure_minimum_memory(size, (preamble_longs - 1) << 3);
|
|
833
|
+
ptr += copy_from_mem(ptr, &theta, sizeof(theta));
|
|
834
|
+
}
|
|
835
|
+
}
|
|
836
|
+
}
|
|
837
|
+
const size_t keys_size_bytes = sizeof(uint64_t) * num_keys;
|
|
838
|
+
check_memory_size(ptr - base + keys_size_bytes, size);
|
|
839
|
+
vector_u64<A> keys(num_keys);
|
|
840
|
+
if (!is_empty) ptr += copy_from_mem(ptr, keys.data(), keys_size_bytes);
|
|
841
|
+
|
|
842
|
+
const bool is_ordered = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_ORDERED);
|
|
843
|
+
return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash, is_ordered);
|
|
844
|
+
}
|
|
845
|
+
|
|
846
|
+
template<typename A>
|
|
847
|
+
typename theta_sketch_alloc<A>::const_iterator compact_theta_sketch_alloc<A>::begin() const {
|
|
848
|
+
return typename theta_sketch_alloc<A>::const_iterator(keys_.data(), keys_.size(), 0);
|
|
849
|
+
}
|
|
850
|
+
|
|
851
|
+
template<typename A>
|
|
852
|
+
typename theta_sketch_alloc<A>::const_iterator compact_theta_sketch_alloc<A>::end() const {
|
|
853
|
+
return typename theta_sketch_alloc<A>::const_iterator(keys_.data(), keys_.size(), keys_.size());
|
|
854
|
+
}
|
|
855
|
+
|
|
856
|
+
// builder
|
|
857
|
+
|
|
858
|
+
template<typename A>
|
|
859
|
+
update_theta_sketch_alloc<A>::builder::builder():
|
|
860
|
+
lg_k_(DEFAULT_LG_K), rf_(DEFAULT_RESIZE_FACTOR), p_(1), seed_(DEFAULT_SEED) {}
|
|
861
|
+
|
|
862
|
+
template<typename A>
|
|
863
|
+
typename update_theta_sketch_alloc<A>::builder& update_theta_sketch_alloc<A>::builder::set_lg_k(uint8_t lg_k) {
|
|
864
|
+
if (lg_k < MIN_LG_K) {
|
|
865
|
+
throw std::invalid_argument("lg_k must not be less than " + std::to_string(MIN_LG_K) + ": " + std::to_string(lg_k));
|
|
866
|
+
}
|
|
867
|
+
lg_k_ = lg_k;
|
|
868
|
+
return *this;
|
|
869
|
+
}
|
|
870
|
+
|
|
871
|
+
template<typename A>
|
|
872
|
+
typename update_theta_sketch_alloc<A>::builder& update_theta_sketch_alloc<A>::builder::set_resize_factor(resize_factor rf) {
|
|
873
|
+
rf_ = rf;
|
|
874
|
+
return *this;
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
template<typename A>
|
|
878
|
+
typename update_theta_sketch_alloc<A>::builder& update_theta_sketch_alloc<A>::builder::set_p(float p) {
|
|
879
|
+
p_ = p;
|
|
880
|
+
return *this;
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
template<typename A>
|
|
884
|
+
typename update_theta_sketch_alloc<A>::builder& update_theta_sketch_alloc<A>::builder::set_seed(uint64_t seed) {
|
|
885
|
+
seed_ = seed;
|
|
886
|
+
return *this;
|
|
887
|
+
}
|
|
888
|
+
|
|
889
|
+
template<typename A>
|
|
890
|
+
uint8_t update_theta_sketch_alloc<A>::builder::starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
|
|
891
|
+
return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
|
|
892
|
+
}
|
|
893
|
+
|
|
894
|
+
template<typename A>
|
|
895
|
+
update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::builder::build() const {
|
|
896
|
+
return update_theta_sketch_alloc<A>(starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_)), lg_k_, rf_, p_, seed_);
|
|
897
|
+
}
|
|
898
|
+
|
|
899
|
+
// iterator
|
|
900
|
+
|
|
901
|
+
template<typename A>
|
|
902
|
+
theta_sketch_alloc<A>::const_iterator::const_iterator(const uint64_t* keys, uint32_t size, uint32_t index):
|
|
903
|
+
keys_(keys), size_(size), index_(index) {
|
|
904
|
+
while (index_ < size_ && keys_[index_] == 0) ++index_;
|
|
905
|
+
}
|
|
906
|
+
|
|
907
|
+
template<typename A>
|
|
908
|
+
typename theta_sketch_alloc<A>::const_iterator& theta_sketch_alloc<A>::const_iterator::operator++() {
|
|
909
|
+
do {
|
|
910
|
+
++index_;
|
|
911
|
+
} while (index_ < size_ && keys_[index_] == 0);
|
|
912
|
+
return *this;
|
|
913
|
+
}
|
|
914
|
+
|
|
915
|
+
template<typename A>
|
|
916
|
+
typename theta_sketch_alloc<A>::const_iterator theta_sketch_alloc<A>::const_iterator::operator++(int) {
|
|
917
|
+
const_iterator tmp(*this);
|
|
918
|
+
operator++();
|
|
919
|
+
return tmp;
|
|
920
|
+
}
|
|
921
|
+
|
|
922
|
+
template<typename A>
|
|
923
|
+
bool theta_sketch_alloc<A>::const_iterator::operator==(const const_iterator& other) const {
|
|
924
|
+
return index_ == other.index_;
|
|
925
|
+
}
|
|
926
|
+
|
|
927
|
+
template<typename A>
|
|
928
|
+
bool theta_sketch_alloc<A>::const_iterator::operator!=(const const_iterator& other) const {
|
|
929
|
+
return index_ != other.index_;
|
|
930
|
+
}
|
|
931
|
+
|
|
932
|
+
template<typename A>
|
|
933
|
+
uint64_t theta_sketch_alloc<A>::const_iterator::operator*() const {
|
|
934
|
+
return keys_[index_];
|
|
935
|
+
}
|
|
936
|
+
|
|
937
|
+
} /* namespace datasketches */
|
|
938
|
+
|
|
939
|
+
#endif
|