datasketches 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef _VAR_OPT_UNION_HPP_
|
|
21
|
+
#define _VAR_OPT_UNION_HPP_
|
|
22
|
+
|
|
23
|
+
#include "var_opt_sketch.hpp"
|
|
24
|
+
#include "common_defs.hpp"
|
|
25
|
+
#include "serde.hpp"
|
|
26
|
+
|
|
27
|
+
#include <vector>
|
|
28
|
+
|
|
29
|
+
namespace datasketches {
|
|
30
|
+
|
|
31
|
+
template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Provides a unioning operation over var_opt_sketch objects. This union allows
|
|
35
|
+
* the sample size k to float, possibly increasing or decreasing as warranted by
|
|
36
|
+
* the available data.
|
|
37
|
+
*
|
|
38
|
+
* The union currently allows serialization and deserialization, even though transporting
|
|
39
|
+
* union objects seems to be an anti-pattern with most sketches. We currently provide it here
|
|
40
|
+
* because the get_result() call may need to discard samples and decrease k in order to
|
|
41
|
+
* return a valid sketch, even if future calls to update() would allow k to remain larger.
|
|
42
|
+
*
|
|
43
|
+
* The (de)serialization methods may be deprecated and subsequently removed in future versions.
|
|
44
|
+
*
|
|
45
|
+
* author Kevin Lang
|
|
46
|
+
* author Jon Malkin
|
|
47
|
+
*/
|
|
48
|
+
template <typename T, typename S = serde<T>, typename A = std::allocator<T>>
|
|
49
|
+
class var_opt_union {
|
|
50
|
+
|
|
51
|
+
public:
|
|
52
|
+
static const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
|
|
53
|
+
|
|
54
|
+
explicit var_opt_union(uint32_t max_k);
|
|
55
|
+
var_opt_union(const var_opt_union& other);
|
|
56
|
+
var_opt_union(var_opt_union&& other) noexcept;
|
|
57
|
+
|
|
58
|
+
~var_opt_union();
|
|
59
|
+
|
|
60
|
+
var_opt_union& operator=(const var_opt_union& other);
|
|
61
|
+
var_opt_union& operator=(var_opt_union&& other);
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Updates this union with the given sketch
|
|
65
|
+
* This method takes an lvalue.
|
|
66
|
+
* @param sk a sketch to add to the union
|
|
67
|
+
*/
|
|
68
|
+
void update(const var_opt_sketch<T,S,A>& sk);
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Updates this union with the given sketch
|
|
72
|
+
* This method takes an rvalue.
|
|
73
|
+
* @param sk a sketch to add to the union
|
|
74
|
+
*/
|
|
75
|
+
void update(var_opt_sketch<T,S,A>&& sk);
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Gets the varopt sketch resulting from the union of any input sketches.
|
|
79
|
+
* @return a varopt sketch
|
|
80
|
+
*/
|
|
81
|
+
var_opt_sketch<T,S,A> get_result() const;
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Resets the union to its default, empty state.
|
|
85
|
+
*/
|
|
86
|
+
void reset();
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Computes size needed to serialize the current state of the union.
|
|
90
|
+
* This version is for all other types and can be expensive since every item needs to be looked at.
|
|
91
|
+
* @return size in bytes needed to serialize this sketch
|
|
92
|
+
*/
|
|
93
|
+
size_t get_serialized_size_bytes() const;
|
|
94
|
+
|
|
95
|
+
// This is a convenience alias for users
|
|
96
|
+
// The type returned by the following serialize method
|
|
97
|
+
typedef vector_u8<A> vector_bytes;
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* NOTE: This method may be deprecated in a future version.
|
|
101
|
+
* This method serializes the sketch as a vector of bytes.
|
|
102
|
+
* An optional header can be reserved in front of the sketch.
|
|
103
|
+
* It is a blank space of a given size.
|
|
104
|
+
* This header is used in Datasketches PostgreSQL extension.
|
|
105
|
+
* @param header_size_bytes space to reserve in front of the sketch
|
|
106
|
+
*/
|
|
107
|
+
vector_bytes serialize(unsigned header_size_bytes = 0) const;
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* NOTE: This method may be deprecated in a future version.
|
|
111
|
+
* This method serializes the sketch into a given stream in a binary form
|
|
112
|
+
* @param os output stream
|
|
113
|
+
*/
|
|
114
|
+
void serialize(std::ostream& os) const;
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* NOTE: This method may be deprecated in a future version.
|
|
118
|
+
* This method deserializes a union from a given stream.
|
|
119
|
+
* @param is input stream
|
|
120
|
+
* @return an instance of a union
|
|
121
|
+
*/
|
|
122
|
+
static var_opt_union deserialize(std::istream& is);
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* NOTE: This method may be deprecated in a future version.
|
|
126
|
+
* This method deserializes a skeuniontch from a given array of bytes.
|
|
127
|
+
* @param bytes pointer to the array of bytes
|
|
128
|
+
* @param size the size of the array
|
|
129
|
+
* @return an instance of a union
|
|
130
|
+
*/
|
|
131
|
+
static var_opt_union deserialize(const void* bytes, size_t size);
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Prints a summary of the union as a string.
|
|
135
|
+
* @return the summary as a string
|
|
136
|
+
*/
|
|
137
|
+
string<A> to_string() const;
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
private:
|
|
141
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<var_opt_sketch<T,S,A>> AllocSketch;
|
|
142
|
+
|
|
143
|
+
static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
|
|
144
|
+
static const uint8_t PREAMBLE_LONGS_NON_EMPTY = 4;
|
|
145
|
+
static const uint8_t SER_VER = 2;
|
|
146
|
+
static const uint8_t FAMILY_ID = 14;
|
|
147
|
+
static const uint8_t EMPTY_FLAG_MASK = 4;
|
|
148
|
+
|
|
149
|
+
uint64_t n_; // cumulative over all input sketches
|
|
150
|
+
|
|
151
|
+
// outer tau is the largest tau of any input sketch
|
|
152
|
+
double outer_tau_numer_; // total weight of all input R-zones where tau = outer_tau
|
|
153
|
+
|
|
154
|
+
// total cardinality of the same R-zones, or zero if no input sketch was in estimation mode
|
|
155
|
+
uint64_t outer_tau_denom_;
|
|
156
|
+
|
|
157
|
+
uint32_t max_k_;
|
|
158
|
+
|
|
159
|
+
var_opt_sketch<T,S,A> gadget_;
|
|
160
|
+
|
|
161
|
+
var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
|
|
162
|
+
uint32_t max_k, var_opt_sketch<T,S,A>&& gadget);
|
|
163
|
+
|
|
164
|
+
/*
|
|
165
|
+
IMPORTANT NOTE: the "gadget" in the union object appears to be a varopt sketch,
|
|
166
|
+
but in fact is NOT because it doesn't satisfy the mathematical definition
|
|
167
|
+
of a varopt sketch of the concatenated input streams. Therefore it could be different
|
|
168
|
+
from a true varopt sketch with that value of K, in which case it could easily provide
|
|
169
|
+
worse estimation accuracy for subset-sum queries.
|
|
170
|
+
|
|
171
|
+
This should not surprise you; the approximation guarantees of varopt sketches
|
|
172
|
+
do not apply to things that merely resemble varopt sketches.
|
|
173
|
+
|
|
174
|
+
However, even though the gadget is not a varopt sketch, the result
|
|
175
|
+
of the unioning process IS a varopt sketch. It is constructed by a
|
|
176
|
+
somewhat complicated "resolution" process which determines the largest K
|
|
177
|
+
that a valid varopt sketch could have given the available information,
|
|
178
|
+
then constructs a varopt sketch of that size and returns it.
|
|
179
|
+
|
|
180
|
+
However, the gadget itself is not touched during the resolution process,
|
|
181
|
+
and additional sketches could subsequently be merged into the union,
|
|
182
|
+
at which point a varopt result could again be requested.
|
|
183
|
+
*/
|
|
184
|
+
|
|
185
|
+
/*
|
|
186
|
+
Explanation of "marked items" in the union's gadget:
|
|
187
|
+
|
|
188
|
+
The boolean value "true" in an pair indicates that the item
|
|
189
|
+
came from an input sketch's R zone, so it is already the result of sampling.
|
|
190
|
+
|
|
191
|
+
Therefore it must not wind up in the H zone of the final result, because
|
|
192
|
+
that would imply that the item is "exact".
|
|
193
|
+
|
|
194
|
+
However, it is okay for a marked item to hang out in the gadget's H zone for a while.
|
|
195
|
+
|
|
196
|
+
And once the item has moved to the gadget's R zone, the mark is never checked again,
|
|
197
|
+
so no effort is made to ensure that its value is preserved or even makes sense.
|
|
198
|
+
*/
|
|
199
|
+
|
|
200
|
+
/*
|
|
201
|
+
Note: if the computer could perform exact real-valued arithmetic, the union could finalize
|
|
202
|
+
its result by reducing k until inner_tau > outer_tau. [Due to the vagaries of floating point
|
|
203
|
+
arithmetic, we won't attempt to detect and specially handle the inner_tau = outer_tau special
|
|
204
|
+
case.]
|
|
205
|
+
|
|
206
|
+
In fact, we won't even look at tau while while reducing k. Instead the logic will be based
|
|
207
|
+
on the more robust integer quantity num_marks_in_h_ in the gadget. It is conceivable that due
|
|
208
|
+
to round-off error we could end up with inner_tau slightly less than outer_tau, but that should
|
|
209
|
+
be fairly harmless since we will have achieved our goal of getting the marked items out of H.
|
|
210
|
+
|
|
211
|
+
Also, you might be wondering why we are bothering to maintain the numerator and denominator
|
|
212
|
+
separately instead of just having a single variable outer_tau. This allows us (in certain
|
|
213
|
+
cases) to add an input's entire R-zone weight into the result sketch, as opposed to subdividing
|
|
214
|
+
it then adding it back up. That would be a source of numerical inaccuracy. And even
|
|
215
|
+
more importantly, this design choice allows us to exactly re-construct the input sketch
|
|
216
|
+
when there is only one of them.
|
|
217
|
+
*/
|
|
218
|
+
inline void merge_items(const var_opt_sketch<T,S,A>& sk);
|
|
219
|
+
inline void merge_items(var_opt_sketch<T,S,A>&& sk);
|
|
220
|
+
inline void resolve_tau(const var_opt_sketch<T,S,A>& sketch);
|
|
221
|
+
|
|
222
|
+
double get_outer_tau() const;
|
|
223
|
+
|
|
224
|
+
var_opt_sketch<T,S,A> simple_gadget_coercer() const;
|
|
225
|
+
|
|
226
|
+
bool there_exist_unmarked_h_items_lighter_than_target(double threshold) const;
|
|
227
|
+
bool detect_and_handle_subcase_of_pseudo_exact(var_opt_sketch<T,S,A>& sk) const;
|
|
228
|
+
void mark_moving_gadget_coercer(var_opt_sketch<T,S,A>& sk) const;
|
|
229
|
+
void migrate_marked_items_by_decreasing_k(var_opt_sketch<T,S,A>& sk) const;
|
|
230
|
+
|
|
231
|
+
static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
|
|
232
|
+
static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
|
|
233
|
+
};
|
|
234
|
+
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
#include "var_opt_union_impl.hpp"
|
|
238
|
+
|
|
239
|
+
#endif // _VAR_OPT_UNION_HPP_
|
|
@@ -0,0 +1,645 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef _VAR_OPT_UNION_IMPL_HPP_
|
|
21
|
+
#define _VAR_OPT_UNION_IMPL_HPP_
|
|
22
|
+
|
|
23
|
+
#include "var_opt_union.hpp"
|
|
24
|
+
|
|
25
|
+
#include <cmath>
|
|
26
|
+
#include <sstream>
|
|
27
|
+
|
|
28
|
+
namespace datasketches {
|
|
29
|
+
|
|
30
|
+
template<typename T, typename S, typename A>
|
|
31
|
+
var_opt_union<T,S,A>::var_opt_union(uint32_t max_k) :
|
|
32
|
+
n_(0),
|
|
33
|
+
outer_tau_numer_(0),
|
|
34
|
+
outer_tau_denom_(0.0),
|
|
35
|
+
max_k_(max_k),
|
|
36
|
+
gadget_(max_k, var_opt_sketch<T,S,A>::DEFAULT_RESIZE_FACTOR, true)
|
|
37
|
+
{}
|
|
38
|
+
|
|
39
|
+
template<typename T, typename S, typename A>
|
|
40
|
+
var_opt_union<T,S,A>::var_opt_union(const var_opt_union& other) :
|
|
41
|
+
n_(other.n_),
|
|
42
|
+
outer_tau_numer_(other.outer_tau_numer_),
|
|
43
|
+
outer_tau_denom_(other.outer_tau_denom_),
|
|
44
|
+
max_k_(other.max_k_),
|
|
45
|
+
gadget_(other.gadget_)
|
|
46
|
+
{}
|
|
47
|
+
|
|
48
|
+
template<typename T, typename S, typename A>
|
|
49
|
+
var_opt_union<T,S,A>::var_opt_union(var_opt_union&& other) noexcept :
|
|
50
|
+
n_(other.n_),
|
|
51
|
+
outer_tau_numer_(other.outer_tau_numer_),
|
|
52
|
+
outer_tau_denom_(other.outer_tau_denom_),
|
|
53
|
+
max_k_(other.max_k_),
|
|
54
|
+
gadget_(std::move(other.gadget_))
|
|
55
|
+
{}
|
|
56
|
+
|
|
57
|
+
template<typename T, typename S, typename A>
|
|
58
|
+
var_opt_union<T,S,A>::var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
|
|
59
|
+
uint32_t max_k, var_opt_sketch<T,S,A>&& gadget) :
|
|
60
|
+
n_(n),
|
|
61
|
+
outer_tau_numer_(outer_tau_numer),
|
|
62
|
+
outer_tau_denom_(outer_tau_denom),
|
|
63
|
+
max_k_(max_k),
|
|
64
|
+
gadget_(gadget)
|
|
65
|
+
{}
|
|
66
|
+
|
|
67
|
+
template<typename T, typename S, typename A>
|
|
68
|
+
var_opt_union<T,S,A>::~var_opt_union() {}
|
|
69
|
+
|
|
70
|
+
template<typename T, typename S, typename A>
|
|
71
|
+
var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(const var_opt_union& other) {
|
|
72
|
+
var_opt_union<T,S,A> union_copy(other);
|
|
73
|
+
std::swap(n_, union_copy.n_);
|
|
74
|
+
std::swap(outer_tau_numer_, union_copy.outer_tau_numer_);
|
|
75
|
+
std::swap(outer_tau_denom_, union_copy.outer_tau_denom_);
|
|
76
|
+
std::swap(max_k_, union_copy.max_k_);
|
|
77
|
+
std::swap(gadget_, union_copy.gadget_);
|
|
78
|
+
return *this;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
template<typename T, typename S, typename A>
|
|
82
|
+
var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(var_opt_union&& other) {
|
|
83
|
+
std::swap(n_, other.n_);
|
|
84
|
+
std::swap(outer_tau_numer_, other.outer_tau_numer_);
|
|
85
|
+
std::swap(outer_tau_denom_, other.outer_tau_denom_);
|
|
86
|
+
std::swap(max_k_, other.max_k_);
|
|
87
|
+
std::swap(gadget_, other.gadget_);
|
|
88
|
+
return *this;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/*
|
|
92
|
+
* An empty union requires 8 bytes.
|
|
93
|
+
*
|
|
94
|
+
* <pre>
|
|
95
|
+
* Long || Start Byte Adr:
|
|
96
|
+
* Adr:
|
|
97
|
+
* || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
|
|
98
|
+
* 0 || Preamble_Longs | SerVer | FamID | Flags |---------Max Res. Size (K)---------|
|
|
99
|
+
* </pre>
|
|
100
|
+
*
|
|
101
|
+
* A non-empty sketch requires 24 bytes of preamble for an under-full sample; once there are
|
|
102
|
+
* at least k items the sketch uses 32 bytes of preamble.
|
|
103
|
+
*
|
|
104
|
+
* The count of items seen is limited to 48 bits (~256 trillion) even though there are adjacent
|
|
105
|
+
* unused preamble bits. The acceptance probability for an item is a double in the range [0,1),
|
|
106
|
+
* limiting us to 53 bits of randomness due to details of the IEEE floating point format. To
|
|
107
|
+
* ensure meaningful probabilities as the items seen count approaches capacity, we intentionally
|
|
108
|
+
* use slightly fewer bits.
|
|
109
|
+
*
|
|
110
|
+
* Following the header are weights for the heavy items, then marks in the event this is a gadget.
|
|
111
|
+
* The serialized items come last.
|
|
112
|
+
*
|
|
113
|
+
* <pre>
|
|
114
|
+
* Long || Start Byte Adr:
|
|
115
|
+
* Adr:
|
|
116
|
+
* || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
|
|
117
|
+
* 0 || Preamble_Longs | SerVer | FamID | Flags |---------Max Res. Size (K)---------|
|
|
118
|
+
*
|
|
119
|
+
* || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
|
|
120
|
+
* 1 ||---------------------------Items Seen Count (N)--------------------------------|
|
|
121
|
+
*
|
|
122
|
+
* || 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
|
|
123
|
+
* 2 ||------------------------Outer Tau Numerator (double)---------------------------|
|
|
124
|
+
*
|
|
125
|
+
* || 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
|
|
126
|
+
* 3 ||----------------------Outer Tau Denominator (uint64_t)-------------------------|
|
|
127
|
+
* </pre>
|
|
128
|
+
*/
|
|
129
|
+
|
|
130
|
+
template<typename T, typename S, typename A>
|
|
131
|
+
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is) {
|
|
132
|
+
uint8_t preamble_longs;
|
|
133
|
+
is.read((char*)&preamble_longs, sizeof(preamble_longs));
|
|
134
|
+
uint8_t serial_version;
|
|
135
|
+
is.read((char*)&serial_version, sizeof(serial_version));
|
|
136
|
+
uint8_t family_id;
|
|
137
|
+
is.read((char*)&family_id, sizeof(family_id));
|
|
138
|
+
uint8_t flags;
|
|
139
|
+
is.read((char*)&flags, sizeof(flags));
|
|
140
|
+
uint32_t max_k;
|
|
141
|
+
is.read((char*)&max_k, sizeof(max_k));
|
|
142
|
+
|
|
143
|
+
check_preamble_longs(preamble_longs, flags);
|
|
144
|
+
check_family_and_serialization_version(family_id, serial_version);
|
|
145
|
+
|
|
146
|
+
if (max_k == 0 || max_k > MAX_K) {
|
|
147
|
+
throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
bool is_empty = flags & EMPTY_FLAG_MASK;
|
|
151
|
+
|
|
152
|
+
if (is_empty) {
|
|
153
|
+
if (!is.good())
|
|
154
|
+
throw std::runtime_error("error reading from std::istream");
|
|
155
|
+
else
|
|
156
|
+
return var_opt_union<T,S,A>(max_k);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
uint64_t items_seen;
|
|
160
|
+
is.read((char*)&items_seen, sizeof(items_seen));
|
|
161
|
+
double outer_tau_numer;
|
|
162
|
+
is.read((char*)&outer_tau_numer, sizeof(outer_tau_numer));
|
|
163
|
+
uint64_t outer_tau_denom;
|
|
164
|
+
is.read((char*)&outer_tau_denom, sizeof(outer_tau_denom));
|
|
165
|
+
|
|
166
|
+
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is);
|
|
167
|
+
|
|
168
|
+
if (!is.good())
|
|
169
|
+
throw std::runtime_error("error reading from std::istream");
|
|
170
|
+
|
|
171
|
+
return var_opt_union<T,S,A>(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
template<typename T, typename S, typename A>
|
|
175
|
+
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size) {
|
|
176
|
+
ensure_minimum_memory(size, 8);
|
|
177
|
+
const char* ptr = static_cast<const char*>(bytes);
|
|
178
|
+
uint8_t preamble_longs;
|
|
179
|
+
ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
|
|
180
|
+
uint8_t serial_version;
|
|
181
|
+
ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
|
|
182
|
+
uint8_t family_id;
|
|
183
|
+
ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
|
|
184
|
+
uint8_t flags;
|
|
185
|
+
ptr += copy_from_mem(ptr, &flags, sizeof(flags));
|
|
186
|
+
uint32_t max_k;
|
|
187
|
+
ptr += copy_from_mem(ptr, &max_k, sizeof(max_k));
|
|
188
|
+
|
|
189
|
+
check_preamble_longs(preamble_longs, flags);
|
|
190
|
+
check_family_and_serialization_version(family_id, serial_version);
|
|
191
|
+
|
|
192
|
+
if (max_k == 0 || max_k > MAX_K) {
|
|
193
|
+
throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
bool is_empty = flags & EMPTY_FLAG_MASK;
|
|
197
|
+
|
|
198
|
+
if (is_empty) {
|
|
199
|
+
return var_opt_union<T,S,A>(max_k);
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
uint64_t items_seen;
|
|
203
|
+
ptr += copy_from_mem(ptr, &items_seen, sizeof(items_seen));
|
|
204
|
+
double outer_tau_numer;
|
|
205
|
+
ptr += copy_from_mem(ptr, &outer_tau_numer, sizeof(outer_tau_numer));
|
|
206
|
+
uint64_t outer_tau_denom;
|
|
207
|
+
ptr += copy_from_mem(ptr, &outer_tau_denom, sizeof(outer_tau_denom));
|
|
208
|
+
|
|
209
|
+
const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
|
|
210
|
+
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size);
|
|
211
|
+
|
|
212
|
+
return var_opt_union<T,S,A>(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
template<typename T, typename S, typename A>
|
|
216
|
+
size_t var_opt_union<T,S,A>::get_serialized_size_bytes() const {
|
|
217
|
+
if (n_ == 0) {
|
|
218
|
+
return PREAMBLE_LONGS_EMPTY << 3;
|
|
219
|
+
} else {
|
|
220
|
+
return (PREAMBLE_LONGS_NON_EMPTY << 3) + gadget_.get_serialized_size_bytes();
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
template<typename T, typename S, typename A>
|
|
225
|
+
void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
|
|
226
|
+
bool empty = (n_ == 0);
|
|
227
|
+
|
|
228
|
+
const uint8_t serialization_version(SER_VER);
|
|
229
|
+
const uint8_t family_id(FAMILY_ID);
|
|
230
|
+
|
|
231
|
+
uint8_t preamble_longs;
|
|
232
|
+
uint8_t flags;
|
|
233
|
+
if (empty) {
|
|
234
|
+
preamble_longs = PREAMBLE_LONGS_EMPTY;
|
|
235
|
+
flags = EMPTY_FLAG_MASK;
|
|
236
|
+
} else {
|
|
237
|
+
preamble_longs = PREAMBLE_LONGS_NON_EMPTY;
|
|
238
|
+
flags = 0;
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
os.write((char*) &preamble_longs, sizeof(uint8_t));
|
|
242
|
+
os.write((char*) &serialization_version, sizeof(uint8_t));
|
|
243
|
+
os.write((char*) &family_id, sizeof(uint8_t));
|
|
244
|
+
os.write((char*) &flags, sizeof(uint8_t));
|
|
245
|
+
os.write((char*) &max_k_, sizeof(uint32_t));
|
|
246
|
+
|
|
247
|
+
if (!empty) {
|
|
248
|
+
os.write((char*) &n_, sizeof(uint64_t));
|
|
249
|
+
os.write((char*) &outer_tau_numer_, sizeof(double));
|
|
250
|
+
os.write((char*) &outer_tau_denom_, sizeof(uint64_t));
|
|
251
|
+
gadget_.serialize(os);
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
template<typename T, typename S, typename A>
|
|
256
|
+
std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header_size_bytes) const {
|
|
257
|
+
const size_t size = header_size_bytes + get_serialized_size_bytes();
|
|
258
|
+
std::vector<uint8_t, AllocU8<A>> bytes(size);
|
|
259
|
+
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
260
|
+
|
|
261
|
+
const bool empty = n_ == 0;
|
|
262
|
+
|
|
263
|
+
const uint8_t serialization_version(SER_VER);
|
|
264
|
+
const uint8_t family_id(FAMILY_ID);
|
|
265
|
+
|
|
266
|
+
uint8_t preamble_longs;
|
|
267
|
+
uint8_t flags;
|
|
268
|
+
|
|
269
|
+
if (empty) {
|
|
270
|
+
preamble_longs = PREAMBLE_LONGS_EMPTY;
|
|
271
|
+
flags = EMPTY_FLAG_MASK;
|
|
272
|
+
} else {
|
|
273
|
+
preamble_longs = PREAMBLE_LONGS_NON_EMPTY;
|
|
274
|
+
flags = 0;
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
// first prelong
|
|
278
|
+
ptr += copy_to_mem(&preamble_longs, ptr, sizeof(uint8_t));
|
|
279
|
+
ptr += copy_to_mem(&serialization_version, ptr, sizeof(uint8_t));
|
|
280
|
+
ptr += copy_to_mem(&family_id, ptr, sizeof(uint8_t));
|
|
281
|
+
ptr += copy_to_mem(&flags, ptr, sizeof(uint8_t));
|
|
282
|
+
ptr += copy_to_mem(&max_k_, ptr, sizeof(uint32_t));
|
|
283
|
+
|
|
284
|
+
if (!empty) {
|
|
285
|
+
ptr += copy_to_mem(&n_, ptr, sizeof(uint64_t));
|
|
286
|
+
ptr += copy_to_mem(&outer_tau_numer_, ptr, sizeof(double));
|
|
287
|
+
ptr += copy_to_mem(&outer_tau_denom_, ptr, sizeof(uint64_t));
|
|
288
|
+
|
|
289
|
+
auto gadget_bytes = gadget_.serialize();
|
|
290
|
+
ptr += copy_to_mem(gadget_bytes.data(), ptr, gadget_bytes.size() * sizeof(uint8_t));
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
return bytes;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
template<typename T, typename S, typename A>
|
|
297
|
+
void var_opt_union<T,S,A>::reset() {
|
|
298
|
+
n_ = 0;
|
|
299
|
+
outer_tau_numer_ = 0.0;
|
|
300
|
+
outer_tau_denom_ = 0;
|
|
301
|
+
gadget_.reset();
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
template<typename T, typename S, typename A>
|
|
305
|
+
string<A> var_opt_union<T,S,A>::to_string() const {
|
|
306
|
+
std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
|
|
307
|
+
os << "### VarOpt Union SUMMARY: " << std::endl;
|
|
308
|
+
os << " . n : " << n_ << std::endl;
|
|
309
|
+
os << " Max k : " << max_k_ << std::endl;
|
|
310
|
+
os << " Gadget Summary: " << std::endl;
|
|
311
|
+
os << gadget_.to_string();
|
|
312
|
+
os << "### END VarOpt Union SUMMARY: " << std::endl;
|
|
313
|
+
return os.str();
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
template<typename T, typename S, typename A>
|
|
317
|
+
void var_opt_union<T,S,A>::update(const var_opt_sketch<T,S,A>& sk) {
|
|
318
|
+
merge_items(sk);
|
|
319
|
+
resolve_tau(sk);
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
template<typename T, typename S, typename A>
|
|
323
|
+
void var_opt_union<T,S,A>::update(var_opt_sketch<T,S,A>&& sk) {
|
|
324
|
+
merge_items(std::move(sk));
|
|
325
|
+
resolve_tau(sk); // don't need items, so ok even if they've been moved out
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
template<typename T, typename S, typename A>
|
|
329
|
+
double var_opt_union<T,S,A>::get_outer_tau() const {
|
|
330
|
+
if (outer_tau_denom_ == 0) {
|
|
331
|
+
return 0.0;
|
|
332
|
+
} else {
|
|
333
|
+
return outer_tau_numer_ / outer_tau_denom_;
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
template<typename T, typename S, typename A>
|
|
338
|
+
void var_opt_union<T,S,A>::merge_items(const var_opt_sketch<T,S,A>& sketch) {
|
|
339
|
+
if (sketch.n_ == 0) {
|
|
340
|
+
return;
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
n_ += sketch.n_;
|
|
344
|
+
|
|
345
|
+
// H region const_iterator
|
|
346
|
+
typename var_opt_sketch<T,S,A>::const_iterator h_itr(sketch, false, false);
|
|
347
|
+
typename var_opt_sketch<T,S,A>::const_iterator h_end(sketch, true, false);
|
|
348
|
+
while (h_itr != h_end) {
|
|
349
|
+
std::pair<const T&, const double> sample = *h_itr;
|
|
350
|
+
gadget_.update(sample.first, sample.second, false);
|
|
351
|
+
++h_itr;
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
// Weight-correcting R region iterator (const_iterator doesn't do the correction)
|
|
355
|
+
typename var_opt_sketch<T,S,A>::iterator r_itr(sketch, false, true);
|
|
356
|
+
typename var_opt_sketch<T,S,A>::iterator r_end(sketch, true, true);
|
|
357
|
+
while (r_itr != r_end) {
|
|
358
|
+
std::pair<const T&, const double> sample = *r_itr;
|
|
359
|
+
gadget_.update(sample.first, sample.second, true);
|
|
360
|
+
++r_itr;
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
template<typename T, typename S, typename A>
|
|
365
|
+
void var_opt_union<T,S,A>::merge_items(var_opt_sketch<T,S,A>&& sketch) {
|
|
366
|
+
if (sketch.n_ == 0) {
|
|
367
|
+
return;
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
n_ += sketch.n_;
|
|
371
|
+
|
|
372
|
+
// H region iterator
|
|
373
|
+
typename var_opt_sketch<T,S,A>::iterator h_itr(sketch, false, false);
|
|
374
|
+
typename var_opt_sketch<T,S,A>::iterator h_end(sketch, true, false);
|
|
375
|
+
while (h_itr != h_end) {
|
|
376
|
+
std::pair<T&, double> sample = *h_itr;
|
|
377
|
+
gadget_.update(std::move(sample.first), sample.second, false);
|
|
378
|
+
++h_itr;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
// Weight-correcting R region iterator
|
|
382
|
+
typename var_opt_sketch<T,S,A>::iterator r_itr(sketch, false, true);
|
|
383
|
+
typename var_opt_sketch<T,S,A>::iterator r_end(sketch, true, true);
|
|
384
|
+
while (r_itr != r_end) {
|
|
385
|
+
std::pair<T&, double> sample = *r_itr;
|
|
386
|
+
gadget_.update(std::move(sample.first), sample.second, true);
|
|
387
|
+
++r_itr;
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
template<typename T, typename S, typename A>
|
|
392
|
+
void var_opt_union<T,S,A>::resolve_tau(const var_opt_sketch<T,S,A>& sketch) {
|
|
393
|
+
if (sketch.r_ > 0) {
|
|
394
|
+
const double sketch_tau = sketch.get_tau();
|
|
395
|
+
const double outer_tau = get_outer_tau();
|
|
396
|
+
|
|
397
|
+
if (outer_tau_denom_ == 0) {
|
|
398
|
+
// detect first estimation mode sketch and grab its tau
|
|
399
|
+
outer_tau_numer_ = sketch.total_wt_r_;
|
|
400
|
+
outer_tau_denom_ = sketch.r_;
|
|
401
|
+
} else if (sketch_tau > outer_tau) {
|
|
402
|
+
// switch to a bigger value of outer_tau
|
|
403
|
+
outer_tau_numer_ = sketch.total_wt_r_;
|
|
404
|
+
outer_tau_denom_ = sketch.r_;
|
|
405
|
+
} else if (sketch_tau == outer_tau) {
|
|
406
|
+
// Ok if previous equality test isn't quite perfect. Mistakes in either direction should
|
|
407
|
+
// be fairly benign.
|
|
408
|
+
// Without conceptually changing outer_tau, update number and denominator. In particular,
|
|
409
|
+
// add the total weight of the incoming reservoir to the running total.
|
|
410
|
+
outer_tau_numer_ += sketch.total_wt_r_;
|
|
411
|
+
outer_tau_denom_ += sketch.r_;
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
// do nothing if sketch's tau is smaller than outer_tau
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
template<typename T, typename S, typename A>
|
|
419
|
+
var_opt_sketch<T,S,A> var_opt_union<T,S,A>::get_result() const {
|
|
420
|
+
// If no marked items in H, gadget is already valid mathematically. We can return what is
|
|
421
|
+
// basically just a copy of the gadget.
|
|
422
|
+
if (gadget_.num_marks_in_h_ == 0) {
|
|
423
|
+
return simple_gadget_coercer();
|
|
424
|
+
} else {
|
|
425
|
+
// Copy of gadget. This may produce needless copying in the
|
|
426
|
+
// pseudo-exact case below, but should simplify the code without
|
|
427
|
+
// needing to make the gadget a pointer
|
|
428
|
+
var_opt_sketch<T,S,A> gcopy(gadget_, false, n_);
|
|
429
|
+
|
|
430
|
+
// At this point, we know that marked items are present in H. So:
|
|
431
|
+
// 1. Result will necessarily be in estimation mode
|
|
432
|
+
// 2. Marked items currently in H need to be absorbed into reservoir (R)
|
|
433
|
+
const bool is_pseudo_exact = detect_and_handle_subcase_of_pseudo_exact(gcopy);
|
|
434
|
+
if (!is_pseudo_exact) {
|
|
435
|
+
// continue with main logic
|
|
436
|
+
migrate_marked_items_by_decreasing_k(gcopy);
|
|
437
|
+
}
|
|
438
|
+
// sub-case was already detected and handled, so return the result
|
|
439
|
+
return gcopy;
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
/**
|
|
444
|
+
* When there are no marked items in H, the gadget is mathematically equivalent to a valid
|
|
445
|
+
* varopt sketch. This method simply returns a copy (without perserving marks).
|
|
446
|
+
*
|
|
447
|
+
* @return A shallow copy of the gadget as valid varopt sketch
|
|
448
|
+
*/
|
|
449
|
+
template<typename T, typename S, typename A>
|
|
450
|
+
var_opt_sketch<T,S,A> var_opt_union<T,S,A>::simple_gadget_coercer() const {
|
|
451
|
+
if (gadget_.num_marks_in_h_ != 0) throw std::logic_error("simple gadget coercer only applies if no marks");
|
|
452
|
+
return var_opt_sketch<T,S,A>(gadget_, true, n_);
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
// this is a condition checked in detect_and_handle_subcase_of_pseudo_exact()
|
|
456
|
+
template<typename T, typename S, typename A>
|
|
457
|
+
bool var_opt_union<T,S,A>::there_exist_unmarked_h_items_lighter_than_target(double threshold) const {
|
|
458
|
+
for (uint32_t i = 0; i < gadget_.h_; ++i) {
|
|
459
|
+
if ((gadget_.weights_[i] < threshold) && !gadget_.marks_[i]) {
|
|
460
|
+
return true;
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
return false;
|
|
464
|
+
}
|
|
465
|
+
|
|
466
|
+
template<typename T, typename S, typename A>
|
|
467
|
+
bool var_opt_union<T,S,A>::detect_and_handle_subcase_of_pseudo_exact(var_opt_sketch<T,S,A>& sk) const {
|
|
468
|
+
// gadget is seemingly exact
|
|
469
|
+
const bool condition1 = gadget_.r_ == 0;
|
|
470
|
+
|
|
471
|
+
// but there are marked items in H, so only _pseudo_ exact
|
|
472
|
+
const bool condition2 = gadget_.num_marks_in_h_ > 0;
|
|
473
|
+
|
|
474
|
+
// if gadget is pseudo-exact and the number of marks equals outer_tau_denom, then we can deduce
|
|
475
|
+
// from the bookkeeping logic of resolve_tau() that all estimation mode input sketches must
|
|
476
|
+
// have had the same tau, so we can throw all of the marked items into a common reservoir.
|
|
477
|
+
const bool condition3 = gadget_.num_marks_in_h_ == outer_tau_denom_;
|
|
478
|
+
|
|
479
|
+
if (!(condition1 && condition2 && condition3)) {
|
|
480
|
+
return false;
|
|
481
|
+
} else {
|
|
482
|
+
|
|
483
|
+
// explicitly enforce rule that items in H should not be lighter than the sketch's tau
|
|
484
|
+
const bool anti_condition4 = there_exist_unmarked_h_items_lighter_than_target(gadget_.get_tau());
|
|
485
|
+
if (anti_condition4) {
|
|
486
|
+
return false;
|
|
487
|
+
} else {
|
|
488
|
+
// conditions 1 through 4 hold
|
|
489
|
+
mark_moving_gadget_coercer(sk);
|
|
490
|
+
return true;
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
|
|
495
|
+
/**
|
|
496
|
+
* This coercer directly transfers marked items from the gadget's H into the result's R.
|
|
497
|
+
* Deciding whether that is a valid thing to do is the responsibility of the caller. Currently,
|
|
498
|
+
* this is only used for a subcase of pseudo-exact, but later it might be used by other
|
|
499
|
+
* subcases as well.
|
|
500
|
+
*
|
|
501
|
+
* @param sk Copy of the gadget, modified with marked items moved to the reservoir
|
|
502
|
+
*/
|
|
503
|
+
template<typename T, typename S, typename A>
|
|
504
|
+
void var_opt_union<T,S,A>::mark_moving_gadget_coercer(var_opt_sketch<T,S,A>& sk) const {
|
|
505
|
+
const uint32_t result_k = gadget_.h_ + gadget_.r_;
|
|
506
|
+
|
|
507
|
+
uint32_t result_h = 0;
|
|
508
|
+
uint32_t result_r = 0;
|
|
509
|
+
size_t next_r_pos = result_k; // = (result_k+1)-1, to fill R region from back to front
|
|
510
|
+
|
|
511
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
|
|
512
|
+
double* wts = AllocDouble().allocate(result_k + 1);
|
|
513
|
+
T* data = A().allocate(result_k + 1);
|
|
514
|
+
|
|
515
|
+
// insert R region items, ignoring weights
|
|
516
|
+
// Currently (May 2017) this next block is unreachable; this coercer is used only in the
|
|
517
|
+
// pseudo-exact case in which case there are no items natively in R, only marked items in H
|
|
518
|
+
// that will be moved into R as part of the coercion process.
|
|
519
|
+
// Addedndum (Jan 2020): Cleanup at end of method assumes R count is 0
|
|
520
|
+
const size_t final_idx = gadget_.get_num_samples();
|
|
521
|
+
for (size_t idx = gadget_.h_ + 1; idx <= final_idx; ++idx) {
|
|
522
|
+
A().construct(&data[next_r_pos], T(gadget_.data_[idx]));
|
|
523
|
+
wts[next_r_pos] = gadget_.weights_[idx];
|
|
524
|
+
++result_r;
|
|
525
|
+
--next_r_pos;
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
double transferred_weight = 0;
|
|
529
|
+
|
|
530
|
+
// insert H region items
|
|
531
|
+
for (size_t idx = 0; idx < gadget_.h_; ++idx) {
|
|
532
|
+
if (gadget_.marks_[idx]) {
|
|
533
|
+
A().construct(&data[next_r_pos], T(gadget_.data_[idx]));
|
|
534
|
+
wts[next_r_pos] = -1.0;
|
|
535
|
+
transferred_weight += gadget_.weights_[idx];
|
|
536
|
+
++result_r;
|
|
537
|
+
--next_r_pos;
|
|
538
|
+
} else {
|
|
539
|
+
A().construct(&data[result_h], T(gadget_.data_[idx]));
|
|
540
|
+
wts[result_h] = gadget_.weights_[idx];
|
|
541
|
+
++result_h;
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
if (result_h + result_r != result_k) throw std::logic_error("H + R counts must equal k");
|
|
546
|
+
if (fabs(transferred_weight - outer_tau_numer_) > 1e-10) {
|
|
547
|
+
throw std::logic_error("uexpected mismatch in transferred weight");
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
const double result_r_weight = gadget_.total_wt_r_ + transferred_weight;
|
|
551
|
+
const uint64_t result_n = n_;
|
|
552
|
+
|
|
553
|
+
// explicitly set weight value for the gap
|
|
554
|
+
wts[result_h] = -1.0;
|
|
555
|
+
|
|
556
|
+
// clean up arrays in input sketch, replace with new values
|
|
557
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
|
|
558
|
+
AllocBool().deallocate(sk.marks_, sk.curr_items_alloc_);
|
|
559
|
+
AllocDouble().deallocate(sk.weights_, sk.curr_items_alloc_);
|
|
560
|
+
for (size_t i = 0; i < result_k; ++i) { A().destroy(sk.data_ + i); } // assumes everything in H region, no gap
|
|
561
|
+
A().deallocate(sk.data_, sk.curr_items_alloc_);
|
|
562
|
+
|
|
563
|
+
sk.data_ = data;
|
|
564
|
+
sk.weights_ = wts;
|
|
565
|
+
sk.marks_ = nullptr;
|
|
566
|
+
sk.num_marks_in_h_ = 0;
|
|
567
|
+
sk.curr_items_alloc_ = result_k + 1;
|
|
568
|
+
sk.k_ = result_k;
|
|
569
|
+
sk.n_ = result_n;
|
|
570
|
+
sk.h_ = result_h;
|
|
571
|
+
sk.r_ = result_r;
|
|
572
|
+
sk.total_wt_r_ = result_r_weight;
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
// this is basically a continuation of get_result(), but modifying the input gadget copy
|
|
576
|
+
template<typename T, typename S, typename A>
|
|
577
|
+
void var_opt_union<T,S,A>::migrate_marked_items_by_decreasing_k(var_opt_sketch<T,S,A>& gcopy) const {
|
|
578
|
+
const uint32_t r_count = gcopy.r_;
|
|
579
|
+
const uint32_t h_count = gcopy.h_;
|
|
580
|
+
const uint32_t k = gcopy.k_;
|
|
581
|
+
|
|
582
|
+
// should be ensured by caller
|
|
583
|
+
if (gcopy.num_marks_in_h_ == 0) throw std::logic_error("unexpectedly found no marked items to migrate");
|
|
584
|
+
// either full (of samples), in pseudo-exact mode, or both
|
|
585
|
+
if ((r_count != 0) && ((h_count + r_count) != k)) throw std::logic_error("invalid gadget state");
|
|
586
|
+
|
|
587
|
+
// if non-full and pseudo-exact, change k so that gcopy is full
|
|
588
|
+
if ((r_count == 0) && (h_count < k)) {
|
|
589
|
+
gcopy.k_ = h_count; // may leve extra space allocated but that's ok
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
// Now k equals the number of samples, so reducing k will increase tau.
|
|
593
|
+
// Also, we know that there are at least 2 samples because 0 or 1 would have been handled
|
|
594
|
+
// by the earlier logic in get_result()
|
|
595
|
+
gcopy.decrease_k_by_1();
|
|
596
|
+
|
|
597
|
+
// gcopy is now in estimation mode, just like the final result must be (due to marked items)
|
|
598
|
+
if (gcopy.get_tau() == 0.0) throw std::logic_error("gadget must be in sampling mode");
|
|
599
|
+
|
|
600
|
+
// keep reducing k until all marked items have been absorbed into the reservoir
|
|
601
|
+
while (gcopy.num_marks_in_h_ > 0) {
|
|
602
|
+
// gcopy.k_ >= 2 because h_ and r_ are both at least 1, but checked in next method anyway
|
|
603
|
+
gcopy.decrease_k_by_1();
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
gcopy.strip_marks();
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
template<typename T, typename S, typename A>
|
|
610
|
+
void var_opt_union<T,S,A>::check_preamble_longs(uint8_t preamble_longs, uint8_t flags) {
|
|
611
|
+
bool is_empty(flags & EMPTY_FLAG_MASK);
|
|
612
|
+
|
|
613
|
+
if (is_empty) {
|
|
614
|
+
if (preamble_longs != PREAMBLE_LONGS_EMPTY) {
|
|
615
|
+
throw std::invalid_argument("Possible corruption: Preamble longs must be "
|
|
616
|
+
+ std::to_string(PREAMBLE_LONGS_EMPTY) + " for an empty sketch. Found: "
|
|
617
|
+
+ std::to_string(preamble_longs));
|
|
618
|
+
}
|
|
619
|
+
} else {
|
|
620
|
+
if (preamble_longs != PREAMBLE_LONGS_NON_EMPTY) {
|
|
621
|
+
throw std::invalid_argument("Possible corruption: Preamble longs must be "
|
|
622
|
+
+ std::to_string(PREAMBLE_LONGS_NON_EMPTY)
|
|
623
|
+
+ " for a non-empty sketch. Found: " + std::to_string(preamble_longs));
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
template<typename T, typename S, typename A>
|
|
629
|
+
void var_opt_union<T,S,A>::check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver) {
|
|
630
|
+
if (family_id == FAMILY_ID) {
|
|
631
|
+
if (ser_ver != SER_VER) {
|
|
632
|
+
throw std::invalid_argument("Possible corruption: VarOpt Union serialization version must be "
|
|
633
|
+
+ std::to_string(SER_VER) + ". Found: " + std::to_string(ser_ver));
|
|
634
|
+
}
|
|
635
|
+
return;
|
|
636
|
+
}
|
|
637
|
+
// TODO: extend to handle reservoir sampling
|
|
638
|
+
|
|
639
|
+
throw std::invalid_argument("Possible corruption: VarOpt Union family id must be "
|
|
640
|
+
+ std::to_string(FAMILY_ID) + ". Found: " + std::to_string(family_id));
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
} // namespace datasketches
|
|
644
|
+
|
|
645
|
+
#endif // _VAR_OPT_UNION_IMPL_HPP_
|