datasketches 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
@@ -0,0 +1,239 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#ifndef _VAR_OPT_UNION_HPP_
|
21
|
+
#define _VAR_OPT_UNION_HPP_
|
22
|
+
|
23
|
+
#include "var_opt_sketch.hpp"
|
24
|
+
#include "common_defs.hpp"
|
25
|
+
#include "serde.hpp"
|
26
|
+
|
27
|
+
#include <vector>
|
28
|
+
|
29
|
+
namespace datasketches {
|
30
|
+
|
31
|
+
template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
|
32
|
+
|
33
|
+
/**
|
34
|
+
* Provides a unioning operation over var_opt_sketch objects. This union allows
|
35
|
+
* the sample size k to float, possibly increasing or decreasing as warranted by
|
36
|
+
* the available data.
|
37
|
+
*
|
38
|
+
* The union currently allows serialization and deserialization, even though transporting
|
39
|
+
* union objects seems to be an anti-pattern with most sketches. We currently provide it here
|
40
|
+
* because the get_result() call may need to discard samples and decrease k in order to
|
41
|
+
* return a valid sketch, even if future calls to update() would allow k to remain larger.
|
42
|
+
*
|
43
|
+
* The (de)serialization methods may be deprecated and subsequently removed in future versions.
|
44
|
+
*
|
45
|
+
* author Kevin Lang
|
46
|
+
* author Jon Malkin
|
47
|
+
*/
|
48
|
+
template <typename T, typename S = serde<T>, typename A = std::allocator<T>>
|
49
|
+
class var_opt_union {
|
50
|
+
|
51
|
+
public:
|
52
|
+
static const uint32_t MAX_K = ((uint32_t) 1 << 31) - 2;
|
53
|
+
|
54
|
+
explicit var_opt_union(uint32_t max_k);
|
55
|
+
var_opt_union(const var_opt_union& other);
|
56
|
+
var_opt_union(var_opt_union&& other) noexcept;
|
57
|
+
|
58
|
+
~var_opt_union();
|
59
|
+
|
60
|
+
var_opt_union& operator=(const var_opt_union& other);
|
61
|
+
var_opt_union& operator=(var_opt_union&& other);
|
62
|
+
|
63
|
+
/**
|
64
|
+
* Updates this union with the given sketch
|
65
|
+
* This method takes an lvalue.
|
66
|
+
* @param sk a sketch to add to the union
|
67
|
+
*/
|
68
|
+
void update(const var_opt_sketch<T,S,A>& sk);
|
69
|
+
|
70
|
+
/**
|
71
|
+
* Updates this union with the given sketch
|
72
|
+
* This method takes an rvalue.
|
73
|
+
* @param sk a sketch to add to the union
|
74
|
+
*/
|
75
|
+
void update(var_opt_sketch<T,S,A>&& sk);
|
76
|
+
|
77
|
+
/**
|
78
|
+
* Gets the varopt sketch resulting from the union of any input sketches.
|
79
|
+
* @return a varopt sketch
|
80
|
+
*/
|
81
|
+
var_opt_sketch<T,S,A> get_result() const;
|
82
|
+
|
83
|
+
/**
|
84
|
+
* Resets the union to its default, empty state.
|
85
|
+
*/
|
86
|
+
void reset();
|
87
|
+
|
88
|
+
/**
|
89
|
+
* Computes size needed to serialize the current state of the union.
|
90
|
+
* This version is for all other types and can be expensive since every item needs to be looked at.
|
91
|
+
* @return size in bytes needed to serialize this sketch
|
92
|
+
*/
|
93
|
+
size_t get_serialized_size_bytes() const;
|
94
|
+
|
95
|
+
// This is a convenience alias for users
|
96
|
+
// The type returned by the following serialize method
|
97
|
+
typedef vector_u8<A> vector_bytes;
|
98
|
+
|
99
|
+
/**
|
100
|
+
* NOTE: This method may be deprecated in a future version.
|
101
|
+
* This method serializes the sketch as a vector of bytes.
|
102
|
+
* An optional header can be reserved in front of the sketch.
|
103
|
+
* It is a blank space of a given size.
|
104
|
+
* This header is used in Datasketches PostgreSQL extension.
|
105
|
+
* @param header_size_bytes space to reserve in front of the sketch
|
106
|
+
*/
|
107
|
+
vector_bytes serialize(unsigned header_size_bytes = 0) const;
|
108
|
+
|
109
|
+
/**
|
110
|
+
* NOTE: This method may be deprecated in a future version.
|
111
|
+
* This method serializes the sketch into a given stream in a binary form
|
112
|
+
* @param os output stream
|
113
|
+
*/
|
114
|
+
void serialize(std::ostream& os) const;
|
115
|
+
|
116
|
+
/**
|
117
|
+
* NOTE: This method may be deprecated in a future version.
|
118
|
+
* This method deserializes a union from a given stream.
|
119
|
+
* @param is input stream
|
120
|
+
* @return an instance of a union
|
121
|
+
*/
|
122
|
+
static var_opt_union deserialize(std::istream& is);
|
123
|
+
|
124
|
+
/**
|
125
|
+
* NOTE: This method may be deprecated in a future version.
|
126
|
+
* This method deserializes a skeuniontch from a given array of bytes.
|
127
|
+
* @param bytes pointer to the array of bytes
|
128
|
+
* @param size the size of the array
|
129
|
+
* @return an instance of a union
|
130
|
+
*/
|
131
|
+
static var_opt_union deserialize(const void* bytes, size_t size);
|
132
|
+
|
133
|
+
/**
|
134
|
+
* Prints a summary of the union as a string.
|
135
|
+
* @return the summary as a string
|
136
|
+
*/
|
137
|
+
string<A> to_string() const;
|
138
|
+
|
139
|
+
|
140
|
+
private:
|
141
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<var_opt_sketch<T,S,A>> AllocSketch;
|
142
|
+
|
143
|
+
static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
|
144
|
+
static const uint8_t PREAMBLE_LONGS_NON_EMPTY = 4;
|
145
|
+
static const uint8_t SER_VER = 2;
|
146
|
+
static const uint8_t FAMILY_ID = 14;
|
147
|
+
static const uint8_t EMPTY_FLAG_MASK = 4;
|
148
|
+
|
149
|
+
uint64_t n_; // cumulative over all input sketches
|
150
|
+
|
151
|
+
// outer tau is the largest tau of any input sketch
|
152
|
+
double outer_tau_numer_; // total weight of all input R-zones where tau = outer_tau
|
153
|
+
|
154
|
+
// total cardinality of the same R-zones, or zero if no input sketch was in estimation mode
|
155
|
+
uint64_t outer_tau_denom_;
|
156
|
+
|
157
|
+
uint32_t max_k_;
|
158
|
+
|
159
|
+
var_opt_sketch<T,S,A> gadget_;
|
160
|
+
|
161
|
+
var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
|
162
|
+
uint32_t max_k, var_opt_sketch<T,S,A>&& gadget);
|
163
|
+
|
164
|
+
/*
|
165
|
+
IMPORTANT NOTE: the "gadget" in the union object appears to be a varopt sketch,
|
166
|
+
but in fact is NOT because it doesn't satisfy the mathematical definition
|
167
|
+
of a varopt sketch of the concatenated input streams. Therefore it could be different
|
168
|
+
from a true varopt sketch with that value of K, in which case it could easily provide
|
169
|
+
worse estimation accuracy for subset-sum queries.
|
170
|
+
|
171
|
+
This should not surprise you; the approximation guarantees of varopt sketches
|
172
|
+
do not apply to things that merely resemble varopt sketches.
|
173
|
+
|
174
|
+
However, even though the gadget is not a varopt sketch, the result
|
175
|
+
of the unioning process IS a varopt sketch. It is constructed by a
|
176
|
+
somewhat complicated "resolution" process which determines the largest K
|
177
|
+
that a valid varopt sketch could have given the available information,
|
178
|
+
then constructs a varopt sketch of that size and returns it.
|
179
|
+
|
180
|
+
However, the gadget itself is not touched during the resolution process,
|
181
|
+
and additional sketches could subsequently be merged into the union,
|
182
|
+
at which point a varopt result could again be requested.
|
183
|
+
*/
|
184
|
+
|
185
|
+
/*
|
186
|
+
Explanation of "marked items" in the union's gadget:
|
187
|
+
|
188
|
+
The boolean value "true" in an pair indicates that the item
|
189
|
+
came from an input sketch's R zone, so it is already the result of sampling.
|
190
|
+
|
191
|
+
Therefore it must not wind up in the H zone of the final result, because
|
192
|
+
that would imply that the item is "exact".
|
193
|
+
|
194
|
+
However, it is okay for a marked item to hang out in the gadget's H zone for a while.
|
195
|
+
|
196
|
+
And once the item has moved to the gadget's R zone, the mark is never checked again,
|
197
|
+
so no effort is made to ensure that its value is preserved or even makes sense.
|
198
|
+
*/
|
199
|
+
|
200
|
+
/*
|
201
|
+
Note: if the computer could perform exact real-valued arithmetic, the union could finalize
|
202
|
+
its result by reducing k until inner_tau > outer_tau. [Due to the vagaries of floating point
|
203
|
+
arithmetic, we won't attempt to detect and specially handle the inner_tau = outer_tau special
|
204
|
+
case.]
|
205
|
+
|
206
|
+
In fact, we won't even look at tau while while reducing k. Instead the logic will be based
|
207
|
+
on the more robust integer quantity num_marks_in_h_ in the gadget. It is conceivable that due
|
208
|
+
to round-off error we could end up with inner_tau slightly less than outer_tau, but that should
|
209
|
+
be fairly harmless since we will have achieved our goal of getting the marked items out of H.
|
210
|
+
|
211
|
+
Also, you might be wondering why we are bothering to maintain the numerator and denominator
|
212
|
+
separately instead of just having a single variable outer_tau. This allows us (in certain
|
213
|
+
cases) to add an input's entire R-zone weight into the result sketch, as opposed to subdividing
|
214
|
+
it then adding it back up. That would be a source of numerical inaccuracy. And even
|
215
|
+
more importantly, this design choice allows us to exactly re-construct the input sketch
|
216
|
+
when there is only one of them.
|
217
|
+
*/
|
218
|
+
inline void merge_items(const var_opt_sketch<T,S,A>& sk);
|
219
|
+
inline void merge_items(var_opt_sketch<T,S,A>&& sk);
|
220
|
+
inline void resolve_tau(const var_opt_sketch<T,S,A>& sketch);
|
221
|
+
|
222
|
+
double get_outer_tau() const;
|
223
|
+
|
224
|
+
var_opt_sketch<T,S,A> simple_gadget_coercer() const;
|
225
|
+
|
226
|
+
bool there_exist_unmarked_h_items_lighter_than_target(double threshold) const;
|
227
|
+
bool detect_and_handle_subcase_of_pseudo_exact(var_opt_sketch<T,S,A>& sk) const;
|
228
|
+
void mark_moving_gadget_coercer(var_opt_sketch<T,S,A>& sk) const;
|
229
|
+
void migrate_marked_items_by_decreasing_k(var_opt_sketch<T,S,A>& sk) const;
|
230
|
+
|
231
|
+
static void check_preamble_longs(uint8_t preamble_longs, uint8_t flags);
|
232
|
+
static void check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver);
|
233
|
+
};
|
234
|
+
|
235
|
+
}
|
236
|
+
|
237
|
+
#include "var_opt_union_impl.hpp"
|
238
|
+
|
239
|
+
#endif // _VAR_OPT_UNION_HPP_
|
@@ -0,0 +1,645 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#ifndef _VAR_OPT_UNION_IMPL_HPP_
|
21
|
+
#define _VAR_OPT_UNION_IMPL_HPP_
|
22
|
+
|
23
|
+
#include "var_opt_union.hpp"
|
24
|
+
|
25
|
+
#include <cmath>
|
26
|
+
#include <sstream>
|
27
|
+
|
28
|
+
namespace datasketches {
|
29
|
+
|
30
|
+
template<typename T, typename S, typename A>
|
31
|
+
var_opt_union<T,S,A>::var_opt_union(uint32_t max_k) :
|
32
|
+
n_(0),
|
33
|
+
outer_tau_numer_(0),
|
34
|
+
outer_tau_denom_(0.0),
|
35
|
+
max_k_(max_k),
|
36
|
+
gadget_(max_k, var_opt_sketch<T,S,A>::DEFAULT_RESIZE_FACTOR, true)
|
37
|
+
{}
|
38
|
+
|
39
|
+
template<typename T, typename S, typename A>
|
40
|
+
var_opt_union<T,S,A>::var_opt_union(const var_opt_union& other) :
|
41
|
+
n_(other.n_),
|
42
|
+
outer_tau_numer_(other.outer_tau_numer_),
|
43
|
+
outer_tau_denom_(other.outer_tau_denom_),
|
44
|
+
max_k_(other.max_k_),
|
45
|
+
gadget_(other.gadget_)
|
46
|
+
{}
|
47
|
+
|
48
|
+
template<typename T, typename S, typename A>
|
49
|
+
var_opt_union<T,S,A>::var_opt_union(var_opt_union&& other) noexcept :
|
50
|
+
n_(other.n_),
|
51
|
+
outer_tau_numer_(other.outer_tau_numer_),
|
52
|
+
outer_tau_denom_(other.outer_tau_denom_),
|
53
|
+
max_k_(other.max_k_),
|
54
|
+
gadget_(std::move(other.gadget_))
|
55
|
+
{}
|
56
|
+
|
57
|
+
template<typename T, typename S, typename A>
|
58
|
+
var_opt_union<T,S,A>::var_opt_union(uint64_t n, double outer_tau_numer, uint64_t outer_tau_denom,
|
59
|
+
uint32_t max_k, var_opt_sketch<T,S,A>&& gadget) :
|
60
|
+
n_(n),
|
61
|
+
outer_tau_numer_(outer_tau_numer),
|
62
|
+
outer_tau_denom_(outer_tau_denom),
|
63
|
+
max_k_(max_k),
|
64
|
+
gadget_(gadget)
|
65
|
+
{}
|
66
|
+
|
67
|
+
template<typename T, typename S, typename A>
|
68
|
+
var_opt_union<T,S,A>::~var_opt_union() {}
|
69
|
+
|
70
|
+
template<typename T, typename S, typename A>
|
71
|
+
var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(const var_opt_union& other) {
|
72
|
+
var_opt_union<T,S,A> union_copy(other);
|
73
|
+
std::swap(n_, union_copy.n_);
|
74
|
+
std::swap(outer_tau_numer_, union_copy.outer_tau_numer_);
|
75
|
+
std::swap(outer_tau_denom_, union_copy.outer_tau_denom_);
|
76
|
+
std::swap(max_k_, union_copy.max_k_);
|
77
|
+
std::swap(gadget_, union_copy.gadget_);
|
78
|
+
return *this;
|
79
|
+
}
|
80
|
+
|
81
|
+
template<typename T, typename S, typename A>
|
82
|
+
var_opt_union<T,S,A>& var_opt_union<T,S,A>::operator=(var_opt_union&& other) {
|
83
|
+
std::swap(n_, other.n_);
|
84
|
+
std::swap(outer_tau_numer_, other.outer_tau_numer_);
|
85
|
+
std::swap(outer_tau_denom_, other.outer_tau_denom_);
|
86
|
+
std::swap(max_k_, other.max_k_);
|
87
|
+
std::swap(gadget_, other.gadget_);
|
88
|
+
return *this;
|
89
|
+
}
|
90
|
+
|
91
|
+
/*
|
92
|
+
* An empty union requires 8 bytes.
|
93
|
+
*
|
94
|
+
* <pre>
|
95
|
+
* Long || Start Byte Adr:
|
96
|
+
* Adr:
|
97
|
+
* || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
|
98
|
+
* 0 || Preamble_Longs | SerVer | FamID | Flags |---------Max Res. Size (K)---------|
|
99
|
+
* </pre>
|
100
|
+
*
|
101
|
+
* A non-empty sketch requires 24 bytes of preamble for an under-full sample; once there are
|
102
|
+
* at least k items the sketch uses 32 bytes of preamble.
|
103
|
+
*
|
104
|
+
* The count of items seen is limited to 48 bits (~256 trillion) even though there are adjacent
|
105
|
+
* unused preamble bits. The acceptance probability for an item is a double in the range [0,1),
|
106
|
+
* limiting us to 53 bits of randomness due to details of the IEEE floating point format. To
|
107
|
+
* ensure meaningful probabilities as the items seen count approaches capacity, we intentionally
|
108
|
+
* use slightly fewer bits.
|
109
|
+
*
|
110
|
+
* Following the header are weights for the heavy items, then marks in the event this is a gadget.
|
111
|
+
* The serialized items come last.
|
112
|
+
*
|
113
|
+
* <pre>
|
114
|
+
* Long || Start Byte Adr:
|
115
|
+
* Adr:
|
116
|
+
* || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
|
117
|
+
* 0 || Preamble_Longs | SerVer | FamID | Flags |---------Max Res. Size (K)---------|
|
118
|
+
*
|
119
|
+
* || 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
|
120
|
+
* 1 ||---------------------------Items Seen Count (N)--------------------------------|
|
121
|
+
*
|
122
|
+
* || 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
|
123
|
+
* 2 ||------------------------Outer Tau Numerator (double)---------------------------|
|
124
|
+
*
|
125
|
+
* || 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 |
|
126
|
+
* 3 ||----------------------Outer Tau Denominator (uint64_t)-------------------------|
|
127
|
+
* </pre>
|
128
|
+
*/
|
129
|
+
|
130
|
+
template<typename T, typename S, typename A>
|
131
|
+
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(std::istream& is) {
|
132
|
+
uint8_t preamble_longs;
|
133
|
+
is.read((char*)&preamble_longs, sizeof(preamble_longs));
|
134
|
+
uint8_t serial_version;
|
135
|
+
is.read((char*)&serial_version, sizeof(serial_version));
|
136
|
+
uint8_t family_id;
|
137
|
+
is.read((char*)&family_id, sizeof(family_id));
|
138
|
+
uint8_t flags;
|
139
|
+
is.read((char*)&flags, sizeof(flags));
|
140
|
+
uint32_t max_k;
|
141
|
+
is.read((char*)&max_k, sizeof(max_k));
|
142
|
+
|
143
|
+
check_preamble_longs(preamble_longs, flags);
|
144
|
+
check_family_and_serialization_version(family_id, serial_version);
|
145
|
+
|
146
|
+
if (max_k == 0 || max_k > MAX_K) {
|
147
|
+
throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
|
148
|
+
}
|
149
|
+
|
150
|
+
bool is_empty = flags & EMPTY_FLAG_MASK;
|
151
|
+
|
152
|
+
if (is_empty) {
|
153
|
+
if (!is.good())
|
154
|
+
throw std::runtime_error("error reading from std::istream");
|
155
|
+
else
|
156
|
+
return var_opt_union<T,S,A>(max_k);
|
157
|
+
}
|
158
|
+
|
159
|
+
uint64_t items_seen;
|
160
|
+
is.read((char*)&items_seen, sizeof(items_seen));
|
161
|
+
double outer_tau_numer;
|
162
|
+
is.read((char*)&outer_tau_numer, sizeof(outer_tau_numer));
|
163
|
+
uint64_t outer_tau_denom;
|
164
|
+
is.read((char*)&outer_tau_denom, sizeof(outer_tau_denom));
|
165
|
+
|
166
|
+
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(is);
|
167
|
+
|
168
|
+
if (!is.good())
|
169
|
+
throw std::runtime_error("error reading from std::istream");
|
170
|
+
|
171
|
+
return var_opt_union<T,S,A>(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
|
172
|
+
}
|
173
|
+
|
174
|
+
template<typename T, typename S, typename A>
|
175
|
+
var_opt_union<T,S,A> var_opt_union<T,S,A>::deserialize(const void* bytes, size_t size) {
|
176
|
+
ensure_minimum_memory(size, 8);
|
177
|
+
const char* ptr = static_cast<const char*>(bytes);
|
178
|
+
uint8_t preamble_longs;
|
179
|
+
ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
|
180
|
+
uint8_t serial_version;
|
181
|
+
ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
|
182
|
+
uint8_t family_id;
|
183
|
+
ptr += copy_from_mem(ptr, &family_id, sizeof(family_id));
|
184
|
+
uint8_t flags;
|
185
|
+
ptr += copy_from_mem(ptr, &flags, sizeof(flags));
|
186
|
+
uint32_t max_k;
|
187
|
+
ptr += copy_from_mem(ptr, &max_k, sizeof(max_k));
|
188
|
+
|
189
|
+
check_preamble_longs(preamble_longs, flags);
|
190
|
+
check_family_and_serialization_version(family_id, serial_version);
|
191
|
+
|
192
|
+
if (max_k == 0 || max_k > MAX_K) {
|
193
|
+
throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1");
|
194
|
+
}
|
195
|
+
|
196
|
+
bool is_empty = flags & EMPTY_FLAG_MASK;
|
197
|
+
|
198
|
+
if (is_empty) {
|
199
|
+
return var_opt_union<T,S,A>(max_k);
|
200
|
+
}
|
201
|
+
|
202
|
+
uint64_t items_seen;
|
203
|
+
ptr += copy_from_mem(ptr, &items_seen, sizeof(items_seen));
|
204
|
+
double outer_tau_numer;
|
205
|
+
ptr += copy_from_mem(ptr, &outer_tau_numer, sizeof(outer_tau_numer));
|
206
|
+
uint64_t outer_tau_denom;
|
207
|
+
ptr += copy_from_mem(ptr, &outer_tau_denom, sizeof(outer_tau_denom));
|
208
|
+
|
209
|
+
const size_t gadget_size = size - (PREAMBLE_LONGS_NON_EMPTY << 3);
|
210
|
+
var_opt_sketch<T,S,A> gadget = var_opt_sketch<T,S,A>::deserialize(ptr, gadget_size);
|
211
|
+
|
212
|
+
return var_opt_union<T,S,A>(items_seen, outer_tau_numer, outer_tau_denom, max_k, std::move(gadget));
|
213
|
+
}
|
214
|
+
|
215
|
+
template<typename T, typename S, typename A>
|
216
|
+
size_t var_opt_union<T,S,A>::get_serialized_size_bytes() const {
|
217
|
+
if (n_ == 0) {
|
218
|
+
return PREAMBLE_LONGS_EMPTY << 3;
|
219
|
+
} else {
|
220
|
+
return (PREAMBLE_LONGS_NON_EMPTY << 3) + gadget_.get_serialized_size_bytes();
|
221
|
+
}
|
222
|
+
}
|
223
|
+
|
224
|
+
template<typename T, typename S, typename A>
|
225
|
+
void var_opt_union<T,S,A>::serialize(std::ostream& os) const {
|
226
|
+
bool empty = (n_ == 0);
|
227
|
+
|
228
|
+
const uint8_t serialization_version(SER_VER);
|
229
|
+
const uint8_t family_id(FAMILY_ID);
|
230
|
+
|
231
|
+
uint8_t preamble_longs;
|
232
|
+
uint8_t flags;
|
233
|
+
if (empty) {
|
234
|
+
preamble_longs = PREAMBLE_LONGS_EMPTY;
|
235
|
+
flags = EMPTY_FLAG_MASK;
|
236
|
+
} else {
|
237
|
+
preamble_longs = PREAMBLE_LONGS_NON_EMPTY;
|
238
|
+
flags = 0;
|
239
|
+
}
|
240
|
+
|
241
|
+
os.write((char*) &preamble_longs, sizeof(uint8_t));
|
242
|
+
os.write((char*) &serialization_version, sizeof(uint8_t));
|
243
|
+
os.write((char*) &family_id, sizeof(uint8_t));
|
244
|
+
os.write((char*) &flags, sizeof(uint8_t));
|
245
|
+
os.write((char*) &max_k_, sizeof(uint32_t));
|
246
|
+
|
247
|
+
if (!empty) {
|
248
|
+
os.write((char*) &n_, sizeof(uint64_t));
|
249
|
+
os.write((char*) &outer_tau_numer_, sizeof(double));
|
250
|
+
os.write((char*) &outer_tau_denom_, sizeof(uint64_t));
|
251
|
+
gadget_.serialize(os);
|
252
|
+
}
|
253
|
+
}
|
254
|
+
|
255
|
+
template<typename T, typename S, typename A>
|
256
|
+
std::vector<uint8_t, AllocU8<A>> var_opt_union<T,S,A>::serialize(unsigned header_size_bytes) const {
|
257
|
+
const size_t size = header_size_bytes + get_serialized_size_bytes();
|
258
|
+
std::vector<uint8_t, AllocU8<A>> bytes(size);
|
259
|
+
uint8_t* ptr = bytes.data() + header_size_bytes;
|
260
|
+
|
261
|
+
const bool empty = n_ == 0;
|
262
|
+
|
263
|
+
const uint8_t serialization_version(SER_VER);
|
264
|
+
const uint8_t family_id(FAMILY_ID);
|
265
|
+
|
266
|
+
uint8_t preamble_longs;
|
267
|
+
uint8_t flags;
|
268
|
+
|
269
|
+
if (empty) {
|
270
|
+
preamble_longs = PREAMBLE_LONGS_EMPTY;
|
271
|
+
flags = EMPTY_FLAG_MASK;
|
272
|
+
} else {
|
273
|
+
preamble_longs = PREAMBLE_LONGS_NON_EMPTY;
|
274
|
+
flags = 0;
|
275
|
+
}
|
276
|
+
|
277
|
+
// first prelong
|
278
|
+
ptr += copy_to_mem(&preamble_longs, ptr, sizeof(uint8_t));
|
279
|
+
ptr += copy_to_mem(&serialization_version, ptr, sizeof(uint8_t));
|
280
|
+
ptr += copy_to_mem(&family_id, ptr, sizeof(uint8_t));
|
281
|
+
ptr += copy_to_mem(&flags, ptr, sizeof(uint8_t));
|
282
|
+
ptr += copy_to_mem(&max_k_, ptr, sizeof(uint32_t));
|
283
|
+
|
284
|
+
if (!empty) {
|
285
|
+
ptr += copy_to_mem(&n_, ptr, sizeof(uint64_t));
|
286
|
+
ptr += copy_to_mem(&outer_tau_numer_, ptr, sizeof(double));
|
287
|
+
ptr += copy_to_mem(&outer_tau_denom_, ptr, sizeof(uint64_t));
|
288
|
+
|
289
|
+
auto gadget_bytes = gadget_.serialize();
|
290
|
+
ptr += copy_to_mem(gadget_bytes.data(), ptr, gadget_bytes.size() * sizeof(uint8_t));
|
291
|
+
}
|
292
|
+
|
293
|
+
return bytes;
|
294
|
+
}
|
295
|
+
|
296
|
+
template<typename T, typename S, typename A>
|
297
|
+
void var_opt_union<T,S,A>::reset() {
|
298
|
+
n_ = 0;
|
299
|
+
outer_tau_numer_ = 0.0;
|
300
|
+
outer_tau_denom_ = 0;
|
301
|
+
gadget_.reset();
|
302
|
+
}
|
303
|
+
|
304
|
+
template<typename T, typename S, typename A>
|
305
|
+
string<A> var_opt_union<T,S,A>::to_string() const {
|
306
|
+
std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
|
307
|
+
os << "### VarOpt Union SUMMARY: " << std::endl;
|
308
|
+
os << " . n : " << n_ << std::endl;
|
309
|
+
os << " Max k : " << max_k_ << std::endl;
|
310
|
+
os << " Gadget Summary: " << std::endl;
|
311
|
+
os << gadget_.to_string();
|
312
|
+
os << "### END VarOpt Union SUMMARY: " << std::endl;
|
313
|
+
return os.str();
|
314
|
+
}
|
315
|
+
|
316
|
+
template<typename T, typename S, typename A>
|
317
|
+
void var_opt_union<T,S,A>::update(const var_opt_sketch<T,S,A>& sk) {
|
318
|
+
merge_items(sk);
|
319
|
+
resolve_tau(sk);
|
320
|
+
}
|
321
|
+
|
322
|
+
template<typename T, typename S, typename A>
|
323
|
+
void var_opt_union<T,S,A>::update(var_opt_sketch<T,S,A>&& sk) {
|
324
|
+
merge_items(std::move(sk));
|
325
|
+
resolve_tau(sk); // don't need items, so ok even if they've been moved out
|
326
|
+
}
|
327
|
+
|
328
|
+
template<typename T, typename S, typename A>
|
329
|
+
double var_opt_union<T,S,A>::get_outer_tau() const {
|
330
|
+
if (outer_tau_denom_ == 0) {
|
331
|
+
return 0.0;
|
332
|
+
} else {
|
333
|
+
return outer_tau_numer_ / outer_tau_denom_;
|
334
|
+
}
|
335
|
+
}
|
336
|
+
|
337
|
+
template<typename T, typename S, typename A>
|
338
|
+
void var_opt_union<T,S,A>::merge_items(const var_opt_sketch<T,S,A>& sketch) {
|
339
|
+
if (sketch.n_ == 0) {
|
340
|
+
return;
|
341
|
+
}
|
342
|
+
|
343
|
+
n_ += sketch.n_;
|
344
|
+
|
345
|
+
// H region const_iterator
|
346
|
+
typename var_opt_sketch<T,S,A>::const_iterator h_itr(sketch, false, false);
|
347
|
+
typename var_opt_sketch<T,S,A>::const_iterator h_end(sketch, true, false);
|
348
|
+
while (h_itr != h_end) {
|
349
|
+
std::pair<const T&, const double> sample = *h_itr;
|
350
|
+
gadget_.update(sample.first, sample.second, false);
|
351
|
+
++h_itr;
|
352
|
+
}
|
353
|
+
|
354
|
+
// Weight-correcting R region iterator (const_iterator doesn't do the correction)
|
355
|
+
typename var_opt_sketch<T,S,A>::iterator r_itr(sketch, false, true);
|
356
|
+
typename var_opt_sketch<T,S,A>::iterator r_end(sketch, true, true);
|
357
|
+
while (r_itr != r_end) {
|
358
|
+
std::pair<const T&, const double> sample = *r_itr;
|
359
|
+
gadget_.update(sample.first, sample.second, true);
|
360
|
+
++r_itr;
|
361
|
+
}
|
362
|
+
}
|
363
|
+
|
364
|
+
template<typename T, typename S, typename A>
|
365
|
+
void var_opt_union<T,S,A>::merge_items(var_opt_sketch<T,S,A>&& sketch) {
|
366
|
+
if (sketch.n_ == 0) {
|
367
|
+
return;
|
368
|
+
}
|
369
|
+
|
370
|
+
n_ += sketch.n_;
|
371
|
+
|
372
|
+
// H region iterator
|
373
|
+
typename var_opt_sketch<T,S,A>::iterator h_itr(sketch, false, false);
|
374
|
+
typename var_opt_sketch<T,S,A>::iterator h_end(sketch, true, false);
|
375
|
+
while (h_itr != h_end) {
|
376
|
+
std::pair<T&, double> sample = *h_itr;
|
377
|
+
gadget_.update(std::move(sample.first), sample.second, false);
|
378
|
+
++h_itr;
|
379
|
+
}
|
380
|
+
|
381
|
+
// Weight-correcting R region iterator
|
382
|
+
typename var_opt_sketch<T,S,A>::iterator r_itr(sketch, false, true);
|
383
|
+
typename var_opt_sketch<T,S,A>::iterator r_end(sketch, true, true);
|
384
|
+
while (r_itr != r_end) {
|
385
|
+
std::pair<T&, double> sample = *r_itr;
|
386
|
+
gadget_.update(std::move(sample.first), sample.second, true);
|
387
|
+
++r_itr;
|
388
|
+
}
|
389
|
+
}
|
390
|
+
|
391
|
+
template<typename T, typename S, typename A>
|
392
|
+
void var_opt_union<T,S,A>::resolve_tau(const var_opt_sketch<T,S,A>& sketch) {
|
393
|
+
if (sketch.r_ > 0) {
|
394
|
+
const double sketch_tau = sketch.get_tau();
|
395
|
+
const double outer_tau = get_outer_tau();
|
396
|
+
|
397
|
+
if (outer_tau_denom_ == 0) {
|
398
|
+
// detect first estimation mode sketch and grab its tau
|
399
|
+
outer_tau_numer_ = sketch.total_wt_r_;
|
400
|
+
outer_tau_denom_ = sketch.r_;
|
401
|
+
} else if (sketch_tau > outer_tau) {
|
402
|
+
// switch to a bigger value of outer_tau
|
403
|
+
outer_tau_numer_ = sketch.total_wt_r_;
|
404
|
+
outer_tau_denom_ = sketch.r_;
|
405
|
+
} else if (sketch_tau == outer_tau) {
|
406
|
+
// Ok if previous equality test isn't quite perfect. Mistakes in either direction should
|
407
|
+
// be fairly benign.
|
408
|
+
// Without conceptually changing outer_tau, update number and denominator. In particular,
|
409
|
+
// add the total weight of the incoming reservoir to the running total.
|
410
|
+
outer_tau_numer_ += sketch.total_wt_r_;
|
411
|
+
outer_tau_denom_ += sketch.r_;
|
412
|
+
}
|
413
|
+
|
414
|
+
// do nothing if sketch's tau is smaller than outer_tau
|
415
|
+
}
|
416
|
+
}
|
417
|
+
|
418
|
+
template<typename T, typename S, typename A>
|
419
|
+
var_opt_sketch<T,S,A> var_opt_union<T,S,A>::get_result() const {
|
420
|
+
// If no marked items in H, gadget is already valid mathematically. We can return what is
|
421
|
+
// basically just a copy of the gadget.
|
422
|
+
if (gadget_.num_marks_in_h_ == 0) {
|
423
|
+
return simple_gadget_coercer();
|
424
|
+
} else {
|
425
|
+
// Copy of gadget. This may produce needless copying in the
|
426
|
+
// pseudo-exact case below, but should simplify the code without
|
427
|
+
// needing to make the gadget a pointer
|
428
|
+
var_opt_sketch<T,S,A> gcopy(gadget_, false, n_);
|
429
|
+
|
430
|
+
// At this point, we know that marked items are present in H. So:
|
431
|
+
// 1. Result will necessarily be in estimation mode
|
432
|
+
// 2. Marked items currently in H need to be absorbed into reservoir (R)
|
433
|
+
const bool is_pseudo_exact = detect_and_handle_subcase_of_pseudo_exact(gcopy);
|
434
|
+
if (!is_pseudo_exact) {
|
435
|
+
// continue with main logic
|
436
|
+
migrate_marked_items_by_decreasing_k(gcopy);
|
437
|
+
}
|
438
|
+
// sub-case was already detected and handled, so return the result
|
439
|
+
return gcopy;
|
440
|
+
}
|
441
|
+
}
|
442
|
+
|
443
|
+
/**
|
444
|
+
* When there are no marked items in H, the gadget is mathematically equivalent to a valid
|
445
|
+
* varopt sketch. This method simply returns a copy (without perserving marks).
|
446
|
+
*
|
447
|
+
* @return A shallow copy of the gadget as valid varopt sketch
|
448
|
+
*/
|
449
|
+
template<typename T, typename S, typename A>
|
450
|
+
var_opt_sketch<T,S,A> var_opt_union<T,S,A>::simple_gadget_coercer() const {
|
451
|
+
if (gadget_.num_marks_in_h_ != 0) throw std::logic_error("simple gadget coercer only applies if no marks");
|
452
|
+
return var_opt_sketch<T,S,A>(gadget_, true, n_);
|
453
|
+
}
|
454
|
+
|
455
|
+
// this is a condition checked in detect_and_handle_subcase_of_pseudo_exact()
|
456
|
+
template<typename T, typename S, typename A>
|
457
|
+
bool var_opt_union<T,S,A>::there_exist_unmarked_h_items_lighter_than_target(double threshold) const {
|
458
|
+
for (uint32_t i = 0; i < gadget_.h_; ++i) {
|
459
|
+
if ((gadget_.weights_[i] < threshold) && !gadget_.marks_[i]) {
|
460
|
+
return true;
|
461
|
+
}
|
462
|
+
}
|
463
|
+
return false;
|
464
|
+
}
|
465
|
+
|
466
|
+
template<typename T, typename S, typename A>
|
467
|
+
bool var_opt_union<T,S,A>::detect_and_handle_subcase_of_pseudo_exact(var_opt_sketch<T,S,A>& sk) const {
|
468
|
+
// gadget is seemingly exact
|
469
|
+
const bool condition1 = gadget_.r_ == 0;
|
470
|
+
|
471
|
+
// but there are marked items in H, so only _pseudo_ exact
|
472
|
+
const bool condition2 = gadget_.num_marks_in_h_ > 0;
|
473
|
+
|
474
|
+
// if gadget is pseudo-exact and the number of marks equals outer_tau_denom, then we can deduce
|
475
|
+
// from the bookkeeping logic of resolve_tau() that all estimation mode input sketches must
|
476
|
+
// have had the same tau, so we can throw all of the marked items into a common reservoir.
|
477
|
+
const bool condition3 = gadget_.num_marks_in_h_ == outer_tau_denom_;
|
478
|
+
|
479
|
+
if (!(condition1 && condition2 && condition3)) {
|
480
|
+
return false;
|
481
|
+
} else {
|
482
|
+
|
483
|
+
// explicitly enforce rule that items in H should not be lighter than the sketch's tau
|
484
|
+
const bool anti_condition4 = there_exist_unmarked_h_items_lighter_than_target(gadget_.get_tau());
|
485
|
+
if (anti_condition4) {
|
486
|
+
return false;
|
487
|
+
} else {
|
488
|
+
// conditions 1 through 4 hold
|
489
|
+
mark_moving_gadget_coercer(sk);
|
490
|
+
return true;
|
491
|
+
}
|
492
|
+
}
|
493
|
+
}
|
494
|
+
|
495
|
+
/**
|
496
|
+
* This coercer directly transfers marked items from the gadget's H into the result's R.
|
497
|
+
* Deciding whether that is a valid thing to do is the responsibility of the caller. Currently,
|
498
|
+
* this is only used for a subcase of pseudo-exact, but later it might be used by other
|
499
|
+
* subcases as well.
|
500
|
+
*
|
501
|
+
* @param sk Copy of the gadget, modified with marked items moved to the reservoir
|
502
|
+
*/
|
503
|
+
template<typename T, typename S, typename A>
|
504
|
+
void var_opt_union<T,S,A>::mark_moving_gadget_coercer(var_opt_sketch<T,S,A>& sk) const {
|
505
|
+
const uint32_t result_k = gadget_.h_ + gadget_.r_;
|
506
|
+
|
507
|
+
uint32_t result_h = 0;
|
508
|
+
uint32_t result_r = 0;
|
509
|
+
size_t next_r_pos = result_k; // = (result_k+1)-1, to fill R region from back to front
|
510
|
+
|
511
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<double> AllocDouble;
|
512
|
+
double* wts = AllocDouble().allocate(result_k + 1);
|
513
|
+
T* data = A().allocate(result_k + 1);
|
514
|
+
|
515
|
+
// insert R region items, ignoring weights
|
516
|
+
// Currently (May 2017) this next block is unreachable; this coercer is used only in the
|
517
|
+
// pseudo-exact case in which case there are no items natively in R, only marked items in H
|
518
|
+
// that will be moved into R as part of the coercion process.
|
519
|
+
// Addedndum (Jan 2020): Cleanup at end of method assumes R count is 0
|
520
|
+
const size_t final_idx = gadget_.get_num_samples();
|
521
|
+
for (size_t idx = gadget_.h_ + 1; idx <= final_idx; ++idx) {
|
522
|
+
A().construct(&data[next_r_pos], T(gadget_.data_[idx]));
|
523
|
+
wts[next_r_pos] = gadget_.weights_[idx];
|
524
|
+
++result_r;
|
525
|
+
--next_r_pos;
|
526
|
+
}
|
527
|
+
|
528
|
+
double transferred_weight = 0;
|
529
|
+
|
530
|
+
// insert H region items
|
531
|
+
for (size_t idx = 0; idx < gadget_.h_; ++idx) {
|
532
|
+
if (gadget_.marks_[idx]) {
|
533
|
+
A().construct(&data[next_r_pos], T(gadget_.data_[idx]));
|
534
|
+
wts[next_r_pos] = -1.0;
|
535
|
+
transferred_weight += gadget_.weights_[idx];
|
536
|
+
++result_r;
|
537
|
+
--next_r_pos;
|
538
|
+
} else {
|
539
|
+
A().construct(&data[result_h], T(gadget_.data_[idx]));
|
540
|
+
wts[result_h] = gadget_.weights_[idx];
|
541
|
+
++result_h;
|
542
|
+
}
|
543
|
+
}
|
544
|
+
|
545
|
+
if (result_h + result_r != result_k) throw std::logic_error("H + R counts must equal k");
|
546
|
+
if (fabs(transferred_weight - outer_tau_numer_) > 1e-10) {
|
547
|
+
throw std::logic_error("uexpected mismatch in transferred weight");
|
548
|
+
}
|
549
|
+
|
550
|
+
const double result_r_weight = gadget_.total_wt_r_ + transferred_weight;
|
551
|
+
const uint64_t result_n = n_;
|
552
|
+
|
553
|
+
// explicitly set weight value for the gap
|
554
|
+
wts[result_h] = -1.0;
|
555
|
+
|
556
|
+
// clean up arrays in input sketch, replace with new values
|
557
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<bool> AllocBool;
|
558
|
+
AllocBool().deallocate(sk.marks_, sk.curr_items_alloc_);
|
559
|
+
AllocDouble().deallocate(sk.weights_, sk.curr_items_alloc_);
|
560
|
+
for (size_t i = 0; i < result_k; ++i) { A().destroy(sk.data_ + i); } // assumes everything in H region, no gap
|
561
|
+
A().deallocate(sk.data_, sk.curr_items_alloc_);
|
562
|
+
|
563
|
+
sk.data_ = data;
|
564
|
+
sk.weights_ = wts;
|
565
|
+
sk.marks_ = nullptr;
|
566
|
+
sk.num_marks_in_h_ = 0;
|
567
|
+
sk.curr_items_alloc_ = result_k + 1;
|
568
|
+
sk.k_ = result_k;
|
569
|
+
sk.n_ = result_n;
|
570
|
+
sk.h_ = result_h;
|
571
|
+
sk.r_ = result_r;
|
572
|
+
sk.total_wt_r_ = result_r_weight;
|
573
|
+
}
|
574
|
+
|
575
|
+
// this is basically a continuation of get_result(), but modifying the input gadget copy
|
576
|
+
template<typename T, typename S, typename A>
|
577
|
+
void var_opt_union<T,S,A>::migrate_marked_items_by_decreasing_k(var_opt_sketch<T,S,A>& gcopy) const {
|
578
|
+
const uint32_t r_count = gcopy.r_;
|
579
|
+
const uint32_t h_count = gcopy.h_;
|
580
|
+
const uint32_t k = gcopy.k_;
|
581
|
+
|
582
|
+
// should be ensured by caller
|
583
|
+
if (gcopy.num_marks_in_h_ == 0) throw std::logic_error("unexpectedly found no marked items to migrate");
|
584
|
+
// either full (of samples), in pseudo-exact mode, or both
|
585
|
+
if ((r_count != 0) && ((h_count + r_count) != k)) throw std::logic_error("invalid gadget state");
|
586
|
+
|
587
|
+
// if non-full and pseudo-exact, change k so that gcopy is full
|
588
|
+
if ((r_count == 0) && (h_count < k)) {
|
589
|
+
gcopy.k_ = h_count; // may leve extra space allocated but that's ok
|
590
|
+
}
|
591
|
+
|
592
|
+
// Now k equals the number of samples, so reducing k will increase tau.
|
593
|
+
// Also, we know that there are at least 2 samples because 0 or 1 would have been handled
|
594
|
+
// by the earlier logic in get_result()
|
595
|
+
gcopy.decrease_k_by_1();
|
596
|
+
|
597
|
+
// gcopy is now in estimation mode, just like the final result must be (due to marked items)
|
598
|
+
if (gcopy.get_tau() == 0.0) throw std::logic_error("gadget must be in sampling mode");
|
599
|
+
|
600
|
+
// keep reducing k until all marked items have been absorbed into the reservoir
|
601
|
+
while (gcopy.num_marks_in_h_ > 0) {
|
602
|
+
// gcopy.k_ >= 2 because h_ and r_ are both at least 1, but checked in next method anyway
|
603
|
+
gcopy.decrease_k_by_1();
|
604
|
+
}
|
605
|
+
|
606
|
+
gcopy.strip_marks();
|
607
|
+
}
|
608
|
+
|
609
|
+
template<typename T, typename S, typename A>
|
610
|
+
void var_opt_union<T,S,A>::check_preamble_longs(uint8_t preamble_longs, uint8_t flags) {
|
611
|
+
bool is_empty(flags & EMPTY_FLAG_MASK);
|
612
|
+
|
613
|
+
if (is_empty) {
|
614
|
+
if (preamble_longs != PREAMBLE_LONGS_EMPTY) {
|
615
|
+
throw std::invalid_argument("Possible corruption: Preamble longs must be "
|
616
|
+
+ std::to_string(PREAMBLE_LONGS_EMPTY) + " for an empty sketch. Found: "
|
617
|
+
+ std::to_string(preamble_longs));
|
618
|
+
}
|
619
|
+
} else {
|
620
|
+
if (preamble_longs != PREAMBLE_LONGS_NON_EMPTY) {
|
621
|
+
throw std::invalid_argument("Possible corruption: Preamble longs must be "
|
622
|
+
+ std::to_string(PREAMBLE_LONGS_NON_EMPTY)
|
623
|
+
+ " for a non-empty sketch. Found: " + std::to_string(preamble_longs));
|
624
|
+
}
|
625
|
+
}
|
626
|
+
}
|
627
|
+
|
628
|
+
template<typename T, typename S, typename A>
|
629
|
+
void var_opt_union<T,S,A>::check_family_and_serialization_version(uint8_t family_id, uint8_t ser_ver) {
|
630
|
+
if (family_id == FAMILY_ID) {
|
631
|
+
if (ser_ver != SER_VER) {
|
632
|
+
throw std::invalid_argument("Possible corruption: VarOpt Union serialization version must be "
|
633
|
+
+ std::to_string(SER_VER) + ". Found: " + std::to_string(ser_ver));
|
634
|
+
}
|
635
|
+
return;
|
636
|
+
}
|
637
|
+
// TODO: extend to handle reservoir sampling
|
638
|
+
|
639
|
+
throw std::invalid_argument("Possible corruption: VarOpt Union family id must be "
|
640
|
+
+ std::to_string(FAMILY_ID) + ". Found: " + std::to_string(family_id));
|
641
|
+
}
|
642
|
+
|
643
|
+
} // namespace datasketches
|
644
|
+
|
645
|
+
#endif // _VAR_OPT_UNION_IMPL_HPP_
|