datasketches 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
@@ -0,0 +1,43 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#ifndef _INTARRAYPAIRITERATOR_HPP_
|
21
|
+
#define _INTARRAYPAIRITERATOR_HPP_
|
22
|
+
|
23
|
+
namespace datasketches {
|
24
|
+
|
25
|
+
template<typename A>
|
26
|
+
class coupon_iterator: public std::iterator<std::input_iterator_tag, uint32_t> {
|
27
|
+
public:
|
28
|
+
coupon_iterator(const int* array, size_t array_slze, size_t index, bool all);
|
29
|
+
coupon_iterator& operator++();
|
30
|
+
bool operator!=(const coupon_iterator& other) const;
|
31
|
+
uint32_t operator*() const;
|
32
|
+
private:
|
33
|
+
const int* array;
|
34
|
+
size_t array_size;
|
35
|
+
size_t index;
|
36
|
+
bool all;
|
37
|
+
};
|
38
|
+
|
39
|
+
}
|
40
|
+
|
41
|
+
#include "coupon_iterator-internal.hpp"
|
42
|
+
|
43
|
+
#endif /* _INTARRAYPAIRITERATOR_HPP_ */
|
@@ -0,0 +1,669 @@
|
|
1
|
+
/*
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
4
|
+
* distributed with this work for additional information
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
7
|
+
* "License"); you may not use this file except in compliance
|
8
|
+
* with the License. You may obtain a copy of the License at
|
9
|
+
*
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
*
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
13
|
+
* software distributed under the License is distributed on an
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15
|
+
* KIND, either express or implied. See the License for the
|
16
|
+
* specific language governing permissions and limitations
|
17
|
+
* under the License.
|
18
|
+
*/
|
19
|
+
|
20
|
+
#ifndef _HLL_HPP_
|
21
|
+
#define _HLL_HPP_
|
22
|
+
|
23
|
+
#include "common_defs.hpp"
|
24
|
+
#include "HllUtil.hpp"
|
25
|
+
|
26
|
+
#include <memory>
|
27
|
+
#include <iostream>
|
28
|
+
#include <vector>
|
29
|
+
|
30
|
+
namespace datasketches {
|
31
|
+
|
32
|
+
/**
|
33
|
+
* This is a high performance implementation of Phillipe Flajolet’s HLL sketch but with
|
34
|
+
* significantly improved error behavior. If the ONLY use case for sketching is counting
|
35
|
+
* uniques and merging, the HLL sketch is a reasonable choice, although the highest performing in terms of accuracy for
|
36
|
+
* storage space consumed is CPC (Compressed Probabilistic Counting). For large enough counts, this HLL version (with HLL_4) can be 2 to
|
37
|
+
* 16 times smaller than the Theta sketch family for the same accuracy.
|
38
|
+
*
|
39
|
+
* <p>This implementation offers three different types of HLL sketch, each with different
|
40
|
+
* trade-offs with accuracy, space and performance. These types are specified with the
|
41
|
+
* {@link TgtHllType} parameter.
|
42
|
+
*
|
43
|
+
* <p>In terms of accuracy, all three types, for the same <i>lg_config_k</i>, have the same error
|
44
|
+
* distribution as a function of <i>n</i>, the number of unique values fed to the sketch.
|
45
|
+
* The configuration parameter <i>lg_config_k</i> is the log-base-2 of <i>K</i>,
|
46
|
+
* where <i>K</i> is the number of buckets or slots for the sketch.
|
47
|
+
*
|
48
|
+
* <p>During warmup, when the sketch has only received a small number of unique items
|
49
|
+
* (up to about 10% of <i>K</i>), this implementation leverages a new class of estimator
|
50
|
+
* algorithms with significantly better accuracy.
|
51
|
+
*
|
52
|
+
* <p>This sketch also offers the capability of operating off-heap. Given a WritableMemory object
|
53
|
+
* created by the user, the sketch will perform all of its updates and internal phase transitions
|
54
|
+
* in that object, which can actually reside either on-heap or off-heap based on how it is
|
55
|
+
* configured. In large systems that must update and merge many millions of sketches, having the
|
56
|
+
* sketch operate off-heap avoids the serialization and deserialization costs of moving sketches
|
57
|
+
* to and from off-heap memory-mapped files, for example, and eliminates big garbage collection
|
58
|
+
* delays.
|
59
|
+
*
|
60
|
+
* author Jon Malkin
|
61
|
+
* author Lee Rhodes
|
62
|
+
* author Kevin Lang
|
63
|
+
*/
|
64
|
+
|
65
|
+
|
66
|
+
/**
|
67
|
+
* Specifies the target type of HLL sketch to be created. It is a target in that the actual
|
68
|
+
* allocation of the HLL array is deferred until sufficient number of items have been received by
|
69
|
+
* the warm-up phases.
|
70
|
+
*
|
71
|
+
* <p>These three target types are isomorphic representations of the same underlying HLL algorithm.
|
72
|
+
* Thus, given the same value of <i>lg_config_k</i> and the same input, all three HLL target types
|
73
|
+
* will produce identical estimates and have identical error distributions.</p>
|
74
|
+
*
|
75
|
+
* <p>The memory (and also the serialization) of the sketch during this early warmup phase starts
|
76
|
+
* out very small (8 bytes, when empty) and then grows in increments of 4 bytes as required
|
77
|
+
* until the full HLL array is allocated. This transition point occurs at about 10% of K for
|
78
|
+
* sketches where lg_config_k is > 8.</p>
|
79
|
+
*
|
80
|
+
* <ul>
|
81
|
+
* <li><b>HLL_8</b> This uses an 8-bit byte per HLL bucket. It is generally the
|
82
|
+
* fastest in terms of update time, but has the largest storage footprint of about
|
83
|
+
* <i>K</i> bytes.</li>
|
84
|
+
*
|
85
|
+
* <li><b>HLL_6</b> This uses a 6-bit field per HLL bucket. It is the generally the next fastest
|
86
|
+
* in terms of update time with a storage footprint of about <i>3/4 * K</i> bytes.</li>
|
87
|
+
*
|
88
|
+
* <li><b>HLL_4</b> This uses a 4-bit field per HLL bucket and for large counts may require
|
89
|
+
* the use of a small internal auxiliary array for storing statistical exceptions, which are rare.
|
90
|
+
* For the values of <i>lg_config_k > 13</i> (<i>K</i> = 8192),
|
91
|
+
* this additional array adds about 3% to the overall storage. It is generally the slowest in
|
92
|
+
* terms of update time, but has the smallest storage footprint of about
|
93
|
+
* <i>K/2 * 1.03</i> bytes.</li>
|
94
|
+
* </ul>
|
95
|
+
*/
|
96
|
+
enum target_hll_type {
|
97
|
+
HLL_4, ///< 4 bits per entry (most compact, size may vary)
|
98
|
+
HLL_6, ///< 6 bits per entry (fixed size)
|
99
|
+
HLL_8 ///< 8 bits per entry (fastest, fixed size)
|
100
|
+
};
|
101
|
+
|
102
|
+
template<typename A>
|
103
|
+
class HllSketchImpl;
|
104
|
+
|
105
|
+
template<typename A>
|
106
|
+
class hll_union_alloc;
|
107
|
+
|
108
|
+
template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
|
109
|
+
template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
|
110
|
+
|
111
|
+
template<typename A = std::allocator<char> >
|
112
|
+
class hll_sketch_alloc final {
|
113
|
+
public:
|
114
|
+
/**
|
115
|
+
* Constructs a new HLL sketch.
|
116
|
+
* @param lg_config_k Sketch can hold 2^lg_config_k rows
|
117
|
+
* @param tgt_type The HLL mode to use, if/when the sketch reaches that state
|
118
|
+
* @param start_full_size Indicates whether to start in HLL mode,
|
119
|
+
* keeping memory use constant (if HLL_6 or HLL_8) at the cost of
|
120
|
+
* starting out using much more memory
|
121
|
+
*/
|
122
|
+
explicit hll_sketch_alloc(int lg_config_k, target_hll_type tgt_type = HLL_4, bool start_full_size = false);
|
123
|
+
|
124
|
+
/**
|
125
|
+
* Copy constructor
|
126
|
+
*/
|
127
|
+
hll_sketch_alloc(const hll_sketch_alloc<A>& that);
|
128
|
+
|
129
|
+
/**
|
130
|
+
* Copy constructor to a new target type
|
131
|
+
*/
|
132
|
+
hll_sketch_alloc(const hll_sketch_alloc<A>& that, target_hll_type tgt_type);
|
133
|
+
|
134
|
+
/**
|
135
|
+
* Move constructor
|
136
|
+
*/
|
137
|
+
hll_sketch_alloc(hll_sketch_alloc<A>&& that) noexcept;
|
138
|
+
|
139
|
+
/**
|
140
|
+
* Reconstructs a sketch from a serialized image on a stream.
|
141
|
+
* @param is An input stream with a binary image of a sketch
|
142
|
+
*/
|
143
|
+
static hll_sketch_alloc deserialize(std::istream& is);
|
144
|
+
|
145
|
+
/**
|
146
|
+
* Reconstructs a sketch from a serialized image in a byte array.
|
147
|
+
* @param is bytes An input array with a binary image of a sketch
|
148
|
+
* @param len Length of the input array, in bytes
|
149
|
+
*/
|
150
|
+
static hll_sketch_alloc deserialize(const void* bytes, size_t len);
|
151
|
+
|
152
|
+
//! Class destructor
|
153
|
+
virtual ~hll_sketch_alloc();
|
154
|
+
|
155
|
+
//! Copy assignment operator
|
156
|
+
hll_sketch_alloc operator=(const hll_sketch_alloc<A>& other);
|
157
|
+
|
158
|
+
//! Move assignment operator
|
159
|
+
hll_sketch_alloc operator=(hll_sketch_alloc<A>&& other);
|
160
|
+
|
161
|
+
/**
|
162
|
+
* Resets the sketch to an empty state in coupon collection mode.
|
163
|
+
* Does not re-use existing internal objects.
|
164
|
+
*/
|
165
|
+
void reset();
|
166
|
+
|
167
|
+
typedef vector_u8<A> vector_bytes; // alias for users
|
168
|
+
|
169
|
+
/**
|
170
|
+
* Serializes the sketch to a byte array, compacting data structures
|
171
|
+
* where feasible to eliminate unused storage in the serialized image.
|
172
|
+
* @param header_size_bytes Allows for PostgreSQL integration
|
173
|
+
*/
|
174
|
+
vector_bytes serialize_compact(unsigned header_size_bytes = 0) const;
|
175
|
+
|
176
|
+
/**
|
177
|
+
* Serializes the sketch to a byte array, retaining all internal
|
178
|
+
* data structures in their current form.
|
179
|
+
*/
|
180
|
+
vector_bytes serialize_updatable() const;
|
181
|
+
|
182
|
+
/**
|
183
|
+
* Serializes the sketch to an ostream, compacting data structures
|
184
|
+
* where feasible to eliminate unused storage in the serialized image.
|
185
|
+
* @param os std::ostream to use for output.
|
186
|
+
*/
|
187
|
+
void serialize_compact(std::ostream& os) const;
|
188
|
+
|
189
|
+
/**
|
190
|
+
* Serializes the sketch to an ostream, retaining all internal data
|
191
|
+
* structures in their current form.
|
192
|
+
* @param os std::ostream to use for output.
|
193
|
+
*/
|
194
|
+
void serialize_updatable(std::ostream& os) const;
|
195
|
+
|
196
|
+
/**
|
197
|
+
* Human readable summary with optional detail
|
198
|
+
* @param summary if true, output the sketch summary
|
199
|
+
* @param detail if true, output the internal data array
|
200
|
+
* @param auxDetail if true, output the internal Aux array, if it exists.
|
201
|
+
* @param all if true, outputs all entries including empty ones
|
202
|
+
* @return human readable string with optional detail.
|
203
|
+
*/
|
204
|
+
string<A> to_string(bool summary = true,
|
205
|
+
bool detail = false,
|
206
|
+
bool aux_detail = false,
|
207
|
+
bool all = false) const;
|
208
|
+
|
209
|
+
/**
|
210
|
+
* Present the given std::string as a potential unique item.
|
211
|
+
* The string is converted to a byte array using UTF8 encoding.
|
212
|
+
* If the string is null or empty no update attempt is made and the method returns.
|
213
|
+
* @param datum The given string.
|
214
|
+
*/
|
215
|
+
void update(const std::string& datum);
|
216
|
+
|
217
|
+
/**
|
218
|
+
* Present the given unsigned 64-bit integer as a potential unique item.
|
219
|
+
* @param datum The given integer.
|
220
|
+
*/
|
221
|
+
void update(uint64_t datum);
|
222
|
+
|
223
|
+
/**
|
224
|
+
* Present the given unsigned 32-bit integer as a potential unique item.
|
225
|
+
* @param datum The given integer.
|
226
|
+
*/
|
227
|
+
void update(uint32_t datum);
|
228
|
+
|
229
|
+
/**
|
230
|
+
* Present the given unsigned 16-bit integer as a potential unique item.
|
231
|
+
* @param datum The given integer.
|
232
|
+
*/
|
233
|
+
void update(uint16_t datum);
|
234
|
+
|
235
|
+
/**
|
236
|
+
* Present the given unsigned 8-bit integer as a potential unique item.
|
237
|
+
* @param datum The given integer.
|
238
|
+
*/
|
239
|
+
void update(uint8_t datum);
|
240
|
+
|
241
|
+
/**
|
242
|
+
* Present the given signed 64-bit integer as a potential unique item.
|
243
|
+
* @param datum The given integer.
|
244
|
+
*/
|
245
|
+
void update(int64_t datum);
|
246
|
+
|
247
|
+
/**
|
248
|
+
* Present the given signed 32-bit integer as a potential unique item.
|
249
|
+
* @param datum The given integer.
|
250
|
+
*/
|
251
|
+
void update(int32_t datum);
|
252
|
+
|
253
|
+
/**
|
254
|
+
* Present the given signed 16-bit integer as a potential unique item.
|
255
|
+
* @param datum The given integer.
|
256
|
+
*/
|
257
|
+
void update(int16_t datum);
|
258
|
+
|
259
|
+
/**
|
260
|
+
* Present the given signed 8-bit integer as a potential unique item.
|
261
|
+
* @param datum The given integer.
|
262
|
+
*/
|
263
|
+
void update(int8_t datum);
|
264
|
+
|
265
|
+
/**
|
266
|
+
* Present the given 64-bit floating point value as a potential unique item.
|
267
|
+
* @param datum The given double.
|
268
|
+
*/
|
269
|
+
void update(double datum);
|
270
|
+
|
271
|
+
/**
|
272
|
+
* Present the given 32-bit floating point value as a potential unique item.
|
273
|
+
* @param datum The given float.
|
274
|
+
*/
|
275
|
+
void update(float datum);
|
276
|
+
|
277
|
+
/**
|
278
|
+
* Present the given data array as a potential unique item.
|
279
|
+
* @param data The given array.
|
280
|
+
* @param length_bytes The array length in bytes.
|
281
|
+
*/
|
282
|
+
void update(const void* data, size_t length_bytes);
|
283
|
+
|
284
|
+
/**
|
285
|
+
* Returns the current cardinality estimate
|
286
|
+
* @return the cardinality estimate
|
287
|
+
*/
|
288
|
+
double get_estimate() const;
|
289
|
+
|
290
|
+
/**
|
291
|
+
* This is less accurate than the getEstimate() method
|
292
|
+
* and is automatically used when the sketch has gone through
|
293
|
+
* union operations where the more accurate HIP estimator cannot
|
294
|
+
* be used.
|
295
|
+
*
|
296
|
+
* This is made public only for error characterization software
|
297
|
+
* that exists in separate packages and is not intended for normal
|
298
|
+
* use.
|
299
|
+
* @return the composite cardinality estimate
|
300
|
+
*/
|
301
|
+
double get_composite_estimate() const;
|
302
|
+
|
303
|
+
/**
|
304
|
+
* Returns the approximate lower error bound given the specified
|
305
|
+
* number of standard deviations.
|
306
|
+
* @param num_std_dev Number of standard deviations, an integer from the set {1, 2, 3}.
|
307
|
+
* @return The approximate lower bound.
|
308
|
+
*/
|
309
|
+
double get_lower_bound(int num_std_dev) const;
|
310
|
+
|
311
|
+
/**
|
312
|
+
* Returns the approximate upper error bound given the specified
|
313
|
+
* number of standard deviations.
|
314
|
+
* @param num_std_dev Number of standard deviations, an integer from the set {1, 2, 3}.
|
315
|
+
* @return The approximate upper bound.
|
316
|
+
*/
|
317
|
+
double get_upper_bound(int num_std_dev) const;
|
318
|
+
|
319
|
+
/**
|
320
|
+
* Returns sketch's configured lg_k value.
|
321
|
+
* @return Configured lg_k value.
|
322
|
+
*/
|
323
|
+
int get_lg_config_k() const;
|
324
|
+
|
325
|
+
/**
|
326
|
+
* Returns the sketch's target HLL mode (from #target_hll_type).
|
327
|
+
* @return The sketch's target HLL mode.
|
328
|
+
*/
|
329
|
+
target_hll_type get_target_type() const;
|
330
|
+
|
331
|
+
/**
|
332
|
+
* Indicates if the sketch is currently stored compacted.
|
333
|
+
* @return True if the sketch is stored in compact form.
|
334
|
+
*/
|
335
|
+
bool is_compact() const;
|
336
|
+
|
337
|
+
/**
|
338
|
+
* Indicates if the sketch is currently empty.
|
339
|
+
* @return True if the sketch is empty.
|
340
|
+
*/
|
341
|
+
bool is_empty() const;
|
342
|
+
|
343
|
+
/**
|
344
|
+
* Returns the size of the sketch serialized in compact form.
|
345
|
+
* @return Size of the sketch serialized in compact form, in bytes.
|
346
|
+
*/
|
347
|
+
int get_compact_serialization_bytes() const;
|
348
|
+
|
349
|
+
/**
|
350
|
+
* Returns the size of the sketch serialized without compaction.
|
351
|
+
* @return Size of the sketch serialized without compaction, in bytes.
|
352
|
+
*/
|
353
|
+
int get_updatable_serialization_bytes() const;
|
354
|
+
|
355
|
+
/**
|
356
|
+
* Returns the maximum size in bytes that this sketch can grow to
|
357
|
+
* given lg_config_k. However, for the HLL_4 sketch type, this
|
358
|
+
* value can be exceeded in extremely rare cases. If exceeded, it
|
359
|
+
* will be larger by only a few percent.
|
360
|
+
*
|
361
|
+
* @param lg_config_k The Log2 of K for the target HLL sketch. This value must be
|
362
|
+
* between 4 and 21 inclusively.
|
363
|
+
* @param tgt_type the desired Hll type
|
364
|
+
* @return the maximum size in bytes that this sketch can grow to.
|
365
|
+
*/
|
366
|
+
static int get_max_updatable_serialization_bytes(int lg_k, target_hll_type tgt_type);
|
367
|
+
|
368
|
+
/**
|
369
|
+
* Gets the current (approximate) Relative Error (RE) asymptotic values given several
|
370
|
+
* parameters. This is used primarily for testing.
|
371
|
+
* @param upper_bound return the RE for the Upper Bound, otherwise for the Lower Bound.
|
372
|
+
* @param unioned set true if the sketch is the result of a union operation.
|
373
|
+
* @param lg_config_k the configured value for the sketch.
|
374
|
+
* @param num_std_dev the given number of Standard Deviations. This must be an integer between
|
375
|
+
* 1 and 3, inclusive.
|
376
|
+
* @return the current (approximate) RelativeError
|
377
|
+
*/
|
378
|
+
static double get_rel_err(bool upper_bound, bool unioned,
|
379
|
+
int lg_config_k, int num_std_dev);
|
380
|
+
|
381
|
+
private:
|
382
|
+
explicit hll_sketch_alloc(HllSketchImpl<A>* that);
|
383
|
+
|
384
|
+
void coupon_update(int coupon);
|
385
|
+
|
386
|
+
std::string type_as_string() const;
|
387
|
+
std::string mode_as_string() const;
|
388
|
+
|
389
|
+
hll_mode get_current_mode() const;
|
390
|
+
int get_serialization_version() const;
|
391
|
+
bool is_out_of_order_flag() const;
|
392
|
+
bool is_estimation_mode() const;
|
393
|
+
|
394
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<hll_sketch_alloc> AllocHllSketch;
|
395
|
+
|
396
|
+
HllSketchImpl<A>* sketch_impl;
|
397
|
+
friend hll_union_alloc<A>;
|
398
|
+
};
|
399
|
+
|
400
|
+
/**
|
401
|
+
* This performs union operations for HLL sketches. This union operator is configured with a
|
402
|
+
* <i>lgMaxK</i> instead of the normal <i>lg_config_k</i>.
|
403
|
+
*
|
404
|
+
* <p>This union operator does permit the unioning of sketches with different values of
|
405
|
+
* <i>lg_config_k</i>. The user should be aware that the resulting accuracy of a sketch returned
|
406
|
+
* at the end of the unioning process will be a function of the smallest of <i>lg_max_k</i> and
|
407
|
+
* <i>lg_config_k</i> that the union operator has seen.
|
408
|
+
*
|
409
|
+
* <p>This union operator also permits unioning of any of the three different target hll_sketch
|
410
|
+
* types.
|
411
|
+
*
|
412
|
+
* <p>Although the API for this union operator parallels many of the methods of the
|
413
|
+
* <i>HllSketch</i>, the behavior of the union operator has some fundamental differences.
|
414
|
+
*
|
415
|
+
* <p>First, the user cannot specify the #tgt_hll_type as an input parameter.
|
416
|
+
* Instead, it is specified for the sketch returned with #get_result(tgt_hll_tyope).
|
417
|
+
*
|
418
|
+
* <p>Second, the internal effective value of log-base-2 of <i>k</i> for the union operation can
|
419
|
+
* change dynamically based on the smallest <i>lg_config_k</i> that the union operation has seen.
|
420
|
+
*
|
421
|
+
* author Jon Malkin
|
422
|
+
* author Lee Rhodes
|
423
|
+
* author Kevin Lang
|
424
|
+
*/
|
425
|
+
|
426
|
+
template<typename A = std::allocator<char> >
|
427
|
+
class hll_union_alloc {
|
428
|
+
public:
|
429
|
+
/**
|
430
|
+
* Construct an hll_union operator with the given maximum log2 of k.
|
431
|
+
* @param lg_max_k The maximum size, in log2, of k. The value must
|
432
|
+
* be between 7 and 21, inclusive.
|
433
|
+
*/
|
434
|
+
explicit hll_union_alloc(int lg_max_k);
|
435
|
+
|
436
|
+
/**
|
437
|
+
* Returns the current cardinality estimate
|
438
|
+
* @return the cardinality estimate
|
439
|
+
*/
|
440
|
+
double get_estimate() const;
|
441
|
+
|
442
|
+
/**
|
443
|
+
* This is less accurate than the get_estimate() method
|
444
|
+
* and is automatically used when the union has gone through
|
445
|
+
* union operations where the more accurate HIP estimator cannot
|
446
|
+
* be used.
|
447
|
+
*
|
448
|
+
* This is made public only for error characterization software
|
449
|
+
* that exists in separate packages and is not intended for normal
|
450
|
+
* use.
|
451
|
+
* @return the composite cardinality estimate
|
452
|
+
*/
|
453
|
+
double get_composite_estimate() const;
|
454
|
+
|
455
|
+
/**
|
456
|
+
* Returns the approximate lower error bound given the specified
|
457
|
+
* number of standard deviations.
|
458
|
+
* @param num_std_dev Number of standard deviations, an integer from the set {1, 2, 3}.
|
459
|
+
* @return The approximate lower bound.
|
460
|
+
*/
|
461
|
+
double get_lower_bound(int num_std_dev) const;
|
462
|
+
|
463
|
+
/**
|
464
|
+
* Returns the approximate upper error bound given the specified
|
465
|
+
* number of standard deviations.
|
466
|
+
* @param num_std_dev Number of standard deviations, an integer from the set {1, 2, 3}.
|
467
|
+
* @return The approximate upper bound.
|
468
|
+
*/
|
469
|
+
double get_upper_bound(int num_std_dev) const;
|
470
|
+
|
471
|
+
/**
|
472
|
+
* Returns the size of the union serialized in compact form.
|
473
|
+
* @return Size of the union serialized in compact form, in bytes.
|
474
|
+
*/
|
475
|
+
int get_compact_serialization_bytes() const;
|
476
|
+
|
477
|
+
/**
|
478
|
+
* Returns the size of the union serialized without compaction.
|
479
|
+
* @return Size of the union serialized without compaction, in bytes.
|
480
|
+
*/
|
481
|
+
int get_updatable_serialization_bytes() const;
|
482
|
+
|
483
|
+
/**
|
484
|
+
* Returns union's configured lg_k value.
|
485
|
+
* @return Configured lg_k value.
|
486
|
+
*/
|
487
|
+
int get_lg_config_k() const;
|
488
|
+
|
489
|
+
/**
|
490
|
+
* Returns the union's target HLL mode (from #target_hll_type).
|
491
|
+
* @return The union's target HLL mode.
|
492
|
+
*/
|
493
|
+
target_hll_type get_target_type() const;
|
494
|
+
|
495
|
+
/**
|
496
|
+
* Indicates if the union is currently stored compacted.
|
497
|
+
* @return True if the union is stored in compact form.
|
498
|
+
*/
|
499
|
+
bool is_compact() const;
|
500
|
+
|
501
|
+
/**
|
502
|
+
* Indicates if the union is currently empty.
|
503
|
+
* @return True if the union is empty.
|
504
|
+
*/
|
505
|
+
bool is_empty() const;
|
506
|
+
|
507
|
+
/**
|
508
|
+
* Resets the union to an empty state in coupon collection mode.
|
509
|
+
* Does not re-use existing internal objects.
|
510
|
+
*/
|
511
|
+
void reset();
|
512
|
+
|
513
|
+
/**
|
514
|
+
* Returns the result of this union operator with the specified
|
515
|
+
* #tgt_hll_type.
|
516
|
+
* @param The tgt_hll_type enum value of the desired result (Default: HLL_4)
|
517
|
+
* @return The result of this union with the specified tgt_hll_type
|
518
|
+
*/
|
519
|
+
hll_sketch_alloc<A> get_result(target_hll_type tgt_type = HLL_4) const;
|
520
|
+
|
521
|
+
/**
|
522
|
+
* Update this union operator with the given sketch.
|
523
|
+
* @param The given sketch.
|
524
|
+
*/
|
525
|
+
void update(const hll_sketch_alloc<A>& sketch);
|
526
|
+
|
527
|
+
/**
|
528
|
+
* Update this union operator with the given temporary sketch.
|
529
|
+
* @param The given sketch.
|
530
|
+
*/
|
531
|
+
void update(hll_sketch_alloc<A>&& sketch);
|
532
|
+
|
533
|
+
/**
|
534
|
+
* Present the given std::string as a potential unique item.
|
535
|
+
* The string is converted to a byte array using UTF8 encoding.
|
536
|
+
* If the string is null or empty no update attempt is made and the method returns.
|
537
|
+
* @param datum The given string.
|
538
|
+
*/
|
539
|
+
void update(const std::string& datum);
|
540
|
+
|
541
|
+
/**
|
542
|
+
* Present the given unsigned 64-bit integer as a potential unique item.
|
543
|
+
* @param datum The given integer.
|
544
|
+
*/
|
545
|
+
void update(uint64_t datum);
|
546
|
+
|
547
|
+
/**
|
548
|
+
* Present the given unsigned 32-bit integer as a potential unique item.
|
549
|
+
* @param datum The given integer.
|
550
|
+
*/
|
551
|
+
void update(uint32_t datum);
|
552
|
+
|
553
|
+
/**
|
554
|
+
* Present the given unsigned 16-bit integer as a potential unique item.
|
555
|
+
* @param datum The given integer.
|
556
|
+
*/
|
557
|
+
void update(uint16_t datum);
|
558
|
+
|
559
|
+
/**
|
560
|
+
* Present the given unsigned 8-bit integer as a potential unique item.
|
561
|
+
* @param datum The given integer.
|
562
|
+
*/
|
563
|
+
void update(uint8_t datum);
|
564
|
+
|
565
|
+
/**
|
566
|
+
* Present the given signed 64-bit integer as a potential unique item.
|
567
|
+
* @param datum The given integer.
|
568
|
+
*/
|
569
|
+
void update(int64_t datum);
|
570
|
+
|
571
|
+
/**
|
572
|
+
* Present the given signed 32-bit integer as a potential unique item.
|
573
|
+
* @param datum The given integer.
|
574
|
+
*/
|
575
|
+
void update(int32_t datum);
|
576
|
+
|
577
|
+
/**
|
578
|
+
* Present the given signed 16-bit integer as a potential unique item.
|
579
|
+
* @param datum The given integer.
|
580
|
+
*/
|
581
|
+
void update(int16_t datum);
|
582
|
+
|
583
|
+
/**
|
584
|
+
* Present the given signed 8-bit integer as a potential unique item.
|
585
|
+
* @param datum The given integer.
|
586
|
+
*/
|
587
|
+
void update(int8_t datum);
|
588
|
+
|
589
|
+
/**
|
590
|
+
* Present the given 64-bit floating point value as a potential unique item.
|
591
|
+
* @param datum The given double.
|
592
|
+
*/
|
593
|
+
void update(double datum);
|
594
|
+
|
595
|
+
/**
|
596
|
+
* Present the given 32-bit floating point value as a potential unique item.
|
597
|
+
* @param datum The given float.
|
598
|
+
*/
|
599
|
+
void update(float datum);
|
600
|
+
|
601
|
+
/**
|
602
|
+
* Present the given data array as a potential unique item.
|
603
|
+
* @param data The given array.
|
604
|
+
* @param length_bytes The array length in bytes.
|
605
|
+
*/
|
606
|
+
void update(const void* data, size_t length_bytes);
|
607
|
+
|
608
|
+
/**
|
609
|
+
* Returns the maximum size in bytes that this union operator can grow to given a lg_k.
|
610
|
+
*
|
611
|
+
* @param lg_k The maximum Log2 of k for this union operator. This value must be
|
612
|
+
* between 4 and 21 inclusively.
|
613
|
+
* @return the maximum size in bytes that this union operator can grow to.
|
614
|
+
*/
|
615
|
+
static int get_max_serialization_bytes(int lg_k);
|
616
|
+
|
617
|
+
/**
|
618
|
+
* Gets the current (approximate) Relative Error (RE) asymptotic values given several
|
619
|
+
* parameters. This is used primarily for testing.
|
620
|
+
* @param upper_bound return the RE for the Upper Bound, otherwise for the Lower Bound.
|
621
|
+
* @param unioned set true if the sketch is the result of a union operation.
|
622
|
+
* @param lg_config_k the configured value for the sketch.
|
623
|
+
* @param num_std_dev the given number of Standard Deviations. This must be an integer between
|
624
|
+
* 1 and 3, inclusive.
|
625
|
+
* @return the current (approximate) RelativeError
|
626
|
+
*/
|
627
|
+
static double get_rel_err(bool upper_bound, bool unioned,
|
628
|
+
int lg_config_k, int num_std_dev);
|
629
|
+
|
630
|
+
private:
|
631
|
+
|
632
|
+
/**
|
633
|
+
* Union the given source and destination sketches. This method examines the state of
|
634
|
+
* the current internal gadget and the incoming sketch and determines the optimal way to
|
635
|
+
* perform the union. This may involve swapping, down-sampling, transforming, and / or
|
636
|
+
* copying one of the arguments and may completely replace the internals of the union.
|
637
|
+
*
|
638
|
+
* @param incoming_impl the given incoming sketch, which may not be modified.
|
639
|
+
* @param lg_max_k the maximum value of log2 K for this union.
|
640
|
+
*/
|
641
|
+
inline void union_impl(const hll_sketch_alloc<A>& sketch, int lg_max_k);
|
642
|
+
|
643
|
+
static HllSketchImpl<A>* copy_or_downsample(const HllSketchImpl<A>* src_impl, int tgt_lg_k);
|
644
|
+
|
645
|
+
void coupon_update(int coupon);
|
646
|
+
|
647
|
+
hll_mode get_current_mode() const;
|
648
|
+
int get_serialization_version() const;
|
649
|
+
bool is_out_of_order_flag() const;
|
650
|
+
bool is_estimation_mode() const;
|
651
|
+
|
652
|
+
// calls couponUpdate on sketch, freeing the old sketch upon changes in hll_mode
|
653
|
+
static HllSketchImpl<A>* leak_free_coupon_update(HllSketchImpl<A>* impl, int coupon);
|
654
|
+
|
655
|
+
int lg_max_k;
|
656
|
+
hll_sketch_alloc<A> gadget;
|
657
|
+
};
|
658
|
+
|
659
|
+
/// convenience alias for hll_sketch with default allocator
|
660
|
+
typedef hll_sketch_alloc<> hll_sketch;
|
661
|
+
|
662
|
+
/// convenience alias for hll_union with default allocator
|
663
|
+
typedef hll_union_alloc<> hll_union;
|
664
|
+
|
665
|
+
} // namespace datasketches
|
666
|
+
|
667
|
+
#include "hll.private.hpp"
|
668
|
+
|
669
|
+
#endif // _HLL_HPP_
|