datasketches 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef _INTARRAYPAIRITERATOR_HPP_
|
|
21
|
+
#define _INTARRAYPAIRITERATOR_HPP_
|
|
22
|
+
|
|
23
|
+
namespace datasketches {
|
|
24
|
+
|
|
25
|
+
template<typename A>
|
|
26
|
+
class coupon_iterator: public std::iterator<std::input_iterator_tag, uint32_t> {
|
|
27
|
+
public:
|
|
28
|
+
coupon_iterator(const int* array, size_t array_slze, size_t index, bool all);
|
|
29
|
+
coupon_iterator& operator++();
|
|
30
|
+
bool operator!=(const coupon_iterator& other) const;
|
|
31
|
+
uint32_t operator*() const;
|
|
32
|
+
private:
|
|
33
|
+
const int* array;
|
|
34
|
+
size_t array_size;
|
|
35
|
+
size_t index;
|
|
36
|
+
bool all;
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
#include "coupon_iterator-internal.hpp"
|
|
42
|
+
|
|
43
|
+
#endif /* _INTARRAYPAIRITERATOR_HPP_ */
|
|
@@ -0,0 +1,669 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef _HLL_HPP_
|
|
21
|
+
#define _HLL_HPP_
|
|
22
|
+
|
|
23
|
+
#include "common_defs.hpp"
|
|
24
|
+
#include "HllUtil.hpp"
|
|
25
|
+
|
|
26
|
+
#include <memory>
|
|
27
|
+
#include <iostream>
|
|
28
|
+
#include <vector>
|
|
29
|
+
|
|
30
|
+
namespace datasketches {
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* This is a high performance implementation of Phillipe Flajolet’s HLL sketch but with
|
|
34
|
+
* significantly improved error behavior. If the ONLY use case for sketching is counting
|
|
35
|
+
* uniques and merging, the HLL sketch is a reasonable choice, although the highest performing in terms of accuracy for
|
|
36
|
+
* storage space consumed is CPC (Compressed Probabilistic Counting). For large enough counts, this HLL version (with HLL_4) can be 2 to
|
|
37
|
+
* 16 times smaller than the Theta sketch family for the same accuracy.
|
|
38
|
+
*
|
|
39
|
+
* <p>This implementation offers three different types of HLL sketch, each with different
|
|
40
|
+
* trade-offs with accuracy, space and performance. These types are specified with the
|
|
41
|
+
* {@link TgtHllType} parameter.
|
|
42
|
+
*
|
|
43
|
+
* <p>In terms of accuracy, all three types, for the same <i>lg_config_k</i>, have the same error
|
|
44
|
+
* distribution as a function of <i>n</i>, the number of unique values fed to the sketch.
|
|
45
|
+
* The configuration parameter <i>lg_config_k</i> is the log-base-2 of <i>K</i>,
|
|
46
|
+
* where <i>K</i> is the number of buckets or slots for the sketch.
|
|
47
|
+
*
|
|
48
|
+
* <p>During warmup, when the sketch has only received a small number of unique items
|
|
49
|
+
* (up to about 10% of <i>K</i>), this implementation leverages a new class of estimator
|
|
50
|
+
* algorithms with significantly better accuracy.
|
|
51
|
+
*
|
|
52
|
+
* <p>This sketch also offers the capability of operating off-heap. Given a WritableMemory object
|
|
53
|
+
* created by the user, the sketch will perform all of its updates and internal phase transitions
|
|
54
|
+
* in that object, which can actually reside either on-heap or off-heap based on how it is
|
|
55
|
+
* configured. In large systems that must update and merge many millions of sketches, having the
|
|
56
|
+
* sketch operate off-heap avoids the serialization and deserialization costs of moving sketches
|
|
57
|
+
* to and from off-heap memory-mapped files, for example, and eliminates big garbage collection
|
|
58
|
+
* delays.
|
|
59
|
+
*
|
|
60
|
+
* author Jon Malkin
|
|
61
|
+
* author Lee Rhodes
|
|
62
|
+
* author Kevin Lang
|
|
63
|
+
*/
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Specifies the target type of HLL sketch to be created. It is a target in that the actual
|
|
68
|
+
* allocation of the HLL array is deferred until sufficient number of items have been received by
|
|
69
|
+
* the warm-up phases.
|
|
70
|
+
*
|
|
71
|
+
* <p>These three target types are isomorphic representations of the same underlying HLL algorithm.
|
|
72
|
+
* Thus, given the same value of <i>lg_config_k</i> and the same input, all three HLL target types
|
|
73
|
+
* will produce identical estimates and have identical error distributions.</p>
|
|
74
|
+
*
|
|
75
|
+
* <p>The memory (and also the serialization) of the sketch during this early warmup phase starts
|
|
76
|
+
* out very small (8 bytes, when empty) and then grows in increments of 4 bytes as required
|
|
77
|
+
* until the full HLL array is allocated. This transition point occurs at about 10% of K for
|
|
78
|
+
* sketches where lg_config_k is > 8.</p>
|
|
79
|
+
*
|
|
80
|
+
* <ul>
|
|
81
|
+
* <li><b>HLL_8</b> This uses an 8-bit byte per HLL bucket. It is generally the
|
|
82
|
+
* fastest in terms of update time, but has the largest storage footprint of about
|
|
83
|
+
* <i>K</i> bytes.</li>
|
|
84
|
+
*
|
|
85
|
+
* <li><b>HLL_6</b> This uses a 6-bit field per HLL bucket. It is the generally the next fastest
|
|
86
|
+
* in terms of update time with a storage footprint of about <i>3/4 * K</i> bytes.</li>
|
|
87
|
+
*
|
|
88
|
+
* <li><b>HLL_4</b> This uses a 4-bit field per HLL bucket and for large counts may require
|
|
89
|
+
* the use of a small internal auxiliary array for storing statistical exceptions, which are rare.
|
|
90
|
+
* For the values of <i>lg_config_k > 13</i> (<i>K</i> = 8192),
|
|
91
|
+
* this additional array adds about 3% to the overall storage. It is generally the slowest in
|
|
92
|
+
* terms of update time, but has the smallest storage footprint of about
|
|
93
|
+
* <i>K/2 * 1.03</i> bytes.</li>
|
|
94
|
+
* </ul>
|
|
95
|
+
*/
|
|
96
|
+
enum target_hll_type {
|
|
97
|
+
HLL_4, ///< 4 bits per entry (most compact, size may vary)
|
|
98
|
+
HLL_6, ///< 6 bits per entry (fixed size)
|
|
99
|
+
HLL_8 ///< 8 bits per entry (fastest, fixed size)
|
|
100
|
+
};
|
|
101
|
+
|
|
102
|
+
template<typename A>
|
|
103
|
+
class HllSketchImpl;
|
|
104
|
+
|
|
105
|
+
template<typename A>
|
|
106
|
+
class hll_union_alloc;
|
|
107
|
+
|
|
108
|
+
template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
|
|
109
|
+
template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
|
|
110
|
+
|
|
111
|
+
template<typename A = std::allocator<char> >
|
|
112
|
+
class hll_sketch_alloc final {
|
|
113
|
+
public:
|
|
114
|
+
/**
|
|
115
|
+
* Constructs a new HLL sketch.
|
|
116
|
+
* @param lg_config_k Sketch can hold 2^lg_config_k rows
|
|
117
|
+
* @param tgt_type The HLL mode to use, if/when the sketch reaches that state
|
|
118
|
+
* @param start_full_size Indicates whether to start in HLL mode,
|
|
119
|
+
* keeping memory use constant (if HLL_6 or HLL_8) at the cost of
|
|
120
|
+
* starting out using much more memory
|
|
121
|
+
*/
|
|
122
|
+
explicit hll_sketch_alloc(int lg_config_k, target_hll_type tgt_type = HLL_4, bool start_full_size = false);
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Copy constructor
|
|
126
|
+
*/
|
|
127
|
+
hll_sketch_alloc(const hll_sketch_alloc<A>& that);
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Copy constructor to a new target type
|
|
131
|
+
*/
|
|
132
|
+
hll_sketch_alloc(const hll_sketch_alloc<A>& that, target_hll_type tgt_type);
|
|
133
|
+
|
|
134
|
+
/**
|
|
135
|
+
* Move constructor
|
|
136
|
+
*/
|
|
137
|
+
hll_sketch_alloc(hll_sketch_alloc<A>&& that) noexcept;
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Reconstructs a sketch from a serialized image on a stream.
|
|
141
|
+
* @param is An input stream with a binary image of a sketch
|
|
142
|
+
*/
|
|
143
|
+
static hll_sketch_alloc deserialize(std::istream& is);
|
|
144
|
+
|
|
145
|
+
/**
|
|
146
|
+
* Reconstructs a sketch from a serialized image in a byte array.
|
|
147
|
+
* @param is bytes An input array with a binary image of a sketch
|
|
148
|
+
* @param len Length of the input array, in bytes
|
|
149
|
+
*/
|
|
150
|
+
static hll_sketch_alloc deserialize(const void* bytes, size_t len);
|
|
151
|
+
|
|
152
|
+
//! Class destructor
|
|
153
|
+
virtual ~hll_sketch_alloc();
|
|
154
|
+
|
|
155
|
+
//! Copy assignment operator
|
|
156
|
+
hll_sketch_alloc operator=(const hll_sketch_alloc<A>& other);
|
|
157
|
+
|
|
158
|
+
//! Move assignment operator
|
|
159
|
+
hll_sketch_alloc operator=(hll_sketch_alloc<A>&& other);
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Resets the sketch to an empty state in coupon collection mode.
|
|
163
|
+
* Does not re-use existing internal objects.
|
|
164
|
+
*/
|
|
165
|
+
void reset();
|
|
166
|
+
|
|
167
|
+
typedef vector_u8<A> vector_bytes; // alias for users
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Serializes the sketch to a byte array, compacting data structures
|
|
171
|
+
* where feasible to eliminate unused storage in the serialized image.
|
|
172
|
+
* @param header_size_bytes Allows for PostgreSQL integration
|
|
173
|
+
*/
|
|
174
|
+
vector_bytes serialize_compact(unsigned header_size_bytes = 0) const;
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Serializes the sketch to a byte array, retaining all internal
|
|
178
|
+
* data structures in their current form.
|
|
179
|
+
*/
|
|
180
|
+
vector_bytes serialize_updatable() const;
|
|
181
|
+
|
|
182
|
+
/**
|
|
183
|
+
* Serializes the sketch to an ostream, compacting data structures
|
|
184
|
+
* where feasible to eliminate unused storage in the serialized image.
|
|
185
|
+
* @param os std::ostream to use for output.
|
|
186
|
+
*/
|
|
187
|
+
void serialize_compact(std::ostream& os) const;
|
|
188
|
+
|
|
189
|
+
/**
|
|
190
|
+
* Serializes the sketch to an ostream, retaining all internal data
|
|
191
|
+
* structures in their current form.
|
|
192
|
+
* @param os std::ostream to use for output.
|
|
193
|
+
*/
|
|
194
|
+
void serialize_updatable(std::ostream& os) const;
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* Human readable summary with optional detail
|
|
198
|
+
* @param summary if true, output the sketch summary
|
|
199
|
+
* @param detail if true, output the internal data array
|
|
200
|
+
* @param auxDetail if true, output the internal Aux array, if it exists.
|
|
201
|
+
* @param all if true, outputs all entries including empty ones
|
|
202
|
+
* @return human readable string with optional detail.
|
|
203
|
+
*/
|
|
204
|
+
string<A> to_string(bool summary = true,
|
|
205
|
+
bool detail = false,
|
|
206
|
+
bool aux_detail = false,
|
|
207
|
+
bool all = false) const;
|
|
208
|
+
|
|
209
|
+
/**
|
|
210
|
+
* Present the given std::string as a potential unique item.
|
|
211
|
+
* The string is converted to a byte array using UTF8 encoding.
|
|
212
|
+
* If the string is null or empty no update attempt is made and the method returns.
|
|
213
|
+
* @param datum The given string.
|
|
214
|
+
*/
|
|
215
|
+
void update(const std::string& datum);
|
|
216
|
+
|
|
217
|
+
/**
|
|
218
|
+
* Present the given unsigned 64-bit integer as a potential unique item.
|
|
219
|
+
* @param datum The given integer.
|
|
220
|
+
*/
|
|
221
|
+
void update(uint64_t datum);
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Present the given unsigned 32-bit integer as a potential unique item.
|
|
225
|
+
* @param datum The given integer.
|
|
226
|
+
*/
|
|
227
|
+
void update(uint32_t datum);
|
|
228
|
+
|
|
229
|
+
/**
|
|
230
|
+
* Present the given unsigned 16-bit integer as a potential unique item.
|
|
231
|
+
* @param datum The given integer.
|
|
232
|
+
*/
|
|
233
|
+
void update(uint16_t datum);
|
|
234
|
+
|
|
235
|
+
/**
|
|
236
|
+
* Present the given unsigned 8-bit integer as a potential unique item.
|
|
237
|
+
* @param datum The given integer.
|
|
238
|
+
*/
|
|
239
|
+
void update(uint8_t datum);
|
|
240
|
+
|
|
241
|
+
/**
|
|
242
|
+
* Present the given signed 64-bit integer as a potential unique item.
|
|
243
|
+
* @param datum The given integer.
|
|
244
|
+
*/
|
|
245
|
+
void update(int64_t datum);
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Present the given signed 32-bit integer as a potential unique item.
|
|
249
|
+
* @param datum The given integer.
|
|
250
|
+
*/
|
|
251
|
+
void update(int32_t datum);
|
|
252
|
+
|
|
253
|
+
/**
|
|
254
|
+
* Present the given signed 16-bit integer as a potential unique item.
|
|
255
|
+
* @param datum The given integer.
|
|
256
|
+
*/
|
|
257
|
+
void update(int16_t datum);
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* Present the given signed 8-bit integer as a potential unique item.
|
|
261
|
+
* @param datum The given integer.
|
|
262
|
+
*/
|
|
263
|
+
void update(int8_t datum);
|
|
264
|
+
|
|
265
|
+
/**
|
|
266
|
+
* Present the given 64-bit floating point value as a potential unique item.
|
|
267
|
+
* @param datum The given double.
|
|
268
|
+
*/
|
|
269
|
+
void update(double datum);
|
|
270
|
+
|
|
271
|
+
/**
|
|
272
|
+
* Present the given 32-bit floating point value as a potential unique item.
|
|
273
|
+
* @param datum The given float.
|
|
274
|
+
*/
|
|
275
|
+
void update(float datum);
|
|
276
|
+
|
|
277
|
+
/**
|
|
278
|
+
* Present the given data array as a potential unique item.
|
|
279
|
+
* @param data The given array.
|
|
280
|
+
* @param length_bytes The array length in bytes.
|
|
281
|
+
*/
|
|
282
|
+
void update(const void* data, size_t length_bytes);
|
|
283
|
+
|
|
284
|
+
/**
|
|
285
|
+
* Returns the current cardinality estimate
|
|
286
|
+
* @return the cardinality estimate
|
|
287
|
+
*/
|
|
288
|
+
double get_estimate() const;
|
|
289
|
+
|
|
290
|
+
/**
|
|
291
|
+
* This is less accurate than the getEstimate() method
|
|
292
|
+
* and is automatically used when the sketch has gone through
|
|
293
|
+
* union operations where the more accurate HIP estimator cannot
|
|
294
|
+
* be used.
|
|
295
|
+
*
|
|
296
|
+
* This is made public only for error characterization software
|
|
297
|
+
* that exists in separate packages and is not intended for normal
|
|
298
|
+
* use.
|
|
299
|
+
* @return the composite cardinality estimate
|
|
300
|
+
*/
|
|
301
|
+
double get_composite_estimate() const;
|
|
302
|
+
|
|
303
|
+
/**
|
|
304
|
+
* Returns the approximate lower error bound given the specified
|
|
305
|
+
* number of standard deviations.
|
|
306
|
+
* @param num_std_dev Number of standard deviations, an integer from the set {1, 2, 3}.
|
|
307
|
+
* @return The approximate lower bound.
|
|
308
|
+
*/
|
|
309
|
+
double get_lower_bound(int num_std_dev) const;
|
|
310
|
+
|
|
311
|
+
/**
|
|
312
|
+
* Returns the approximate upper error bound given the specified
|
|
313
|
+
* number of standard deviations.
|
|
314
|
+
* @param num_std_dev Number of standard deviations, an integer from the set {1, 2, 3}.
|
|
315
|
+
* @return The approximate upper bound.
|
|
316
|
+
*/
|
|
317
|
+
double get_upper_bound(int num_std_dev) const;
|
|
318
|
+
|
|
319
|
+
/**
|
|
320
|
+
* Returns sketch's configured lg_k value.
|
|
321
|
+
* @return Configured lg_k value.
|
|
322
|
+
*/
|
|
323
|
+
int get_lg_config_k() const;
|
|
324
|
+
|
|
325
|
+
/**
|
|
326
|
+
* Returns the sketch's target HLL mode (from #target_hll_type).
|
|
327
|
+
* @return The sketch's target HLL mode.
|
|
328
|
+
*/
|
|
329
|
+
target_hll_type get_target_type() const;
|
|
330
|
+
|
|
331
|
+
/**
|
|
332
|
+
* Indicates if the sketch is currently stored compacted.
|
|
333
|
+
* @return True if the sketch is stored in compact form.
|
|
334
|
+
*/
|
|
335
|
+
bool is_compact() const;
|
|
336
|
+
|
|
337
|
+
/**
|
|
338
|
+
* Indicates if the sketch is currently empty.
|
|
339
|
+
* @return True if the sketch is empty.
|
|
340
|
+
*/
|
|
341
|
+
bool is_empty() const;
|
|
342
|
+
|
|
343
|
+
/**
|
|
344
|
+
* Returns the size of the sketch serialized in compact form.
|
|
345
|
+
* @return Size of the sketch serialized in compact form, in bytes.
|
|
346
|
+
*/
|
|
347
|
+
int get_compact_serialization_bytes() const;
|
|
348
|
+
|
|
349
|
+
/**
|
|
350
|
+
* Returns the size of the sketch serialized without compaction.
|
|
351
|
+
* @return Size of the sketch serialized without compaction, in bytes.
|
|
352
|
+
*/
|
|
353
|
+
int get_updatable_serialization_bytes() const;
|
|
354
|
+
|
|
355
|
+
/**
|
|
356
|
+
* Returns the maximum size in bytes that this sketch can grow to
|
|
357
|
+
* given lg_config_k. However, for the HLL_4 sketch type, this
|
|
358
|
+
* value can be exceeded in extremely rare cases. If exceeded, it
|
|
359
|
+
* will be larger by only a few percent.
|
|
360
|
+
*
|
|
361
|
+
* @param lg_config_k The Log2 of K for the target HLL sketch. This value must be
|
|
362
|
+
* between 4 and 21 inclusively.
|
|
363
|
+
* @param tgt_type the desired Hll type
|
|
364
|
+
* @return the maximum size in bytes that this sketch can grow to.
|
|
365
|
+
*/
|
|
366
|
+
static int get_max_updatable_serialization_bytes(int lg_k, target_hll_type tgt_type);
|
|
367
|
+
|
|
368
|
+
/**
|
|
369
|
+
* Gets the current (approximate) Relative Error (RE) asymptotic values given several
|
|
370
|
+
* parameters. This is used primarily for testing.
|
|
371
|
+
* @param upper_bound return the RE for the Upper Bound, otherwise for the Lower Bound.
|
|
372
|
+
* @param unioned set true if the sketch is the result of a union operation.
|
|
373
|
+
* @param lg_config_k the configured value for the sketch.
|
|
374
|
+
* @param num_std_dev the given number of Standard Deviations. This must be an integer between
|
|
375
|
+
* 1 and 3, inclusive.
|
|
376
|
+
* @return the current (approximate) RelativeError
|
|
377
|
+
*/
|
|
378
|
+
static double get_rel_err(bool upper_bound, bool unioned,
|
|
379
|
+
int lg_config_k, int num_std_dev);
|
|
380
|
+
|
|
381
|
+
private:
|
|
382
|
+
explicit hll_sketch_alloc(HllSketchImpl<A>* that);
|
|
383
|
+
|
|
384
|
+
void coupon_update(int coupon);
|
|
385
|
+
|
|
386
|
+
std::string type_as_string() const;
|
|
387
|
+
std::string mode_as_string() const;
|
|
388
|
+
|
|
389
|
+
hll_mode get_current_mode() const;
|
|
390
|
+
int get_serialization_version() const;
|
|
391
|
+
bool is_out_of_order_flag() const;
|
|
392
|
+
bool is_estimation_mode() const;
|
|
393
|
+
|
|
394
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<hll_sketch_alloc> AllocHllSketch;
|
|
395
|
+
|
|
396
|
+
HllSketchImpl<A>* sketch_impl;
|
|
397
|
+
friend hll_union_alloc<A>;
|
|
398
|
+
};
|
|
399
|
+
|
|
400
|
+
/**
|
|
401
|
+
* This performs union operations for HLL sketches. This union operator is configured with a
|
|
402
|
+
* <i>lgMaxK</i> instead of the normal <i>lg_config_k</i>.
|
|
403
|
+
*
|
|
404
|
+
* <p>This union operator does permit the unioning of sketches with different values of
|
|
405
|
+
* <i>lg_config_k</i>. The user should be aware that the resulting accuracy of a sketch returned
|
|
406
|
+
* at the end of the unioning process will be a function of the smallest of <i>lg_max_k</i> and
|
|
407
|
+
* <i>lg_config_k</i> that the union operator has seen.
|
|
408
|
+
*
|
|
409
|
+
* <p>This union operator also permits unioning of any of the three different target hll_sketch
|
|
410
|
+
* types.
|
|
411
|
+
*
|
|
412
|
+
* <p>Although the API for this union operator parallels many of the methods of the
|
|
413
|
+
* <i>HllSketch</i>, the behavior of the union operator has some fundamental differences.
|
|
414
|
+
*
|
|
415
|
+
* <p>First, the user cannot specify the #tgt_hll_type as an input parameter.
|
|
416
|
+
* Instead, it is specified for the sketch returned with #get_result(tgt_hll_tyope).
|
|
417
|
+
*
|
|
418
|
+
* <p>Second, the internal effective value of log-base-2 of <i>k</i> for the union operation can
|
|
419
|
+
* change dynamically based on the smallest <i>lg_config_k</i> that the union operation has seen.
|
|
420
|
+
*
|
|
421
|
+
* author Jon Malkin
|
|
422
|
+
* author Lee Rhodes
|
|
423
|
+
* author Kevin Lang
|
|
424
|
+
*/
|
|
425
|
+
|
|
426
|
+
template<typename A = std::allocator<char> >
|
|
427
|
+
class hll_union_alloc {
|
|
428
|
+
public:
|
|
429
|
+
/**
|
|
430
|
+
* Construct an hll_union operator with the given maximum log2 of k.
|
|
431
|
+
* @param lg_max_k The maximum size, in log2, of k. The value must
|
|
432
|
+
* be between 7 and 21, inclusive.
|
|
433
|
+
*/
|
|
434
|
+
explicit hll_union_alloc(int lg_max_k);
|
|
435
|
+
|
|
436
|
+
/**
|
|
437
|
+
* Returns the current cardinality estimate
|
|
438
|
+
* @return the cardinality estimate
|
|
439
|
+
*/
|
|
440
|
+
double get_estimate() const;
|
|
441
|
+
|
|
442
|
+
/**
|
|
443
|
+
* This is less accurate than the get_estimate() method
|
|
444
|
+
* and is automatically used when the union has gone through
|
|
445
|
+
* union operations where the more accurate HIP estimator cannot
|
|
446
|
+
* be used.
|
|
447
|
+
*
|
|
448
|
+
* This is made public only for error characterization software
|
|
449
|
+
* that exists in separate packages and is not intended for normal
|
|
450
|
+
* use.
|
|
451
|
+
* @return the composite cardinality estimate
|
|
452
|
+
*/
|
|
453
|
+
double get_composite_estimate() const;
|
|
454
|
+
|
|
455
|
+
/**
|
|
456
|
+
* Returns the approximate lower error bound given the specified
|
|
457
|
+
* number of standard deviations.
|
|
458
|
+
* @param num_std_dev Number of standard deviations, an integer from the set {1, 2, 3}.
|
|
459
|
+
* @return The approximate lower bound.
|
|
460
|
+
*/
|
|
461
|
+
double get_lower_bound(int num_std_dev) const;
|
|
462
|
+
|
|
463
|
+
/**
|
|
464
|
+
* Returns the approximate upper error bound given the specified
|
|
465
|
+
* number of standard deviations.
|
|
466
|
+
* @param num_std_dev Number of standard deviations, an integer from the set {1, 2, 3}.
|
|
467
|
+
* @return The approximate upper bound.
|
|
468
|
+
*/
|
|
469
|
+
double get_upper_bound(int num_std_dev) const;
|
|
470
|
+
|
|
471
|
+
/**
|
|
472
|
+
* Returns the size of the union serialized in compact form.
|
|
473
|
+
* @return Size of the union serialized in compact form, in bytes.
|
|
474
|
+
*/
|
|
475
|
+
int get_compact_serialization_bytes() const;
|
|
476
|
+
|
|
477
|
+
/**
|
|
478
|
+
* Returns the size of the union serialized without compaction.
|
|
479
|
+
* @return Size of the union serialized without compaction, in bytes.
|
|
480
|
+
*/
|
|
481
|
+
int get_updatable_serialization_bytes() const;
|
|
482
|
+
|
|
483
|
+
/**
|
|
484
|
+
* Returns union's configured lg_k value.
|
|
485
|
+
* @return Configured lg_k value.
|
|
486
|
+
*/
|
|
487
|
+
int get_lg_config_k() const;
|
|
488
|
+
|
|
489
|
+
/**
|
|
490
|
+
* Returns the union's target HLL mode (from #target_hll_type).
|
|
491
|
+
* @return The union's target HLL mode.
|
|
492
|
+
*/
|
|
493
|
+
target_hll_type get_target_type() const;
|
|
494
|
+
|
|
495
|
+
/**
|
|
496
|
+
* Indicates if the union is currently stored compacted.
|
|
497
|
+
* @return True if the union is stored in compact form.
|
|
498
|
+
*/
|
|
499
|
+
bool is_compact() const;
|
|
500
|
+
|
|
501
|
+
/**
|
|
502
|
+
* Indicates if the union is currently empty.
|
|
503
|
+
* @return True if the union is empty.
|
|
504
|
+
*/
|
|
505
|
+
bool is_empty() const;
|
|
506
|
+
|
|
507
|
+
/**
|
|
508
|
+
* Resets the union to an empty state in coupon collection mode.
|
|
509
|
+
* Does not re-use existing internal objects.
|
|
510
|
+
*/
|
|
511
|
+
void reset();
|
|
512
|
+
|
|
513
|
+
/**
|
|
514
|
+
* Returns the result of this union operator with the specified
|
|
515
|
+
* #tgt_hll_type.
|
|
516
|
+
* @param The tgt_hll_type enum value of the desired result (Default: HLL_4)
|
|
517
|
+
* @return The result of this union with the specified tgt_hll_type
|
|
518
|
+
*/
|
|
519
|
+
hll_sketch_alloc<A> get_result(target_hll_type tgt_type = HLL_4) const;
|
|
520
|
+
|
|
521
|
+
/**
|
|
522
|
+
* Update this union operator with the given sketch.
|
|
523
|
+
* @param The given sketch.
|
|
524
|
+
*/
|
|
525
|
+
void update(const hll_sketch_alloc<A>& sketch);
|
|
526
|
+
|
|
527
|
+
/**
|
|
528
|
+
* Update this union operator with the given temporary sketch.
|
|
529
|
+
* @param The given sketch.
|
|
530
|
+
*/
|
|
531
|
+
void update(hll_sketch_alloc<A>&& sketch);
|
|
532
|
+
|
|
533
|
+
/**
|
|
534
|
+
* Present the given std::string as a potential unique item.
|
|
535
|
+
* The string is converted to a byte array using UTF8 encoding.
|
|
536
|
+
* If the string is null or empty no update attempt is made and the method returns.
|
|
537
|
+
* @param datum The given string.
|
|
538
|
+
*/
|
|
539
|
+
void update(const std::string& datum);
|
|
540
|
+
|
|
541
|
+
/**
|
|
542
|
+
* Present the given unsigned 64-bit integer as a potential unique item.
|
|
543
|
+
* @param datum The given integer.
|
|
544
|
+
*/
|
|
545
|
+
void update(uint64_t datum);
|
|
546
|
+
|
|
547
|
+
/**
|
|
548
|
+
* Present the given unsigned 32-bit integer as a potential unique item.
|
|
549
|
+
* @param datum The given integer.
|
|
550
|
+
*/
|
|
551
|
+
void update(uint32_t datum);
|
|
552
|
+
|
|
553
|
+
/**
|
|
554
|
+
* Present the given unsigned 16-bit integer as a potential unique item.
|
|
555
|
+
* @param datum The given integer.
|
|
556
|
+
*/
|
|
557
|
+
void update(uint16_t datum);
|
|
558
|
+
|
|
559
|
+
/**
|
|
560
|
+
* Present the given unsigned 8-bit integer as a potential unique item.
|
|
561
|
+
* @param datum The given integer.
|
|
562
|
+
*/
|
|
563
|
+
void update(uint8_t datum);
|
|
564
|
+
|
|
565
|
+
/**
|
|
566
|
+
* Present the given signed 64-bit integer as a potential unique item.
|
|
567
|
+
* @param datum The given integer.
|
|
568
|
+
*/
|
|
569
|
+
void update(int64_t datum);
|
|
570
|
+
|
|
571
|
+
/**
|
|
572
|
+
* Present the given signed 32-bit integer as a potential unique item.
|
|
573
|
+
* @param datum The given integer.
|
|
574
|
+
*/
|
|
575
|
+
void update(int32_t datum);
|
|
576
|
+
|
|
577
|
+
/**
|
|
578
|
+
* Present the given signed 16-bit integer as a potential unique item.
|
|
579
|
+
* @param datum The given integer.
|
|
580
|
+
*/
|
|
581
|
+
void update(int16_t datum);
|
|
582
|
+
|
|
583
|
+
/**
|
|
584
|
+
* Present the given signed 8-bit integer as a potential unique item.
|
|
585
|
+
* @param datum The given integer.
|
|
586
|
+
*/
|
|
587
|
+
void update(int8_t datum);
|
|
588
|
+
|
|
589
|
+
/**
|
|
590
|
+
* Present the given 64-bit floating point value as a potential unique item.
|
|
591
|
+
* @param datum The given double.
|
|
592
|
+
*/
|
|
593
|
+
void update(double datum);
|
|
594
|
+
|
|
595
|
+
/**
|
|
596
|
+
* Present the given 32-bit floating point value as a potential unique item.
|
|
597
|
+
* @param datum The given float.
|
|
598
|
+
*/
|
|
599
|
+
void update(float datum);
|
|
600
|
+
|
|
601
|
+
/**
|
|
602
|
+
* Present the given data array as a potential unique item.
|
|
603
|
+
* @param data The given array.
|
|
604
|
+
* @param length_bytes The array length in bytes.
|
|
605
|
+
*/
|
|
606
|
+
void update(const void* data, size_t length_bytes);
|
|
607
|
+
|
|
608
|
+
/**
|
|
609
|
+
* Returns the maximum size in bytes that this union operator can grow to given a lg_k.
|
|
610
|
+
*
|
|
611
|
+
* @param lg_k The maximum Log2 of k for this union operator. This value must be
|
|
612
|
+
* between 4 and 21 inclusively.
|
|
613
|
+
* @return the maximum size in bytes that this union operator can grow to.
|
|
614
|
+
*/
|
|
615
|
+
static int get_max_serialization_bytes(int lg_k);
|
|
616
|
+
|
|
617
|
+
/**
|
|
618
|
+
* Gets the current (approximate) Relative Error (RE) asymptotic values given several
|
|
619
|
+
* parameters. This is used primarily for testing.
|
|
620
|
+
* @param upper_bound return the RE for the Upper Bound, otherwise for the Lower Bound.
|
|
621
|
+
* @param unioned set true if the sketch is the result of a union operation.
|
|
622
|
+
* @param lg_config_k the configured value for the sketch.
|
|
623
|
+
* @param num_std_dev the given number of Standard Deviations. This must be an integer between
|
|
624
|
+
* 1 and 3, inclusive.
|
|
625
|
+
* @return the current (approximate) RelativeError
|
|
626
|
+
*/
|
|
627
|
+
static double get_rel_err(bool upper_bound, bool unioned,
|
|
628
|
+
int lg_config_k, int num_std_dev);
|
|
629
|
+
|
|
630
|
+
private:
|
|
631
|
+
|
|
632
|
+
/**
|
|
633
|
+
* Union the given source and destination sketches. This method examines the state of
|
|
634
|
+
* the current internal gadget and the incoming sketch and determines the optimal way to
|
|
635
|
+
* perform the union. This may involve swapping, down-sampling, transforming, and / or
|
|
636
|
+
* copying one of the arguments and may completely replace the internals of the union.
|
|
637
|
+
*
|
|
638
|
+
* @param incoming_impl the given incoming sketch, which may not be modified.
|
|
639
|
+
* @param lg_max_k the maximum value of log2 K for this union.
|
|
640
|
+
*/
|
|
641
|
+
inline void union_impl(const hll_sketch_alloc<A>& sketch, int lg_max_k);
|
|
642
|
+
|
|
643
|
+
static HllSketchImpl<A>* copy_or_downsample(const HllSketchImpl<A>* src_impl, int tgt_lg_k);
|
|
644
|
+
|
|
645
|
+
void coupon_update(int coupon);
|
|
646
|
+
|
|
647
|
+
hll_mode get_current_mode() const;
|
|
648
|
+
int get_serialization_version() const;
|
|
649
|
+
bool is_out_of_order_flag() const;
|
|
650
|
+
bool is_estimation_mode() const;
|
|
651
|
+
|
|
652
|
+
// calls couponUpdate on sketch, freeing the old sketch upon changes in hll_mode
|
|
653
|
+
static HllSketchImpl<A>* leak_free_coupon_update(HllSketchImpl<A>* impl, int coupon);
|
|
654
|
+
|
|
655
|
+
int lg_max_k;
|
|
656
|
+
hll_sketch_alloc<A> gadget;
|
|
657
|
+
};
|
|
658
|
+
|
|
659
|
+
/// convenience alias for hll_sketch with default allocator
|
|
660
|
+
typedef hll_sketch_alloc<> hll_sketch;
|
|
661
|
+
|
|
662
|
+
/// convenience alias for hll_union with default allocator
|
|
663
|
+
typedef hll_union_alloc<> hll_union;
|
|
664
|
+
|
|
665
|
+
} // namespace datasketches
|
|
666
|
+
|
|
667
|
+
#include "hll.private.hpp"
|
|
668
|
+
|
|
669
|
+
#endif // _HLL_HPP_
|