datasketches 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
|
@@ -0,0 +1,484 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef FREQUENT_ITEMS_SKETCH_IMPL_HPP_
|
|
21
|
+
#define FREQUENT_ITEMS_SKETCH_IMPL_HPP_
|
|
22
|
+
|
|
23
|
+
#include <cstring>
|
|
24
|
+
#include <limits>
|
|
25
|
+
#include <sstream>
|
|
26
|
+
|
|
27
|
+
#include "memory_operations.hpp"
|
|
28
|
+
|
|
29
|
+
namespace datasketches {
|
|
30
|
+
|
|
31
|
+
// clang++ seems to require this declaration for CMAKE_BUILD_TYPE='Debug"
|
|
32
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
33
|
+
const uint8_t frequent_items_sketch<T, W, H, E, S, A>::LG_MIN_MAP_SIZE;
|
|
34
|
+
|
|
35
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
36
|
+
frequent_items_sketch<T, W, H, E, S, A>::frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size):
|
|
37
|
+
total_weight(0),
|
|
38
|
+
offset(0),
|
|
39
|
+
map(std::max(lg_start_map_size, frequent_items_sketch::LG_MIN_MAP_SIZE), std::max(lg_max_map_size, frequent_items_sketch::LG_MIN_MAP_SIZE))
|
|
40
|
+
{
|
|
41
|
+
if (lg_start_map_size > lg_max_map_size) throw std::invalid_argument("starting size must not be greater than maximum size");
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
45
|
+
void frequent_items_sketch<T, W, H, E, S, A>::update(const T& item, W weight) {
|
|
46
|
+
check_weight(weight);
|
|
47
|
+
if (weight == 0) return;
|
|
48
|
+
total_weight += weight;
|
|
49
|
+
offset += map.adjust_or_insert(item, weight);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
53
|
+
void frequent_items_sketch<T, W, H, E, S, A>::update(T&& item, W weight) {
|
|
54
|
+
check_weight(weight);
|
|
55
|
+
if (weight == 0) return;
|
|
56
|
+
total_weight += weight;
|
|
57
|
+
offset += map.adjust_or_insert(std::move(item), weight);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
61
|
+
void frequent_items_sketch<T, W, H, E, S, A>::merge(const frequent_items_sketch& other) {
|
|
62
|
+
if (other.is_empty()) return;
|
|
63
|
+
const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
|
|
64
|
+
for (auto &it: other.map) {
|
|
65
|
+
update(it.first, it.second);
|
|
66
|
+
}
|
|
67
|
+
offset += other.offset;
|
|
68
|
+
total_weight = merged_total_weight;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
72
|
+
void frequent_items_sketch<T, W, H, E, S, A>::merge(frequent_items_sketch&& other) {
|
|
73
|
+
if (other.is_empty()) return;
|
|
74
|
+
const W merged_total_weight = total_weight + other.get_total_weight(); // for correction at the end
|
|
75
|
+
for (auto &it: other.map) {
|
|
76
|
+
update(std::move(it.first), it.second);
|
|
77
|
+
}
|
|
78
|
+
offset += other.offset;
|
|
79
|
+
total_weight = merged_total_weight;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
83
|
+
bool frequent_items_sketch<T, W, H, E, S, A>::is_empty() const {
|
|
84
|
+
return map.get_num_active() == 0;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
88
|
+
uint32_t frequent_items_sketch<T, W, H, E, S, A>::get_num_active_items() const {
|
|
89
|
+
return map.get_num_active();
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
93
|
+
W frequent_items_sketch<T, W, H, E, S, A>::get_total_weight() const {
|
|
94
|
+
return total_weight;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
98
|
+
W frequent_items_sketch<T, W, H, E, S, A>::get_estimate(const T& item) const {
|
|
99
|
+
// if item is tracked estimate = weight + offset, otherwise 0
|
|
100
|
+
const W weight = map.get(item);
|
|
101
|
+
if (weight > 0) return weight + offset;
|
|
102
|
+
return 0;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
106
|
+
W frequent_items_sketch<T, W, H, E, S, A>::get_lower_bound(const T& item) const {
|
|
107
|
+
return map.get(item);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
111
|
+
W frequent_items_sketch<T, W, H, E, S, A>::get_upper_bound(const T& item) const {
|
|
112
|
+
return map.get(item) + offset;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
116
|
+
W frequent_items_sketch<T, W, H, E, S, A>::get_maximum_error() const {
|
|
117
|
+
return offset;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
121
|
+
double frequent_items_sketch<T, W, H, E, S, A>::get_epsilon() const {
|
|
122
|
+
return EPSILON_FACTOR / (1 << map.get_lg_max_size());
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
126
|
+
double frequent_items_sketch<T, W, H, E, S, A>::get_epsilon(uint8_t lg_max_map_size) {
|
|
127
|
+
return EPSILON_FACTOR / (1 << lg_max_map_size);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
131
|
+
double frequent_items_sketch<T, W, H, E, S, A>::get_apriori_error(uint8_t lg_max_map_size, W estimated_total_weight) {
|
|
132
|
+
return get_epsilon(lg_max_map_size) * estimated_total_weight;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
137
|
+
typename frequent_items_sketch<T, W, H, E, S, A>::vector_row
|
|
138
|
+
frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error_type err_type) const {
|
|
139
|
+
return get_frequent_items(err_type, get_maximum_error());
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
143
|
+
typename frequent_items_sketch<T, W, H, E, S, A>::vector_row
|
|
144
|
+
frequent_items_sketch<T, W, H, E, S, A>::get_frequent_items(frequent_items_error_type err_type, W threshold) const {
|
|
145
|
+
vector_row items;
|
|
146
|
+
for (auto &it: map) {
|
|
147
|
+
const W lb = it.second;
|
|
148
|
+
const W ub = it.second + offset;
|
|
149
|
+
if ((err_type == NO_FALSE_NEGATIVES && ub > threshold) || (err_type == NO_FALSE_POSITIVES && lb > threshold)) {
|
|
150
|
+
items.push_back(row(&it.first, it.second, offset));
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
// sort by estimate in descending order
|
|
154
|
+
std::sort(items.begin(), items.end(), [](row a, row b){ return a.get_estimate() > b.get_estimate(); });
|
|
155
|
+
return items;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
159
|
+
void frequent_items_sketch<T, W, H, E, S, A>::serialize(std::ostream& os) const {
|
|
160
|
+
const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_NONEMPTY;
|
|
161
|
+
os.write((char*)&preamble_longs, sizeof(preamble_longs));
|
|
162
|
+
const uint8_t serial_version = SERIAL_VERSION;
|
|
163
|
+
os.write((char*)&serial_version, sizeof(serial_version));
|
|
164
|
+
const uint8_t family = FAMILY_ID;
|
|
165
|
+
os.write((char*)&family, sizeof(family));
|
|
166
|
+
const uint8_t lg_max_size = map.get_lg_max_size();
|
|
167
|
+
os.write((char*)&lg_max_size, sizeof(lg_max_size));
|
|
168
|
+
const uint8_t lg_cur_size = map.get_lg_cur_size();
|
|
169
|
+
os.write((char*)&lg_cur_size, sizeof(lg_cur_size));
|
|
170
|
+
const uint8_t flags_byte(
|
|
171
|
+
(is_empty() ? 1 << flags::IS_EMPTY : 0)
|
|
172
|
+
);
|
|
173
|
+
os.write((char*)&flags_byte, sizeof(flags_byte));
|
|
174
|
+
const uint16_t unused16 = 0;
|
|
175
|
+
os.write((char*)&unused16, sizeof(unused16));
|
|
176
|
+
if (!is_empty()) {
|
|
177
|
+
const uint32_t num_items = map.get_num_active();
|
|
178
|
+
os.write((char*)&num_items, sizeof(num_items));
|
|
179
|
+
const uint32_t unused32 = 0;
|
|
180
|
+
os.write((char*)&unused32, sizeof(unused32));
|
|
181
|
+
os.write((char*)&total_weight, sizeof(total_weight));
|
|
182
|
+
os.write((char*)&offset, sizeof(offset));
|
|
183
|
+
|
|
184
|
+
// copy active items and their weights to use batch serialization
|
|
185
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<W> AllocW;
|
|
186
|
+
W* weights = AllocW().allocate(num_items);
|
|
187
|
+
T* items = A().allocate(num_items);
|
|
188
|
+
uint32_t i = 0;
|
|
189
|
+
for (auto &it: map) {
|
|
190
|
+
new (&items[i]) T(it.first);
|
|
191
|
+
weights[i++] = it.second;
|
|
192
|
+
}
|
|
193
|
+
os.write((char*)weights, sizeof(W) * num_items);
|
|
194
|
+
AllocW().deallocate(weights, num_items);
|
|
195
|
+
S().serialize(os, items, num_items);
|
|
196
|
+
for (unsigned i = 0; i < num_items; i++) items[i].~T();
|
|
197
|
+
A().deallocate(items, num_items);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
202
|
+
size_t frequent_items_sketch<T, W, H, E, S, A>::get_serialized_size_bytes() const {
|
|
203
|
+
if (is_empty()) return PREAMBLE_LONGS_EMPTY * sizeof(uint64_t);
|
|
204
|
+
size_t size = PREAMBLE_LONGS_NONEMPTY * sizeof(uint64_t) + map.get_num_active() * sizeof(W);
|
|
205
|
+
for (auto &it: map) size += S().size_of_item(it.first);
|
|
206
|
+
return size;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
210
|
+
vector_u8<A> frequent_items_sketch<T, W, H, E, S, A>::serialize(unsigned header_size_bytes) const {
|
|
211
|
+
const size_t size = header_size_bytes + get_serialized_size_bytes();
|
|
212
|
+
vector_u8<A> bytes(size);
|
|
213
|
+
uint8_t* ptr = bytes.data() + header_size_bytes;
|
|
214
|
+
uint8_t* end_ptr = ptr + size;
|
|
215
|
+
|
|
216
|
+
const uint8_t preamble_longs = is_empty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_NONEMPTY;
|
|
217
|
+
ptr += copy_to_mem(&preamble_longs, ptr, sizeof(uint8_t));
|
|
218
|
+
const uint8_t serial_version = SERIAL_VERSION;
|
|
219
|
+
ptr += copy_to_mem(&serial_version, ptr, sizeof(uint8_t));
|
|
220
|
+
const uint8_t family = FAMILY_ID;
|
|
221
|
+
ptr += copy_to_mem(&family, ptr, sizeof(uint8_t));
|
|
222
|
+
const uint8_t lg_max_size = map.get_lg_max_size();
|
|
223
|
+
ptr += copy_to_mem(&lg_max_size, ptr, sizeof(uint8_t));
|
|
224
|
+
const uint8_t lg_cur_size = map.get_lg_cur_size();
|
|
225
|
+
ptr += copy_to_mem(&lg_cur_size, ptr, sizeof(uint8_t));
|
|
226
|
+
const uint8_t flags_byte(
|
|
227
|
+
(is_empty() ? 1 << flags::IS_EMPTY : 0)
|
|
228
|
+
);
|
|
229
|
+
ptr += copy_to_mem(&flags_byte, ptr, sizeof(uint8_t));
|
|
230
|
+
const uint16_t unused16 = 0;
|
|
231
|
+
ptr += copy_to_mem(&unused16, ptr, sizeof(uint16_t));
|
|
232
|
+
if (!is_empty()) {
|
|
233
|
+
const uint32_t num_items = map.get_num_active();
|
|
234
|
+
ptr += copy_to_mem(&num_items, ptr, sizeof(uint32_t));
|
|
235
|
+
const uint32_t unused32 = 0;
|
|
236
|
+
ptr += copy_to_mem(&unused32, ptr, sizeof(uint32_t));
|
|
237
|
+
ptr += copy_to_mem(&total_weight, ptr, sizeof(total_weight));
|
|
238
|
+
ptr += copy_to_mem(&offset, ptr, sizeof(offset));
|
|
239
|
+
|
|
240
|
+
// copy active items and their weights to use batch serialization
|
|
241
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<W> AllocW;
|
|
242
|
+
W* weights = AllocW().allocate(num_items);
|
|
243
|
+
T* items = A().allocate(num_items);
|
|
244
|
+
uint32_t i = 0;
|
|
245
|
+
for (auto &it: map) {
|
|
246
|
+
new (&items[i]) T(it.first);
|
|
247
|
+
weights[i++] = it.second;
|
|
248
|
+
}
|
|
249
|
+
ptr += copy_to_mem(weights, ptr, sizeof(W) * num_items);
|
|
250
|
+
AllocW().deallocate(weights, num_items);
|
|
251
|
+
const size_t bytes_remaining = end_ptr - ptr;
|
|
252
|
+
ptr += S().serialize(ptr, bytes_remaining, items, num_items);
|
|
253
|
+
for (unsigned i = 0; i < num_items; i++) items[i].~T();
|
|
254
|
+
A().deallocate(items, num_items);
|
|
255
|
+
}
|
|
256
|
+
return bytes;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
260
|
+
class frequent_items_sketch<T, W, H, E, S, A>::items_deleter {
|
|
261
|
+
public:
|
|
262
|
+
items_deleter(uint32_t num, bool destroy): num(num), destroy(destroy) {}
|
|
263
|
+
void set_destroy(bool destroy) { this->destroy = destroy; }
|
|
264
|
+
void operator() (T* ptr) const {
|
|
265
|
+
if (ptr != nullptr) {
|
|
266
|
+
if (destroy) {
|
|
267
|
+
for (uint32_t i = 0; i < num; ++i) ptr[i].~T();
|
|
268
|
+
}
|
|
269
|
+
A().deallocate(ptr, num);
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
private:
|
|
273
|
+
uint32_t num;
|
|
274
|
+
bool destroy;
|
|
275
|
+
};
|
|
276
|
+
|
|
277
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
278
|
+
frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(std::istream& is) {
|
|
279
|
+
uint8_t preamble_longs;
|
|
280
|
+
is.read((char*)&preamble_longs, sizeof(preamble_longs));
|
|
281
|
+
uint8_t serial_version;
|
|
282
|
+
is.read((char*)&serial_version, sizeof(serial_version));
|
|
283
|
+
uint8_t family_id;
|
|
284
|
+
is.read((char*)&family_id, sizeof(family_id));
|
|
285
|
+
uint8_t lg_max_size;
|
|
286
|
+
is.read((char*)&lg_max_size, sizeof(lg_max_size));
|
|
287
|
+
uint8_t lg_cur_size;
|
|
288
|
+
is.read((char*)&lg_cur_size, sizeof(lg_cur_size));
|
|
289
|
+
uint8_t flags_byte;
|
|
290
|
+
is.read((char*)&flags_byte, sizeof(flags_byte));
|
|
291
|
+
uint16_t unused16;
|
|
292
|
+
is.read((char*)&unused16, sizeof(unused16));
|
|
293
|
+
|
|
294
|
+
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
|
295
|
+
|
|
296
|
+
check_preamble_longs(preamble_longs, is_empty);
|
|
297
|
+
check_serial_version(serial_version);
|
|
298
|
+
check_family_id(family_id);
|
|
299
|
+
check_size(lg_cur_size, lg_max_size);
|
|
300
|
+
|
|
301
|
+
frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size);
|
|
302
|
+
if (!is_empty) {
|
|
303
|
+
uint32_t num_items;
|
|
304
|
+
is.read((char*)&num_items, sizeof(num_items));
|
|
305
|
+
uint32_t unused32;
|
|
306
|
+
is.read((char*)&unused32, sizeof(unused32));
|
|
307
|
+
W total_weight;
|
|
308
|
+
is.read((char*)&total_weight, sizeof(total_weight));
|
|
309
|
+
W offset;
|
|
310
|
+
is.read((char*)&offset, sizeof(offset));
|
|
311
|
+
|
|
312
|
+
// batch deserialization with intermediate array of items and weights
|
|
313
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<W> AllocW;
|
|
314
|
+
std::vector<W, AllocW> weights(num_items);
|
|
315
|
+
is.read((char*)weights.data(), sizeof(W) * num_items);
|
|
316
|
+
std::unique_ptr<T, items_deleter> items(A().allocate(num_items), items_deleter(num_items, false));
|
|
317
|
+
S().deserialize(is, items.get(), num_items);
|
|
318
|
+
items.get_deleter().set_destroy(true); // serde did not throw, so the items must be constructed
|
|
319
|
+
for (uint32_t i = 0; i < num_items; i++) {
|
|
320
|
+
sketch.update(std::move(items.get()[i]), weights[i]);
|
|
321
|
+
}
|
|
322
|
+
sketch.total_weight = total_weight;
|
|
323
|
+
sketch.offset = offset;
|
|
324
|
+
}
|
|
325
|
+
if (!is.good())
|
|
326
|
+
throw std::runtime_error("error reading from std::istream");
|
|
327
|
+
return sketch;
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
331
|
+
frequent_items_sketch<T, W, H, E, S, A> frequent_items_sketch<T, W, H, E, S, A>::deserialize(const void* bytes, size_t size) {
|
|
332
|
+
ensure_minimum_memory(size, 8);
|
|
333
|
+
const char* ptr = static_cast<const char*>(bytes);
|
|
334
|
+
const char* base = static_cast<const char*>(bytes);
|
|
335
|
+
uint8_t preamble_longs;
|
|
336
|
+
ptr += copy_from_mem(ptr, &preamble_longs, sizeof(uint8_t));
|
|
337
|
+
uint8_t serial_version;
|
|
338
|
+
ptr += copy_from_mem(ptr, &serial_version, sizeof(uint8_t));
|
|
339
|
+
uint8_t family_id;
|
|
340
|
+
ptr += copy_from_mem(ptr, &family_id, sizeof(uint8_t));
|
|
341
|
+
uint8_t lg_max_size;
|
|
342
|
+
ptr += copy_from_mem(ptr, &lg_max_size, sizeof(uint8_t));
|
|
343
|
+
uint8_t lg_cur_size;
|
|
344
|
+
ptr += copy_from_mem(ptr, &lg_cur_size, sizeof(uint8_t));
|
|
345
|
+
uint8_t flags_byte;
|
|
346
|
+
ptr += copy_from_mem(ptr, &flags_byte, sizeof(uint8_t));
|
|
347
|
+
uint16_t unused16;
|
|
348
|
+
ptr += copy_from_mem(ptr, &unused16, sizeof(uint16_t));
|
|
349
|
+
|
|
350
|
+
const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
|
|
351
|
+
|
|
352
|
+
check_preamble_longs(preamble_longs, is_empty);
|
|
353
|
+
check_serial_version(serial_version);
|
|
354
|
+
check_family_id(family_id);
|
|
355
|
+
check_size(lg_cur_size, lg_max_size);
|
|
356
|
+
ensure_minimum_memory(size, 1 << preamble_longs);
|
|
357
|
+
|
|
358
|
+
frequent_items_sketch<T, W, H, E, S, A> sketch(lg_max_size, lg_cur_size);
|
|
359
|
+
if (!is_empty) {
|
|
360
|
+
uint32_t num_items;
|
|
361
|
+
ptr += copy_from_mem(ptr, &num_items, sizeof(uint32_t));
|
|
362
|
+
uint32_t unused32;
|
|
363
|
+
ptr += copy_from_mem(ptr, &unused32, sizeof(uint32_t));
|
|
364
|
+
W total_weight;
|
|
365
|
+
ptr += copy_from_mem(ptr, &total_weight, sizeof(total_weight));
|
|
366
|
+
W offset;
|
|
367
|
+
ptr += copy_from_mem(ptr, &offset, sizeof(offset));
|
|
368
|
+
|
|
369
|
+
ensure_minimum_memory(size, ptr - base + (sizeof(W) * num_items));
|
|
370
|
+
// batch deserialization with intermediate array of items and weights
|
|
371
|
+
typedef typename std::allocator_traits<A>::template rebind_alloc<W> AllocW;
|
|
372
|
+
std::vector<W, AllocW> weights(num_items);
|
|
373
|
+
ptr += copy_from_mem(ptr, weights.data(), sizeof(W) * num_items);
|
|
374
|
+
std::unique_ptr<T, items_deleter> items(A().allocate(num_items), items_deleter(num_items, false));
|
|
375
|
+
const size_t bytes_remaining = size - (ptr - base);
|
|
376
|
+
ptr += S().deserialize(ptr, bytes_remaining, items.get(), num_items);
|
|
377
|
+
items.get_deleter().set_destroy(true); // serde did not throw, so the items must be constructed
|
|
378
|
+
for (uint32_t i = 0; i < num_items; i++) {
|
|
379
|
+
sketch.update(std::move(items.get()[i]), weights[i]);
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
sketch.total_weight = total_weight;
|
|
383
|
+
sketch.offset = offset;
|
|
384
|
+
}
|
|
385
|
+
return sketch;
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
389
|
+
void frequent_items_sketch<T, W, H, E, S, A>::check_preamble_longs(uint8_t preamble_longs, bool is_empty) {
|
|
390
|
+
if (is_empty) {
|
|
391
|
+
if (preamble_longs != PREAMBLE_LONGS_EMPTY) {
|
|
392
|
+
throw std::invalid_argument("Possible corruption: preamble longs of an empty sketch must be " + std::to_string(PREAMBLE_LONGS_EMPTY) + ": " + std::to_string(preamble_longs));
|
|
393
|
+
}
|
|
394
|
+
} else {
|
|
395
|
+
if (preamble_longs != PREAMBLE_LONGS_NONEMPTY) {
|
|
396
|
+
throw std::invalid_argument("Possible corruption: preamble longs of an non-empty sketch must be " + std::to_string(PREAMBLE_LONGS_NONEMPTY) + ": " + std::to_string(preamble_longs));
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
402
|
+
void frequent_items_sketch<T, W, H, E, S, A>::check_serial_version(uint8_t serial_version) {
|
|
403
|
+
if (serial_version != SERIAL_VERSION) {
|
|
404
|
+
throw std::invalid_argument("Possible corruption: serial version must be " + std::to_string(SERIAL_VERSION) + ": " + std::to_string(serial_version));
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
409
|
+
void frequent_items_sketch<T, W, H, E, S, A>::check_family_id(uint8_t family_id) {
|
|
410
|
+
if (family_id != FAMILY_ID) {
|
|
411
|
+
throw std::invalid_argument("Possible corruption: family ID must be " + std::to_string(FAMILY_ID) + ": " + std::to_string(family_id));
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
416
|
+
void frequent_items_sketch<T, W, H, E, S, A>::check_size(uint8_t lg_cur_size, uint8_t lg_max_size) {
|
|
417
|
+
if (lg_cur_size > lg_max_size) {
|
|
418
|
+
throw std::invalid_argument("Possible corruption: expected lg_cur_size <= lg_max_size: " + std::to_string(lg_cur_size) + " <= " + std::to_string(lg_max_size));
|
|
419
|
+
}
|
|
420
|
+
if (lg_cur_size < LG_MIN_MAP_SIZE) {
|
|
421
|
+
throw std::invalid_argument("Possible corruption: lg_cur_size must not be less than " + std::to_string(LG_MIN_MAP_SIZE) + ": " + std::to_string(lg_cur_size));
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
426
|
+
string<A> frequent_items_sketch<T, W, H, E, S, A>::to_string(bool print_items) const {
|
|
427
|
+
std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
|
|
428
|
+
os << "### Frequent items sketch summary:" << std::endl;
|
|
429
|
+
os << " lg cur map size : " << (int) map.get_lg_cur_size() << std::endl;
|
|
430
|
+
os << " lg max map size : " << (int) map.get_lg_max_size() << std::endl;
|
|
431
|
+
os << " num active items : " << get_num_active_items() << std::endl;
|
|
432
|
+
os << " total weight : " << get_total_weight() << std::endl;
|
|
433
|
+
os << " max error : " << get_maximum_error() << std::endl;
|
|
434
|
+
os << "### End sketch summary" << std::endl;
|
|
435
|
+
if (print_items) {
|
|
436
|
+
vector_row items;
|
|
437
|
+
for (auto &it: map) {
|
|
438
|
+
items.push_back(row(&it.first, it.second, offset));
|
|
439
|
+
}
|
|
440
|
+
// sort by estimate in descending order
|
|
441
|
+
std::sort(items.begin(), items.end(), [](row a, row b){ return a.get_estimate() > b.get_estimate(); });
|
|
442
|
+
os << "### Items in descending order by estimate" << std::endl;
|
|
443
|
+
os << " item, estimate, lower bound, upper bound" << std::endl;
|
|
444
|
+
for (auto &it: items) {
|
|
445
|
+
os << " " << it.get_item() << ", " << it.get_estimate() << ", "
|
|
446
|
+
<< it.get_lower_bound() << ", " << it.get_upper_bound() << std::endl;
|
|
447
|
+
}
|
|
448
|
+
os << "### End items" << std::endl;
|
|
449
|
+
}
|
|
450
|
+
return os.str();
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
// version for integral signed type
|
|
454
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
455
|
+
template<typename WW, typename std::enable_if<std::is_integral<WW>::value && std::is_signed<WW>::value, int>::type>
|
|
456
|
+
void frequent_items_sketch<T, W, H, E, S, A>::check_weight(WW weight) {
|
|
457
|
+
if (weight < 0) {
|
|
458
|
+
throw std::invalid_argument("weight must be non-negative");
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
// version for integral unsigned type - no-op
|
|
463
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
464
|
+
template<typename WW, typename std::enable_if<std::is_integral<WW>::value && std::is_unsigned<WW>::value, int>::type>
|
|
465
|
+
void frequent_items_sketch<T, W, H, E, S, A>::check_weight(WW) {}
|
|
466
|
+
|
|
467
|
+
// version for floating point type
|
|
468
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
469
|
+
template<typename WW, typename std::enable_if<std::is_floating_point<WW>::value, int>::type>
|
|
470
|
+
void frequent_items_sketch<T, W, H, E, S, A>::check_weight(WW weight) {
|
|
471
|
+
if (weight < 0) {
|
|
472
|
+
throw std::invalid_argument("weight must be non-negative");
|
|
473
|
+
}
|
|
474
|
+
if (std::isnan(weight)) {
|
|
475
|
+
throw std::invalid_argument("weight must be a valid number");
|
|
476
|
+
}
|
|
477
|
+
if (std::isinf(weight)) {
|
|
478
|
+
throw std::invalid_argument("weight must be finite");
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
#endif
|