datasketches 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE +310 -0
- data/NOTICE +11 -0
- data/README.md +126 -0
- data/ext/datasketches/cpc_wrapper.cpp +50 -0
- data/ext/datasketches/ext.cpp +12 -0
- data/ext/datasketches/extconf.rb +11 -0
- data/ext/datasketches/hll_wrapper.cpp +69 -0
- data/lib/datasketches.rb +9 -0
- data/lib/datasketches/version.rb +3 -0
- data/vendor/datasketches-cpp/CMakeLists.txt +126 -0
- data/vendor/datasketches-cpp/LICENSE +311 -0
- data/vendor/datasketches-cpp/MANIFEST.in +19 -0
- data/vendor/datasketches-cpp/NOTICE +11 -0
- data/vendor/datasketches-cpp/README.md +42 -0
- data/vendor/datasketches-cpp/common/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/common/include/MurmurHash3.h +173 -0
- data/vendor/datasketches-cpp/common/include/binomial_bounds.hpp +458 -0
- data/vendor/datasketches-cpp/common/include/bounds_binomial_proportions.hpp +291 -0
- data/vendor/datasketches-cpp/common/include/ceiling_power_of_2.hpp +41 -0
- data/vendor/datasketches-cpp/common/include/common_defs.hpp +51 -0
- data/vendor/datasketches-cpp/common/include/conditional_back_inserter.hpp +68 -0
- data/vendor/datasketches-cpp/common/include/conditional_forward.hpp +70 -0
- data/vendor/datasketches-cpp/common/include/count_zeros.hpp +114 -0
- data/vendor/datasketches-cpp/common/include/inv_pow2_table.hpp +107 -0
- data/vendor/datasketches-cpp/common/include/memory_operations.hpp +57 -0
- data/vendor/datasketches-cpp/common/include/serde.hpp +196 -0
- data/vendor/datasketches-cpp/common/test/CMakeLists.txt +38 -0
- data/vendor/datasketches-cpp/common/test/catch.hpp +17618 -0
- data/vendor/datasketches-cpp/common/test/catch_runner.cpp +7 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.cpp +31 -0
- data/vendor/datasketches-cpp/common/test/test_allocator.hpp +108 -0
- data/vendor/datasketches-cpp/common/test/test_runner.cpp +29 -0
- data/vendor/datasketches-cpp/common/test/test_type.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/CMakeLists.txt +74 -0
- data/vendor/datasketches-cpp/cpc/include/compression_data.hpp +6022 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_common.hpp +62 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor.hpp +147 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_compressor_impl.hpp +742 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_confidence.hpp +167 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch.hpp +311 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_sketch_impl.hpp +810 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union.hpp +102 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_union_impl.hpp +346 -0
- data/vendor/datasketches-cpp/cpc/include/cpc_util.hpp +137 -0
- data/vendor/datasketches-cpp/cpc/include/icon_estimator.hpp +274 -0
- data/vendor/datasketches-cpp/cpc/include/kxp_byte_lookup.hpp +81 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table.hpp +84 -0
- data/vendor/datasketches-cpp/cpc/include/u32_table_impl.hpp +266 -0
- data/vendor/datasketches-cpp/cpc/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/cpc/test/compression_test.cpp +67 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_sketch_test.cpp +381 -0
- data/vendor/datasketches-cpp/cpc/test/cpc_union_test.cpp +149 -0
- data/vendor/datasketches-cpp/fi/CMakeLists.txt +54 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch.hpp +319 -0
- data/vendor/datasketches-cpp/fi/include/frequent_items_sketch_impl.hpp +484 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map.hpp +114 -0
- data/vendor/datasketches-cpp/fi/include/reverse_purge_hash_map_impl.hpp +345 -0
- data/vendor/datasketches-cpp/fi/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_custom_type_test.cpp +84 -0
- data/vendor/datasketches-cpp/fi/test/frequent_items_sketch_test.cpp +360 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/items_sketch_string_utf8_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/longs_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/fi/test/reverse_purge_hash_map_test.cpp +47 -0
- data/vendor/datasketches-cpp/hll/CMakeLists.txt +92 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap-internal.hpp +303 -0
- data/vendor/datasketches-cpp/hll/include/AuxHashMap.hpp +83 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable-internal.hpp +811 -0
- data/vendor/datasketches-cpp/hll/include/CompositeInterpolationXTable.hpp +40 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet-internal.hpp +291 -0
- data/vendor/datasketches-cpp/hll/include/CouponHashSet.hpp +59 -0
- data/vendor/datasketches-cpp/hll/include/CouponList-internal.hpp +417 -0
- data/vendor/datasketches-cpp/hll/include/CouponList.hpp +91 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation-internal.hpp +233 -0
- data/vendor/datasketches-cpp/hll/include/CubicInterpolation.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers-internal.hpp +90 -0
- data/vendor/datasketches-cpp/hll/include/HarmonicNumbers.hpp +48 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array-internal.hpp +335 -0
- data/vendor/datasketches-cpp/hll/include/Hll4Array.hpp +69 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array-internal.hpp +124 -0
- data/vendor/datasketches-cpp/hll/include/Hll6Array.hpp +55 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array-internal.hpp +158 -0
- data/vendor/datasketches-cpp/hll/include/Hll8Array.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/HllArray-internal.hpp +706 -0
- data/vendor/datasketches-cpp/hll/include/HllArray.hpp +136 -0
- data/vendor/datasketches-cpp/hll/include/HllSketch-internal.hpp +462 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl-internal.hpp +149 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImpl.hpp +85 -0
- data/vendor/datasketches-cpp/hll/include/HllSketchImplFactory.hpp +170 -0
- data/vendor/datasketches-cpp/hll/include/HllUnion-internal.hpp +287 -0
- data/vendor/datasketches-cpp/hll/include/HllUtil.hpp +239 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables-internal.hpp +112 -0
- data/vendor/datasketches-cpp/hll/include/RelativeErrorTables.hpp +46 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator-internal.hpp +56 -0
- data/vendor/datasketches-cpp/hll/include/coupon_iterator.hpp +43 -0
- data/vendor/datasketches-cpp/hll/include/hll.hpp +669 -0
- data/vendor/datasketches-cpp/hll/include/hll.private.hpp +32 -0
- data/vendor/datasketches-cpp/hll/test/AuxHashMapTest.cpp +79 -0
- data/vendor/datasketches-cpp/hll/test/CMakeLists.txt +51 -0
- data/vendor/datasketches-cpp/hll/test/CouponHashSetTest.cpp +130 -0
- data/vendor/datasketches-cpp/hll/test/CouponListTest.cpp +181 -0
- data/vendor/datasketches-cpp/hll/test/CrossCountingTest.cpp +93 -0
- data/vendor/datasketches-cpp/hll/test/HllArrayTest.cpp +191 -0
- data/vendor/datasketches-cpp/hll/test/HllSketchTest.cpp +389 -0
- data/vendor/datasketches-cpp/hll/test/HllUnionTest.cpp +313 -0
- data/vendor/datasketches-cpp/hll/test/IsomorphicTest.cpp +141 -0
- data/vendor/datasketches-cpp/hll/test/TablesTest.cpp +44 -0
- data/vendor/datasketches-cpp/hll/test/ToFromByteArrayTest.cpp +168 -0
- data/vendor/datasketches-cpp/hll/test/array6_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/compact_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/list_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_array4_from_java.sk +0 -0
- data/vendor/datasketches-cpp/hll/test/updatable_set_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/CMakeLists.txt +58 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper.hpp +150 -0
- data/vendor/datasketches-cpp/kll/include/kll_helper_impl.hpp +319 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator.hpp +67 -0
- data/vendor/datasketches-cpp/kll/include/kll_quantile_calculator_impl.hpp +169 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch.hpp +559 -0
- data/vendor/datasketches-cpp/kll/include/kll_sketch_impl.hpp +1131 -0
- data/vendor/datasketches-cpp/kll/test/CMakeLists.txt +44 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_custom_type_test.cpp +154 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_float_one_item_v1.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_from_java.sk +0 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_test.cpp +685 -0
- data/vendor/datasketches-cpp/kll/test/kll_sketch_validation.cpp +229 -0
- data/vendor/datasketches-cpp/pyproject.toml +17 -0
- data/vendor/datasketches-cpp/python/CMakeLists.txt +61 -0
- data/vendor/datasketches-cpp/python/README.md +78 -0
- data/vendor/datasketches-cpp/python/jupyter/CPCSketch.ipynb +345 -0
- data/vendor/datasketches-cpp/python/jupyter/FrequentItemsSketch.ipynb +354 -0
- data/vendor/datasketches-cpp/python/jupyter/HLLSketch.ipynb +346 -0
- data/vendor/datasketches-cpp/python/jupyter/KLLSketch.ipynb +463 -0
- data/vendor/datasketches-cpp/python/jupyter/ThetaSketchNotebook.ipynb +396 -0
- data/vendor/datasketches-cpp/python/src/__init__.py +2 -0
- data/vendor/datasketches-cpp/python/src/cpc_wrapper.cpp +90 -0
- data/vendor/datasketches-cpp/python/src/datasketches.cpp +40 -0
- data/vendor/datasketches-cpp/python/src/fi_wrapper.cpp +123 -0
- data/vendor/datasketches-cpp/python/src/hll_wrapper.cpp +136 -0
- data/vendor/datasketches-cpp/python/src/kll_wrapper.cpp +209 -0
- data/vendor/datasketches-cpp/python/src/theta_wrapper.cpp +162 -0
- data/vendor/datasketches-cpp/python/src/vector_of_kll.cpp +488 -0
- data/vendor/datasketches-cpp/python/src/vo_wrapper.cpp +140 -0
- data/vendor/datasketches-cpp/python/tests/__init__.py +0 -0
- data/vendor/datasketches-cpp/python/tests/cpc_test.py +64 -0
- data/vendor/datasketches-cpp/python/tests/fi_test.py +110 -0
- data/vendor/datasketches-cpp/python/tests/hll_test.py +131 -0
- data/vendor/datasketches-cpp/python/tests/kll_test.py +119 -0
- data/vendor/datasketches-cpp/python/tests/theta_test.py +121 -0
- data/vendor/datasketches-cpp/python/tests/vector_of_kll_test.py +148 -0
- data/vendor/datasketches-cpp/python/tests/vo_test.py +101 -0
- data/vendor/datasketches-cpp/sampling/CMakeLists.txt +48 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch.hpp +392 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_sketch_impl.hpp +1752 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union.hpp +239 -0
- data/vendor/datasketches-cpp/sampling/include/var_opt_union_impl.hpp +645 -0
- data/vendor/datasketches-cpp/sampling/test/CMakeLists.txt +43 -0
- data/vendor/datasketches-cpp/sampling/test/binaries_from_java.txt +67 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_sketch_test.cpp +509 -0
- data/vendor/datasketches-cpp/sampling/test/var_opt_union_test.cpp +358 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_long_sampling.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_sketch_string_exact.sk +0 -0
- data/vendor/datasketches-cpp/sampling/test/varopt_union_double_sampling.sk +0 -0
- data/vendor/datasketches-cpp/setup.py +94 -0
- data/vendor/datasketches-cpp/theta/CMakeLists.txt +57 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b.hpp +73 -0
- data/vendor/datasketches-cpp/theta/include/theta_a_not_b_impl.hpp +83 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection.hpp +88 -0
- data/vendor/datasketches-cpp/theta/include/theta_intersection_impl.hpp +130 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch.hpp +533 -0
- data/vendor/datasketches-cpp/theta/include/theta_sketch_impl.hpp +939 -0
- data/vendor/datasketches-cpp/theta/include/theta_union.hpp +122 -0
- data/vendor/datasketches-cpp/theta/include/theta_union_impl.hpp +109 -0
- data/vendor/datasketches-cpp/theta/test/CMakeLists.txt +45 -0
- data/vendor/datasketches-cpp/theta/test/theta_a_not_b_test.cpp +244 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_intersection_test.cpp +218 -0
- data/vendor/datasketches-cpp/theta/test/theta_sketch_test.cpp +438 -0
- data/vendor/datasketches-cpp/theta/test/theta_union_test.cpp +97 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/theta/test/theta_update_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/CMakeLists.txt +104 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_a_not_b_impl.hpp +32 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection.hpp +52 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_intersection_impl.hpp +31 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch.hpp +179 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_sketch_impl.hpp +238 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union.hpp +81 -0
- data/vendor/datasketches-cpp/tuple/include/array_of_doubles_union_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_sampled_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/bounds_on_ratios_in_theta_sketched_sets.hpp +135 -0
- data/vendor/datasketches-cpp/tuple/include/jaccard_similarity.hpp +172 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental.hpp +53 -0
- data/vendor/datasketches-cpp/tuple/include/theta_a_not_b_experimental_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/theta_comparators.hpp +48 -0
- data/vendor/datasketches-cpp/tuple/include/theta_constants.hpp +34 -0
- data/vendor/datasketches-cpp/tuple/include/theta_helpers.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base.hpp +59 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_base_impl.hpp +121 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental.hpp +78 -0
- data/vendor/datasketches-cpp/tuple/include/theta_intersection_experimental_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base.hpp +54 -0
- data/vendor/datasketches-cpp/tuple/include/theta_set_difference_base_impl.hpp +80 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental.hpp +393 -0
- data/vendor/datasketches-cpp/tuple/include/theta_sketch_experimental_impl.hpp +481 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base.hpp +60 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_base_impl.hpp +84 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental.hpp +88 -0
- data/vendor/datasketches-cpp/tuple/include/theta_union_experimental_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base.hpp +259 -0
- data/vendor/datasketches-cpp/tuple/include/theta_update_sketch_base_impl.hpp +389 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b.hpp +57 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_a_not_b_impl.hpp +33 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection.hpp +104 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_intersection_impl.hpp +43 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch.hpp +496 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_sketch_impl.hpp +587 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union.hpp +109 -0
- data/vendor/datasketches-cpp/tuple/include/tuple_union_impl.hpp +47 -0
- data/vendor/datasketches-cpp/tuple/test/CMakeLists.txt +53 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_1_compact_non_empty_no_entries_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_2_compact_exact_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/aod_3_compact_empty_from_java.sk +1 -0
- data/vendor/datasketches-cpp/tuple/test/array_of_doubles_sketch_test.cpp +298 -0
- data/vendor/datasketches-cpp/tuple/test/theta_a_not_b_experimental_test.cpp +250 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_empty_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_estimation_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_compact_single_item_from_java.sk +0 -0
- data/vendor/datasketches-cpp/tuple/test/theta_intersection_experimental_test.cpp +224 -0
- data/vendor/datasketches-cpp/tuple/test/theta_jaccard_similarity_test.cpp +144 -0
- data/vendor/datasketches-cpp/tuple/test/theta_sketch_experimental_test.cpp +247 -0
- data/vendor/datasketches-cpp/tuple/test/theta_union_experimental_test.cpp +44 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_a_not_b_test.cpp +289 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_intersection_test.cpp +235 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_jaccard_similarity_test.cpp +98 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_allocation_test.cpp +102 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_sketch_test.cpp +249 -0
- data/vendor/datasketches-cpp/tuple/test/tuple_union_test.cpp +187 -0
- metadata +302 -0
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include <catch.hpp>
|
|
21
|
+
|
|
22
|
+
#include "cpc_union.hpp"
|
|
23
|
+
|
|
24
|
+
namespace datasketches {
|
|
25
|
+
|
|
26
|
+
static const double RELATIVE_ERROR_FOR_LG_K_11 = 0.02;
|
|
27
|
+
|
|
28
|
+
TEST_CASE("cpc union: lg k limits", "[cpc_union]") {
|
|
29
|
+
cpc_union u1(CPC_MIN_LG_K); // this should work
|
|
30
|
+
cpc_union u2(CPC_MAX_LG_K); // this should work
|
|
31
|
+
REQUIRE_THROWS_AS(cpc_union(CPC_MIN_LG_K - 1), std::invalid_argument);
|
|
32
|
+
REQUIRE_THROWS_AS(cpc_union(CPC_MAX_LG_K + 1), std::invalid_argument);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
TEST_CASE("cpc union: empty", "[cpc_union]") {
|
|
36
|
+
cpc_union u(11);
|
|
37
|
+
auto s = u.get_result();
|
|
38
|
+
REQUIRE(s.is_empty());
|
|
39
|
+
REQUIRE(s.get_estimate() == 0.0);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
TEST_CASE("cpc union: copy", "[cpc_union]") {
|
|
43
|
+
cpc_sketch s(11);
|
|
44
|
+
s.update(1);
|
|
45
|
+
cpc_union u1(11);
|
|
46
|
+
u1.update(s);
|
|
47
|
+
|
|
48
|
+
cpc_union u2 = u1; // copy constructor
|
|
49
|
+
auto s1 = u2.get_result();
|
|
50
|
+
REQUIRE_FALSE(s1.is_empty());
|
|
51
|
+
REQUIRE(s1.get_estimate() == Approx(1).margin(RELATIVE_ERROR_FOR_LG_K_11));
|
|
52
|
+
s.update(2);
|
|
53
|
+
u2.update(s);
|
|
54
|
+
u1 = u2; // operator=
|
|
55
|
+
auto s2 = u1.get_result();
|
|
56
|
+
REQUIRE_FALSE(s2.is_empty());
|
|
57
|
+
REQUIRE(s2.get_estimate() == Approx(2).margin(2 * RELATIVE_ERROR_FOR_LG_K_11));
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
TEST_CASE("cpc union: custom seed", "[cpc_union]") {
|
|
61
|
+
cpc_sketch s(11, 123);
|
|
62
|
+
|
|
63
|
+
s.update(1);
|
|
64
|
+
s.update(2);
|
|
65
|
+
s.update(3);
|
|
66
|
+
|
|
67
|
+
cpc_union u1(11, 123);
|
|
68
|
+
u1.update(s);
|
|
69
|
+
auto r = u1.get_result();
|
|
70
|
+
REQUIRE_FALSE(r.is_empty());
|
|
71
|
+
REQUIRE(r.get_estimate() == Approx(3).margin(3 * RELATIVE_ERROR_FOR_LG_K_11));
|
|
72
|
+
|
|
73
|
+
// incompatible seed
|
|
74
|
+
cpc_union u2(11, 234);
|
|
75
|
+
REQUIRE_THROWS_AS(u2.update(s), std::invalid_argument);
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
TEST_CASE("cpc union: large", "[cpc_union]") {
|
|
79
|
+
int key = 0;
|
|
80
|
+
cpc_sketch s(11);
|
|
81
|
+
cpc_union u(11);
|
|
82
|
+
for (int i = 0; i < 1000; i++) {
|
|
83
|
+
cpc_sketch tmp(11);
|
|
84
|
+
for (int i = 0; i < 10000; i++) {
|
|
85
|
+
s.update(key);
|
|
86
|
+
tmp.update(key);
|
|
87
|
+
key++;
|
|
88
|
+
}
|
|
89
|
+
u.update(tmp);
|
|
90
|
+
}
|
|
91
|
+
cpc_sketch r = u.get_result();
|
|
92
|
+
REQUIRE(r.get_num_coupons() == s.get_num_coupons());
|
|
93
|
+
REQUIRE(r.get_estimate() == Approx(s.get_estimate()).margin(s.get_estimate() * RELATIVE_ERROR_FOR_LG_K_11));
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
TEST_CASE("cpc union: reduce k empty", "[cpc_union]") {
|
|
97
|
+
cpc_sketch s(11);
|
|
98
|
+
for (int i = 0; i < 10000; i++) s.update(i);
|
|
99
|
+
cpc_union u(12);
|
|
100
|
+
u.update(s);
|
|
101
|
+
cpc_sketch r = u.get_result();
|
|
102
|
+
REQUIRE(r.get_lg_k() == 11);
|
|
103
|
+
REQUIRE(r.get_estimate() == Approx(10000).margin(10000 * RELATIVE_ERROR_FOR_LG_K_11));
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
TEST_CASE("cpc union: reduce k sparse", "[cpc_union]") {
|
|
107
|
+
cpc_union u(12);
|
|
108
|
+
|
|
109
|
+
cpc_sketch s12(12);
|
|
110
|
+
for (int i = 0; i < 100; i++) s12.update(i);
|
|
111
|
+
u.update(s12);
|
|
112
|
+
|
|
113
|
+
cpc_sketch s11(11);
|
|
114
|
+
for (int i = 0; i < 1000; i++) s11.update(i);
|
|
115
|
+
u.update(s11);
|
|
116
|
+
|
|
117
|
+
cpc_sketch r = u.get_result();
|
|
118
|
+
REQUIRE(r.get_lg_k() == 11);
|
|
119
|
+
REQUIRE(r.get_estimate() == Approx(1000).margin(1000 * RELATIVE_ERROR_FOR_LG_K_11));
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
TEST_CASE("cpc union: reduce k window", "[cpc_union]") {
|
|
123
|
+
cpc_union u(12);
|
|
124
|
+
|
|
125
|
+
cpc_sketch s12(12);
|
|
126
|
+
for (int i = 0; i < 500; i++) s12.update(i);
|
|
127
|
+
u.update(s12);
|
|
128
|
+
|
|
129
|
+
cpc_sketch s11(11);
|
|
130
|
+
for (int i = 0; i < 1000; i++) s11.update(i);
|
|
131
|
+
u.update(s11);
|
|
132
|
+
|
|
133
|
+
cpc_sketch r = u.get_result();
|
|
134
|
+
REQUIRE(r.get_lg_k() == 11);
|
|
135
|
+
REQUIRE(r.get_estimate() == Approx(1000).margin(1000 * RELATIVE_ERROR_FOR_LG_K_11));
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
TEST_CASE("cpc union: moving update", "[cpc_union]") {
|
|
139
|
+
cpc_union u(11);
|
|
140
|
+
cpc_sketch s(11);
|
|
141
|
+
for (int i = 0; i < 100; i++) s.update(i); // sparse
|
|
142
|
+
u.update(std::move(s));
|
|
143
|
+
cpc_sketch r = u.get_result();
|
|
144
|
+
REQUIRE(r.get_estimate() == Approx(100).margin(100 * RELATIVE_ERROR_FOR_LG_K_11));
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
} /* namespace datasketches */
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
|
3
|
+
# distributed with this work for additional information
|
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
|
6
|
+
# "License"); you may not use this file except in compliance
|
|
7
|
+
# with the License. You may obtain a copy of the License at
|
|
8
|
+
#
|
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
#
|
|
11
|
+
# Unless required by applicable law or agreed to in writing,
|
|
12
|
+
# software distributed under the License is distributed on an
|
|
13
|
+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
14
|
+
# KIND, either express or implied. See the License for the
|
|
15
|
+
# specific language governing permissions and limitations
|
|
16
|
+
# under the License.
|
|
17
|
+
|
|
18
|
+
add_library(fi INTERFACE)
|
|
19
|
+
|
|
20
|
+
add_library(${PROJECT_NAME}::FI ALIAS fi)
|
|
21
|
+
|
|
22
|
+
if (BUILD_TESTS)
|
|
23
|
+
add_subdirectory(test)
|
|
24
|
+
endif()
|
|
25
|
+
|
|
26
|
+
target_include_directories(fi
|
|
27
|
+
INTERFACE
|
|
28
|
+
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
|
|
29
|
+
$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/include>
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
target_link_libraries(fi INTERFACE common)
|
|
33
|
+
target_compile_features(fi INTERFACE cxx_std_11)
|
|
34
|
+
|
|
35
|
+
set(fi_HEADERS "")
|
|
36
|
+
list(APPEND fi_HEADERS "include/frequent_items_sketch.hpp")
|
|
37
|
+
list(APPEND fi_HEADERS "include/frequent_items_sketch_impl.hpp")
|
|
38
|
+
list(APPEND fi_HEADERS "include/reverse_purge_hash_map.hpp")
|
|
39
|
+
list(APPEND fi_HEADERS "include/reverse_purge_hash_map_impl.hpp")
|
|
40
|
+
|
|
41
|
+
install(TARGETS fi
|
|
42
|
+
EXPORT ${PROJECT_NAME}
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
install(FILES ${fi_HEADERS}
|
|
46
|
+
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/DataSketches")
|
|
47
|
+
|
|
48
|
+
target_sources(fi
|
|
49
|
+
INTERFACE
|
|
50
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/frequent_items_sketch.hpp
|
|
51
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/frequent_items_sketch_impl.hpp
|
|
52
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/reverse_purge_hash_map.hpp
|
|
53
|
+
${CMAKE_CURRENT_SOURCE_DIR}/include/reverse_purge_hash_map_impl.hpp
|
|
54
|
+
)
|
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Licensed to the Apache Software Foundation (ASF) under one
|
|
3
|
+
* or more contributor license agreements. See the NOTICE file
|
|
4
|
+
* distributed with this work for additional information
|
|
5
|
+
* regarding copyright ownership. The ASF licenses this file
|
|
6
|
+
* to you under the Apache License, Version 2.0 (the
|
|
7
|
+
* "License"); you may not use this file except in compliance
|
|
8
|
+
* with the License. You may obtain a copy of the License at
|
|
9
|
+
*
|
|
10
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
*
|
|
12
|
+
* Unless required by applicable law or agreed to in writing,
|
|
13
|
+
* software distributed under the License is distributed on an
|
|
14
|
+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
15
|
+
* KIND, either express or implied. See the License for the
|
|
16
|
+
* specific language governing permissions and limitations
|
|
17
|
+
* under the License.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#ifndef FREQUENT_ITEMS_SKETCH_HPP_
|
|
21
|
+
#define FREQUENT_ITEMS_SKETCH_HPP_
|
|
22
|
+
|
|
23
|
+
#include <memory>
|
|
24
|
+
#include <vector>
|
|
25
|
+
#include <iostream>
|
|
26
|
+
#include <functional>
|
|
27
|
+
#include <type_traits>
|
|
28
|
+
|
|
29
|
+
#include "reverse_purge_hash_map.hpp"
|
|
30
|
+
#include "common_defs.hpp"
|
|
31
|
+
#include "serde.hpp"
|
|
32
|
+
|
|
33
|
+
namespace datasketches {
|
|
34
|
+
|
|
35
|
+
/*
|
|
36
|
+
* Based on Java implementation here:
|
|
37
|
+
* https://github.com/DataSketches/sketches-core/blob/master/src/main/java/com/yahoo/sketches/frequencies/ItemsSketch.java
|
|
38
|
+
* author Alexander Saydakov
|
|
39
|
+
*/
|
|
40
|
+
|
|
41
|
+
enum frequent_items_error_type { NO_FALSE_POSITIVES, NO_FALSE_NEGATIVES };
|
|
42
|
+
|
|
43
|
+
// for serialization as raw bytes
|
|
44
|
+
template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
|
|
45
|
+
template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
|
|
46
|
+
|
|
47
|
+
// type W for weight must be an arithmetic type (integral or floating point)
|
|
48
|
+
template<typename T, typename W = uint64_t, typename H = std::hash<T>, typename E = std::equal_to<T>, typename S = serde<T>, typename A = std::allocator<T>>
|
|
49
|
+
class frequent_items_sketch {
|
|
50
|
+
public:
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Construct this sketch with parameters lg_max_map_size and lg_start_map_size.
|
|
54
|
+
*
|
|
55
|
+
* @param lg_max_map_size Log2 of the physical size of the internal hash map managed by this
|
|
56
|
+
* sketch. The maximum capacity of this internal hash map is 0.75 times 2^lg_max_map_size.
|
|
57
|
+
* Both the ultimate accuracy and size of this sketch are functions of lg_max_map_size.
|
|
58
|
+
*
|
|
59
|
+
* @param lg_start_map_size Log2 of the starting physical size of the internal hash
|
|
60
|
+
* map managed by this sketch.
|
|
61
|
+
*/
|
|
62
|
+
explicit frequent_items_sketch(uint8_t lg_max_map_size, uint8_t lg_start_map_size = LG_MIN_MAP_SIZE);
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Update this sketch with an item and a positive weight (frequency count).
|
|
66
|
+
* @param item for which the weight should be increased (lvalue)
|
|
67
|
+
* @param weight the amount by which the weight of the item should be increased
|
|
68
|
+
* A count of zero is a no-op, and a negative count will throw an exception.
|
|
69
|
+
*/
|
|
70
|
+
void update(const T& item, W weight = 1);
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Update this sketch with an item and a positive weight (frequency count).
|
|
74
|
+
* @param item for which the weight should be increased (rvalue)
|
|
75
|
+
* @param weight the amount by which the weight of the item should be increased
|
|
76
|
+
* A count of zero is a no-op, and a negative count will throw an exception.
|
|
77
|
+
*/
|
|
78
|
+
void update(T&& item, W weight = 1);
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* This function merges the other sketch into this one.
|
|
82
|
+
* The other sketch may be of a different size.
|
|
83
|
+
* @param other sketch to be merged into this (lvalue)
|
|
84
|
+
*/
|
|
85
|
+
void merge(const frequent_items_sketch& other);
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* This function merges the other sketch into this one.
|
|
89
|
+
* The other sketch may be of a different size.
|
|
90
|
+
* @param other sketch to be merged into this (rvalue)
|
|
91
|
+
*/
|
|
92
|
+
void merge(frequent_items_sketch&& other);
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* @return true if this sketch is empty
|
|
96
|
+
*/
|
|
97
|
+
bool is_empty() const;
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* @return the number of active items in the sketch
|
|
101
|
+
*/
|
|
102
|
+
uint32_t get_num_active_items() const;
|
|
103
|
+
|
|
104
|
+
/**
|
|
105
|
+
* Returns the sum of the weights (frequencies) in the stream seen so far by the sketch
|
|
106
|
+
*
|
|
107
|
+
* @return the total weight of all items in the stream seen so far by the sketch
|
|
108
|
+
*/
|
|
109
|
+
W get_total_weight() const;
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Returns the estimate of the weight (frequency) of the given item.
|
|
113
|
+
* Note: The true frequency of a item would be the sum of the counts as a result of the
|
|
114
|
+
* two update functions.
|
|
115
|
+
*
|
|
116
|
+
* @param item the given item
|
|
117
|
+
* @return the estimate of the weight (frequency) of the given item
|
|
118
|
+
*/
|
|
119
|
+
W get_estimate(const T& item) const;
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Returns the guaranteed lower bound weight (frequency) of the given item.
|
|
123
|
+
*
|
|
124
|
+
* @param item the given item.
|
|
125
|
+
* @return the guaranteed lower bound weight of the given item. That is, a number which
|
|
126
|
+
* is guaranteed to be no larger than the real weight.
|
|
127
|
+
*/
|
|
128
|
+
W get_lower_bound(const T& item) const;
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Returns the guaranteed upper bound weight (frequency) of the given item.
|
|
132
|
+
*
|
|
133
|
+
* @param item the given item
|
|
134
|
+
* @return the guaranteed upper bound weight of the given item. That is, a number which
|
|
135
|
+
* is guaranteed to be no smaller than the real frequency.
|
|
136
|
+
*/
|
|
137
|
+
W get_upper_bound(const T& item) const;
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* @return An upper bound on the maximum error of get_estimate(item) for any item.
|
|
141
|
+
* This is equivalent to the maximum distance between the upper bound and the lower bound
|
|
142
|
+
* for any item.
|
|
143
|
+
*/
|
|
144
|
+
W get_maximum_error() const;
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Returns epsilon value of this sketch.
|
|
148
|
+
* This is just the value <i>3.5 / max_map_size</i>.
|
|
149
|
+
* @return epsilon used by the sketch to compute error.
|
|
150
|
+
*/
|
|
151
|
+
double get_epsilon() const;
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* Returns epsilon used to compute <i>a priori</i> error.
|
|
155
|
+
* This is just the value <i>3.5 / maxMapSize</i>.
|
|
156
|
+
* @param maxMapSize the planned map size to be used when constructing this sketch.
|
|
157
|
+
* @return epsilon used to compute <i>a priori</i> error.
|
|
158
|
+
*/
|
|
159
|
+
static double get_epsilon(uint8_t lg_max_map_size);
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Returns the estimated <i>a priori</i> error given the max_map_size for the sketch and the
|
|
163
|
+
* estimated_total_stream_weight.
|
|
164
|
+
* @param lg_max_map_size the planned map size to be used when constructing this sketch.
|
|
165
|
+
* @param estimated_total_stream_weight the estimated total stream weight.
|
|
166
|
+
* @return the estimated <i>a priori</i> error.
|
|
167
|
+
*/
|
|
168
|
+
static double get_apriori_error(uint8_t lg_max_map_size, W estimated_total_weight);
|
|
169
|
+
|
|
170
|
+
class row;
|
|
171
|
+
typedef typename std::vector<row, typename std::allocator_traits<A>::template rebind_alloc<row>> vector_row; // alias for users
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* Returns an array of rows that include frequent items, estimates, upper and lower bounds
|
|
175
|
+
* given an error_type and using get_maximum_error() as a threshold.
|
|
176
|
+
*
|
|
177
|
+
* <p>The method first examines all active items in the sketch (items that have a counter).
|
|
178
|
+
*
|
|
179
|
+
* <p>If <i>error_type = NO_FALSE_NEGATIVES</i>, this will include an item in the result
|
|
180
|
+
* list if get_upper_bound(item) > threshold.
|
|
181
|
+
* There will be no false negatives, i.e., no Type II error.
|
|
182
|
+
* There may be items in the set with true frequencies less than the threshold
|
|
183
|
+
* (false positives).</p>
|
|
184
|
+
*
|
|
185
|
+
* <p>If <i>error_type = NO_FALSE_POSITIVES</i>, this will include an item in the result
|
|
186
|
+
* list if get_lower_bound(item) > threshold.
|
|
187
|
+
* There will be no false positives, i.e., no Type I error.
|
|
188
|
+
* There may be items omitted from the set with true frequencies greater than the
|
|
189
|
+
* threshold (false negatives).</p>
|
|
190
|
+
*
|
|
191
|
+
* @param error_type determines whether no false positives or no false negatives are desired.
|
|
192
|
+
* @return an array of frequent items
|
|
193
|
+
*/
|
|
194
|
+
vector_row get_frequent_items(frequent_items_error_type err_type) const;
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* Returns an array of rows that include frequent items, estimates, upper and lower bounds
|
|
198
|
+
* given an error_type and a threshold.
|
|
199
|
+
*
|
|
200
|
+
* <p>The method first examines all active items in the sketch (items that have a counter).
|
|
201
|
+
*
|
|
202
|
+
* <p>If <i>error_type = NO_FALSE_NEGATIVES</i>, this will include an item in the result
|
|
203
|
+
* list if get_upper_bound(item) > threshold.
|
|
204
|
+
* There will be no false negatives, i.e., no Type II error.
|
|
205
|
+
* There may be items in the set with true frequencies less than the threshold
|
|
206
|
+
* (false positives).</p>
|
|
207
|
+
*
|
|
208
|
+
* <p>If <i>error_type = NO_FALSE_POSITIVES</i>, this will include an item in the result
|
|
209
|
+
* list if get_lower_bound(item) > threshold.
|
|
210
|
+
* There will be no false positives, i.e., no Type I error.
|
|
211
|
+
* There may be items omitted from the set with true frequencies greater than the
|
|
212
|
+
* threshold (false negatives).</p>
|
|
213
|
+
*
|
|
214
|
+
* @param error_type determines whether no false positives or no false negatives are desired.
|
|
215
|
+
* @param threshold to include items in the result list
|
|
216
|
+
* @return an array of frequent items
|
|
217
|
+
*/
|
|
218
|
+
vector_row get_frequent_items(frequent_items_error_type err_type, W threshold) const;
|
|
219
|
+
|
|
220
|
+
/**
|
|
221
|
+
* Computes size needed to serialize the current state of the sketch.
|
|
222
|
+
* This can be expensive since every item needs to be looked at.
|
|
223
|
+
* @return size in bytes needed to serialize this sketch
|
|
224
|
+
*/
|
|
225
|
+
size_t get_serialized_size_bytes() const;
|
|
226
|
+
|
|
227
|
+
/**
|
|
228
|
+
* This method serializes the sketch into a given stream in a binary form
|
|
229
|
+
* @param os output stream
|
|
230
|
+
*/
|
|
231
|
+
void serialize(std::ostream& os) const;
|
|
232
|
+
|
|
233
|
+
// This is a convenience alias for users
|
|
234
|
+
// The type returned by the following serialize method
|
|
235
|
+
typedef vector_u8<A> vector_bytes;
|
|
236
|
+
|
|
237
|
+
/**
|
|
238
|
+
* This method serializes the sketch as a vector of bytes.
|
|
239
|
+
* An optional header can be reserved in front of the sketch.
|
|
240
|
+
* It is a blank space of a given size.
|
|
241
|
+
* This header is used in Datasketches PostgreSQL extension.
|
|
242
|
+
* @param header_size_bytes space to reserve in front of the sketch
|
|
243
|
+
* @return serialized sketch as a vector of bytes
|
|
244
|
+
*/
|
|
245
|
+
vector_bytes serialize(unsigned header_size_bytes = 0) const;
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* This method deserializes a sketch from a given stream.
|
|
249
|
+
* @param is input stream
|
|
250
|
+
* @return an instance of the sketch
|
|
251
|
+
*/
|
|
252
|
+
static frequent_items_sketch deserialize(std::istream& is);
|
|
253
|
+
|
|
254
|
+
/**
|
|
255
|
+
* This method deserializes a sketch from a given array of bytes.
|
|
256
|
+
* @param bytes pointer to the array of bytes
|
|
257
|
+
* @param size the size of the array
|
|
258
|
+
* @return an instance of the sketch
|
|
259
|
+
*/
|
|
260
|
+
static frequent_items_sketch deserialize(const void* bytes, size_t size);
|
|
261
|
+
|
|
262
|
+
/**
|
|
263
|
+
* Returns a human readable summary of this sketch
|
|
264
|
+
* @param print_items if true include the list of items retained by the sketch
|
|
265
|
+
*/
|
|
266
|
+
string<A> to_string(bool print_items = false) const;
|
|
267
|
+
|
|
268
|
+
private:
|
|
269
|
+
static const uint8_t LG_MIN_MAP_SIZE = 3;
|
|
270
|
+
static const uint8_t SERIAL_VERSION = 1;
|
|
271
|
+
static const uint8_t FAMILY_ID = 10;
|
|
272
|
+
static const uint8_t PREAMBLE_LONGS_EMPTY = 1;
|
|
273
|
+
static const uint8_t PREAMBLE_LONGS_NONEMPTY = 4;
|
|
274
|
+
static constexpr double EPSILON_FACTOR = 3.5;
|
|
275
|
+
enum flags { IS_EMPTY };
|
|
276
|
+
W total_weight;
|
|
277
|
+
W offset;
|
|
278
|
+
reverse_purge_hash_map<T, W, H, E, A> map;
|
|
279
|
+
static void check_preamble_longs(uint8_t preamble_longs, bool is_empty);
|
|
280
|
+
static void check_serial_version(uint8_t serial_version);
|
|
281
|
+
static void check_family_id(uint8_t family_id);
|
|
282
|
+
static void check_size(uint8_t lg_cur_size, uint8_t lg_max_size);
|
|
283
|
+
|
|
284
|
+
// version for integral signed type
|
|
285
|
+
template<typename WW = W, typename std::enable_if<std::is_integral<WW>::value && std::is_signed<WW>::value, int>::type = 0>
|
|
286
|
+
static inline void check_weight(WW weight);
|
|
287
|
+
|
|
288
|
+
// version for integral unsigned type
|
|
289
|
+
template<typename WW = W, typename std::enable_if<std::is_integral<WW>::value && std::is_unsigned<WW>::value, int>::type = 0>
|
|
290
|
+
static inline void check_weight(WW weight);
|
|
291
|
+
|
|
292
|
+
// version for floating point type
|
|
293
|
+
template<typename WW = W, typename std::enable_if<std::is_floating_point<WW>::value, int>::type = 0>
|
|
294
|
+
static inline void check_weight(WW weight);
|
|
295
|
+
|
|
296
|
+
// for deserialize
|
|
297
|
+
class items_deleter;
|
|
298
|
+
};
|
|
299
|
+
|
|
300
|
+
template<typename T, typename W, typename H, typename E, typename S, typename A>
|
|
301
|
+
class frequent_items_sketch<T, W, H, E, S, A>::row {
|
|
302
|
+
public:
|
|
303
|
+
row(const T* item, W weight, W offset):
|
|
304
|
+
item(item), weight(weight), offset(offset) {}
|
|
305
|
+
const T& get_item() const { return *item; }
|
|
306
|
+
W get_estimate() const { return weight + offset; }
|
|
307
|
+
W get_lower_bound() const { return weight; }
|
|
308
|
+
W get_upper_bound() const { return weight + offset; }
|
|
309
|
+
private:
|
|
310
|
+
const T* item;
|
|
311
|
+
W weight;
|
|
312
|
+
W offset;
|
|
313
|
+
};
|
|
314
|
+
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
#include "frequent_items_sketch_impl.hpp"
|
|
318
|
+
|
|
319
|
+
# endif
|